Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Shared support for scanning document type declarations in HTML and XHTML. |
2 | ||
3 | This module is used as a foundation for the HTMLParser and sgmllib | |
4 | modules (indirectly, for htmllib as well). It has no documented | |
5 | public API and should not be used directly. | |
6 | ||
7 | """ | |
8 | ||
9 | import re | |
10 | ||
11 | _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match | |
12 | _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match | |
13 | _commentclose = re.compile(r'--\s*>') | |
14 | _markedsectionclose = re.compile(r']\s*]\s*>') | |
15 | ||
16 | # An analysis of the MS-Word extensions is available at | |
17 | # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf | |
18 | ||
19 | _msmarkedsectionclose = re.compile(r']\s*>') | |
20 | ||
21 | del re | |
22 | ||
23 | ||
24 | class ParserBase: | |
25 | """Parser base class which provides some common support methods used | |
26 | by the SGML/HTML and XHTML parsers.""" | |
27 | ||
28 | def __init__(self): | |
29 | if self.__class__ is ParserBase: | |
30 | raise RuntimeError( | |
31 | "markupbase.ParserBase must be subclassed") | |
32 | ||
33 | def error(self, message): | |
34 | raise NotImplementedError( | |
35 | "subclasses of ParserBase must override error()") | |
36 | ||
37 | def reset(self): | |
38 | self.lineno = 1 | |
39 | self.offset = 0 | |
40 | ||
41 | def getpos(self): | |
42 | """Return current line number and offset.""" | |
43 | return self.lineno, self.offset | |
44 | ||
45 | # Internal -- update line number and offset. This should be | |
46 | # called for each piece of data exactly once, in order -- in other | |
47 | # words the concatenation of all the input strings to this | |
48 | # function should be exactly the entire input. | |
49 | def updatepos(self, i, j): | |
50 | if i >= j: | |
51 | return j | |
52 | rawdata = self.rawdata | |
53 | nlines = rawdata.count("\n", i, j) | |
54 | if nlines: | |
55 | self.lineno = self.lineno + nlines | |
56 | pos = rawdata.rindex("\n", i, j) # Should not fail | |
57 | self.offset = j-(pos+1) | |
58 | else: | |
59 | self.offset = self.offset + j-i | |
60 | return j | |
61 | ||
62 | _decl_otherchars = '' | |
63 | ||
64 | # Internal -- parse declaration (for use by subclasses). | |
65 | def parse_declaration(self, i): | |
66 | # This is some sort of declaration; in "HTML as | |
67 | # deployed," this should only be the document type | |
68 | # declaration ("<!DOCTYPE html...>"). | |
69 | # ISO 8879:1986, however, has more complex | |
70 | # declaration syntax for elements in <!...>, including: | |
71 | # --comment-- | |
72 | # [marked section] | |
73 | # name in the following list: ENTITY, DOCTYPE, ELEMENT, | |
74 | # ATTLIST, NOTATION, SHORTREF, USEMAP, | |
75 | # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM | |
76 | rawdata = self.rawdata | |
77 | j = i + 2 | |
78 | assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" | |
79 | if rawdata[j:j+1] in ("-", ""): | |
80 | # Start of comment followed by buffer boundary, | |
81 | # or just a buffer boundary. | |
82 | return -1 | |
83 | # A simple, practical version could look like: ((name|stringlit) S*) + '>' | |
84 | n = len(rawdata) | |
85 | if rawdata[j:j+1] == '--': #comment | |
86 | # Locate --.*-- as the body of the comment | |
87 | return self.parse_comment(i) | |
88 | elif rawdata[j] == '[': #marked section | |
89 | # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section | |
90 | # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA | |
91 | # Note that this is extended by Microsoft Office "Save as Web" function | |
92 | # to include [if...] and [endif]. | |
93 | return self.parse_marked_section(i) | |
94 | else: #all other declaration elements | |
95 | decltype, j = self._scan_name(j, i) | |
96 | if j < 0: | |
97 | return j | |
98 | if decltype == "doctype": | |
99 | self._decl_otherchars = '' | |
100 | while j < n: | |
101 | c = rawdata[j] | |
102 | if c == ">": | |
103 | # end of declaration syntax | |
104 | data = rawdata[i+2:j] | |
105 | if decltype == "doctype": | |
106 | self.handle_decl(data) | |
107 | else: | |
108 | self.unknown_decl(data) | |
109 | return j + 1 | |
110 | if c in "\"'": | |
111 | m = _declstringlit_match(rawdata, j) | |
112 | if not m: | |
113 | return -1 # incomplete | |
114 | j = m.end() | |
115 | elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
116 | name, j = self._scan_name(j, i) | |
117 | elif c in self._decl_otherchars: | |
118 | j = j + 1 | |
119 | elif c == "[": | |
120 | # this could be handled in a separate doctype parser | |
121 | if decltype == "doctype": | |
122 | j = self._parse_doctype_subset(j + 1, i) | |
123 | elif decltype in ("attlist", "linktype", "link", "element"): | |
124 | # must tolerate []'d groups in a content model in an element declaration | |
125 | # also in data attribute specifications of attlist declaration | |
126 | # also link type declaration subsets in linktype declarations | |
127 | # also link attribute specification lists in link declarations | |
128 | self.error("unsupported '[' char in %s declaration" % decltype) | |
129 | else: | |
130 | self.error("unexpected '[' char in declaration") | |
131 | else: | |
132 | self.error( | |
133 | "unexpected %r char in declaration" % rawdata[j]) | |
134 | if j < 0: | |
135 | return j | |
136 | return -1 # incomplete | |
137 | ||
138 | # Internal -- parse a marked section | |
139 | # Override this to handle MS-word extension syntax <![if word]>content<![endif]> | |
140 | def parse_marked_section( self, i, report=1 ): | |
141 | rawdata= self.rawdata | |
142 | assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" | |
143 | sectName, j = self._scan_name( i+3, i ) | |
144 | if j < 0: | |
145 | return j | |
146 | if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): | |
147 | # look for standard ]]> ending | |
148 | match= _markedsectionclose.search(rawdata, i+3) | |
149 | elif sectName in ("if", "else", "endif"): | |
150 | # look for MS Office ]> ending | |
151 | match= _msmarkedsectionclose.search(rawdata, i+3) | |
152 | else: | |
153 | self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) | |
154 | if not match: | |
155 | return -1 | |
156 | if report: | |
157 | j = match.start(0) | |
158 | self.unknown_decl(rawdata[i+3: j]) | |
159 | return match.end(0) | |
160 | ||
161 | # Internal -- parse comment, return length or -1 if not terminated | |
162 | def parse_comment(self, i, report=1): | |
163 | rawdata = self.rawdata | |
164 | if rawdata[i:i+4] != '<!--': | |
165 | self.error('unexpected call to parse_comment()') | |
166 | match = _commentclose.search(rawdata, i+4) | |
167 | if not match: | |
168 | return -1 | |
169 | if report: | |
170 | j = match.start(0) | |
171 | self.handle_comment(rawdata[i+4: j]) | |
172 | return match.end(0) | |
173 | ||
174 | # Internal -- scan past the internal subset in a <!DOCTYPE declaration, | |
175 | # returning the index just past any whitespace following the trailing ']'. | |
176 | def _parse_doctype_subset(self, i, declstartpos): | |
177 | rawdata = self.rawdata | |
178 | n = len(rawdata) | |
179 | j = i | |
180 | while j < n: | |
181 | c = rawdata[j] | |
182 | if c == "<": | |
183 | s = rawdata[j:j+2] | |
184 | if s == "<": | |
185 | # end of buffer; incomplete | |
186 | return -1 | |
187 | if s != "<!": | |
188 | self.updatepos(declstartpos, j + 1) | |
189 | self.error("unexpected char in internal subset (in %r)" % s) | |
190 | if (j + 2) == n: | |
191 | # end of buffer; incomplete | |
192 | return -1 | |
193 | if (j + 4) > n: | |
194 | # end of buffer; incomplete | |
195 | return -1 | |
196 | if rawdata[j:j+4] == "<!--": | |
197 | j = self.parse_comment(j, report=0) | |
198 | if j < 0: | |
199 | return j | |
200 | continue | |
201 | name, j = self._scan_name(j + 2, declstartpos) | |
202 | if j == -1: | |
203 | return -1 | |
204 | if name not in ("attlist", "element", "entity", "notation"): | |
205 | self.updatepos(declstartpos, j + 2) | |
206 | self.error( | |
207 | "unknown declaration %r in internal subset" % name) | |
208 | # handle the individual names | |
209 | meth = getattr(self, "_parse_doctype_" + name) | |
210 | j = meth(j, declstartpos) | |
211 | if j < 0: | |
212 | return j | |
213 | elif c == "%": | |
214 | # parameter entity reference | |
215 | if (j + 1) == n: | |
216 | # end of buffer; incomplete | |
217 | return -1 | |
218 | s, j = self._scan_name(j + 1, declstartpos) | |
219 | if j < 0: | |
220 | return j | |
221 | if rawdata[j] == ";": | |
222 | j = j + 1 | |
223 | elif c == "]": | |
224 | j = j + 1 | |
225 | while j < n and rawdata[j].isspace(): | |
226 | j = j + 1 | |
227 | if j < n: | |
228 | if rawdata[j] == ">": | |
229 | return j | |
230 | self.updatepos(declstartpos, j) | |
231 | self.error("unexpected char after internal subset") | |
232 | else: | |
233 | return -1 | |
234 | elif c.isspace(): | |
235 | j = j + 1 | |
236 | else: | |
237 | self.updatepos(declstartpos, j) | |
238 | self.error("unexpected char %r in internal subset" % c) | |
239 | # end of buffer reached | |
240 | return -1 | |
241 | ||
242 | # Internal -- scan past <!ELEMENT declarations | |
243 | def _parse_doctype_element(self, i, declstartpos): | |
244 | name, j = self._scan_name(i, declstartpos) | |
245 | if j == -1: | |
246 | return -1 | |
247 | # style content model; just skip until '>' | |
248 | rawdata = self.rawdata | |
249 | if '>' in rawdata[j:]: | |
250 | return rawdata.find(">", j) + 1 | |
251 | return -1 | |
252 | ||
253 | # Internal -- scan past <!ATTLIST declarations | |
254 | def _parse_doctype_attlist(self, i, declstartpos): | |
255 | rawdata = self.rawdata | |
256 | name, j = self._scan_name(i, declstartpos) | |
257 | c = rawdata[j:j+1] | |
258 | if c == "": | |
259 | return -1 | |
260 | if c == ">": | |
261 | return j + 1 | |
262 | while 1: | |
263 | # scan a series of attribute descriptions; simplified: | |
264 | # name type [value] [#constraint] | |
265 | name, j = self._scan_name(j, declstartpos) | |
266 | if j < 0: | |
267 | return j | |
268 | c = rawdata[j:j+1] | |
269 | if c == "": | |
270 | return -1 | |
271 | if c == "(": | |
272 | # an enumerated type; look for ')' | |
273 | if ")" in rawdata[j:]: | |
274 | j = rawdata.find(")", j) + 1 | |
275 | else: | |
276 | return -1 | |
277 | while rawdata[j:j+1].isspace(): | |
278 | j = j + 1 | |
279 | if not rawdata[j:]: | |
280 | # end of buffer, incomplete | |
281 | return -1 | |
282 | else: | |
283 | name, j = self._scan_name(j, declstartpos) | |
284 | c = rawdata[j:j+1] | |
285 | if not c: | |
286 | return -1 | |
287 | if c in "'\"": | |
288 | m = _declstringlit_match(rawdata, j) | |
289 | if m: | |
290 | j = m.end() | |
291 | else: | |
292 | return -1 | |
293 | c = rawdata[j:j+1] | |
294 | if not c: | |
295 | return -1 | |
296 | if c == "#": | |
297 | if rawdata[j:] == "#": | |
298 | # end of buffer | |
299 | return -1 | |
300 | name, j = self._scan_name(j + 1, declstartpos) | |
301 | if j < 0: | |
302 | return j | |
303 | c = rawdata[j:j+1] | |
304 | if not c: | |
305 | return -1 | |
306 | if c == '>': | |
307 | # all done | |
308 | return j + 1 | |
309 | ||
310 | # Internal -- scan past <!NOTATION declarations | |
311 | def _parse_doctype_notation(self, i, declstartpos): | |
312 | name, j = self._scan_name(i, declstartpos) | |
313 | if j < 0: | |
314 | return j | |
315 | rawdata = self.rawdata | |
316 | while 1: | |
317 | c = rawdata[j:j+1] | |
318 | if not c: | |
319 | # end of buffer; incomplete | |
320 | return -1 | |
321 | if c == '>': | |
322 | return j + 1 | |
323 | if c in "'\"": | |
324 | m = _declstringlit_match(rawdata, j) | |
325 | if not m: | |
326 | return -1 | |
327 | j = m.end() | |
328 | else: | |
329 | name, j = self._scan_name(j, declstartpos) | |
330 | if j < 0: | |
331 | return j | |
332 | ||
333 | # Internal -- scan past <!ENTITY declarations | |
334 | def _parse_doctype_entity(self, i, declstartpos): | |
335 | rawdata = self.rawdata | |
336 | if rawdata[i:i+1] == "%": | |
337 | j = i + 1 | |
338 | while 1: | |
339 | c = rawdata[j:j+1] | |
340 | if not c: | |
341 | return -1 | |
342 | if c.isspace(): | |
343 | j = j + 1 | |
344 | else: | |
345 | break | |
346 | else: | |
347 | j = i | |
348 | name, j = self._scan_name(j, declstartpos) | |
349 | if j < 0: | |
350 | return j | |
351 | while 1: | |
352 | c = self.rawdata[j:j+1] | |
353 | if not c: | |
354 | return -1 | |
355 | if c in "'\"": | |
356 | m = _declstringlit_match(rawdata, j) | |
357 | if m: | |
358 | j = m.end() | |
359 | else: | |
360 | return -1 # incomplete | |
361 | elif c == ">": | |
362 | return j + 1 | |
363 | else: | |
364 | name, j = self._scan_name(j, declstartpos) | |
365 | if j < 0: | |
366 | return j | |
367 | ||
368 | # Internal -- scan a name token and the new position and the token, or | |
369 | # return -1 if we've reached the end of the buffer. | |
370 | def _scan_name(self, i, declstartpos): | |
371 | rawdata = self.rawdata | |
372 | n = len(rawdata) | |
373 | if i == n: | |
374 | return None, -1 | |
375 | m = _declname_match(rawdata, i) | |
376 | if m: | |
377 | s = m.group() | |
378 | name = s.strip() | |
379 | if (i + len(s)) == n: | |
380 | return None, -1 # end of buffer | |
381 | return name.lower(), m.end() | |
382 | else: | |
383 | self.updatepos(declstartpos, i) | |
384 | self.error("expected name token at %r" | |
385 | % rawdata[declstartpos:declstartpos+20]) | |
386 | ||
387 | # To be overridden -- handlers for unknown objects | |
388 | def unknown_decl(self, data): | |
389 | pass |