Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v9 / lib / python2.4 / markupbase.py
CommitLineData
920dae64
AT
1"""Shared support for scanning document type declarations in HTML and XHTML.
2
3This module is used as a foundation for the HTMLParser and sgmllib
4modules (indirectly, for htmllib as well). It has no documented
5public API and should not be used directly.
6
7"""
8
9import re
10
11_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
12_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
13_commentclose = re.compile(r'--\s*>')
14_markedsectionclose = re.compile(r']\s*]\s*>')
15
16# An analysis of the MS-Word extensions is available at
17# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18
19_msmarkedsectionclose = re.compile(r']\s*>')
20
21del re
22
23
24class ParserBase:
25 """Parser base class which provides some common support methods used
26 by the SGML/HTML and XHTML parsers."""
27
28 def __init__(self):
29 if self.__class__ is ParserBase:
30 raise RuntimeError(
31 "markupbase.ParserBase must be subclassed")
32
33 def error(self, message):
34 raise NotImplementedError(
35 "subclasses of ParserBase must override error()")
36
37 def reset(self):
38 self.lineno = 1
39 self.offset = 0
40
41 def getpos(self):
42 """Return current line number and offset."""
43 return self.lineno, self.offset
44
45 # Internal -- update line number and offset. This should be
46 # called for each piece of data exactly once, in order -- in other
47 # words the concatenation of all the input strings to this
48 # function should be exactly the entire input.
49 def updatepos(self, i, j):
50 if i >= j:
51 return j
52 rawdata = self.rawdata
53 nlines = rawdata.count("\n", i, j)
54 if nlines:
55 self.lineno = self.lineno + nlines
56 pos = rawdata.rindex("\n", i, j) # Should not fail
57 self.offset = j-(pos+1)
58 else:
59 self.offset = self.offset + j-i
60 return j
61
62 _decl_otherchars = ''
63
64 # Internal -- parse declaration (for use by subclasses).
65 def parse_declaration(self, i):
66 # This is some sort of declaration; in "HTML as
67 # deployed," this should only be the document type
68 # declaration ("<!DOCTYPE html...>").
69 # ISO 8879:1986, however, has more complex
70 # declaration syntax for elements in <!...>, including:
71 # --comment--
72 # [marked section]
73 # name in the following list: ENTITY, DOCTYPE, ELEMENT,
74 # ATTLIST, NOTATION, SHORTREF, USEMAP,
75 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
76 rawdata = self.rawdata
77 j = i + 2
78 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
79 if rawdata[j:j+1] in ("-", ""):
80 # Start of comment followed by buffer boundary,
81 # or just a buffer boundary.
82 return -1
83 # A simple, practical version could look like: ((name|stringlit) S*) + '>'
84 n = len(rawdata)
85 if rawdata[j:j+1] == '--': #comment
86 # Locate --.*-- as the body of the comment
87 return self.parse_comment(i)
88 elif rawdata[j] == '[': #marked section
89 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
90 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
91 # Note that this is extended by Microsoft Office "Save as Web" function
92 # to include [if...] and [endif].
93 return self.parse_marked_section(i)
94 else: #all other declaration elements
95 decltype, j = self._scan_name(j, i)
96 if j < 0:
97 return j
98 if decltype == "doctype":
99 self._decl_otherchars = ''
100 while j < n:
101 c = rawdata[j]
102 if c == ">":
103 # end of declaration syntax
104 data = rawdata[i+2:j]
105 if decltype == "doctype":
106 self.handle_decl(data)
107 else:
108 self.unknown_decl(data)
109 return j + 1
110 if c in "\"'":
111 m = _declstringlit_match(rawdata, j)
112 if not m:
113 return -1 # incomplete
114 j = m.end()
115 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
116 name, j = self._scan_name(j, i)
117 elif c in self._decl_otherchars:
118 j = j + 1
119 elif c == "[":
120 # this could be handled in a separate doctype parser
121 if decltype == "doctype":
122 j = self._parse_doctype_subset(j + 1, i)
123 elif decltype in ("attlist", "linktype", "link", "element"):
124 # must tolerate []'d groups in a content model in an element declaration
125 # also in data attribute specifications of attlist declaration
126 # also link type declaration subsets in linktype declarations
127 # also link attribute specification lists in link declarations
128 self.error("unsupported '[' char in %s declaration" % decltype)
129 else:
130 self.error("unexpected '[' char in declaration")
131 else:
132 self.error(
133 "unexpected %r char in declaration" % rawdata[j])
134 if j < 0:
135 return j
136 return -1 # incomplete
137
138 # Internal -- parse a marked section
139 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
140 def parse_marked_section( self, i, report=1 ):
141 rawdata= self.rawdata
142 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
143 sectName, j = self._scan_name( i+3, i )
144 if j < 0:
145 return j
146 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
147 # look for standard ]]> ending
148 match= _markedsectionclose.search(rawdata, i+3)
149 elif sectName in ("if", "else", "endif"):
150 # look for MS Office ]> ending
151 match= _msmarkedsectionclose.search(rawdata, i+3)
152 else:
153 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
154 if not match:
155 return -1
156 if report:
157 j = match.start(0)
158 self.unknown_decl(rawdata[i+3: j])
159 return match.end(0)
160
161 # Internal -- parse comment, return length or -1 if not terminated
162 def parse_comment(self, i, report=1):
163 rawdata = self.rawdata
164 if rawdata[i:i+4] != '<!--':
165 self.error('unexpected call to parse_comment()')
166 match = _commentclose.search(rawdata, i+4)
167 if not match:
168 return -1
169 if report:
170 j = match.start(0)
171 self.handle_comment(rawdata[i+4: j])
172 return match.end(0)
173
174 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
175 # returning the index just past any whitespace following the trailing ']'.
176 def _parse_doctype_subset(self, i, declstartpos):
177 rawdata = self.rawdata
178 n = len(rawdata)
179 j = i
180 while j < n:
181 c = rawdata[j]
182 if c == "<":
183 s = rawdata[j:j+2]
184 if s == "<":
185 # end of buffer; incomplete
186 return -1
187 if s != "<!":
188 self.updatepos(declstartpos, j + 1)
189 self.error("unexpected char in internal subset (in %r)" % s)
190 if (j + 2) == n:
191 # end of buffer; incomplete
192 return -1
193 if (j + 4) > n:
194 # end of buffer; incomplete
195 return -1
196 if rawdata[j:j+4] == "<!--":
197 j = self.parse_comment(j, report=0)
198 if j < 0:
199 return j
200 continue
201 name, j = self._scan_name(j + 2, declstartpos)
202 if j == -1:
203 return -1
204 if name not in ("attlist", "element", "entity", "notation"):
205 self.updatepos(declstartpos, j + 2)
206 self.error(
207 "unknown declaration %r in internal subset" % name)
208 # handle the individual names
209 meth = getattr(self, "_parse_doctype_" + name)
210 j = meth(j, declstartpos)
211 if j < 0:
212 return j
213 elif c == "%":
214 # parameter entity reference
215 if (j + 1) == n:
216 # end of buffer; incomplete
217 return -1
218 s, j = self._scan_name(j + 1, declstartpos)
219 if j < 0:
220 return j
221 if rawdata[j] == ";":
222 j = j + 1
223 elif c == "]":
224 j = j + 1
225 while j < n and rawdata[j].isspace():
226 j = j + 1
227 if j < n:
228 if rawdata[j] == ">":
229 return j
230 self.updatepos(declstartpos, j)
231 self.error("unexpected char after internal subset")
232 else:
233 return -1
234 elif c.isspace():
235 j = j + 1
236 else:
237 self.updatepos(declstartpos, j)
238 self.error("unexpected char %r in internal subset" % c)
239 # end of buffer reached
240 return -1
241
242 # Internal -- scan past <!ELEMENT declarations
243 def _parse_doctype_element(self, i, declstartpos):
244 name, j = self._scan_name(i, declstartpos)
245 if j == -1:
246 return -1
247 # style content model; just skip until '>'
248 rawdata = self.rawdata
249 if '>' in rawdata[j:]:
250 return rawdata.find(">", j) + 1
251 return -1
252
253 # Internal -- scan past <!ATTLIST declarations
254 def _parse_doctype_attlist(self, i, declstartpos):
255 rawdata = self.rawdata
256 name, j = self._scan_name(i, declstartpos)
257 c = rawdata[j:j+1]
258 if c == "":
259 return -1
260 if c == ">":
261 return j + 1
262 while 1:
263 # scan a series of attribute descriptions; simplified:
264 # name type [value] [#constraint]
265 name, j = self._scan_name(j, declstartpos)
266 if j < 0:
267 return j
268 c = rawdata[j:j+1]
269 if c == "":
270 return -1
271 if c == "(":
272 # an enumerated type; look for ')'
273 if ")" in rawdata[j:]:
274 j = rawdata.find(")", j) + 1
275 else:
276 return -1
277 while rawdata[j:j+1].isspace():
278 j = j + 1
279 if not rawdata[j:]:
280 # end of buffer, incomplete
281 return -1
282 else:
283 name, j = self._scan_name(j, declstartpos)
284 c = rawdata[j:j+1]
285 if not c:
286 return -1
287 if c in "'\"":
288 m = _declstringlit_match(rawdata, j)
289 if m:
290 j = m.end()
291 else:
292 return -1
293 c = rawdata[j:j+1]
294 if not c:
295 return -1
296 if c == "#":
297 if rawdata[j:] == "#":
298 # end of buffer
299 return -1
300 name, j = self._scan_name(j + 1, declstartpos)
301 if j < 0:
302 return j
303 c = rawdata[j:j+1]
304 if not c:
305 return -1
306 if c == '>':
307 # all done
308 return j + 1
309
310 # Internal -- scan past <!NOTATION declarations
311 def _parse_doctype_notation(self, i, declstartpos):
312 name, j = self._scan_name(i, declstartpos)
313 if j < 0:
314 return j
315 rawdata = self.rawdata
316 while 1:
317 c = rawdata[j:j+1]
318 if not c:
319 # end of buffer; incomplete
320 return -1
321 if c == '>':
322 return j + 1
323 if c in "'\"":
324 m = _declstringlit_match(rawdata, j)
325 if not m:
326 return -1
327 j = m.end()
328 else:
329 name, j = self._scan_name(j, declstartpos)
330 if j < 0:
331 return j
332
333 # Internal -- scan past <!ENTITY declarations
334 def _parse_doctype_entity(self, i, declstartpos):
335 rawdata = self.rawdata
336 if rawdata[i:i+1] == "%":
337 j = i + 1
338 while 1:
339 c = rawdata[j:j+1]
340 if not c:
341 return -1
342 if c.isspace():
343 j = j + 1
344 else:
345 break
346 else:
347 j = i
348 name, j = self._scan_name(j, declstartpos)
349 if j < 0:
350 return j
351 while 1:
352 c = self.rawdata[j:j+1]
353 if not c:
354 return -1
355 if c in "'\"":
356 m = _declstringlit_match(rawdata, j)
357 if m:
358 j = m.end()
359 else:
360 return -1 # incomplete
361 elif c == ">":
362 return j + 1
363 else:
364 name, j = self._scan_name(j, declstartpos)
365 if j < 0:
366 return j
367
368 # Internal -- scan a name token and the new position and the token, or
369 # return -1 if we've reached the end of the buffer.
370 def _scan_name(self, i, declstartpos):
371 rawdata = self.rawdata
372 n = len(rawdata)
373 if i == n:
374 return None, -1
375 m = _declname_match(rawdata, i)
376 if m:
377 s = m.group()
378 name = s.strip()
379 if (i + len(s)) == n:
380 return None, -1 # end of buffer
381 return name.lower(), m.end()
382 else:
383 self.updatepos(declstartpos, i)
384 self.error("expected name token at %r"
385 % rawdata[declstartpos:declstartpos+20])
386
387 # To be overridden -- handlers for unknown objects
388 def unknown_decl(self, data):
389 pass