Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / lib / python2.4 / HTMLParser.py
CommitLineData
920dae64
AT
1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import markupbase
12import re
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
18incomplete = re.compile('&[a-zA-Z#]')
19
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
22
23starttagopen = re.compile('<[a-zA-Z]')
24piclose = re.compile('>')
25commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
27attrfind = re.compile(
28 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
29 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
30
31locatestarttagend = re.compile(r"""
32 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33 (?:\s+ # whitespace before attribute name
34 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35 (?:\s*=\s* # value indicator
36 (?:'[^']*' # LITA-enclosed value
37 |\"[^\"]*\" # LIT-enclosed value
38 |[^'\">\s]+ # bare value
39 )
40 )?
41 )
42 )*
43 \s* # trailing whitespace
44""", re.VERBOSE)
45endendtag = re.compile('>')
46endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
47
48
49class HTMLParseError(Exception):
50 """Exception raised for all parse errors."""
51
52 def __init__(self, msg, position=(None, None)):
53 assert msg
54 self.msg = msg
55 self.lineno = position[0]
56 self.offset = position[1]
57
58 def __str__(self):
59 result = self.msg
60 if self.lineno is not None:
61 result = result + ", at line %d" % self.lineno
62 if self.offset is not None:
63 result = result + ", column %d" % (self.offset + 1)
64 return result
65
66
67class HTMLParser(markupbase.ParserBase):
68 """Find tags and other markup and call handler functions.
69
70 Usage:
71 p = HTMLParser()
72 p.feed(data)
73 ...
74 p.close()
75
76 Start tags are handled by calling self.handle_starttag() or
77 self.handle_startendtag(); end tags by self.handle_endtag(). The
78 data between tags is passed from the parser to the derived class
79 by calling self.handle_data() with the data as argument (the data
80 may be split up in arbitrary chunks). Entity references are
81 passed by calling self.handle_entityref() with the entity
82 reference as the argument. Numeric character references are
83 passed to self.handle_charref() with the string containing the
84 reference as the argument.
85 """
86
87 CDATA_CONTENT_ELEMENTS = ("script", "style")
88
89
90 def __init__(self):
91 """Initialize and reset this instance."""
92 self.reset()
93
94 def reset(self):
95 """Reset this instance. Loses all unprocessed data."""
96 self.rawdata = ''
97 self.lasttag = '???'
98 self.interesting = interesting_normal
99 markupbase.ParserBase.reset(self)
100
101 def feed(self, data):
102 """Feed data to the parser.
103
104 Call this as often as you want, with as little or as much text
105 as you want (may include '\n').
106 """
107 self.rawdata = self.rawdata + data
108 self.goahead(0)
109
110 def close(self):
111 """Handle any buffered data."""
112 self.goahead(1)
113
114 def error(self, message):
115 raise HTMLParseError(message, self.getpos())
116
117 __starttag_text = None
118
119 def get_starttag_text(self):
120 """Return full source of start tag: '<...>'."""
121 return self.__starttag_text
122
123 def set_cdata_mode(self):
124 self.interesting = interesting_cdata
125
126 def clear_cdata_mode(self):
127 self.interesting = interesting_normal
128
129 # Internal -- handle data as far as reasonable. May leave state
130 # and data to be processed by a subsequent call. If 'end' is
131 # true, force handling all data as if followed by EOF marker.
132 def goahead(self, end):
133 rawdata = self.rawdata
134 i = 0
135 n = len(rawdata)
136 while i < n:
137 match = self.interesting.search(rawdata, i) # < or &
138 if match:
139 j = match.start()
140 else:
141 j = n
142 if i < j: self.handle_data(rawdata[i:j])
143 i = self.updatepos(i, j)
144 if i == n: break
145 startswith = rawdata.startswith
146 if startswith('<', i):
147 if starttagopen.match(rawdata, i): # < + letter
148 k = self.parse_starttag(i)
149 elif startswith("</", i):
150 k = self.parse_endtag(i)
151 elif startswith("<!--", i):
152 k = self.parse_comment(i)
153 elif startswith("<?", i):
154 k = self.parse_pi(i)
155 elif startswith("<!", i):
156 k = self.parse_declaration(i)
157 elif (i + 1) < n:
158 self.handle_data("<")
159 k = i + 1
160 else:
161 break
162 if k < 0:
163 if end:
164 self.error("EOF in middle of construct")
165 break
166 i = self.updatepos(i, k)
167 elif startswith("&#", i):
168 match = charref.match(rawdata, i)
169 if match:
170 name = match.group()[2:-1]
171 self.handle_charref(name)
172 k = match.end()
173 if not startswith(';', k-1):
174 k = k - 1
175 i = self.updatepos(i, k)
176 continue
177 else:
178 break
179 elif startswith('&', i):
180 match = entityref.match(rawdata, i)
181 if match:
182 name = match.group(1)
183 self.handle_entityref(name)
184 k = match.end()
185 if not startswith(';', k-1):
186 k = k - 1
187 i = self.updatepos(i, k)
188 continue
189 match = incomplete.match(rawdata, i)
190 if match:
191 # match.group() will contain at least 2 chars
192 if end and match.group() == rawdata[i:]:
193 self.error("EOF in middle of entity or char ref")
194 # incomplete
195 break
196 elif (i + 1) < n:
197 # not the end of the buffer, and can't be confused
198 # with some other construct
199 self.handle_data("&")
200 i = self.updatepos(i, i + 1)
201 else:
202 break
203 else:
204 assert 0, "interesting.search() lied"
205 # end while
206 if end and i < n:
207 self.handle_data(rawdata[i:n])
208 i = self.updatepos(i, n)
209 self.rawdata = rawdata[i:]
210
211 # Internal -- parse processing instr, return end or -1 if not terminated
212 def parse_pi(self, i):
213 rawdata = self.rawdata
214 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
215 match = piclose.search(rawdata, i+2) # >
216 if not match:
217 return -1
218 j = match.start()
219 self.handle_pi(rawdata[i+2: j])
220 j = match.end()
221 return j
222
223 # Internal -- handle starttag, return end or -1 if not terminated
224 def parse_starttag(self, i):
225 self.__starttag_text = None
226 endpos = self.check_for_whole_start_tag(i)
227 if endpos < 0:
228 return endpos
229 rawdata = self.rawdata
230 self.__starttag_text = rawdata[i:endpos]
231
232 # Now parse the data between i+1 and j into a tag and attrs
233 attrs = []
234 match = tagfind.match(rawdata, i+1)
235 assert match, 'unexpected call to parse_starttag()'
236 k = match.end()
237 self.lasttag = tag = rawdata[i+1:k].lower()
238
239 while k < endpos:
240 m = attrfind.match(rawdata, k)
241 if not m:
242 break
243 attrname, rest, attrvalue = m.group(1, 2, 3)
244 if not rest:
245 attrvalue = None
246 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
247 attrvalue[:1] == '"' == attrvalue[-1:]:
248 attrvalue = attrvalue[1:-1]
249 attrvalue = self.unescape(attrvalue)
250 attrs.append((attrname.lower(), attrvalue))
251 k = m.end()
252
253 end = rawdata[k:endpos].strip()
254 if end not in (">", "/>"):
255 lineno, offset = self.getpos()
256 if "\n" in self.__starttag_text:
257 lineno = lineno + self.__starttag_text.count("\n")
258 offset = len(self.__starttag_text) \
259 - self.__starttag_text.rfind("\n")
260 else:
261 offset = offset + len(self.__starttag_text)
262 self.error("junk characters in start tag: %r"
263 % (rawdata[k:endpos][:20],))
264 if end.endswith('/>'):
265 # XHTML-style empty tag: <span attr="value" />
266 self.handle_startendtag(tag, attrs)
267 else:
268 self.handle_starttag(tag, attrs)
269 if tag in self.CDATA_CONTENT_ELEMENTS:
270 self.set_cdata_mode()
271 return endpos
272
273 # Internal -- check to see if we have a complete starttag; return end
274 # or -1 if incomplete.
275 def check_for_whole_start_tag(self, i):
276 rawdata = self.rawdata
277 m = locatestarttagend.match(rawdata, i)
278 if m:
279 j = m.end()
280 next = rawdata[j:j+1]
281 if next == ">":
282 return j + 1
283 if next == "/":
284 if rawdata.startswith("/>", j):
285 return j + 2
286 if rawdata.startswith("/", j):
287 # buffer boundary
288 return -1
289 # else bogus input
290 self.updatepos(i, j + 1)
291 self.error("malformed empty start tag")
292 if next == "":
293 # end of input
294 return -1
295 if next in ("abcdefghijklmnopqrstuvwxyz=/"
296 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
297 # end of input in or before attribute value, or we have the
298 # '/' from a '/>' ending
299 return -1
300 self.updatepos(i, j)
301 self.error("malformed start tag")
302 raise AssertionError("we should not get here!")
303
304 # Internal -- parse endtag, return end or -1 if incomplete
305 def parse_endtag(self, i):
306 rawdata = self.rawdata
307 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
308 match = endendtag.search(rawdata, i+1) # >
309 if not match:
310 return -1
311 j = match.end()
312 match = endtagfind.match(rawdata, i) # </ + tag + >
313 if not match:
314 self.error("bad end tag: %r" % (rawdata[i:j],))
315 tag = match.group(1)
316 self.handle_endtag(tag.lower())
317 self.clear_cdata_mode()
318 return j
319
320 # Overridable -- finish processing of start+end tag: <tag.../>
321 def handle_startendtag(self, tag, attrs):
322 self.handle_starttag(tag, attrs)
323 self.handle_endtag(tag)
324
325 # Overridable -- handle start tag
326 def handle_starttag(self, tag, attrs):
327 pass
328
329 # Overridable -- handle end tag
330 def handle_endtag(self, tag):
331 pass
332
333 # Overridable -- handle character reference
334 def handle_charref(self, name):
335 pass
336
337 # Overridable -- handle entity reference
338 def handle_entityref(self, name):
339 pass
340
341 # Overridable -- handle data
342 def handle_data(self, data):
343 pass
344
345 # Overridable -- handle comment
346 def handle_comment(self, data):
347 pass
348
349 # Overridable -- handle declaration
350 def handle_decl(self, decl):
351 pass
352
353 # Overridable -- handle processing instruction
354 def handle_pi(self, data):
355 pass
356
357 def unknown_decl(self, data):
358 self.error("unknown declaration: %r" % (data,))
359
360 # Internal -- helper to remove special character quoting
361 def unescape(self, s):
362 if '&' not in s:
363 return s
364 s = s.replace("&lt;", "<")
365 s = s.replace("&gt;", ">")
366 s = s.replace("&apos;", "'")
367 s = s.replace("&quot;", '"')
368 s = s.replace("&amp;", "&") # Must be last
369 return s