Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / lib / python2.4 / htmllib.py
CommitLineData
920dae64
AT
1"""HTML 2.0 parser.
2
3See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
6
7import sgmllib
8
9from formatter import AS_IS
10
11__all__ = ["HTMLParser", "HTMLParseError"]
12
13
14class HTMLParseError(sgmllib.SGMLParseError):
15 """Error raised when an HTML document can't be parsed."""
16
17
18class HTMLParser(sgmllib.SGMLParser):
19 """This is the basic HTML parser class.
20
21 It supports all entity names required by the XHTML 1.0 Recommendation.
22 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
23 elements.
24
25 """
26
27 from htmlentitydefs import entitydefs
28
29 def __init__(self, formatter, verbose=0):
30 """Creates an instance of the HTMLParser class.
31
32 The formatter parameter is the formatter instance associated with
33 the parser.
34
35 """
36 sgmllib.SGMLParser.__init__(self, verbose)
37 self.formatter = formatter
38
39 def error(self, message):
40 raise HTMLParseError(message)
41
42 def reset(self):
43 sgmllib.SGMLParser.reset(self)
44 self.savedata = None
45 self.isindex = 0
46 self.title = None
47 self.base = None
48 self.anchor = None
49 self.anchorlist = []
50 self.nofill = 0
51 self.list_stack = []
52
53 # ------ Methods used internally; some may be overridden
54
55 # --- Formatter interface, taking care of 'savedata' mode;
56 # shouldn't need to be overridden
57
58 def handle_data(self, data):
59 if self.savedata is not None:
60 self.savedata = self.savedata + data
61 else:
62 if self.nofill:
63 self.formatter.add_literal_data(data)
64 else:
65 self.formatter.add_flowing_data(data)
66
67 # --- Hooks to save data; shouldn't need to be overridden
68
69 def save_bgn(self):
70 """Begins saving character data in a buffer instead of sending it
71 to the formatter object.
72
73 Retrieve the stored data via the save_end() method. Use of the
74 save_bgn() / save_end() pair may not be nested.
75
76 """
77 self.savedata = ''
78
79 def save_end(self):
80 """Ends buffering character data and returns all data saved since
81 the preceding call to the save_bgn() method.
82
83 If the nofill flag is false, whitespace is collapsed to single
84 spaces. A call to this method without a preceding call to the
85 save_bgn() method will raise a TypeError exception.
86
87 """
88 data = self.savedata
89 self.savedata = None
90 if not self.nofill:
91 data = ' '.join(data.split())
92 return data
93
94 # --- Hooks for anchors; should probably be overridden
95
96 def anchor_bgn(self, href, name, type):
97 """This method is called at the start of an anchor region.
98
99 The arguments correspond to the attributes of the <A> tag with
100 the same names. The default implementation maintains a list of
101 hyperlinks (defined by the HREF attribute for <A> tags) within
102 the document. The list of hyperlinks is available as the data
103 attribute anchorlist.
104
105 """
106 self.anchor = href
107 if self.anchor:
108 self.anchorlist.append(href)
109
110 def anchor_end(self):
111 """This method is called at the end of an anchor region.
112
113 The default implementation adds a textual footnote marker using an
114 index into the list of hyperlinks created by the anchor_bgn()method.
115
116 """
117 if self.anchor:
118 self.handle_data("[%d]" % len(self.anchorlist))
119 self.anchor = None
120
121 # --- Hook for images; should probably be overridden
122
123 def handle_image(self, src, alt, *args):
124 """This method is called to handle images.
125
126 The default implementation simply passes the alt value to the
127 handle_data() method.
128
129 """
130 self.handle_data(alt)
131
132 # --------- Top level elememts
133
134 def start_html(self, attrs): pass
135 def end_html(self): pass
136
137 def start_head(self, attrs): pass
138 def end_head(self): pass
139
140 def start_body(self, attrs): pass
141 def end_body(self): pass
142
143 # ------ Head elements
144
145 def start_title(self, attrs):
146 self.save_bgn()
147
148 def end_title(self):
149 self.title = self.save_end()
150
151 def do_base(self, attrs):
152 for a, v in attrs:
153 if a == 'href':
154 self.base = v
155
156 def do_isindex(self, attrs):
157 self.isindex = 1
158
159 def do_link(self, attrs):
160 pass
161
162 def do_meta(self, attrs):
163 pass
164
165 def do_nextid(self, attrs): # Deprecated
166 pass
167
168 # ------ Body elements
169
170 # --- Headings
171
172 def start_h1(self, attrs):
173 self.formatter.end_paragraph(1)
174 self.formatter.push_font(('h1', 0, 1, 0))
175
176 def end_h1(self):
177 self.formatter.end_paragraph(1)
178 self.formatter.pop_font()
179
180 def start_h2(self, attrs):
181 self.formatter.end_paragraph(1)
182 self.formatter.push_font(('h2', 0, 1, 0))
183
184 def end_h2(self):
185 self.formatter.end_paragraph(1)
186 self.formatter.pop_font()
187
188 def start_h3(self, attrs):
189 self.formatter.end_paragraph(1)
190 self.formatter.push_font(('h3', 0, 1, 0))
191
192 def end_h3(self):
193 self.formatter.end_paragraph(1)
194 self.formatter.pop_font()
195
196 def start_h4(self, attrs):
197 self.formatter.end_paragraph(1)
198 self.formatter.push_font(('h4', 0, 1, 0))
199
200 def end_h4(self):
201 self.formatter.end_paragraph(1)
202 self.formatter.pop_font()
203
204 def start_h5(self, attrs):
205 self.formatter.end_paragraph(1)
206 self.formatter.push_font(('h5', 0, 1, 0))
207
208 def end_h5(self):
209 self.formatter.end_paragraph(1)
210 self.formatter.pop_font()
211
212 def start_h6(self, attrs):
213 self.formatter.end_paragraph(1)
214 self.formatter.push_font(('h6', 0, 1, 0))
215
216 def end_h6(self):
217 self.formatter.end_paragraph(1)
218 self.formatter.pop_font()
219
220 # --- Block Structuring Elements
221
222 def do_p(self, attrs):
223 self.formatter.end_paragraph(1)
224
225 def start_pre(self, attrs):
226 self.formatter.end_paragraph(1)
227 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
228 self.nofill = self.nofill + 1
229
230 def end_pre(self):
231 self.formatter.end_paragraph(1)
232 self.formatter.pop_font()
233 self.nofill = max(0, self.nofill - 1)
234
235 def start_xmp(self, attrs):
236 self.start_pre(attrs)
237 self.setliteral('xmp') # Tell SGML parser
238
239 def end_xmp(self):
240 self.end_pre()
241
242 def start_listing(self, attrs):
243 self.start_pre(attrs)
244 self.setliteral('listing') # Tell SGML parser
245
246 def end_listing(self):
247 self.end_pre()
248
249 def start_address(self, attrs):
250 self.formatter.end_paragraph(0)
251 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
252
253 def end_address(self):
254 self.formatter.end_paragraph(0)
255 self.formatter.pop_font()
256
257 def start_blockquote(self, attrs):
258 self.formatter.end_paragraph(1)
259 self.formatter.push_margin('blockquote')
260
261 def end_blockquote(self):
262 self.formatter.end_paragraph(1)
263 self.formatter.pop_margin()
264
265 # --- List Elements
266
267 def start_ul(self, attrs):
268 self.formatter.end_paragraph(not self.list_stack)
269 self.formatter.push_margin('ul')
270 self.list_stack.append(['ul', '*', 0])
271
272 def end_ul(self):
273 if self.list_stack: del self.list_stack[-1]
274 self.formatter.end_paragraph(not self.list_stack)
275 self.formatter.pop_margin()
276
277 def do_li(self, attrs):
278 self.formatter.end_paragraph(0)
279 if self.list_stack:
280 [dummy, label, counter] = top = self.list_stack[-1]
281 top[2] = counter = counter+1
282 else:
283 label, counter = '*', 0
284 self.formatter.add_label_data(label, counter)
285
286 def start_ol(self, attrs):
287 self.formatter.end_paragraph(not self.list_stack)
288 self.formatter.push_margin('ol')
289 label = '1.'
290 for a, v in attrs:
291 if a == 'type':
292 if len(v) == 1: v = v + '.'
293 label = v
294 self.list_stack.append(['ol', label, 0])
295
296 def end_ol(self):
297 if self.list_stack: del self.list_stack[-1]
298 self.formatter.end_paragraph(not self.list_stack)
299 self.formatter.pop_margin()
300
301 def start_menu(self, attrs):
302 self.start_ul(attrs)
303
304 def end_menu(self):
305 self.end_ul()
306
307 def start_dir(self, attrs):
308 self.start_ul(attrs)
309
310 def end_dir(self):
311 self.end_ul()
312
313 def start_dl(self, attrs):
314 self.formatter.end_paragraph(1)
315 self.list_stack.append(['dl', '', 0])
316
317 def end_dl(self):
318 self.ddpop(1)
319 if self.list_stack: del self.list_stack[-1]
320
321 def do_dt(self, attrs):
322 self.ddpop()
323
324 def do_dd(self, attrs):
325 self.ddpop()
326 self.formatter.push_margin('dd')
327 self.list_stack.append(['dd', '', 0])
328
329 def ddpop(self, bl=0):
330 self.formatter.end_paragraph(bl)
331 if self.list_stack:
332 if self.list_stack[-1][0] == 'dd':
333 del self.list_stack[-1]
334 self.formatter.pop_margin()
335
336 # --- Phrase Markup
337
338 # Idiomatic Elements
339
340 def start_cite(self, attrs): self.start_i(attrs)
341 def end_cite(self): self.end_i()
342
343 def start_code(self, attrs): self.start_tt(attrs)
344 def end_code(self): self.end_tt()
345
346 def start_em(self, attrs): self.start_i(attrs)
347 def end_em(self): self.end_i()
348
349 def start_kbd(self, attrs): self.start_tt(attrs)
350 def end_kbd(self): self.end_tt()
351
352 def start_samp(self, attrs): self.start_tt(attrs)
353 def end_samp(self): self.end_tt()
354
355 def start_strong(self, attrs): self.start_b(attrs)
356 def end_strong(self): self.end_b()
357
358 def start_var(self, attrs): self.start_i(attrs)
359 def end_var(self): self.end_i()
360
361 # Typographic Elements
362
363 def start_i(self, attrs):
364 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
365 def end_i(self):
366 self.formatter.pop_font()
367
368 def start_b(self, attrs):
369 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
370 def end_b(self):
371 self.formatter.pop_font()
372
373 def start_tt(self, attrs):
374 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
375 def end_tt(self):
376 self.formatter.pop_font()
377
378 def start_a(self, attrs):
379 href = ''
380 name = ''
381 type = ''
382 for attrname, value in attrs:
383 value = value.strip()
384 if attrname == 'href':
385 href = value
386 if attrname == 'name':
387 name = value
388 if attrname == 'type':
389 type = value.lower()
390 self.anchor_bgn(href, name, type)
391
392 def end_a(self):
393 self.anchor_end()
394
395 # --- Line Break
396
397 def do_br(self, attrs):
398 self.formatter.add_line_break()
399
400 # --- Horizontal Rule
401
402 def do_hr(self, attrs):
403 self.formatter.add_hor_rule()
404
405 # --- Image
406
407 def do_img(self, attrs):
408 align = ''
409 alt = '(image)'
410 ismap = ''
411 src = ''
412 width = 0
413 height = 0
414 for attrname, value in attrs:
415 if attrname == 'align':
416 align = value
417 if attrname == 'alt':
418 alt = value
419 if attrname == 'ismap':
420 ismap = value
421 if attrname == 'src':
422 src = value
423 if attrname == 'width':
424 try: width = int(value)
425 except ValueError: pass
426 if attrname == 'height':
427 try: height = int(value)
428 except ValueError: pass
429 self.handle_image(src, alt, ismap, align, width, height)
430
431 # --- Really Old Unofficial Deprecated Stuff
432
433 def do_plaintext(self, attrs):
434 self.start_pre(attrs)
435 self.setnomoretags() # Tell SGML parser
436
437 # --- Unhandled tags
438
439 def unknown_starttag(self, tag, attrs):
440 pass
441
442 def unknown_endtag(self, tag):
443 pass
444
445
446def test(args = None):
447 import sys, formatter
448
449 if not args:
450 args = sys.argv[1:]
451
452 silent = args and args[0] == '-s'
453 if silent:
454 del args[0]
455
456 if args:
457 file = args[0]
458 else:
459 file = 'test.html'
460
461 if file == '-':
462 f = sys.stdin
463 else:
464 try:
465 f = open(file, 'r')
466 except IOError, msg:
467 print file, ":", msg
468 sys.exit(1)
469
470 data = f.read()
471
472 if f is not sys.stdin:
473 f.close()
474
475 if silent:
476 f = formatter.NullFormatter()
477 else:
478 f = formatter.AbstractFormatter(formatter.DumbWriter())
479
480 p = HTMLParser(f)
481 p.feed(data)
482 p.close()
483
484
485if __name__ == '__main__':
486 test()