Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """HTML 2.0 parser. |
2 | ||
3 | See the HTML 2.0 specification: | |
4 | http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html | |
5 | """ | |
6 | ||
7 | import sgmllib | |
8 | ||
9 | from formatter import AS_IS | |
10 | ||
11 | __all__ = ["HTMLParser", "HTMLParseError"] | |
12 | ||
13 | ||
14 | class HTMLParseError(sgmllib.SGMLParseError): | |
15 | """Error raised when an HTML document can't be parsed.""" | |
16 | ||
17 | ||
18 | class HTMLParser(sgmllib.SGMLParser): | |
19 | """This is the basic HTML parser class. | |
20 | ||
21 | It supports all entity names required by the XHTML 1.0 Recommendation. | |
22 | It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 | |
23 | elements. | |
24 | ||
25 | """ | |
26 | ||
27 | from htmlentitydefs import entitydefs | |
28 | ||
29 | def __init__(self, formatter, verbose=0): | |
30 | """Creates an instance of the HTMLParser class. | |
31 | ||
32 | The formatter parameter is the formatter instance associated with | |
33 | the parser. | |
34 | ||
35 | """ | |
36 | sgmllib.SGMLParser.__init__(self, verbose) | |
37 | self.formatter = formatter | |
38 | ||
39 | def error(self, message): | |
40 | raise HTMLParseError(message) | |
41 | ||
42 | def reset(self): | |
43 | sgmllib.SGMLParser.reset(self) | |
44 | self.savedata = None | |
45 | self.isindex = 0 | |
46 | self.title = None | |
47 | self.base = None | |
48 | self.anchor = None | |
49 | self.anchorlist = [] | |
50 | self.nofill = 0 | |
51 | self.list_stack = [] | |
52 | ||
53 | # ------ Methods used internally; some may be overridden | |
54 | ||
55 | # --- Formatter interface, taking care of 'savedata' mode; | |
56 | # shouldn't need to be overridden | |
57 | ||
58 | def handle_data(self, data): | |
59 | if self.savedata is not None: | |
60 | self.savedata = self.savedata + data | |
61 | else: | |
62 | if self.nofill: | |
63 | self.formatter.add_literal_data(data) | |
64 | else: | |
65 | self.formatter.add_flowing_data(data) | |
66 | ||
67 | # --- Hooks to save data; shouldn't need to be overridden | |
68 | ||
69 | def save_bgn(self): | |
70 | """Begins saving character data in a buffer instead of sending it | |
71 | to the formatter object. | |
72 | ||
73 | Retrieve the stored data via the save_end() method. Use of the | |
74 | save_bgn() / save_end() pair may not be nested. | |
75 | ||
76 | """ | |
77 | self.savedata = '' | |
78 | ||
79 | def save_end(self): | |
80 | """Ends buffering character data and returns all data saved since | |
81 | the preceding call to the save_bgn() method. | |
82 | ||
83 | If the nofill flag is false, whitespace is collapsed to single | |
84 | spaces. A call to this method without a preceding call to the | |
85 | save_bgn() method will raise a TypeError exception. | |
86 | ||
87 | """ | |
88 | data = self.savedata | |
89 | self.savedata = None | |
90 | if not self.nofill: | |
91 | data = ' '.join(data.split()) | |
92 | return data | |
93 | ||
94 | # --- Hooks for anchors; should probably be overridden | |
95 | ||
96 | def anchor_bgn(self, href, name, type): | |
97 | """This method is called at the start of an anchor region. | |
98 | ||
99 | The arguments correspond to the attributes of the <A> tag with | |
100 | the same names. The default implementation maintains a list of | |
101 | hyperlinks (defined by the HREF attribute for <A> tags) within | |
102 | the document. The list of hyperlinks is available as the data | |
103 | attribute anchorlist. | |
104 | ||
105 | """ | |
106 | self.anchor = href | |
107 | if self.anchor: | |
108 | self.anchorlist.append(href) | |
109 | ||
110 | def anchor_end(self): | |
111 | """This method is called at the end of an anchor region. | |
112 | ||
113 | The default implementation adds a textual footnote marker using an | |
114 | index into the list of hyperlinks created by the anchor_bgn()method. | |
115 | ||
116 | """ | |
117 | if self.anchor: | |
118 | self.handle_data("[%d]" % len(self.anchorlist)) | |
119 | self.anchor = None | |
120 | ||
121 | # --- Hook for images; should probably be overridden | |
122 | ||
123 | def handle_image(self, src, alt, *args): | |
124 | """This method is called to handle images. | |
125 | ||
126 | The default implementation simply passes the alt value to the | |
127 | handle_data() method. | |
128 | ||
129 | """ | |
130 | self.handle_data(alt) | |
131 | ||
132 | # --------- Top level elememts | |
133 | ||
134 | def start_html(self, attrs): pass | |
135 | def end_html(self): pass | |
136 | ||
137 | def start_head(self, attrs): pass | |
138 | def end_head(self): pass | |
139 | ||
140 | def start_body(self, attrs): pass | |
141 | def end_body(self): pass | |
142 | ||
143 | # ------ Head elements | |
144 | ||
145 | def start_title(self, attrs): | |
146 | self.save_bgn() | |
147 | ||
148 | def end_title(self): | |
149 | self.title = self.save_end() | |
150 | ||
151 | def do_base(self, attrs): | |
152 | for a, v in attrs: | |
153 | if a == 'href': | |
154 | self.base = v | |
155 | ||
156 | def do_isindex(self, attrs): | |
157 | self.isindex = 1 | |
158 | ||
159 | def do_link(self, attrs): | |
160 | pass | |
161 | ||
162 | def do_meta(self, attrs): | |
163 | pass | |
164 | ||
165 | def do_nextid(self, attrs): # Deprecated | |
166 | pass | |
167 | ||
168 | # ------ Body elements | |
169 | ||
170 | # --- Headings | |
171 | ||
172 | def start_h1(self, attrs): | |
173 | self.formatter.end_paragraph(1) | |
174 | self.formatter.push_font(('h1', 0, 1, 0)) | |
175 | ||
176 | def end_h1(self): | |
177 | self.formatter.end_paragraph(1) | |
178 | self.formatter.pop_font() | |
179 | ||
180 | def start_h2(self, attrs): | |
181 | self.formatter.end_paragraph(1) | |
182 | self.formatter.push_font(('h2', 0, 1, 0)) | |
183 | ||
184 | def end_h2(self): | |
185 | self.formatter.end_paragraph(1) | |
186 | self.formatter.pop_font() | |
187 | ||
188 | def start_h3(self, attrs): | |
189 | self.formatter.end_paragraph(1) | |
190 | self.formatter.push_font(('h3', 0, 1, 0)) | |
191 | ||
192 | def end_h3(self): | |
193 | self.formatter.end_paragraph(1) | |
194 | self.formatter.pop_font() | |
195 | ||
196 | def start_h4(self, attrs): | |
197 | self.formatter.end_paragraph(1) | |
198 | self.formatter.push_font(('h4', 0, 1, 0)) | |
199 | ||
200 | def end_h4(self): | |
201 | self.formatter.end_paragraph(1) | |
202 | self.formatter.pop_font() | |
203 | ||
204 | def start_h5(self, attrs): | |
205 | self.formatter.end_paragraph(1) | |
206 | self.formatter.push_font(('h5', 0, 1, 0)) | |
207 | ||
208 | def end_h5(self): | |
209 | self.formatter.end_paragraph(1) | |
210 | self.formatter.pop_font() | |
211 | ||
212 | def start_h6(self, attrs): | |
213 | self.formatter.end_paragraph(1) | |
214 | self.formatter.push_font(('h6', 0, 1, 0)) | |
215 | ||
216 | def end_h6(self): | |
217 | self.formatter.end_paragraph(1) | |
218 | self.formatter.pop_font() | |
219 | ||
220 | # --- Block Structuring Elements | |
221 | ||
222 | def do_p(self, attrs): | |
223 | self.formatter.end_paragraph(1) | |
224 | ||
225 | def start_pre(self, attrs): | |
226 | self.formatter.end_paragraph(1) | |
227 | self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) | |
228 | self.nofill = self.nofill + 1 | |
229 | ||
230 | def end_pre(self): | |
231 | self.formatter.end_paragraph(1) | |
232 | self.formatter.pop_font() | |
233 | self.nofill = max(0, self.nofill - 1) | |
234 | ||
235 | def start_xmp(self, attrs): | |
236 | self.start_pre(attrs) | |
237 | self.setliteral('xmp') # Tell SGML parser | |
238 | ||
239 | def end_xmp(self): | |
240 | self.end_pre() | |
241 | ||
242 | def start_listing(self, attrs): | |
243 | self.start_pre(attrs) | |
244 | self.setliteral('listing') # Tell SGML parser | |
245 | ||
246 | def end_listing(self): | |
247 | self.end_pre() | |
248 | ||
249 | def start_address(self, attrs): | |
250 | self.formatter.end_paragraph(0) | |
251 | self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) | |
252 | ||
253 | def end_address(self): | |
254 | self.formatter.end_paragraph(0) | |
255 | self.formatter.pop_font() | |
256 | ||
257 | def start_blockquote(self, attrs): | |
258 | self.formatter.end_paragraph(1) | |
259 | self.formatter.push_margin('blockquote') | |
260 | ||
261 | def end_blockquote(self): | |
262 | self.formatter.end_paragraph(1) | |
263 | self.formatter.pop_margin() | |
264 | ||
265 | # --- List Elements | |
266 | ||
267 | def start_ul(self, attrs): | |
268 | self.formatter.end_paragraph(not self.list_stack) | |
269 | self.formatter.push_margin('ul') | |
270 | self.list_stack.append(['ul', '*', 0]) | |
271 | ||
272 | def end_ul(self): | |
273 | if self.list_stack: del self.list_stack[-1] | |
274 | self.formatter.end_paragraph(not self.list_stack) | |
275 | self.formatter.pop_margin() | |
276 | ||
277 | def do_li(self, attrs): | |
278 | self.formatter.end_paragraph(0) | |
279 | if self.list_stack: | |
280 | [dummy, label, counter] = top = self.list_stack[-1] | |
281 | top[2] = counter = counter+1 | |
282 | else: | |
283 | label, counter = '*', 0 | |
284 | self.formatter.add_label_data(label, counter) | |
285 | ||
286 | def start_ol(self, attrs): | |
287 | self.formatter.end_paragraph(not self.list_stack) | |
288 | self.formatter.push_margin('ol') | |
289 | label = '1.' | |
290 | for a, v in attrs: | |
291 | if a == 'type': | |
292 | if len(v) == 1: v = v + '.' | |
293 | label = v | |
294 | self.list_stack.append(['ol', label, 0]) | |
295 | ||
296 | def end_ol(self): | |
297 | if self.list_stack: del self.list_stack[-1] | |
298 | self.formatter.end_paragraph(not self.list_stack) | |
299 | self.formatter.pop_margin() | |
300 | ||
301 | def start_menu(self, attrs): | |
302 | self.start_ul(attrs) | |
303 | ||
304 | def end_menu(self): | |
305 | self.end_ul() | |
306 | ||
307 | def start_dir(self, attrs): | |
308 | self.start_ul(attrs) | |
309 | ||
310 | def end_dir(self): | |
311 | self.end_ul() | |
312 | ||
313 | def start_dl(self, attrs): | |
314 | self.formatter.end_paragraph(1) | |
315 | self.list_stack.append(['dl', '', 0]) | |
316 | ||
317 | def end_dl(self): | |
318 | self.ddpop(1) | |
319 | if self.list_stack: del self.list_stack[-1] | |
320 | ||
321 | def do_dt(self, attrs): | |
322 | self.ddpop() | |
323 | ||
324 | def do_dd(self, attrs): | |
325 | self.ddpop() | |
326 | self.formatter.push_margin('dd') | |
327 | self.list_stack.append(['dd', '', 0]) | |
328 | ||
329 | def ddpop(self, bl=0): | |
330 | self.formatter.end_paragraph(bl) | |
331 | if self.list_stack: | |
332 | if self.list_stack[-1][0] == 'dd': | |
333 | del self.list_stack[-1] | |
334 | self.formatter.pop_margin() | |
335 | ||
336 | # --- Phrase Markup | |
337 | ||
338 | # Idiomatic Elements | |
339 | ||
340 | def start_cite(self, attrs): self.start_i(attrs) | |
341 | def end_cite(self): self.end_i() | |
342 | ||
343 | def start_code(self, attrs): self.start_tt(attrs) | |
344 | def end_code(self): self.end_tt() | |
345 | ||
346 | def start_em(self, attrs): self.start_i(attrs) | |
347 | def end_em(self): self.end_i() | |
348 | ||
349 | def start_kbd(self, attrs): self.start_tt(attrs) | |
350 | def end_kbd(self): self.end_tt() | |
351 | ||
352 | def start_samp(self, attrs): self.start_tt(attrs) | |
353 | def end_samp(self): self.end_tt() | |
354 | ||
355 | def start_strong(self, attrs): self.start_b(attrs) | |
356 | def end_strong(self): self.end_b() | |
357 | ||
358 | def start_var(self, attrs): self.start_i(attrs) | |
359 | def end_var(self): self.end_i() | |
360 | ||
361 | # Typographic Elements | |
362 | ||
363 | def start_i(self, attrs): | |
364 | self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) | |
365 | def end_i(self): | |
366 | self.formatter.pop_font() | |
367 | ||
368 | def start_b(self, attrs): | |
369 | self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) | |
370 | def end_b(self): | |
371 | self.formatter.pop_font() | |
372 | ||
373 | def start_tt(self, attrs): | |
374 | self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) | |
375 | def end_tt(self): | |
376 | self.formatter.pop_font() | |
377 | ||
378 | def start_a(self, attrs): | |
379 | href = '' | |
380 | name = '' | |
381 | type = '' | |
382 | for attrname, value in attrs: | |
383 | value = value.strip() | |
384 | if attrname == 'href': | |
385 | href = value | |
386 | if attrname == 'name': | |
387 | name = value | |
388 | if attrname == 'type': | |
389 | type = value.lower() | |
390 | self.anchor_bgn(href, name, type) | |
391 | ||
392 | def end_a(self): | |
393 | self.anchor_end() | |
394 | ||
395 | # --- Line Break | |
396 | ||
397 | def do_br(self, attrs): | |
398 | self.formatter.add_line_break() | |
399 | ||
400 | # --- Horizontal Rule | |
401 | ||
402 | def do_hr(self, attrs): | |
403 | self.formatter.add_hor_rule() | |
404 | ||
405 | # --- Image | |
406 | ||
407 | def do_img(self, attrs): | |
408 | align = '' | |
409 | alt = '(image)' | |
410 | ismap = '' | |
411 | src = '' | |
412 | width = 0 | |
413 | height = 0 | |
414 | for attrname, value in attrs: | |
415 | if attrname == 'align': | |
416 | align = value | |
417 | if attrname == 'alt': | |
418 | alt = value | |
419 | if attrname == 'ismap': | |
420 | ismap = value | |
421 | if attrname == 'src': | |
422 | src = value | |
423 | if attrname == 'width': | |
424 | try: width = int(value) | |
425 | except ValueError: pass | |
426 | if attrname == 'height': | |
427 | try: height = int(value) | |
428 | except ValueError: pass | |
429 | self.handle_image(src, alt, ismap, align, width, height) | |
430 | ||
431 | # --- Really Old Unofficial Deprecated Stuff | |
432 | ||
433 | def do_plaintext(self, attrs): | |
434 | self.start_pre(attrs) | |
435 | self.setnomoretags() # Tell SGML parser | |
436 | ||
437 | # --- Unhandled tags | |
438 | ||
439 | def unknown_starttag(self, tag, attrs): | |
440 | pass | |
441 | ||
442 | def unknown_endtag(self, tag): | |
443 | pass | |
444 | ||
445 | ||
446 | def test(args = None): | |
447 | import sys, formatter | |
448 | ||
449 | if not args: | |
450 | args = sys.argv[1:] | |
451 | ||
452 | silent = args and args[0] == '-s' | |
453 | if silent: | |
454 | del args[0] | |
455 | ||
456 | if args: | |
457 | file = args[0] | |
458 | else: | |
459 | file = 'test.html' | |
460 | ||
461 | if file == '-': | |
462 | f = sys.stdin | |
463 | else: | |
464 | try: | |
465 | f = open(file, 'r') | |
466 | except IOError, msg: | |
467 | print file, ":", msg | |
468 | sys.exit(1) | |
469 | ||
470 | data = f.read() | |
471 | ||
472 | if f is not sys.stdin: | |
473 | f.close() | |
474 | ||
475 | if silent: | |
476 | f = formatter.NullFormatter() | |
477 | else: | |
478 | f = formatter.AbstractFormatter(formatter.DumbWriter()) | |
479 | ||
480 | p = HTMLParser(f) | |
481 | p.feed(data) | |
482 | p.close() | |
483 | ||
484 | ||
485 | if __name__ == '__main__': | |
486 | test() |