Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | import pprint |
2 | import sgmllib | |
3 | import unittest | |
4 | from test import test_support | |
5 | ||
6 | ||
7 | class EventCollector(sgmllib.SGMLParser): | |
8 | ||
9 | def __init__(self): | |
10 | self.events = [] | |
11 | self.append = self.events.append | |
12 | sgmllib.SGMLParser.__init__(self) | |
13 | ||
14 | def get_events(self): | |
15 | # Normalize the list of events so that buffer artefacts don't | |
16 | # separate runs of contiguous characters. | |
17 | L = [] | |
18 | prevtype = None | |
19 | for event in self.events: | |
20 | type = event[0] | |
21 | if type == prevtype == "data": | |
22 | L[-1] = ("data", L[-1][1] + event[1]) | |
23 | else: | |
24 | L.append(event) | |
25 | prevtype = type | |
26 | self.events = L | |
27 | return L | |
28 | ||
29 | # structure markup | |
30 | ||
31 | def unknown_starttag(self, tag, attrs): | |
32 | self.append(("starttag", tag, attrs)) | |
33 | ||
34 | def unknown_endtag(self, tag): | |
35 | self.append(("endtag", tag)) | |
36 | ||
37 | # all other markup | |
38 | ||
39 | def handle_comment(self, data): | |
40 | self.append(("comment", data)) | |
41 | ||
42 | def handle_charref(self, data): | |
43 | self.append(("charref", data)) | |
44 | ||
45 | def handle_data(self, data): | |
46 | self.append(("data", data)) | |
47 | ||
48 | def handle_decl(self, decl): | |
49 | self.append(("decl", decl)) | |
50 | ||
51 | def handle_entityref(self, data): | |
52 | self.append(("entityref", data)) | |
53 | ||
54 | def handle_pi(self, data): | |
55 | self.append(("pi", data)) | |
56 | ||
57 | def unknown_decl(self, decl): | |
58 | self.append(("unknown decl", decl)) | |
59 | ||
60 | ||
61 | class CDATAEventCollector(EventCollector): | |
62 | def start_cdata(self, attrs): | |
63 | self.append(("starttag", "cdata", attrs)) | |
64 | self.setliteral() | |
65 | ||
66 | ||
67 | class SGMLParserTestCase(unittest.TestCase): | |
68 | ||
69 | collector = EventCollector | |
70 | ||
71 | def get_events(self, source): | |
72 | parser = self.collector() | |
73 | try: | |
74 | for s in source: | |
75 | parser.feed(s) | |
76 | parser.close() | |
77 | except: | |
78 | #self.events = parser.events | |
79 | raise | |
80 | return parser.get_events() | |
81 | ||
82 | def check_events(self, source, expected_events): | |
83 | try: | |
84 | events = self.get_events(source) | |
85 | except: | |
86 | import sys | |
87 | #print >>sys.stderr, pprint.pformat(self.events) | |
88 | raise | |
89 | if events != expected_events: | |
90 | self.fail("received events did not match expected events\n" | |
91 | "Expected:\n" + pprint.pformat(expected_events) + | |
92 | "\nReceived:\n" + pprint.pformat(events)) | |
93 | ||
94 | def check_parse_error(self, source): | |
95 | parser = EventCollector() | |
96 | try: | |
97 | parser.feed(source) | |
98 | parser.close() | |
99 | except sgmllib.SGMLParseError: | |
100 | pass | |
101 | else: | |
102 | self.fail("expected SGMLParseError for %r\nReceived:\n%s" | |
103 | % (source, pprint.pformat(parser.get_events()))) | |
104 | ||
105 | def test_doctype_decl_internal(self): | |
106 | inside = """\ | |
107 | DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN' | |
108 | SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [ | |
109 | <!ELEMENT html - O EMPTY> | |
110 | <!ATTLIST html | |
111 | version CDATA #IMPLIED | |
112 | profile CDATA 'DublinCore'> | |
113 | <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'> | |
114 | <!ENTITY myEntity 'internal parsed entity'> | |
115 | <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'> | |
116 | <!ENTITY % paramEntity 'name|name|name'> | |
117 | %paramEntity; | |
118 | <!-- comment --> | |
119 | ]""" | |
120 | self.check_events(["<!%s>" % inside], [ | |
121 | ("decl", inside), | |
122 | ]) | |
123 | ||
124 | def test_doctype_decl_external(self): | |
125 | inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'" | |
126 | self.check_events("<!%s>" % inside, [ | |
127 | ("decl", inside), | |
128 | ]) | |
129 | ||
130 | def test_underscore_in_attrname(self): | |
131 | # SF bug #436621 | |
132 | """Make sure attribute names with underscores are accepted""" | |
133 | self.check_events("<a has_under _under>", [ | |
134 | ("starttag", "a", [("has_under", "has_under"), | |
135 | ("_under", "_under")]), | |
136 | ]) | |
137 | ||
138 | def test_underscore_in_tagname(self): | |
139 | # SF bug #436621 | |
140 | """Make sure tag names with underscores are accepted""" | |
141 | self.check_events("<has_under></has_under>", [ | |
142 | ("starttag", "has_under", []), | |
143 | ("endtag", "has_under"), | |
144 | ]) | |
145 | ||
146 | def test_quotes_in_unquoted_attrs(self): | |
147 | # SF bug #436621 | |
148 | """Be sure quotes in unquoted attributes are made part of the value""" | |
149 | self.check_events("<a href=foo'bar\"baz>", [ | |
150 | ("starttag", "a", [("href", "foo'bar\"baz")]), | |
151 | ]) | |
152 | ||
153 | def test_xhtml_empty_tag(self): | |
154 | """Handling of XHTML-style empty start tags""" | |
155 | self.check_events("<br />text<i></i>", [ | |
156 | ("starttag", "br", []), | |
157 | ("data", "text"), | |
158 | ("starttag", "i", []), | |
159 | ("endtag", "i"), | |
160 | ]) | |
161 | ||
162 | def test_processing_instruction_only(self): | |
163 | self.check_events("<?processing instruction>", [ | |
164 | ("pi", "processing instruction"), | |
165 | ]) | |
166 | ||
167 | def test_bad_nesting(self): | |
168 | self.check_events("<a><b></a></b>", [ | |
169 | ("starttag", "a", []), | |
170 | ("starttag", "b", []), | |
171 | ("endtag", "a"), | |
172 | ("endtag", "b"), | |
173 | ]) | |
174 | ||
175 | def test_bare_ampersands(self): | |
176 | self.check_events("this text & contains & ampersands &", [ | |
177 | ("data", "this text & contains & ampersands &"), | |
178 | ]) | |
179 | ||
180 | def test_bare_pointy_brackets(self): | |
181 | self.check_events("this < text > contains < bare>pointy< brackets", [ | |
182 | ("data", "this < text > contains < bare>pointy< brackets"), | |
183 | ]) | |
184 | ||
185 | def test_attr_syntax(self): | |
186 | output = [ | |
187 | ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")]) | |
188 | ] | |
189 | self.check_events("""<a b='v' c="v" d=v e>""", output) | |
190 | self.check_events("""<a b = 'v' c = "v" d = v e>""", output) | |
191 | self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) | |
192 | self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) | |
193 | ||
194 | def test_attr_values(self): | |
195 | self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""", | |
196 | [("starttag", "a", [("b", "xxx\n\txxx"), | |
197 | ("c", "yyy\t\nyyy"), | |
198 | ("d", "\txyz\n")]) | |
199 | ]) | |
200 | self.check_events("""<a b='' c="">""", [ | |
201 | ("starttag", "a", [("b", ""), ("c", "")]), | |
202 | ]) | |
203 | # URL construction stuff from RFC 1808: | |
204 | safe = "$-_.+" | |
205 | extra = "!*'()," | |
206 | reserved = ";/?:@&=" | |
207 | url = "http://example.com:8080/path/to/file?%s%s%s" % ( | |
208 | safe, extra, reserved) | |
209 | self.check_events("""<e a=%s>""" % url, [ | |
210 | ("starttag", "e", [("a", url)]), | |
211 | ]) | |
212 | # Regression test for SF patch #669683. | |
213 | self.check_events("<e a=rgb(1,2,3)>", [ | |
214 | ("starttag", "e", [("a", "rgb(1,2,3)")]), | |
215 | ]) | |
216 | ||
217 | def test_attr_funky_names(self): | |
218 | self.check_events("""<a a.b='v' c:d=v e-f=v>""", [ | |
219 | ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]), | |
220 | ]) | |
221 | ||
222 | def test_illegal_declarations(self): | |
223 | s = 'abc<!spacer type="block" height="25">def' | |
224 | self.check_events(s, [ | |
225 | ("data", "abc"), | |
226 | ("unknown decl", 'spacer type="block" height="25"'), | |
227 | ("data", "def"), | |
228 | ]) | |
229 | ||
230 | def test_weird_starttags(self): | |
231 | self.check_events("<a<a>", [ | |
232 | ("starttag", "a", []), | |
233 | ("starttag", "a", []), | |
234 | ]) | |
235 | self.check_events("</a<a>", [ | |
236 | ("endtag", "a"), | |
237 | ("starttag", "a", []), | |
238 | ]) | |
239 | ||
240 | def test_declaration_junk_chars(self): | |
241 | self.check_parse_error("<!DOCTYPE foo $ >") | |
242 | ||
243 | def test_get_starttag_text(self): | |
244 | s = """<foobar \n one="1"\ttwo=2 >""" | |
245 | self.check_events(s, [ | |
246 | ("starttag", "foobar", [("one", "1"), ("two", "2")]), | |
247 | ]) | |
248 | ||
249 | def test_cdata_content(self): | |
250 | s = ("<cdata> <!-- not a comment --> ¬-an-entity-ref; </cdata>" | |
251 | "<notcdata> <!-- comment --> </notcdata>") | |
252 | self.collector = CDATAEventCollector | |
253 | self.check_events(s, [ | |
254 | ("starttag", "cdata", []), | |
255 | ("data", " <!-- not a comment --> ¬-an-entity-ref; "), | |
256 | ("endtag", "cdata"), | |
257 | ("starttag", "notcdata", []), | |
258 | ("data", " "), | |
259 | ("comment", " comment "), | |
260 | ("data", " "), | |
261 | ("endtag", "notcdata"), | |
262 | ]) | |
263 | s = """<cdata> <not a='start tag'> </cdata>""" | |
264 | self.check_events(s, [ | |
265 | ("starttag", "cdata", []), | |
266 | ("data", " <not a='start tag'> "), | |
267 | ("endtag", "cdata"), | |
268 | ]) | |
269 | ||
270 | def test_illegal_declarations(self): | |
271 | s = 'abc<!spacer type="block" height="25">def' | |
272 | self.check_events(s, [ | |
273 | ("data", "abc"), | |
274 | ("unknown decl", 'spacer type="block" height="25"'), | |
275 | ("data", "def"), | |
276 | ]) | |
277 | ||
278 | def test_enumerated_attr_type(self): | |
279 | s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>" | |
280 | self.check_events(s, [ | |
281 | ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'), | |
282 | ]) | |
283 | ||
284 | # XXX These tests have been disabled by prefixing their names with | |
285 | # an underscore. The first two exercise outstanding bugs in the | |
286 | # sgmllib module, and the third exhibits questionable behavior | |
287 | # that needs to be carefully considered before changing it. | |
288 | ||
289 | def _test_starttag_end_boundary(self): | |
290 | self.check_events("""<a b='<'>""", [("starttag", "a", [("b", "<")])]) | |
291 | self.check_events("""<a b='>'>""", [("starttag", "a", [("b", ">")])]) | |
292 | ||
293 | def _test_buffer_artefacts(self): | |
294 | output = [("starttag", "a", [("b", "<")])] | |
295 | self.check_events(["<a b='<'>"], output) | |
296 | self.check_events(["<a ", "b='<'>"], output) | |
297 | self.check_events(["<a b", "='<'>"], output) | |
298 | self.check_events(["<a b=", "'<'>"], output) | |
299 | self.check_events(["<a b='<", "'>"], output) | |
300 | self.check_events(["<a b='<'", ">"], output) | |
301 | ||
302 | output = [("starttag", "a", [("b", ">")])] | |
303 | self.check_events(["<a b='>'>"], output) | |
304 | self.check_events(["<a ", "b='>'>"], output) | |
305 | self.check_events(["<a b", "='>'>"], output) | |
306 | self.check_events(["<a b=", "'>'>"], output) | |
307 | self.check_events(["<a b='>", "'>"], output) | |
308 | self.check_events(["<a b='>'", ">"], output) | |
309 | ||
310 | output = [("comment", "abc")] | |
311 | self._run_check(["", "<!--abc-->"], output) | |
312 | self._run_check(["<", "!--abc-->"], output) | |
313 | self._run_check(["<!", "--abc-->"], output) | |
314 | self._run_check(["<!-", "-abc-->"], output) | |
315 | self._run_check(["<!--", "abc-->"], output) | |
316 | self._run_check(["<!--a", "bc-->"], output) | |
317 | self._run_check(["<!--ab", "c-->"], output) | |
318 | self._run_check(["<!--abc", "-->"], output) | |
319 | self._run_check(["<!--abc-", "->"], output) | |
320 | self._run_check(["<!--abc--", ">"], output) | |
321 | self._run_check(["<!--abc-->", ""], output) | |
322 | ||
323 | def _test_starttag_junk_chars(self): | |
324 | self.check_parse_error("<") | |
325 | self.check_parse_error("<>") | |
326 | self.check_parse_error("</$>") | |
327 | self.check_parse_error("</") | |
328 | self.check_parse_error("</a") | |
329 | self.check_parse_error("<$") | |
330 | self.check_parse_error("<$>") | |
331 | self.check_parse_error("<!") | |
332 | self.check_parse_error("<a $>") | |
333 | self.check_parse_error("<a") | |
334 | self.check_parse_error("<a foo='bar'") | |
335 | self.check_parse_error("<a foo='bar") | |
336 | self.check_parse_error("<a foo='>'") | |
337 | self.check_parse_error("<a foo='>") | |
338 | self.check_parse_error("<a foo=>") | |
339 | ||
340 | ||
341 | def test_main(): | |
342 | test_support.run_unittest(SGMLParserTestCase) | |
343 | ||
344 | ||
345 | if __name__ == "__main__": | |
346 | test_main() |