Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | import test.test_support, unittest |
2 | import sys, codecs, htmlentitydefs, unicodedata | |
3 | ||
4 | class PosReturn: | |
5 | # this can be used for configurable callbacks | |
6 | ||
7 | def __init__(self): | |
8 | self.pos = 0 | |
9 | ||
10 | def handle(self, exc): | |
11 | oldpos = self.pos | |
12 | realpos = oldpos | |
13 | if realpos<0: | |
14 | realpos = len(exc.object) + realpos | |
15 | # if we don't advance this time, terminate on the next call | |
16 | # otherwise we'd get an endless loop | |
17 | if realpos <= exc.start: | |
18 | self.pos = len(exc.object) | |
19 | return (u"<?>", oldpos) | |
20 | ||
21 | class CodecCallbackTest(unittest.TestCase): | |
22 | ||
23 | def test_xmlcharrefreplace(self): | |
24 | # replace unencodable characters which numeric character entities. | |
25 | # For ascii, latin-1 and charmaps this is completely implemented | |
26 | # in C and should be reasonably fast. | |
27 | s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" | |
28 | self.assertEqual( | |
29 | s.encode("ascii", "xmlcharrefreplace"), | |
30 | "スパモ änd eggs" | |
31 | ) | |
32 | self.assertEqual( | |
33 | s.encode("latin-1", "xmlcharrefreplace"), | |
34 | "スパモ \xe4nd eggs" | |
35 | ) | |
36 | ||
37 | def test_xmlcharnamereplace(self): | |
38 | # This time use a named character entity for unencodable | |
39 | # characters, if one is available. | |
40 | ||
41 | def xmlcharnamereplace(exc): | |
42 | if not isinstance(exc, UnicodeEncodeError): | |
43 | raise TypeError("don't know how to handle %r" % exc) | |
44 | l = [] | |
45 | for c in exc.object[exc.start:exc.end]: | |
46 | try: | |
47 | l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)]) | |
48 | except KeyError: | |
49 | l.append(u"&#%d;" % ord(c)) | |
50 | return (u"".join(l), exc.end) | |
51 | ||
52 | codecs.register_error( | |
53 | "test.xmlcharnamereplace", xmlcharnamereplace) | |
54 | ||
55 | sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" | |
56 | sout = "«ℜ» = ⟨ሴ€⟩" | |
57 | self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) | |
58 | sout = "\xabℜ\xbb = ⟨ሴ€⟩" | |
59 | self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) | |
60 | sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" | |
61 | self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) | |
62 | ||
63 | def test_uninamereplace(self): | |
64 | # We're using the names from the unicode database this time, | |
65 | # and we're doing "syntax highlighting" here, i.e. we include | |
66 | # the replaced text in ANSI escape sequences. For this it is | |
67 | # useful that the error handler is not called for every single | |
68 | # unencodable character, but for a complete sequence of | |
69 | # unencodable characters, otherwise we would output many | |
70 | # unneccessary escape sequences. | |
71 | ||
72 | def uninamereplace(exc): | |
73 | if not isinstance(exc, UnicodeEncodeError): | |
74 | raise TypeError("don't know how to handle %r" % exc) | |
75 | l = [] | |
76 | for c in exc.object[exc.start:exc.end]: | |
77 | l.append(unicodedata.name(c, u"0x%x" % ord(c))) | |
78 | return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end) | |
79 | ||
80 | codecs.register_error( | |
81 | "test.uninamereplace", uninamereplace) | |
82 | ||
83 | sin = u"\xac\u1234\u20ac\u8000" | |
84 | sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" | |
85 | self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) | |
86 | ||
87 | sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" | |
88 | self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) | |
89 | ||
90 | sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" | |
91 | self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) | |
92 | ||
93 | def test_backslashescape(self): | |
94 | # Does the same as the "unicode-escape" encoding, but with different | |
95 | # base encodings. | |
96 | sin = u"a\xac\u1234\u20ac\u8000" | |
97 | if sys.maxunicode > 0xffff: | |
98 | sin += unichr(sys.maxunicode) | |
99 | sout = "a\\xac\\u1234\\u20ac\\u8000" | |
100 | if sys.maxunicode > 0xffff: | |
101 | sout += "\\U%08x" % sys.maxunicode | |
102 | self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) | |
103 | ||
104 | sout = "a\xac\\u1234\\u20ac\\u8000" | |
105 | if sys.maxunicode > 0xffff: | |
106 | sout += "\\U%08x" % sys.maxunicode | |
107 | self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) | |
108 | ||
109 | sout = "a\xac\\u1234\xa4\\u8000" | |
110 | if sys.maxunicode > 0xffff: | |
111 | sout += "\\U%08x" % sys.maxunicode | |
112 | self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) | |
113 | ||
114 | def test_decoderelaxedutf8(self): | |
115 | # This is the test for a decoding callback handler, | |
116 | # that relaxes the UTF-8 minimal encoding restriction. | |
117 | # A null byte that is encoded as "\xc0\x80" will be | |
118 | # decoded as a null byte. All other illegal sequences | |
119 | # will be handled strictly. | |
120 | def relaxedutf8(exc): | |
121 | if not isinstance(exc, UnicodeDecodeError): | |
122 | raise TypeError("don't know how to handle %r" % exc) | |
123 | if exc.object[exc.start:exc.end].startswith("\xc0\x80"): | |
124 | return (u"\x00", exc.start+2) # retry after two bytes | |
125 | else: | |
126 | raise exc | |
127 | ||
128 | codecs.register_error( | |
129 | "test.relaxedutf8", relaxedutf8) | |
130 | ||
131 | sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" | |
132 | sout = u"a\x00b\x00c\xfc\x00\x00" | |
133 | self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) | |
134 | sin = "\xc0\x80\xc0\x81" | |
135 | self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8") | |
136 | ||
137 | def test_charmapencode(self): | |
138 | # For charmap encodings the replacement string will be | |
139 | # mapped through the encoding again. This means, that | |
140 | # to be able to use e.g. the "replace" handler, the | |
141 | # charmap has to have a mapping for "?". | |
142 | charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) | |
143 | sin = u"abc" | |
144 | sout = "AABBCC" | |
145 | self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) | |
146 | ||
147 | sin = u"abcA" | |
148 | self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) | |
149 | ||
150 | charmap[ord("?")] = "XYZ" | |
151 | sin = u"abcDEF" | |
152 | sout = "AABBCCXYZXYZXYZ" | |
153 | self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) | |
154 | ||
155 | charmap[ord("?")] = u"XYZ" | |
156 | self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) | |
157 | ||
158 | charmap[ord("?")] = u"XYZ" | |
159 | self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) | |
160 | ||
161 | def test_decodeunicodeinternal(self): | |
162 | self.assertRaises( | |
163 | UnicodeDecodeError, | |
164 | "\x00\x00\x00\x00\x00".decode, | |
165 | "unicode-internal", | |
166 | ) | |
167 | if sys.maxunicode > 0xffff: | |
168 | def handler_unicodeinternal(exc): | |
169 | if not isinstance(exc, UnicodeDecodeError): | |
170 | raise TypeError("don't know how to handle %r" % exc) | |
171 | return (u"\x01", 1) | |
172 | ||
173 | self.assertEqual( | |
174 | "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), | |
175 | u"\u0000" | |
176 | ) | |
177 | ||
178 | self.assertEqual( | |
179 | "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), | |
180 | u"\u0000\ufffd" | |
181 | ) | |
182 | ||
183 | codecs.register_error("test.hui", handler_unicodeinternal) | |
184 | ||
185 | self.assertEqual( | |
186 | "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), | |
187 | u"\u0000\u0001\u0000" | |
188 | ) | |
189 | ||
190 | def test_callbacks(self): | |
191 | def handler1(exc): | |
192 | if not isinstance(exc, UnicodeEncodeError) \ | |
193 | and not isinstance(exc, UnicodeDecodeError): | |
194 | raise TypeError("don't know how to handle %r" % exc) | |
195 | l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] | |
196 | return (u"[%s]" % u"".join(l), exc.end) | |
197 | ||
198 | codecs.register_error("test.handler1", handler1) | |
199 | ||
200 | def handler2(exc): | |
201 | if not isinstance(exc, UnicodeDecodeError): | |
202 | raise TypeError("don't know how to handle %r" % exc) | |
203 | l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] | |
204 | return (u"[%s]" % u"".join(l), exc.end+1) # skip one character | |
205 | ||
206 | codecs.register_error("test.handler2", handler2) | |
207 | ||
208 | s = "\x00\x81\x7f\x80\xff" | |
209 | ||
210 | self.assertEqual( | |
211 | s.decode("ascii", "test.handler1"), | |
212 | u"\x00[<129>]\x7f[<128>][<255>]" | |
213 | ) | |
214 | self.assertEqual( | |
215 | s.decode("ascii", "test.handler2"), | |
216 | u"\x00[<129>][<128>]" | |
217 | ) | |
218 | ||
219 | self.assertEqual( | |
220 | "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), | |
221 | u"\u3042[<92><117><51><120>]xx" | |
222 | ) | |
223 | ||
224 | self.assertEqual( | |
225 | "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), | |
226 | u"\u3042[<92><117><51><120><120>]" | |
227 | ) | |
228 | ||
229 | self.assertEqual( | |
230 | codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], | |
231 | u"z[<98>][<99>]" | |
232 | ) | |
233 | ||
234 | self.assertEqual( | |
235 | u"g\xfc\xdfrk".encode("ascii", "test.handler1"), | |
236 | u"g[<252><223>]rk" | |
237 | ) | |
238 | ||
239 | self.assertEqual( | |
240 | u"g\xfc\xdf".encode("ascii", "test.handler1"), | |
241 | u"g[<252><223>]" | |
242 | ) | |
243 | ||
244 | def test_longstrings(self): | |
245 | # test long strings to check for memory overflow problems | |
246 | errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] | |
247 | # register the handlers under different names, | |
248 | # to prevent the codec from recognizing the name | |
249 | for err in errors: | |
250 | codecs.register_error("test." + err, codecs.lookup_error(err)) | |
251 | l = 1000 | |
252 | errors += [ "test." + err for err in errors ] | |
253 | for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: | |
254 | for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): | |
255 | for err in errors: | |
256 | try: | |
257 | uni.encode(enc, err) | |
258 | except UnicodeError: | |
259 | pass | |
260 | ||
261 | def check_exceptionobjectargs(self, exctype, args, msg): | |
262 | # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion | |
263 | # check with one missing argument | |
264 | self.assertRaises(TypeError, exctype, *args[:-1]) | |
265 | # check with one argument too much | |
266 | self.assertRaises(TypeError, exctype, *(args + ["too much"])) | |
267 | # check with one argument of the wrong type | |
268 | wrongargs = [ "spam", u"eggs", 42, 1.0, None ] | |
269 | for i in xrange(len(args)): | |
270 | for wrongarg in wrongargs: | |
271 | if type(wrongarg) is type(args[i]): | |
272 | continue | |
273 | # build argument array | |
274 | callargs = [] | |
275 | for j in xrange(len(args)): | |
276 | if i==j: | |
277 | callargs.append(wrongarg) | |
278 | else: | |
279 | callargs.append(args[i]) | |
280 | self.assertRaises(TypeError, exctype, *callargs) | |
281 | ||
282 | # check with the correct number and type of arguments | |
283 | exc = exctype(*args) | |
284 | self.assertEquals(str(exc), msg) | |
285 | ||
286 | def test_unicodeencodeerror(self): | |
287 | self.check_exceptionobjectargs( | |
288 | UnicodeEncodeError, | |
289 | ["ascii", u"g\xfcrk", 1, 2, "ouch"], | |
290 | "'ascii' codec can't encode character u'\\xfc' in position 1: ouch" | |
291 | ) | |
292 | self.check_exceptionobjectargs( | |
293 | UnicodeEncodeError, | |
294 | ["ascii", u"g\xfcrk", 1, 4, "ouch"], | |
295 | "'ascii' codec can't encode characters in position 1-3: ouch" | |
296 | ) | |
297 | self.check_exceptionobjectargs( | |
298 | UnicodeEncodeError, | |
299 | ["ascii", u"\xfcx", 0, 1, "ouch"], | |
300 | "'ascii' codec can't encode character u'\\xfc' in position 0: ouch" | |
301 | ) | |
302 | self.check_exceptionobjectargs( | |
303 | UnicodeEncodeError, | |
304 | ["ascii", u"\u0100x", 0, 1, "ouch"], | |
305 | "'ascii' codec can't encode character u'\\u0100' in position 0: ouch" | |
306 | ) | |
307 | self.check_exceptionobjectargs( | |
308 | UnicodeEncodeError, | |
309 | ["ascii", u"\uffffx", 0, 1, "ouch"], | |
310 | "'ascii' codec can't encode character u'\\uffff' in position 0: ouch" | |
311 | ) | |
312 | if sys.maxunicode > 0xffff: | |
313 | self.check_exceptionobjectargs( | |
314 | UnicodeEncodeError, | |
315 | ["ascii", u"\U00010000x", 0, 1, "ouch"], | |
316 | "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch" | |
317 | ) | |
318 | ||
319 | def test_unicodedecodeerror(self): | |
320 | self.check_exceptionobjectargs( | |
321 | UnicodeDecodeError, | |
322 | ["ascii", "g\xfcrk", 1, 2, "ouch"], | |
323 | "'ascii' codec can't decode byte 0xfc in position 1: ouch" | |
324 | ) | |
325 | self.check_exceptionobjectargs( | |
326 | UnicodeDecodeError, | |
327 | ["ascii", "g\xfcrk", 1, 3, "ouch"], | |
328 | "'ascii' codec can't decode bytes in position 1-2: ouch" | |
329 | ) | |
330 | ||
331 | def test_unicodetranslateerror(self): | |
332 | self.check_exceptionobjectargs( | |
333 | UnicodeTranslateError, | |
334 | [u"g\xfcrk", 1, 2, "ouch"], | |
335 | "can't translate character u'\\xfc' in position 1: ouch" | |
336 | ) | |
337 | self.check_exceptionobjectargs( | |
338 | UnicodeTranslateError, | |
339 | [u"g\u0100rk", 1, 2, "ouch"], | |
340 | "can't translate character u'\\u0100' in position 1: ouch" | |
341 | ) | |
342 | self.check_exceptionobjectargs( | |
343 | UnicodeTranslateError, | |
344 | [u"g\uffffrk", 1, 2, "ouch"], | |
345 | "can't translate character u'\\uffff' in position 1: ouch" | |
346 | ) | |
347 | if sys.maxunicode > 0xffff: | |
348 | self.check_exceptionobjectargs( | |
349 | UnicodeTranslateError, | |
350 | [u"g\U00010000rk", 1, 2, "ouch"], | |
351 | "can't translate character u'\\U00010000' in position 1: ouch" | |
352 | ) | |
353 | self.check_exceptionobjectargs( | |
354 | UnicodeTranslateError, | |
355 | [u"g\xfcrk", 1, 3, "ouch"], | |
356 | "can't translate characters in position 1-2: ouch" | |
357 | ) | |
358 | ||
359 | def test_badandgoodstrictexceptions(self): | |
360 | # "strict" complains about a non-exception passed in | |
361 | self.assertRaises( | |
362 | TypeError, | |
363 | codecs.strict_errors, | |
364 | 42 | |
365 | ) | |
366 | # "strict" complains about the wrong exception type | |
367 | self.assertRaises( | |
368 | Exception, | |
369 | codecs.strict_errors, | |
370 | Exception("ouch") | |
371 | ) | |
372 | ||
373 | # If the correct exception is passed in, "strict" raises it | |
374 | self.assertRaises( | |
375 | UnicodeEncodeError, | |
376 | codecs.strict_errors, | |
377 | UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch") | |
378 | ) | |
379 | ||
380 | def test_badandgoodignoreexceptions(self): | |
381 | # "ignore" complains about a non-exception passed in | |
382 | self.assertRaises( | |
383 | TypeError, | |
384 | codecs.ignore_errors, | |
385 | 42 | |
386 | ) | |
387 | # "ignore" complains about the wrong exception type | |
388 | self.assertRaises( | |
389 | TypeError, | |
390 | codecs.ignore_errors, | |
391 | UnicodeError("ouch") | |
392 | ) | |
393 | # If the correct exception is passed in, "ignore" returns an empty replacement | |
394 | self.assertEquals( | |
395 | codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), | |
396 | (u"", 1) | |
397 | ) | |
398 | self.assertEquals( | |
399 | codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), | |
400 | (u"", 1) | |
401 | ) | |
402 | self.assertEquals( | |
403 | codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), | |
404 | (u"", 1) | |
405 | ) | |
406 | ||
407 | def test_badandgoodreplaceexceptions(self): | |
408 | # "replace" complains about a non-exception passed in | |
409 | self.assertRaises( | |
410 | TypeError, | |
411 | codecs.replace_errors, | |
412 | 42 | |
413 | ) | |
414 | # "replace" complains about the wrong exception type | |
415 | self.assertRaises( | |
416 | TypeError, | |
417 | codecs.replace_errors, | |
418 | UnicodeError("ouch") | |
419 | ) | |
420 | # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement | |
421 | self.assertEquals( | |
422 | codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), | |
423 | (u"?", 1) | |
424 | ) | |
425 | self.assertEquals( | |
426 | codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), | |
427 | (u"\ufffd", 1) | |
428 | ) | |
429 | self.assertEquals( | |
430 | codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), | |
431 | (u"\ufffd", 1) | |
432 | ) | |
433 | ||
434 | def test_badandgoodxmlcharrefreplaceexceptions(self): | |
435 | # "xmlcharrefreplace" complains about a non-exception passed in | |
436 | self.assertRaises( | |
437 | TypeError, | |
438 | codecs.xmlcharrefreplace_errors, | |
439 | 42 | |
440 | ) | |
441 | # "xmlcharrefreplace" complains about the wrong exception types | |
442 | self.assertRaises( | |
443 | TypeError, | |
444 | codecs.xmlcharrefreplace_errors, | |
445 | UnicodeError("ouch") | |
446 | ) | |
447 | # "xmlcharrefreplace" can only be used for encoding | |
448 | self.assertRaises( | |
449 | TypeError, | |
450 | codecs.xmlcharrefreplace_errors, | |
451 | UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") | |
452 | ) | |
453 | self.assertRaises( | |
454 | TypeError, | |
455 | codecs.xmlcharrefreplace_errors, | |
456 | UnicodeTranslateError(u"\u3042", 0, 1, "ouch") | |
457 | ) | |
458 | # Use the correct exception | |
459 | self.assertEquals( | |
460 | codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), | |
461 | (u"&#%d;" % 0x3042, 1) | |
462 | ) | |
463 | ||
464 | def test_badandgoodbackslashreplaceexceptions(self): | |
465 | # "backslashreplace" complains about a non-exception passed in | |
466 | self.assertRaises( | |
467 | TypeError, | |
468 | codecs.backslashreplace_errors, | |
469 | 42 | |
470 | ) | |
471 | # "backslashreplace" complains about the wrong exception types | |
472 | self.assertRaises( | |
473 | TypeError, | |
474 | codecs.backslashreplace_errors, | |
475 | UnicodeError("ouch") | |
476 | ) | |
477 | # "backslashreplace" can only be used for encoding | |
478 | self.assertRaises( | |
479 | TypeError, | |
480 | codecs.backslashreplace_errors, | |
481 | UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") | |
482 | ) | |
483 | self.assertRaises( | |
484 | TypeError, | |
485 | codecs.backslashreplace_errors, | |
486 | UnicodeTranslateError(u"\u3042", 0, 1, "ouch") | |
487 | ) | |
488 | # Use the correct exception | |
489 | self.assertEquals( | |
490 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), | |
491 | (u"\\u3042", 1) | |
492 | ) | |
493 | self.assertEquals( | |
494 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")), | |
495 | (u"\\x00", 1) | |
496 | ) | |
497 | self.assertEquals( | |
498 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")), | |
499 | (u"\\xff", 1) | |
500 | ) | |
501 | self.assertEquals( | |
502 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")), | |
503 | (u"\\u0100", 1) | |
504 | ) | |
505 | self.assertEquals( | |
506 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")), | |
507 | (u"\\uffff", 1) | |
508 | ) | |
509 | if sys.maxunicode>0xffff: | |
510 | self.assertEquals( | |
511 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")), | |
512 | (u"\\U00010000", 1) | |
513 | ) | |
514 | self.assertEquals( | |
515 | codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")), | |
516 | (u"\\U0010ffff", 1) | |
517 | ) | |
518 | ||
519 | def test_badhandlerresults(self): | |
520 | results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) | |
521 | encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") | |
522 | ||
523 | for res in results: | |
524 | codecs.register_error("test.badhandler", lambda: res) | |
525 | for enc in encs: | |
526 | self.assertRaises( | |
527 | TypeError, | |
528 | u"\u3042".encode, | |
529 | enc, | |
530 | "test.badhandler" | |
531 | ) | |
532 | for (enc, bytes) in ( | |
533 | ("ascii", "\xff"), | |
534 | ("utf-8", "\xff"), | |
535 | ("utf-7", "+x-"), | |
536 | ("unicode-internal", "\x00"), | |
537 | ): | |
538 | self.assertRaises( | |
539 | TypeError, | |
540 | bytes.decode, | |
541 | enc, | |
542 | "test.badhandler" | |
543 | ) | |
544 | ||
545 | def test_lookup(self): | |
546 | self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) | |
547 | self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore")) | |
548 | self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) | |
549 | self.assertEquals( | |
550 | codecs.xmlcharrefreplace_errors, | |
551 | codecs.lookup_error("xmlcharrefreplace") | |
552 | ) | |
553 | self.assertEquals( | |
554 | codecs.backslashreplace_errors, | |
555 | codecs.lookup_error("backslashreplace") | |
556 | ) | |
557 | ||
558 | def test_unencodablereplacement(self): | |
559 | def unencrepl(exc): | |
560 | if isinstance(exc, UnicodeEncodeError): | |
561 | return (u"\u4242", exc.end) | |
562 | else: | |
563 | raise TypeError("don't know how to handle %r" % exc) | |
564 | codecs.register_error("test.unencreplhandler", unencrepl) | |
565 | for enc in ("ascii", "iso-8859-1", "iso-8859-15"): | |
566 | self.assertRaises( | |
567 | UnicodeEncodeError, | |
568 | u"\u4242".encode, | |
569 | enc, | |
570 | "test.unencreplhandler" | |
571 | ) | |
572 | ||
573 | def test_badregistercall(self): | |
574 | # enhance coverage of: | |
575 | # Modules/_codecsmodule.c::register_error() | |
576 | # Python/codecs.c::PyCodec_RegisterError() | |
577 | self.assertRaises(TypeError, codecs.register_error, 42) | |
578 | self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) | |
579 | ||
580 | def test_unknownhandler(self): | |
581 | # enhance coverage of: | |
582 | # Modules/_codecsmodule.c::lookup_error() | |
583 | self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") | |
584 | ||
585 | def test_xmlcharrefvalues(self): | |
586 | # enhance coverage of: | |
587 | # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() | |
588 | # and inline implementations | |
589 | v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000) | |
590 | if sys.maxunicode>=100000: | |
591 | v += (100000, 500000, 1000000) | |
592 | s = u"".join([unichr(x) for x in v]) | |
593 | codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) | |
594 | for enc in ("ascii", "iso-8859-15"): | |
595 | for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): | |
596 | s.encode(enc, err) | |
597 | ||
598 | def test_decodehelper(self): | |
599 | # enhance coverage of: | |
600 | # Objects/unicodeobject.c::unicode_decode_call_errorhandler() | |
601 | # and callers | |
602 | self.assertRaises(LookupError, "\xff".decode, "ascii", "test.unknown") | |
603 | ||
604 | def baddecodereturn1(exc): | |
605 | return 42 | |
606 | codecs.register_error("test.baddecodereturn1", baddecodereturn1) | |
607 | self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1") | |
608 | self.assertRaises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1") | |
609 | self.assertRaises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1") | |
610 | self.assertRaises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1") | |
611 | self.assertRaises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") | |
612 | self.assertRaises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") | |
613 | ||
614 | def baddecodereturn2(exc): | |
615 | return (u"?", None) | |
616 | codecs.register_error("test.baddecodereturn2", baddecodereturn2) | |
617 | self.assertRaises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn2") | |
618 | ||
619 | handler = PosReturn() | |
620 | codecs.register_error("test.posreturn", handler.handle) | |
621 | ||
622 | # Valid negative position | |
623 | handler.pos = -1 | |
624 | self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") | |
625 | ||
626 | # Valid negative position | |
627 | handler.pos = -2 | |
628 | self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?><?>") | |
629 | ||
630 | # Negative position out of bounds | |
631 | handler.pos = -3 | |
632 | self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") | |
633 | ||
634 | # Valid positive position | |
635 | handler.pos = 1 | |
636 | self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>0") | |
637 | ||
638 | # Largest valid positive position (one beyond end of input) | |
639 | handler.pos = 2 | |
640 | self.assertEquals("\xff0".decode("ascii", "test.posreturn"), u"<?>") | |
641 | ||
642 | # Invalid positive position | |
643 | handler.pos = 3 | |
644 | self.assertRaises(IndexError, "\xff0".decode, "ascii", "test.posreturn") | |
645 | ||
646 | # Restart at the "0" | |
647 | handler.pos = 6 | |
648 | self.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u"<?>0") | |
649 | ||
650 | class D(dict): | |
651 | def __getitem__(self, key): | |
652 | raise ValueError | |
653 | self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None}) | |
654 | self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D()) | |
655 | self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: sys.maxunicode+1}) | |
656 | ||
657 | def test_encodehelper(self): | |
658 | # enhance coverage of: | |
659 | # Objects/unicodeobject.c::unicode_encode_call_errorhandler() | |
660 | # and callers | |
661 | self.assertRaises(LookupError, u"\xff".encode, "ascii", "test.unknown") | |
662 | ||
663 | def badencodereturn1(exc): | |
664 | return 42 | |
665 | codecs.register_error("test.badencodereturn1", badencodereturn1) | |
666 | self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn1") | |
667 | ||
668 | def badencodereturn2(exc): | |
669 | return (u"?", None) | |
670 | codecs.register_error("test.badencodereturn2", badencodereturn2) | |
671 | self.assertRaises(TypeError, u"\xff".encode, "ascii", "test.badencodereturn2") | |
672 | ||
673 | handler = PosReturn() | |
674 | codecs.register_error("test.posreturn", handler.handle) | |
675 | ||
676 | # Valid negative position | |
677 | handler.pos = -1 | |
678 | self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") | |
679 | ||
680 | # Valid negative position | |
681 | handler.pos = -2 | |
682 | self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>") | |
683 | ||
684 | # Negative position out of bounds | |
685 | handler.pos = -3 | |
686 | self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") | |
687 | ||
688 | # Valid positive position | |
689 | handler.pos = 1 | |
690 | self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0") | |
691 | ||
692 | # Largest valid positive position (one beyond end of input | |
693 | handler.pos = 2 | |
694 | self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>") | |
695 | ||
696 | # Invalid positive position | |
697 | handler.pos = 3 | |
698 | self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn") | |
699 | ||
700 | handler.pos = 0 | |
701 | ||
702 | class D(dict): | |
703 | def __getitem__(self, key): | |
704 | raise ValueError | |
705 | for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"): | |
706 | self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None}) | |
707 | self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D()) | |
708 | self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300}) | |
709 | ||
710 | def test_translatehelper(self): | |
711 | # enhance coverage of: | |
712 | # Objects/unicodeobject.c::unicode_encode_call_errorhandler() | |
713 | # and callers | |
714 | # (Unfortunately the errors argument is not directly accessible | |
715 | # from Python, so we can't test that much) | |
716 | class D(dict): | |
717 | def __getitem__(self, key): | |
718 | raise ValueError | |
719 | self.assertRaises(ValueError, u"\xff".translate, D()) | |
720 | self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1}) | |
721 | self.assertRaises(TypeError, u"\xff".translate, {0xff: ()}) | |
722 | ||
723 | def test_bug828737(self): | |
724 | charmap = { | |
725 | ord("&"): u"&", | |
726 | ord("<"): u"<", | |
727 | ord(">"): u">", | |
728 | ord('"'): u""", | |
729 | } | |
730 | ||
731 | for n in (1, 10, 100, 1000): | |
732 | text = u'abc<def>ghi'*n | |
733 | text.translate(charmap) | |
734 | ||
735 | def test_main(): | |
736 | test.test_support.run_unittest(CodecCallbackTest) | |
737 | ||
738 | if __name__ == "__main__": | |
739 | test_main() |