import test
.test_support
, unittest
import sys
, codecs
, htmlentitydefs
, unicodedata
# this can be used for configurable callbacks
realpos
= len(exc
.object) + realpos
# if we don't advance this time, terminate on the next call
# otherwise we'd get an endless loop
self
.pos
= len(exc
.object)
class CodecCallbackTest(unittest
.TestCase
):
def test_xmlcharrefreplace(self
):
# replace unencodable characters which numeric character entities.
# For ascii, latin-1 and charmaps this is completely implemented
# in C and should be reasonably fast.
s
= u
"\u30b9\u30d1\u30e2 \xe4nd eggs"
s
.encode("ascii", "xmlcharrefreplace"),
"スパモ änd eggs"
s
.encode("latin-1", "xmlcharrefreplace"),
"スパモ \xe4nd eggs"
def test_xmlcharnamereplace(self
):
# This time use a named character entity for unencodable
# characters, if one is available.
def xmlcharnamereplace(exc
):
if not isinstance(exc
, UnicodeEncodeError):
raise TypeError("don't know how to handle %r" % exc
)
for c
in exc
.object[exc
.start
:exc
.end
]:
l
.append(u
"&%s;" % htmlentitydefs
.codepoint2name
[ord(c
)])
l
.append(u
"&#%d;" % ord(c
))
return (u
"".join(l
), exc
.end
)
"test.xmlcharnamereplace", xmlcharnamereplace
)
sin
= u
"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
sout
= "«ℜ» = ⟨ሴ€⟩"
self
.assertEqual(sin
.encode("ascii", "test.xmlcharnamereplace"), sout
)
sout
= "\xabℜ\xbb = ⟨ሴ€⟩"
self
.assertEqual(sin
.encode("latin-1", "test.xmlcharnamereplace"), sout
)
sout
= "\xabℜ\xbb = ⟨ሴ\xa4⟩"
self
.assertEqual(sin
.encode("iso-8859-15", "test.xmlcharnamereplace"), sout
)
def test_uninamereplace(self
):
# We're using the names from the unicode database this time,
# and we're doing "syntax highlighting" here, i.e. we include
# the replaced text in ANSI escape sequences. For this it is
# useful that the error handler is not called for every single
# unencodable character, but for a complete sequence of
# unencodable characters, otherwise we would output many
# unneccessary escape sequences.
if not isinstance(exc
, UnicodeEncodeError):
raise TypeError("don't know how to handle %r" % exc
)
for c
in exc
.object[exc
.start
:exc
.end
]:
l
.append(unicodedata
.name(c
, u
"0x%x" % ord(c
)))
return (u
"\033[1m%s\033[0m" % u
", ".join(l
), exc
.end
)
"test.uninamereplace", uninamereplace
)
sin
= u
"\xac\u1234\u20ac\u8000"
sout
= "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
self
.assertEqual(sin
.encode("ascii", "test.uninamereplace"), sout
)
sout
= "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
self
.assertEqual(sin
.encode("latin-1", "test.uninamereplace"), sout
)
sout
= "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
self
.assertEqual(sin
.encode("iso-8859-15", "test.uninamereplace"), sout
)
def test_backslashescape(self
):
# Does the same as the "unicode-escape" encoding, but with different
sin
= u
"a\xac\u1234\u20ac\u8000"
if sys
.maxunicode
> 0xffff:
sin
+= unichr(sys
.maxunicode
)
sout
= "a\\xac\\u1234\\u20ac\\u8000"
if sys
.maxunicode
> 0xffff:
sout
+= "\\U%08x" % sys
.maxunicode
self
.assertEqual(sin
.encode("ascii", "backslashreplace"), sout
)
sout
= "a\xac\\u1234\\u20ac\\u8000"
if sys
.maxunicode
> 0xffff:
sout
+= "\\U%08x" % sys
.maxunicode
self
.assertEqual(sin
.encode("latin-1", "backslashreplace"), sout
)
sout
= "a\xac\\u1234\xa4\\u8000"
if sys
.maxunicode
> 0xffff:
sout
+= "\\U%08x" % sys
.maxunicode
self
.assertEqual(sin
.encode("iso-8859-15", "backslashreplace"), sout
)
def test_decoderelaxedutf8(self
):
# This is the test for a decoding callback handler,
# that relaxes the UTF-8 minimal encoding restriction.
# A null byte that is encoded as "\xc0\x80" will be
# decoded as a null byte. All other illegal sequences
# will be handled strictly.
if not isinstance(exc
, UnicodeDecodeError):
raise TypeError("don't know how to handle %r" % exc
)
if exc
.object[exc
.start
:exc
.end
].startswith("\xc0\x80"):
return (u
"\x00", exc
.start
+2) # retry after two bytes
"test.relaxedutf8", relaxedutf8
)
sin
= "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
sout
= u
"a\x00b\x00c\xfc\x00\x00"
self
.assertEqual(sin
.decode("utf-8", "test.relaxedutf8"), sout
)
self
.assertRaises(UnicodeError, sin
.decode
, "utf-8", "test.relaxedutf8")
def test_charmapencode(self
):
# For charmap encodings the replacement string will be
# mapped through the encoding again. This means, that
# to be able to use e.g. the "replace" handler, the
# charmap has to have a mapping for "?".
charmap
= dict([ (ord(c
), 2*c
.upper()) for c
in "abcdefgh"])
self
.assertEquals(codecs
.charmap_encode(sin
, "strict", charmap
)[0], sout
)
self
.assertRaises(UnicodeError, codecs
.charmap_encode
, sin
, "strict", charmap
)
charmap
[ord("?")] = "XYZ"
self
.assertEquals(codecs
.charmap_encode(sin
, "replace", charmap
)[0], sout
)
charmap
[ord("?")] = u
"XYZ"
self
.assertRaises(TypeError, codecs
.charmap_encode
, sin
, "replace", charmap
)
charmap
[ord("?")] = u
"XYZ"
self
.assertRaises(TypeError, codecs
.charmap_encode
, sin
, "replace", charmap
)
def test_decodeunicodeinternal(self
):
"\x00\x00\x00\x00\x00".decode
,
if sys
.maxunicode
> 0xffff:
def handler_unicodeinternal(exc
):
if not isinstance(exc
, UnicodeDecodeError):
raise TypeError("don't know how to handle %r" % exc
)
"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
codecs
.register_error("test.hui", handler_unicodeinternal
)
"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
def test_callbacks(self
):
if not isinstance(exc
, UnicodeEncodeError) \
and not isinstance(exc
, UnicodeDecodeError):
raise TypeError("don't know how to handle %r" % exc
)
l
= [u
"<%d>" % ord(exc
.object[pos
]) for pos
in xrange(exc
.start
, exc
.end
)]
return (u
"[%s]" % u
"".join(l
), exc
.end
)
codecs
.register_error("test.handler1", handler1
)
if not isinstance(exc
, UnicodeDecodeError):
raise TypeError("don't know how to handle %r" % exc
)
l
= [u
"<%d>" % ord(exc
.object[pos
]) for pos
in xrange(exc
.start
, exc
.end
)]
return (u
"[%s]" % u
"".join(l
), exc
.end
+1) # skip one character
codecs
.register_error("test.handler2", handler2
)
s
= "\x00\x81\x7f\x80\xff"
s
.decode("ascii", "test.handler1"),
u
"\x00[<129>]\x7f[<128>][<255>]"
s
.decode("ascii", "test.handler2"),
"\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
u
"\u3042[<92><117><51><120>]xx"
"\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
u
"\u3042[<92><117><51><120><120>]"
codecs
.charmap_decode("abc", "test.handler1", {ord("a"): u
"z"})[0],
u
"g\xfc\xdfrk".encode("ascii", "test.handler1"),
u
"g\xfc\xdf".encode("ascii", "test.handler1"),
def test_longstrings(self
):
# test long strings to check for memory overflow problems
errors
= [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
codecs
.register_error("test." + err
, codecs
.lookup_error(err
))
errors
+= [ "test." + err
for err
in errors
]
for uni
in [ s
*l
for s
in (u
"x", u
"\u3042", u
"a\xe4") ]:
for enc
in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
def check_exceptionobjectargs(self
, exctype
, args
, msg
):
# Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
# check with one missing argument
self
.assertRaises(TypeError, exctype
, *args
[:-1])
# check with one argument too much
self
.assertRaises(TypeError, exctype
, *(args
+ ["too much"]))
# check with one argument of the wrong type
wrongargs
= [ "spam", u
"eggs", 42, 1.0, None ]
for i
in xrange(len(args
)):
for wrongarg
in wrongargs
:
if type(wrongarg
) is type(args
[i
]):
for j
in xrange(len(args
)):
callargs
.append(wrongarg
)
self
.assertRaises(TypeError, exctype
, *callargs
)
# check with the correct number and type of arguments
self
.assertEquals(str(exc
), msg
)
def test_unicodeencodeerror(self
):
self
.check_exceptionobjectargs(
["ascii", u
"g\xfcrk", 1, 2, "ouch"],
"'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
self
.check_exceptionobjectargs(
["ascii", u
"g\xfcrk", 1, 4, "ouch"],
"'ascii' codec can't encode characters in position 1-3: ouch"
self
.check_exceptionobjectargs(
["ascii", u
"\xfcx", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
self
.check_exceptionobjectargs(
["ascii", u
"\u0100x", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
self
.check_exceptionobjectargs(
["ascii", u
"\uffffx", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
if sys
.maxunicode
> 0xffff:
self
.check_exceptionobjectargs(
["ascii", u
"\U00010000x", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
def test_unicodedecodeerror(self
):
self
.check_exceptionobjectargs(
["ascii", "g\xfcrk", 1, 2, "ouch"],
"'ascii' codec can't decode byte 0xfc in position 1: ouch"
self
.check_exceptionobjectargs(
["ascii", "g\xfcrk", 1, 3, "ouch"],
"'ascii' codec can't decode bytes in position 1-2: ouch"
def test_unicodetranslateerror(self
):
self
.check_exceptionobjectargs(
[u
"g\xfcrk", 1, 2, "ouch"],
"can't translate character u'\\xfc' in position 1: ouch"
self
.check_exceptionobjectargs(
[u
"g\u0100rk", 1, 2, "ouch"],
"can't translate character u'\\u0100' in position 1: ouch"
self
.check_exceptionobjectargs(
[u
"g\uffffrk", 1, 2, "ouch"],
"can't translate character u'\\uffff' in position 1: ouch"
if sys
.maxunicode
> 0xffff:
self
.check_exceptionobjectargs(
[u
"g\U00010000rk", 1, 2, "ouch"],
"can't translate character u'\\U00010000' in position 1: ouch"
self
.check_exceptionobjectargs(
[u
"g\xfcrk", 1, 3, "ouch"],
"can't translate characters in position 1-2: ouch"
def test_badandgoodstrictexceptions(self
):
# "strict" complains about a non-exception passed in
# "strict" complains about the wrong exception type
# If the correct exception is passed in, "strict" raises it
UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")
def test_badandgoodignoreexceptions(self
):
# "ignore" complains about a non-exception passed in
# "ignore" complains about the wrong exception type
# If the correct exception is passed in, "ignore" returns an empty replacement
codecs
.ignore_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
codecs
.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
codecs
.ignore_errors(UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")),
def test_badandgoodreplaceexceptions(self
):
# "replace" complains about a non-exception passed in
# "replace" complains about the wrong exception type
# With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
codecs
.replace_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
codecs
.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
codecs
.replace_errors(UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")),
def test_badandgoodxmlcharrefreplaceexceptions(self
):
# "xmlcharrefreplace" complains about a non-exception passed in
codecs
.xmlcharrefreplace_errors
,
# "xmlcharrefreplace" complains about the wrong exception types
codecs
.xmlcharrefreplace_errors
,
# "xmlcharrefreplace" can only be used for encoding
codecs
.xmlcharrefreplace_errors
,
UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
codecs
.xmlcharrefreplace_errors
,
UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")
# Use the correct exception
codecs
.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
def test_badandgoodbackslashreplaceexceptions(self
):
# "backslashreplace" complains about a non-exception passed in
codecs
.backslashreplace_errors
,
# "backslashreplace" complains about the wrong exception types
codecs
.backslashreplace_errors
,
# "backslashreplace" can only be used for encoding
codecs
.backslashreplace_errors
,
UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
codecs
.backslashreplace_errors
,
UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")
# Use the correct exception
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\x00", 0, 1, "ouch")),
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\xff", 0, 1, "ouch")),
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\u0100", 0, 1, "ouch")),
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\uffff", 0, 1, "ouch")),
if sys
.maxunicode
>0xffff:
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\U00010000", 0, 1, "ouch")),
codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\U0010ffff", 0, 1, "ouch")),
def test_badhandlerresults(self
):
results
= ( 42, u
"foo", (1,2,3), (u
"foo", 1, 3), (u
"foo", None), (u
"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
encs
= ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
codecs
.register_error("test.badhandler", lambda: res
)
("unicode-internal", "\x00"),
self
.assertEquals(codecs
.strict_errors
, codecs
.lookup_error("strict"))
self
.assertEquals(codecs
.ignore_errors
, codecs
.lookup_error("ignore"))
self
.assertEquals(codecs
.strict_errors
, codecs
.lookup_error("strict"))
codecs
.xmlcharrefreplace_errors
,
codecs
.lookup_error("xmlcharrefreplace")
codecs
.backslashreplace_errors
,
codecs
.lookup_error("backslashreplace")
def test_unencodablereplacement(self
):
if isinstance(exc
, UnicodeEncodeError):
return (u
"\u4242", exc
.end
)
raise TypeError("don't know how to handle %r" % exc
)
codecs
.register_error("test.unencreplhandler", unencrepl
)
for enc
in ("ascii", "iso-8859-1", "iso-8859-15"):
def test_badregistercall(self
):
# Modules/_codecsmodule.c::register_error()
# Python/codecs.c::PyCodec_RegisterError()
self
.assertRaises(TypeError, codecs
.register_error
, 42)
self
.assertRaises(TypeError, codecs
.register_error
, "test.dummy", 42)
def test_unknownhandler(self
):
# Modules/_codecsmodule.c::lookup_error()
self
.assertRaises(LookupError, codecs
.lookup_error
, "test.unknown")
def test_xmlcharrefvalues(self
):
# Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
# and inline implementations
v
= (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
if sys
.maxunicode
>=100000:
v
+= (100000, 500000, 1000000)
s
= u
"".join([unichr(x
) for x
in v
])
codecs
.register_error("test.xmlcharrefreplace", codecs
.xmlcharrefreplace_errors
)
for enc
in ("ascii", "iso-8859-15"):
for err
in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
def test_decodehelper(self
):
# Objects/unicodeobject.c::unicode_decode_call_errorhandler()
self
.assertRaises(LookupError, "\xff".decode
, "ascii", "test.unknown")
def baddecodereturn1(exc
):
codecs
.register_error("test.baddecodereturn1", baddecodereturn1
)
self
.assertRaises(TypeError, "\xff".decode
, "ascii", "test.baddecodereturn1")
self
.assertRaises(TypeError, "\\".decode
, "unicode-escape", "test.baddecodereturn1")
self
.assertRaises(TypeError, "\\x0".decode
, "unicode-escape", "test.baddecodereturn1")
self
.assertRaises(TypeError, "\\x0y".decode
, "unicode-escape", "test.baddecodereturn1")
self
.assertRaises(TypeError, "\\Uffffeeee".decode
, "unicode-escape", "test.baddecodereturn1")
self
.assertRaises(TypeError, "\\uyyyy".decode
, "raw-unicode-escape", "test.baddecodereturn1")
def baddecodereturn2(exc
):
codecs
.register_error("test.baddecodereturn2", baddecodereturn2
)
self
.assertRaises(TypeError, "\xff".decode
, "ascii", "test.baddecodereturn2")
codecs
.register_error("test.posreturn", handler
.handle
)
# Valid negative position
self
.assertEquals("\xff0".decode("ascii", "test.posreturn"), u
"<?>0")
# Valid negative position
self
.assertEquals("\xff0".decode("ascii", "test.posreturn"), u
"<?><?>")
# Negative position out of bounds
self
.assertRaises(IndexError, "\xff0".decode
, "ascii", "test.posreturn")
# Valid positive position
self
.assertEquals("\xff0".decode("ascii", "test.posreturn"), u
"<?>0")
# Largest valid positive position (one beyond end of input)
self
.assertEquals("\xff0".decode("ascii", "test.posreturn"), u
"<?>")
# Invalid positive position
self
.assertRaises(IndexError, "\xff0".decode
, "ascii", "test.posreturn")
self
.assertEquals("\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), u
"<?>0")
def __getitem__(self
, key
):
self
.assertRaises(UnicodeError, codecs
.charmap_decode
, "\xff", "strict", {0xff: None})
self
.assertRaises(ValueError, codecs
.charmap_decode
, "\xff", "strict", D())
self
.assertRaises(TypeError, codecs
.charmap_decode
, "\xff", "strict", {0xff: sys
.maxunicode
+1})
def test_encodehelper(self
):
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()
self
.assertRaises(LookupError, u
"\xff".encode
, "ascii", "test.unknown")
def badencodereturn1(exc
):
codecs
.register_error("test.badencodereturn1", badencodereturn1
)
self
.assertRaises(TypeError, u
"\xff".encode
, "ascii", "test.badencodereturn1")
def badencodereturn2(exc
):
codecs
.register_error("test.badencodereturn2", badencodereturn2
)
self
.assertRaises(TypeError, u
"\xff".encode
, "ascii", "test.badencodereturn2")
codecs
.register_error("test.posreturn", handler
.handle
)
# Valid negative position
self
.assertEquals(u
"\xff0".encode("ascii", "test.posreturn"), "<?>0")
# Valid negative position
self
.assertEquals(u
"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
# Negative position out of bounds
self
.assertRaises(IndexError, u
"\xff0".encode
, "ascii", "test.posreturn")
# Valid positive position
self
.assertEquals(u
"\xff0".encode("ascii", "test.posreturn"), "<?>0")
# Largest valid positive position (one beyond end of input
self
.assertEquals(u
"\xff0".encode("ascii", "test.posreturn"), "<?>")
# Invalid positive position
self
.assertRaises(IndexError, u
"\xff0".encode
, "ascii", "test.posreturn")
def __getitem__(self
, key
):
for err
in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
self
.assertRaises(UnicodeError, codecs
.charmap_encode
, u
"\xff", err
, {0xff: None})
self
.assertRaises(ValueError, codecs
.charmap_encode
, u
"\xff", err
, D())
self
.assertRaises(TypeError, codecs
.charmap_encode
, u
"\xff", err
, {0xff: 300})
def test_translatehelper(self
):
# Objects/unicodeobject.c::unicode_encode_call_errorhandler()
# (Unfortunately the errors argument is not directly accessible
# from Python, so we can't test that much)
def __getitem__(self
, key
):
self
.assertRaises(ValueError, u
"\xff".translate
, D())
self
.assertRaises(TypeError, u
"\xff".translate
, {0xff: sys
.maxunicode
+1})
self
.assertRaises(TypeError, u
"\xff".translate
, {0xff: ()})
def test_bug828737(self
):
for n
in (1, 10, 100, 1000):
test
.test_support
.run_unittest(CodecCallbackTest
)
if __name__
== "__main__":