Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / amd64 / lib / python2.4 / test / test_unicode.py
CommitLineData
920dae64
AT
1# -*- coding: iso-8859-1 -*-
2""" Test script for the Unicode implementation.
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9import unittest, sys, string, codecs, new
10from test import test_support, string_tests
11
12class UnicodeTest(
13 string_tests.CommonTest,
14 string_tests.MixinStrUnicodeUserStringTest,
15 string_tests.MixinStrUnicodeTest,
16 ):
17 type2test = unicode
18
19 def checkequalnofix(self, result, object, methodname, *args):
20 method = getattr(object, methodname)
21 realresult = method(*args)
22 self.assertEqual(realresult, result)
23 self.assert_(type(realresult) is type(result))
24
25 # if the original is returned make sure that
26 # this doesn't happen with subclasses
27 if realresult is object:
28 class usub(unicode):
29 def __repr__(self):
30 return 'usub(%r)' % unicode.__repr__(self)
31 object = usub(object)
32 method = getattr(object, methodname)
33 realresult = method(*args)
34 self.assertEqual(realresult, result)
35 self.assert_(object is not realresult)
36
37 def test_literals(self):
38 self.assertEqual(u'\xff', u'\u00ff')
39 self.assertEqual(u'\uffff', u'\U0000ffff')
40 self.assertRaises(UnicodeError, eval, 'u\'\\Ufffffffe\'')
41 self.assertRaises(UnicodeError, eval, 'u\'\\Uffffffff\'')
42 self.assertRaises(UnicodeError, eval, 'u\'\\U%08x\'' % 0x110000)
43
44 def test_repr(self):
45 if not sys.platform.startswith('java'):
46 # Test basic sanity of repr()
47 self.assertEqual(repr(u'abc'), "u'abc'")
48 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
49 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
50 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
51 self.assertEqual(repr(u'\\'), "u'\\\\'")
52 self.assertEqual(repr(u'\n'), "u'\\n'")
53 self.assertEqual(repr(u'\r'), "u'\\r'")
54 self.assertEqual(repr(u'\t'), "u'\\t'")
55 self.assertEqual(repr(u'\b'), "u'\\x08'")
56 self.assertEqual(repr(u"'\""), """u'\\'"'""")
57 self.assertEqual(repr(u"'\""), """u'\\'"'""")
58 self.assertEqual(repr(u"'"), '''u"'"''')
59 self.assertEqual(repr(u'"'), """u'"'""")
60 latin1repr = (
61 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
62 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
63 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
64 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
65 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
66 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
67 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
68 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
69 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
70 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
71 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
72 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
73 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
74 "\\xfe\\xff'")
75 testrepr = repr(u''.join(map(unichr, xrange(256))))
76 self.assertEqual(testrepr, latin1repr)
77
78 def test_count(self):
79 string_tests.CommonTest.test_count(self)
80 # check mixed argument types
81 self.checkequalnofix(3, 'aaa', 'count', u'a')
82 self.checkequalnofix(0, 'aaa', 'count', u'b')
83 self.checkequalnofix(3, u'aaa', 'count', 'a')
84 self.checkequalnofix(0, u'aaa', 'count', 'b')
85 self.checkequalnofix(0, u'aaa', 'count', 'b')
86 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
87 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
88 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
89 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
90
91 def test_find(self):
92 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
93 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
94 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
95
96 self.assertRaises(TypeError, u'hello'.find)
97 self.assertRaises(TypeError, u'hello'.find, 42)
98
99 def test_rfind(self):
100 string_tests.CommonTest.test_rfind(self)
101 # check mixed argument types
102 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
103 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
104 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
105
106 def test_index(self):
107 string_tests.CommonTest.test_index(self)
108 # check mixed argument types
109 for (t1, t2) in ((str, unicode), (unicode, str)):
110 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
111 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
112 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
113 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
114 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
115 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
116 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
117 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
118
119 def test_rindex(self):
120 string_tests.CommonTest.test_rindex(self)
121 # check mixed argument types
122 for (t1, t2) in ((str, unicode), (unicode, str)):
123 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
124 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
125 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
126 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
127
128 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
129 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
130 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
131 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
132 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
133
134 def test_translate(self):
135 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
136 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
137 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
138 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
139 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
140 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
141
142 self.assertRaises(TypeError, u'hello'.translate)
143 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
144
145 def test_split(self):
146 string_tests.CommonTest.test_split(self)
147
148 # Mixed arguments
149 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
150 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
151 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
152
153 def test_join(self):
154 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
155
156 # mixed arguments
157 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
158 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
159 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
160 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
161 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
162 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
163 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
164
165 def test_strip(self):
166 string_tests.CommonTest.test_strip(self)
167 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
168
169 def test_replace(self):
170 string_tests.CommonTest.test_replace(self)
171
172 # method call forwarded from str implementation because of unicode argument
173 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
174 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
175
176 def test_comparison(self):
177 # Comparisons:
178 self.assertEqual(u'abc', 'abc')
179 self.assertEqual('abc', u'abc')
180 self.assertEqual(u'abc', u'abc')
181 self.assert_(u'abcd' > 'abc')
182 self.assert_('abcd' > u'abc')
183 self.assert_(u'abcd' > u'abc')
184 self.assert_(u'abc' < 'abcd')
185 self.assert_('abc' < u'abcd')
186 self.assert_(u'abc' < u'abcd')
187
188 if 0:
189 # Move these tests to a Unicode collation module test...
190 # Testing UTF-16 code point order comparisons...
191
192 # No surrogates, no fixup required.
193 self.assert_(u'\u0061' < u'\u20ac')
194 # Non surrogate below surrogate value, no fixup required
195 self.assert_(u'\u0061' < u'\ud800\udc02')
196
197 # Non surrogate above surrogate value, fixup required
198 def test_lecmp(s, s2):
199 self.assert_(s < s2)
200
201 def test_fixup(s):
202 s2 = u'\ud800\udc01'
203 test_lecmp(s, s2)
204 s2 = u'\ud900\udc01'
205 test_lecmp(s, s2)
206 s2 = u'\uda00\udc01'
207 test_lecmp(s, s2)
208 s2 = u'\udb00\udc01'
209 test_lecmp(s, s2)
210 s2 = u'\ud800\udd01'
211 test_lecmp(s, s2)
212 s2 = u'\ud900\udd01'
213 test_lecmp(s, s2)
214 s2 = u'\uda00\udd01'
215 test_lecmp(s, s2)
216 s2 = u'\udb00\udd01'
217 test_lecmp(s, s2)
218 s2 = u'\ud800\ude01'
219 test_lecmp(s, s2)
220 s2 = u'\ud900\ude01'
221 test_lecmp(s, s2)
222 s2 = u'\uda00\ude01'
223 test_lecmp(s, s2)
224 s2 = u'\udb00\ude01'
225 test_lecmp(s, s2)
226 s2 = u'\ud800\udfff'
227 test_lecmp(s, s2)
228 s2 = u'\ud900\udfff'
229 test_lecmp(s, s2)
230 s2 = u'\uda00\udfff'
231 test_lecmp(s, s2)
232 s2 = u'\udb00\udfff'
233 test_lecmp(s, s2)
234
235 test_fixup(u'\ue000')
236 test_fixup(u'\uff61')
237
238 # Surrogates on both sides, no fixup required
239 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
240
241 def test_islower(self):
242 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
243 self.checkequalnofix(False, u'\u1FFc', 'islower')
244
245 def test_isupper(self):
246 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
247 if not sys.platform.startswith('java'):
248 self.checkequalnofix(False, u'\u1FFc', 'isupper')
249
250 def test_istitle(self):
251 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
252 self.checkequalnofix(True, u'\u1FFc', 'istitle')
253 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
254
255 def test_isspace(self):
256 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
257 self.checkequalnofix(True, u'\u2000', 'isspace')
258 self.checkequalnofix(True, u'\u200a', 'isspace')
259 self.checkequalnofix(False, u'\u2014', 'isspace')
260
261 def test_isalpha(self):
262 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
263 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
264
265 def test_isdecimal(self):
266 self.checkequalnofix(False, u'', 'isdecimal')
267 self.checkequalnofix(False, u'a', 'isdecimal')
268 self.checkequalnofix(True, u'0', 'isdecimal')
269 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
270 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
271 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
272 self.checkequalnofix(True, u'0123456789', 'isdecimal')
273 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
274
275 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
276
277 def test_isdigit(self):
278 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
279 self.checkequalnofix(True, u'\u2460', 'isdigit')
280 self.checkequalnofix(False, u'\xbc', 'isdigit')
281 self.checkequalnofix(True, u'\u0660', 'isdigit')
282
283 def test_isnumeric(self):
284 self.checkequalnofix(False, u'', 'isnumeric')
285 self.checkequalnofix(False, u'a', 'isnumeric')
286 self.checkequalnofix(True, u'0', 'isnumeric')
287 self.checkequalnofix(True, u'\u2460', 'isnumeric')
288 self.checkequalnofix(True, u'\xbc', 'isnumeric')
289 self.checkequalnofix(True, u'\u0660', 'isnumeric')
290 self.checkequalnofix(True, u'0123456789', 'isnumeric')
291 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
292
293 self.assertRaises(TypeError, u"abc".isnumeric, 42)
294
295 def test_contains(self):
296 # Testing Unicode contains method
297 self.assert_('a' in u'abdb')
298 self.assert_('a' in u'bdab')
299 self.assert_('a' in u'bdaba')
300 self.assert_('a' in u'bdba')
301 self.assert_('a' in u'bdba')
302 self.assert_(u'a' in u'bdba')
303 self.assert_(u'a' not in u'bdb')
304 self.assert_(u'a' not in 'bdb')
305 self.assert_(u'a' in 'bdba')
306 self.assert_(u'a' in ('a',1,None))
307 self.assert_(u'a' in (1,None,'a'))
308 self.assert_(u'a' in (1,None,u'a'))
309 self.assert_('a' in ('a',1,None))
310 self.assert_('a' in (1,None,'a'))
311 self.assert_('a' in (1,None,u'a'))
312 self.assert_('a' not in ('x',1,u'y'))
313 self.assert_('a' not in ('x',1,None))
314 self.assert_(u'abcd' not in u'abcxxxx')
315 self.assert_(u'ab' in u'abcd')
316 self.assert_('ab' in u'abc')
317 self.assert_(u'ab' in 'abc')
318 self.assert_(u'ab' in (1,None,u'ab'))
319 self.assert_(u'' in u'abc')
320 self.assert_('' in u'abc')
321
322 # If the following fails either
323 # the contains operator does not propagate UnicodeErrors or
324 # someone has changed the default encoding
325 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
326
327 self.assert_(u'' in '')
328 self.assert_('' in u'')
329 self.assert_(u'' in u'')
330 self.assert_(u'' in 'abc')
331 self.assert_('' in u'abc')
332 self.assert_(u'' in u'abc')
333 self.assert_(u'\0' not in 'abc')
334 self.assert_('\0' not in u'abc')
335 self.assert_(u'\0' not in u'abc')
336 self.assert_(u'\0' in '\0abc')
337 self.assert_('\0' in u'\0abc')
338 self.assert_(u'\0' in u'\0abc')
339 self.assert_(u'\0' in 'abc\0')
340 self.assert_('\0' in u'abc\0')
341 self.assert_(u'\0' in u'abc\0')
342 self.assert_(u'a' in '\0abc')
343 self.assert_('a' in u'\0abc')
344 self.assert_(u'a' in u'\0abc')
345 self.assert_(u'asdf' in 'asdf')
346 self.assert_('asdf' in u'asdf')
347 self.assert_(u'asdf' in u'asdf')
348 self.assert_(u'asdf' not in 'asd')
349 self.assert_('asdf' not in u'asd')
350 self.assert_(u'asdf' not in u'asd')
351 self.assert_(u'asdf' not in '')
352 self.assert_('asdf' not in u'')
353 self.assert_(u'asdf' not in u'')
354
355 self.assertRaises(TypeError, u"abc".__contains__)
356
357 def test_formatting(self):
358 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
359 # Testing Unicode formatting strings...
360 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
361 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
362 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
363 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
364 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
365 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
366 if not sys.platform.startswith('java'):
367 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
368 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
369 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
370
371 self.assertEqual(u'%c' % 0x1234, u'\u1234')
372 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
373
374 # formatting jobs delegated from the string implementation:
375 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
376 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
377 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
378 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
379 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
380 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
381 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
382 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
383 self.assertEqual('...%s...' % u"abc", u'...abc...')
384 self.assertEqual('%*s' % (5,u'abc',), u' abc')
385 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
386 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
387 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
388 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
389 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
390 self.assertEqual('%c' % u'a', u'a')
391
392
393 def test_constructor(self):
394 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
395
396 self.assertEqual(
397 unicode(u'unicode remains unicode'),
398 u'unicode remains unicode'
399 )
400
401 class UnicodeSubclass(unicode):
402 pass
403
404 self.assertEqual(
405 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
406 u'unicode subclass becomes unicode'
407 )
408
409 self.assertEqual(
410 unicode('strings are converted to unicode'),
411 u'strings are converted to unicode'
412 )
413
414 class UnicodeCompat:
415 def __init__(self, x):
416 self.x = x
417 def __unicode__(self):
418 return self.x
419
420 self.assertEqual(
421 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
422 u'__unicode__ compatible objects are recognized')
423
424 class StringCompat:
425 def __init__(self, x):
426 self.x = x
427 def __str__(self):
428 return self.x
429
430 self.assertEqual(
431 unicode(StringCompat('__str__ compatible objects are recognized')),
432 u'__str__ compatible objects are recognized'
433 )
434
435 # unicode(obj) is compatible to str():
436
437 o = StringCompat('unicode(obj) is compatible to str()')
438 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
439 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
440
441 # %-formatting and .__unicode__()
442 self.assertEqual(u'%s' %
443 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
444 u"u'%s' % obj uses obj.__unicode__()")
445 self.assertEqual(u'%s' %
446 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
447 u"u'%s' % obj falls back to obj.__str__()")
448
449 for obj in (123, 123.45, 123L):
450 self.assertEqual(unicode(obj), unicode(str(obj)))
451
452 # unicode(obj, encoding, error) tests (this maps to
453 # PyUnicode_FromEncodedObject() at C level)
454
455 if not sys.platform.startswith('java'):
456 self.assertRaises(
457 TypeError,
458 unicode,
459 u'decoding unicode is not supported',
460 'utf-8',
461 'strict'
462 )
463
464 self.assertEqual(
465 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
466 u'strings are decoded to unicode'
467 )
468
469 if not sys.platform.startswith('java'):
470 self.assertEqual(
471 unicode(
472 buffer('character buffers are decoded to unicode'),
473 'utf-8',
474 'strict'
475 ),
476 u'character buffers are decoded to unicode'
477 )
478
479 self.assertRaises(TypeError, unicode, 42, 42, 42)
480
481 def test_codecs_utf7(self):
482 utfTests = [
483 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
484 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
485 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
486 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
487 (u'+', '+-'),
488 (u'+-', '+--'),
489 (u'+?', '+-?'),
490 (u'\?', '+AFw?'),
491 (u'+?', '+-?'),
492 (ur'\\?', '+AFwAXA?'),
493 (ur'\\\?', '+AFwAXABc?'),
494 (ur'++--', '+-+---')
495 ]
496
497 for (x, y) in utfTests:
498 self.assertEqual(x.encode('utf-7'), y)
499
500 # surrogates not supported
501 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
502
503 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
504
505 def test_codecs_utf8(self):
506 self.assertEqual(u''.encode('utf-8'), '')
507 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
508 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
509 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
510 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
511 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
512 self.assertEqual(
513 (u'\ud800\udc02'*1000).encode('utf-8'),
514 '\xf0\x90\x80\x82'*1000
515 )
516 self.assertEqual(
517 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
518 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
519 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
520 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
521 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
522 u' Nunstuck git und'.encode('utf-8'),
523 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
524 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
525 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
526 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
527 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
528 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
529 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
530 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
531 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
532 '\xe3\x80\x8cWenn ist das Nunstuck git und'
533 )
534
535 # UTF-8 specific decoding tests
536 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
537 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
538 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
539
540 # Other possible utf-8 test cases:
541 # * strict decoding testing for all of the
542 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
543
544 def test_codecs_idna(self):
545 # Test whether trailing dot is preserved
546 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
547
548 def test_codecs_errors(self):
549 # Error handling (encoding)
550 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
551 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
552 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
553 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
554
555 # Error handling (decoding)
556 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
557 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
558 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
559 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
560
561 # Error handling (unknown character names)
562 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
563
564 # Error handling (truncated escape sequence)
565 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
566
567 # Error handling (bad decoder return)
568 def search_function(encoding):
569 def decode1(input, errors="strict"):
570 return 42 # not a tuple
571 def encode1(input, errors="strict"):
572 return 42 # not a tuple
573 def encode2(input, errors="strict"):
574 return (42, 42) # no unicode
575 def decode2(input, errors="strict"):
576 return (42, 42) # no unicode
577 if encoding=="test.unicode1":
578 return (encode1, decode1, None, None)
579 elif encoding=="test.unicode2":
580 return (encode2, decode2, None, None)
581 else:
582 return None
583 codecs.register(search_function)
584 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
585 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
586 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
587 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
588 # executes PyUnicode_Encode()
589 import imp
590 self.assertRaises(
591 ImportError,
592 imp.find_module,
593 "non-existing module",
594 [u"non-existing dir"]
595 )
596
597 # Error handling (wrong arguments)
598 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
599
600 # Error handling (PyUnicode_EncodeDecimal())
601 self.assertRaises(UnicodeError, int, u"\u0200")
602
603 def test_codecs(self):
604 # Encoding
605 self.assertEqual(u'hello'.encode('ascii'), 'hello')
606 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
607 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
608 self.assertEqual(u'hello'.encode('utf8'), 'hello')
609 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
610 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
611 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
612
613 # Roundtrip safety for BMP (just the first 1024 chars)
614 u = u''.join(map(unichr, xrange(1024)))
615 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
616 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
617 self.assertEqual(unicode(u.encode(encoding),encoding), u)
618
619 # Roundtrip safety for BMP (just the first 256 chars)
620 u = u''.join(map(unichr, xrange(256)))
621 for encoding in ('latin-1',):
622 self.assertEqual(unicode(u.encode(encoding),encoding), u)
623
624 # Roundtrip safety for BMP (just the first 128 chars)
625 u = u''.join(map(unichr, xrange(128)))
626 for encoding in ('ascii',):
627 self.assertEqual(unicode(u.encode(encoding),encoding), u)
628
629 # Roundtrip safety for non-BMP (just a few chars)
630 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
631 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
632 #'raw_unicode_escape',
633 'unicode_escape', 'unicode_internal'):
634 self.assertEqual(unicode(u.encode(encoding),encoding), u)
635
636 # UTF-8 must be roundtrip safe for all UCS-2 code points
637 # This excludes surrogates: in the full range, there would be
638 # a surrogate pair (\udbff\udc00), which gets converted back
639 # to a non-BMP character (\U0010fc00)
640 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
641 for encoding in ('utf-8',):
642 self.assertEqual(unicode(u.encode(encoding),encoding), u)
643
644 def test_codecs_charmap(self):
645 # 0-127
646 s = ''.join(map(chr, xrange(128)))
647 for encoding in (
648 'cp037', 'cp1026',
649 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
650 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
651 'cp863', 'cp865', 'cp866',
652 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
653 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
654 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
655 'mac_cyrillic', 'mac_latin2',
656
657 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
658 'cp1256', 'cp1257', 'cp1258',
659 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
660
661 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
662 'cp1006', 'iso8859_8',
663
664 ### These have undefined mappings:
665 #'cp424',
666
667 ### These fail the round-trip:
668 #'cp875'
669
670 ):
671 self.assertEqual(unicode(s, encoding).encode(encoding), s)
672
673 # 128-255
674 s = ''.join(map(chr, xrange(128, 256)))
675 for encoding in (
676 'cp037', 'cp1026',
677 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
678 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
679 'cp863', 'cp865', 'cp866',
680 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
681 'iso8859_2', 'iso8859_4', 'iso8859_5',
682 'iso8859_9', 'koi8_r', 'latin_1',
683 'mac_cyrillic', 'mac_latin2',
684
685 ### These have undefined mappings:
686 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
687 #'cp1256', 'cp1257', 'cp1258',
688 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
689 #'iso8859_3', 'iso8859_6', 'iso8859_7',
690 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
691
692 ### These fail the round-trip:
693 #'cp1006', 'cp875', 'iso8859_8',
694
695 ):
696 self.assertEqual(unicode(s, encoding).encode(encoding), s)
697
698 def test_concatenation(self):
699 self.assertEqual((u"abc" u"def"), u"abcdef")
700 self.assertEqual(("abc" u"def"), u"abcdef")
701 self.assertEqual((u"abc" "def"), u"abcdef")
702 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
703 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
704
705 def test_printing(self):
706 class BitBucket:
707 def write(self, text):
708 pass
709
710 out = BitBucket()
711 print >>out, u'abc'
712 print >>out, u'abc', u'def'
713 print >>out, u'abc', 'def'
714 print >>out, 'abc', u'def'
715 print >>out, u'abc\n'
716 print >>out, u'abc\n',
717 print >>out, u'abc\n',
718 print >>out, u'def\n'
719 print >>out, u'def\n'
720
721 def test_ucs4(self):
722 if sys.maxunicode == 0xFFFF:
723 return
724 x = u'\U00100000'
725 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
726 self.assertEqual(x, y)
727
728def test_main():
729 test_support.run_unittest(UnicodeTest)
730
731if __name__ == "__main__":
732 test_main()