| 1 | from test.test_support import verbose, TestFailed, TestSkipped, verify |
| 2 | import sys |
| 3 | import os |
| 4 | from unicodedata import normalize |
| 5 | |
| 6 | TESTDATAFILE = "NormalizationTest-3.2.0" + os.extsep + "txt" |
| 7 | |
| 8 | # This search allows using a build directory just inside the source |
| 9 | # directory, and saving just one copy of the test data in the source |
| 10 | # tree, rather than having a copy in each build directory. |
| 11 | # There might be a better way to do this. |
| 12 | |
| 13 | for path in [os.path.curdir, os.path.pardir]: |
| 14 | fn = os.path.join(path, TESTDATAFILE) |
| 15 | skip_expected = not os.path.exists(fn) |
| 16 | if not skip_expected: |
| 17 | TESTDATAFILE = fn |
| 18 | break |
| 19 | |
| 20 | class RangeError: |
| 21 | pass |
| 22 | |
| 23 | def NFC(str): |
| 24 | return normalize("NFC", str) |
| 25 | |
| 26 | def NFKC(str): |
| 27 | return normalize("NFKC", str) |
| 28 | |
| 29 | def NFD(str): |
| 30 | return normalize("NFD", str) |
| 31 | |
| 32 | def NFKD(str): |
| 33 | return normalize("NFKD", str) |
| 34 | |
| 35 | def unistr(data): |
| 36 | data = [int(x, 16) for x in data.split(" ")] |
| 37 | for x in data: |
| 38 | if x > sys.maxunicode: |
| 39 | raise RangeError |
| 40 | return u"".join([unichr(x) for x in data]) |
| 41 | |
| 42 | def test_main(): |
| 43 | if skip_expected: |
| 44 | raise TestSkipped(TESTDATAFILE + " not found, download from " + |
| 45 | "http://www.unicode.org/Public/3.2-Update/" + TESTDATAFILE) |
| 46 | |
| 47 | part1_data = {} |
| 48 | for line in open(TESTDATAFILE): |
| 49 | if '#' in line: |
| 50 | line = line.split('#')[0] |
| 51 | line = line.strip() |
| 52 | if not line: |
| 53 | continue |
| 54 | if line.startswith("@Part"): |
| 55 | part = line |
| 56 | continue |
| 57 | try: |
| 58 | c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] |
| 59 | except RangeError: |
| 60 | # Skip unsupported characters |
| 61 | continue |
| 62 | |
| 63 | if verbose: |
| 64 | print line |
| 65 | |
| 66 | # Perform tests |
| 67 | verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) |
| 68 | verify(c4 == NFC(c4) == NFC(c5), line) |
| 69 | verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) |
| 70 | verify(c5 == NFD(c4) == NFD(c5), line) |
| 71 | verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), |
| 72 | line) |
| 73 | verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), |
| 74 | line) |
| 75 | |
| 76 | # Record part 1 data |
| 77 | if part == "@Part1": |
| 78 | part1_data[c1] = 1 |
| 79 | |
| 80 | # Perform tests for all other data |
| 81 | for c in range(sys.maxunicode+1): |
| 82 | X = unichr(c) |
| 83 | if X in part1_data: |
| 84 | continue |
| 85 | assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c |
| 86 | |
| 87 | # Check for bug 834676 |
| 88 | normalize('NFC',u'\ud55c\uae00') |
| 89 | |
| 90 | if __name__ == "__main__": |
| 91 | test_main() |