Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
2 | ||
3 | import stringprep, unicodedata, re, codecs | |
4 | ||
5 | # IDNA section 3.1 | |
6 | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | |
7 | ||
8 | # IDNA section 5 | |
9 | ace_prefix = "xn--" | |
10 | uace_prefix = unicode(ace_prefix, "ascii") | |
11 | ||
12 | # This assumes query strings, so AllowUnassigned is true | |
13 | def nameprep(label): | |
14 | # Map | |
15 | newlabel = [] | |
16 | for c in label: | |
17 | if stringprep.in_table_b1(c): | |
18 | # Map to nothing | |
19 | continue | |
20 | newlabel.append(stringprep.map_table_b2(c)) | |
21 | label = u"".join(newlabel) | |
22 | ||
23 | # Normalize | |
24 | label = unicodedata.normalize("NFKC", label) | |
25 | ||
26 | # Prohibit | |
27 | for c in label: | |
28 | if stringprep.in_table_c12(c) or \ | |
29 | stringprep.in_table_c22(c) or \ | |
30 | stringprep.in_table_c3(c) or \ | |
31 | stringprep.in_table_c4(c) or \ | |
32 | stringprep.in_table_c5(c) or \ | |
33 | stringprep.in_table_c6(c) or \ | |
34 | stringprep.in_table_c7(c) or \ | |
35 | stringprep.in_table_c8(c) or \ | |
36 | stringprep.in_table_c9(c): | |
37 | raise UnicodeError, "Invalid character %s" % repr(c) | |
38 | ||
39 | # Check bidi | |
40 | RandAL = map(stringprep.in_table_d1, label) | |
41 | for c in RandAL: | |
42 | if c: | |
43 | # There is a RandAL char in the string. Must perform further | |
44 | # tests: | |
45 | # 1) The characters in section 5.8 MUST be prohibited. | |
46 | # This is table C.8, which was already checked | |
47 | # 2) If a string contains any RandALCat character, the string | |
48 | # MUST NOT contain any LCat character. | |
49 | if filter(stringprep.in_table_d2, label): | |
50 | raise UnicodeError, "Violation of BIDI requirement 2" | |
51 | ||
52 | # 3) If a string contains any RandALCat character, a | |
53 | # RandALCat character MUST be the first character of the | |
54 | # string, and a RandALCat character MUST be the last | |
55 | # character of the string. | |
56 | if not RandAL[0] or not RandAL[-1]: | |
57 | raise UnicodeError, "Violation of BIDI requirement 3" | |
58 | ||
59 | return label | |
60 | ||
61 | def ToASCII(label): | |
62 | try: | |
63 | # Step 1: try ASCII | |
64 | label = label.encode("ascii") | |
65 | except UnicodeError: | |
66 | pass | |
67 | else: | |
68 | # Skip to step 3: UseSTD3ASCIIRules is false, so | |
69 | # Skip to step 8. | |
70 | if 0 < len(label) < 64: | |
71 | return label | |
72 | raise UnicodeError, "label too long" | |
73 | ||
74 | # Step 2: nameprep | |
75 | label = nameprep(label) | |
76 | ||
77 | # Step 3: UseSTD3ASCIIRules is false | |
78 | # Step 4: try ASCII | |
79 | try: | |
80 | label = label.encode("ascii") | |
81 | except UnicodeError: | |
82 | pass | |
83 | else: | |
84 | # Skip to step 8. | |
85 | if 0 < len(label) < 64: | |
86 | return label | |
87 | raise UnicodeError, "label too long" | |
88 | ||
89 | # Step 5: Check ACE prefix | |
90 | if label.startswith(uace_prefix): | |
91 | raise UnicodeError, "Label starts with ACE prefix" | |
92 | ||
93 | # Step 6: Encode with PUNYCODE | |
94 | label = label.encode("punycode") | |
95 | ||
96 | # Step 7: Prepend ACE prefix | |
97 | label = ace_prefix + label | |
98 | ||
99 | # Step 8: Check size | |
100 | if 0 < len(label) < 64: | |
101 | return label | |
102 | raise UnicodeError, "label too long" | |
103 | ||
104 | def ToUnicode(label): | |
105 | # Step 1: Check for ASCII | |
106 | if isinstance(label, str): | |
107 | pure_ascii = True | |
108 | else: | |
109 | try: | |
110 | label = label.encode("ascii") | |
111 | pure_ascii = True | |
112 | except UnicodeError: | |
113 | pure_ascii = False | |
114 | if not pure_ascii: | |
115 | # Step 2: Perform nameprep | |
116 | label = nameprep(label) | |
117 | # It doesn't say this, but apparently, it should be ASCII now | |
118 | try: | |
119 | label = label.encode("ascii") | |
120 | except UnicodeError: | |
121 | raise UnicodeError, "Invalid character in IDN label" | |
122 | # Step 3: Check for ACE prefix | |
123 | if not label.startswith(ace_prefix): | |
124 | return unicode(label, "ascii") | |
125 | ||
126 | # Step 4: Remove ACE prefix | |
127 | label1 = label[len(ace_prefix):] | |
128 | ||
129 | # Step 5: Decode using PUNYCODE | |
130 | result = label1.decode("punycode") | |
131 | ||
132 | # Step 6: Apply ToASCII | |
133 | label2 = ToASCII(result) | |
134 | ||
135 | # Step 7: Compare the result of step 6 with the one of step 3 | |
136 | # label2 will already be in lower case. | |
137 | if label.lower() != label2: | |
138 | raise UnicodeError, ("IDNA does not round-trip", label, label2) | |
139 | ||
140 | # Step 8: return the result of step 5 | |
141 | return result | |
142 | ||
143 | ### Codec APIs | |
144 | ||
145 | class Codec(codecs.Codec): | |
146 | def encode(self,input,errors='strict'): | |
147 | ||
148 | if errors != 'strict': | |
149 | # IDNA is quite clear that implementations must be strict | |
150 | raise UnicodeError, "unsupported error handling "+errors | |
151 | ||
152 | if not input: | |
153 | return "", 0 | |
154 | ||
155 | result = [] | |
156 | labels = dots.split(input) | |
157 | if labels and len(labels[-1])==0: | |
158 | trailing_dot = '.' | |
159 | del labels[-1] | |
160 | else: | |
161 | trailing_dot = '' | |
162 | for label in labels: | |
163 | result.append(ToASCII(label)) | |
164 | # Join with U+002E | |
165 | return ".".join(result)+trailing_dot, len(input) | |
166 | ||
167 | def decode(self,input,errors='strict'): | |
168 | ||
169 | if errors != 'strict': | |
170 | raise UnicodeError, "Unsupported error handling "+errors | |
171 | ||
172 | if not input: | |
173 | return u"", 0 | |
174 | ||
175 | # IDNA allows decoding to operate on Unicode strings, too. | |
176 | if isinstance(input, unicode): | |
177 | labels = dots.split(input) | |
178 | else: | |
179 | # Must be ASCII string | |
180 | input = str(input) | |
181 | unicode(input, "ascii") | |
182 | labels = input.split(".") | |
183 | ||
184 | if labels and len(labels[-1]) == 0: | |
185 | trailing_dot = u'.' | |
186 | del labels[-1] | |
187 | else: | |
188 | trailing_dot = u'' | |
189 | ||
190 | result = [] | |
191 | for label in labels: | |
192 | result.append(ToUnicode(label)) | |
193 | ||
194 | return u".".join(result)+trailing_dot, len(input) | |
195 | ||
196 | class StreamWriter(Codec,codecs.StreamWriter): | |
197 | pass | |
198 | ||
199 | class StreamReader(Codec,codecs.StreamReader): | |
200 | pass | |
201 | ||
202 | ### encodings module API | |
203 | ||
204 | def getregentry(): | |
205 | ||
206 | return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |