Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package Encode::Unicode; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | ||
6 | our $VERSION = do { my @r = (q$Revision: 1.37 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
7 | ||
8 | use XSLoader; | |
9 | XSLoader::load(__PACKAGE__,$VERSION); | |
10 | ||
11 | # | |
12 | # Object Generator 8 transcoders all at once! | |
13 | # | |
14 | ||
15 | require Encode; | |
16 | ||
17 | for my $name (qw(UTF-16 UTF-16BE UTF-16LE | |
18 | UTF-32 UTF-32BE UTF-32LE | |
19 | UCS-2BE UCS-2LE)) | |
20 | { | |
21 | my ($size, $endian, $ucs2, $mask); | |
22 | $name =~ /^(\w+)-(\d+)(\w*)$/o; | |
23 | if ($ucs2 = ($1 eq 'UCS')){ | |
24 | $size = 2; | |
25 | }else{ | |
26 | $size = $2/8; | |
27 | } | |
28 | $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ; | |
29 | $size == 4 and $endian = uc($endian); | |
30 | ||
31 | $Encode::Encoding{$name} = | |
32 | bless { | |
33 | Name => $name, | |
34 | size => $size, | |
35 | endian => $endian, | |
36 | ucs2 => $ucs2, | |
37 | } => __PACKAGE__; | |
38 | ||
39 | } | |
40 | ||
41 | use base qw(Encode::Encoding); | |
42 | ||
43 | # | |
44 | # three implementations of (en|de)code exist. The XS version is the | |
45 | # fastest. *_modern uses an array and *_classic sticks with substr. | |
46 | # *_classic is much slower but more memory conservative. | |
47 | # *_xs is the default. | |
48 | ||
49 | sub set_transcoder{ | |
50 | no warnings qw(redefine); | |
51 | my $type = shift; | |
52 | if ($type eq "xs"){ | |
53 | *decode = \&decode_xs; | |
54 | *encode = \&encode_xs; | |
55 | }elsif($type eq "modern"){ | |
56 | *decode = \&decode_modern; | |
57 | *encode = \&encode_modern; | |
58 | }elsif($type eq "classic"){ | |
59 | *decode = \&decode_classic; | |
60 | *encode = \&encode_classic; | |
61 | }else{ | |
62 | require Carp; | |
63 | Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)"; | |
64 | } | |
65 | } | |
66 | ||
67 | set_transcoder("xs"); | |
68 | ||
69 | # | |
70 | # Aux. subs & constants | |
71 | # | |
72 | ||
73 | sub FBCHAR(){ 0xFFFd } | |
74 | sub BOM_BE(){ 0xFeFF } | |
75 | sub BOM16LE(){ 0xFFFe } | |
76 | sub BOM32LE(){ 0xFFFe0000 } | |
77 | ||
78 | sub valid_ucs2($){ | |
79 | return | |
80 | (0 <= $_[0] && $_[0] < 0xD800) | |
81 | || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); | |
82 | } | |
83 | ||
84 | sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } | |
85 | sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } | |
86 | sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } | |
87 | ||
88 | sub ensurrogate($){ | |
89 | use integer; # we have divisions | |
90 | my $uni = shift; | |
91 | my $hi = ($uni - 0x10000) / 0x400 + 0xD800; | |
92 | my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; | |
93 | return ($hi, $lo); | |
94 | } | |
95 | ||
96 | sub desurrogate($$){ | |
97 | my ($hi, $lo) = @_; | |
98 | return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); | |
99 | } | |
100 | ||
101 | sub Mask { {2 => 0xffff, 4 => 0xffffffff} } | |
102 | ||
103 | # | |
104 | # *_modern are much faster but guzzle more memory | |
105 | # | |
106 | ||
107 | sub decode_modern($$;$) | |
108 | { | |
109 | my ($obj, $str, $chk ) = @_; | |
110 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; | |
111 | ||
112 | # warn "$size, $endian, $ucs2"; | |
113 | $endian ||= BOMB($size, substr($str, 0, $size, '')) | |
114 | or poisoned2death($obj, "Where's the BOM?"); | |
115 | my $mask = Mask->{$size}; | |
116 | my $utf8 = ''; | |
117 | my @ord = unpack("$endian*", $str); | |
118 | undef $str; # to conserve memory | |
119 | while (@ord){ | |
120 | my $ord = shift @ord; | |
121 | unless ($size == 4 or valid_ucs2($ord &= $mask)){ | |
122 | if ($ucs2){ | |
123 | $chk and | |
124 | poisoned2death($obj, "no surrogates allowed", $ord); | |
125 | shift @ord; # skip the next one as well | |
126 | $ord = FBCHAR; | |
127 | }else{ | |
128 | unless (isHiSurrogate($ord)){ | |
129 | poisoned2death($obj, "Malformed HI surrogate", $ord); | |
130 | } | |
131 | my $lo = shift @ord; | |
132 | unless (isLoSurrogate($lo &= $mask)){ | |
133 | poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); | |
134 | } | |
135 | $ord = desurrogate($ord, $lo); | |
136 | } | |
137 | } | |
138 | $utf8 .= chr($ord); | |
139 | } | |
140 | utf8::upgrade($utf8); | |
141 | return $utf8; | |
142 | } | |
143 | ||
144 | sub encode_modern($$;$) | |
145 | { | |
146 | my ($obj, $utf8, $chk) = @_; | |
147 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; | |
148 | my @str = (); | |
149 | unless ($endian){ | |
150 | $endian = ($size == 4) ? 'N' : 'n'; | |
151 | push @str, BOM_BE; | |
152 | } | |
153 | my @ord = unpack("U*", $utf8); | |
154 | undef $utf8; # to conserve memory | |
155 | for my $ord (@ord){ | |
156 | unless ($size == 4 or valid_ucs2($ord)) { | |
157 | unless(issurrogate($ord)){ | |
158 | if ($ucs2){ | |
159 | $chk and | |
160 | poisoned2death($obj, "code point too high", $ord); | |
161 | ||
162 | push @str, FBCHAR; | |
163 | }else{ | |
164 | ||
165 | push @str, ensurrogate($ord); | |
166 | } | |
167 | }else{ # not supposed to happen | |
168 | push @str, FBCHAR; | |
169 | } | |
170 | }else{ | |
171 | push @str, $ord; | |
172 | } | |
173 | } | |
174 | return pack("$endian*", @str); | |
175 | } | |
176 | ||
177 | # | |
178 | # *_classic are slower but more memory conservative | |
179 | # | |
180 | ||
181 | sub decode_classic($$;$) | |
182 | { | |
183 | my ($obj, $str, $chk ) = @_; | |
184 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; | |
185 | ||
186 | # warn "$size, $endian, $ucs2"; | |
187 | $endian ||= BOMB($size, substr($str, 0, $size, '')) | |
188 | or poisoned2death($obj, "Where's the BOM?"); | |
189 | my $mask = Mask->{$size}; | |
190 | my $utf8 = ''; | |
191 | my @ord = unpack("$endian*", $str); | |
192 | while (length($str)){ | |
193 | my $ord = unpack($endian, substr($str, 0, $size, '')); | |
194 | unless ($size == 4 or valid_ucs2($ord &= $mask)){ | |
195 | if ($ucs2){ | |
196 | $chk and | |
197 | poisoned2death($obj, "no surrogates allowed", $ord); | |
198 | substr($str,0,$size,''); # skip the next one as well | |
199 | $ord = FBCHAR; | |
200 | }else{ | |
201 | unless (isHiSurrogate($ord)){ | |
202 | poisoned2death($obj, "Malformed HI surrogate", $ord); | |
203 | } | |
204 | my $lo = unpack($endian ,substr($str,0,$size,'')); | |
205 | unless (isLoSurrogate($lo &= $mask)){ | |
206 | poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); | |
207 | } | |
208 | $ord = desurrogate($ord, $lo); | |
209 | } | |
210 | } | |
211 | $utf8 .= chr($ord); | |
212 | } | |
213 | utf8::upgrade($utf8); | |
214 | return $utf8; | |
215 | } | |
216 | ||
217 | sub encode_classic($$;$) | |
218 | { | |
219 | my ($obj, $utf8, $chk) = @_; | |
220 | my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; | |
221 | # warn join ", ", $size, $ucs2, $endian, $mask; | |
222 | my $str = ''; | |
223 | unless ($endian){ | |
224 | $endian = ($size == 4) ? 'N' : 'n'; | |
225 | $str .= pack($endian, BOM_BE); | |
226 | } | |
227 | while (length($utf8)){ | |
228 | my $ord = ord(substr($utf8,0,1,'')); | |
229 | unless ($size == 4 or valid_ucs2($ord)) { | |
230 | unless(issurrogate($ord)){ | |
231 | if ($ucs2){ | |
232 | $chk and | |
233 | poisoned2death($obj, "code point too high", $ord); | |
234 | $str .= pack($endian, FBCHAR); | |
235 | }else{ | |
236 | $str .= pack($endian.2, ensurrogate($ord)); | |
237 | } | |
238 | }else{ # not supposed to happen | |
239 | $str .= pack($endian, FBCHAR); | |
240 | } | |
241 | }else{ | |
242 | $str .= pack($endian, $ord); | |
243 | } | |
244 | } | |
245 | return $str; | |
246 | } | |
247 | ||
248 | sub BOMB { | |
249 | my ($size, $bom) = @_; | |
250 | my $N = $size == 2 ? 'n' : 'N'; | |
251 | my $ord = unpack($N, $bom); | |
252 | return ($ord eq BOM_BE) ? $N : | |
253 | ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; | |
254 | } | |
255 | ||
256 | sub poisoned2death{ | |
257 | my $obj = shift; | |
258 | my $msg = shift; | |
259 | my $pair = join(", ", map {sprintf "\\x%x", $_} @_); | |
260 | require Carp; | |
261 | Carp::croak $obj->name, ":", $msg, "<$pair>.", caller; | |
262 | } | |
263 | ||
264 | 1; | |
265 | __END__ | |
266 | ||
267 | =head1 NAME | |
268 | ||
269 | Encode::Unicode -- Various Unicode Transformation Formats | |
270 | ||
271 | =cut | |
272 | ||
273 | =head1 SYNOPSIS | |
274 | ||
275 | use Encode qw/encode decode/; | |
276 | $ucs2 = encode("UCS-2BE", $utf8); | |
277 | $utf8 = decode("UCS-2BE", $ucs2); | |
278 | ||
279 | =head1 ABSTRACT | |
280 | ||
281 | This module implements all Character Encoding Schemes of Unicode that | |
282 | are officially documented by Unicode Consortium (except, of course, | |
283 | for UTF-8, which is a native format in perl). | |
284 | ||
285 | =over 4 | |
286 | ||
287 | =item L<http://www.unicode.org/glossary/> says: | |
288 | ||
289 | I<Character Encoding Scheme> A character encoding form plus byte | |
290 | serialization. There are seven character encoding schemes in Unicode: | |
291 | UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and | |
292 | UTF-32LE (UCS-4LE). | |
293 | ||
294 | =item Quick Reference | |
295 | ||
296 | Decodes from ord(N) Encodes chr(N) to... | |
297 | octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == | |
298 | ---------------+-----------------+------------------------------ | |
299 | UCS-2BE 2 N N is bogus Not Available | |
300 | UCS-2LE 2 N N bogus Not Available | |
301 | UTF-16 2/4 Y Y is S.P S.P BE/LE | |
302 | UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd | |
303 | UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf | |
304 | UTF-32 4 Y - is bogus As is BE/LE | |
305 | UTF-32BE 4 N - bogus As is 0x0001abcd | |
306 | UTF-32LE 4 N - bogus As is 0xcdab0100 | |
307 | UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d | |
308 | ---------------+-----------------+------------------------------ | |
309 | ||
310 | =back | |
311 | ||
312 | =head1 Size, Endianness, and BOM | |
313 | ||
314 | You can categorize these CES by 3 criteria: size of each character, | |
315 | endianness, and Byte Order Mark. | |
316 | ||
317 | =head2 by size | |
318 | ||
319 | UCS-2 is a fixed-length encoding with each character taking 16 bits. | |
320 | It B<does not> support I<surrogate pairs>. When a surrogate pair | |
321 | is encountered during decode(), its place is filled with \x{FFFD} | |
322 | if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a | |
323 | character whose ord value is larger than 0xFFFF is encountered, | |
324 | its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine | |
325 | croaks if I<CHECK> is 1. | |
326 | ||
327 | UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>. | |
328 | When it encounters a high surrogate (0xD800-0xDBFF), it fetches the | |
329 | following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to | |
330 | form a character. Bogus surrogates result in death. When \x{10000} | |
331 | or above is encountered during encode(), it C<ensurrogate>s them and | |
332 | pushes the surrogate pair to the output stream. | |
333 | ||
334 | UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits. | |
335 | Since it is 32-bit, there is no need for I<surrogate pairs>. | |
336 | ||
337 | =head2 by endianness | |
338 | ||
339 | The first (and now failed) goal of Unicode was to map all character | |
340 | repertoires into a fixed-length integer so that programmers are happy. | |
341 | Since each character is either a I<short> or I<long> in C, you have to | |
342 | pay attention to the endianness of each platform when you pass data | |
343 | to one another. | |
344 | ||
345 | Anything marked as BE is Big Endian (or network byte order) and LE is | |
346 | Little Endian (aka VAX byte order). For anything not marked either | |
347 | BE or LE, a character called Byte Order Mark (BOM) indicating the | |
348 | endianness is prepended to the string. | |
349 | ||
350 | =over 4 | |
351 | ||
352 | =item BOM as integer when fetched in network byte order | |
353 | ||
354 | 16 32 bits/char | |
355 | ------------------------- | |
356 | BE 0xFeFF 0x0000FeFF | |
357 | LE 0xFFeF 0xFFFe0000 | |
358 | ------------------------- | |
359 | ||
360 | =back | |
361 | ||
362 | This modules handles the BOM as follows. | |
363 | ||
364 | =over 4 | |
365 | ||
366 | =item * | |
367 | ||
368 | When BE or LE is explicitly stated as the name of encoding, BOM is | |
369 | simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE). | |
370 | ||
371 | =item * | |
372 | ||
373 | When BE or LE is omitted during decode(), it checks if BOM is at the | |
374 | beginning of the string; if one is found, the endianness is set to | |
375 | what the BOM says. If no BOM is found, the routine dies. | |
376 | ||
377 | =item * | |
378 | ||
379 | When BE or LE is omitted during encode(), it returns a BE-encoded | |
380 | string with BOM prepended. So when you want to encode a whole text | |
381 | file, make sure you encode() the whole text at once, not line by line | |
382 | or each line, not file, will have a BOM prepended. | |
383 | ||
384 | =item * | |
385 | ||
386 | C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE. | |
387 | UCS-2 is already registered by IANA and others that way. | |
388 | ||
389 | =back | |
390 | ||
391 | =head1 Surrogate Pairs | |
392 | ||
393 | To say the least, surrogate pairs were the biggest mistake of the | |
394 | Unicode Consortium. But according to the late Douglas Adams in I<The | |
395 | Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the | |
396 | Universe was created. This has made a lot of people very angry and | |
397 | been widely regarded as a bad move>. Their mistake was not of this | |
398 | magnitude so let's forgive them. | |
399 | ||
400 | (I don't dare make any comparison with Unicode Consortium and the | |
401 | Vogons here ;) Or, comparing Encode to Babel Fish is completely | |
402 | appropriate -- if you can only stick this into your ear :) | |
403 | ||
404 | Surrogate pairs were born when the Unicode Consortium finally | |
405 | admitted that 16 bits were not big enough to hold all the world's | |
406 | character repertoires. But they already made UCS-2 16-bit. What | |
407 | do we do? | |
408 | ||
409 | Back then, the range 0xD800-0xDFFF was not allocated. Let's split | |
410 | that range in half and use the first half to represent the C<upper | |
411 | half of a character> and the second half to represent the C<lower | |
412 | half of a character>. That way, you can represent 1024 * 1024 = | |
413 | 1048576 more characters. Now we can store character ranges up to | |
414 | \x{10ffff} even with 16-bit encodings. This pair of half-character is | |
415 | now called a I<surrogate pair> and UTF-16 is the name of the encoding | |
416 | that embraces them. | |
417 | ||
418 | Here is a formula to ensurrogate a Unicode character \x{10000} and | |
419 | above; | |
420 | ||
421 | $hi = ($uni - 0x10000) / 0x400 + 0xD800; | |
422 | $lo = ($uni - 0x10000) % 0x400 + 0xDC00; | |
423 | ||
424 | And to desurrogate; | |
425 | ||
426 | $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); | |
427 | ||
428 | Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but | |
429 | perl does not prohibit the use of characters within this range. To perl, | |
430 | every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>. | |
431 | ||
432 | (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit | |
433 | integer support! | |
434 | ||
435 | =head1 SEE ALSO | |
436 | ||
437 | L<Encode>, L<http://www.unicode.org/glossary/>, | |
438 | L<http://www.unicode.org/unicode/faq/utf_bom.html>, | |
439 | ||
440 | RFC 2781 L<http://rfc.net/rfc2781.html>, | |
441 | ||
442 | The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html> | |
443 | ||
444 | Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)> | |
445 | by Larry Wall, Tom Christiansen, Jon Orwant; | |
446 | O'Reilly & Associates; ISBN 0-596-00027-8 | |
447 | ||
448 | =cut |