Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Encode::Unicode; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
5 | no warnings 'redefine'; | |
6 | ||
7 | our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
8 | ||
9 | use XSLoader; | |
10 | XSLoader::load(__PACKAGE__,$VERSION); | |
11 | ||
12 | # | |
13 | # Object Generator 8 transcoders all at once! | |
14 | # | |
15 | ||
16 | require Encode; | |
17 | ||
18 | our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32); | |
19 | ||
20 | for my $name (qw(UTF-16 UTF-16BE UTF-16LE | |
21 | UTF-32 UTF-32BE UTF-32LE | |
22 | UCS-2BE UCS-2LE)) | |
23 | { | |
24 | my ($size, $endian, $ucs2, $mask); | |
25 | $name =~ /^(\w+)-(\d+)(\w*)$/o; | |
26 | if ($ucs2 = ($1 eq 'UCS')){ | |
27 | $size = 2; | |
28 | }else{ | |
29 | $size = $2/8; | |
30 | } | |
31 | $endian = ($3 eq 'BE') ? 'n' : ($3 eq 'LE') ? 'v' : '' ; | |
32 | $size == 4 and $endian = uc($endian); | |
33 | ||
34 | $Encode::Encoding{$name} = | |
35 | bless { | |
36 | Name => $name, | |
37 | size => $size, | |
38 | endian => $endian, | |
39 | ucs2 => $ucs2, | |
40 | } => __PACKAGE__; | |
41 | } | |
42 | ||
43 | use base qw(Encode::Encoding); | |
44 | ||
45 | sub renew { | |
46 | my $self = shift; | |
47 | $BOM_Unknown{$self->name} or return $self; | |
48 | my $clone = bless { %$self } => ref($self); | |
49 | $clone->{renewed}++; # so the caller knows it is renewed. | |
50 | return $clone; | |
51 | } | |
52 | ||
53 | # There used to be a perl implemntation of (en|de)code but with | |
54 | # XS version is ripe, perl version is zapped for optimal speed | |
55 | ||
56 | *decode = \&decode_xs; | |
57 | *encode = \&encode_xs; | |
58 | ||
59 | 1; | |
60 | __END__ | |
61 | ||
62 | =head1 NAME | |
63 | ||
64 | Encode::Unicode -- Various Unicode Transformation Formats | |
65 | ||
66 | =cut | |
67 | ||
68 | =head1 SYNOPSIS | |
69 | ||
70 | use Encode qw/encode decode/; | |
71 | $ucs2 = encode("UCS-2BE", $utf8); | |
72 | $utf8 = decode("UCS-2BE", $ucs2); | |
73 | ||
74 | =head1 ABSTRACT | |
75 | ||
76 | This module implements all Character Encoding Schemes of Unicode that | |
77 | are officially documented by Unicode Consortium (except, of course, | |
78 | for UTF-8, which is a native format in perl). | |
79 | ||
80 | =over 4 | |
81 | ||
82 | =item L<http://www.unicode.org/glossary/> says: | |
83 | ||
84 | I<Character Encoding Scheme> A character encoding form plus byte | |
85 | serialization. There are Seven character encoding schemes in Unicode: | |
86 | UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and | |
87 | UTF-32LE (UCS-4LE), and UTF-7. | |
88 | ||
89 | Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of | |
90 | Unicode's Character Encoding Scheme. It is separately implemented in | |
91 | Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>. | |
92 | ||
93 | =item Quick Reference | |
94 | ||
95 | Decodes from ord(N) Encodes chr(N) to... | |
96 | octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} == | |
97 | ---------------+-----------------+------------------------------ | |
98 | UCS-2BE 2 N N is bogus Not Available | |
99 | UCS-2LE 2 N N bogus Not Available | |
100 | UTF-16 2/4 Y Y is S.P S.P BE/LE | |
101 | UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd | |
102 | UTF-16LE 2 N Y S.P S.P 0x2ad8,0xcddf | |
103 | UTF-32 4 Y - is bogus As is BE/LE | |
104 | UTF-32BE 4 N - bogus As is 0x0001abcd | |
105 | UTF-32LE 4 N - bogus As is 0xcdab0100 | |
106 | UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d | |
107 | ---------------+-----------------+------------------------------ | |
108 | ||
109 | =back | |
110 | ||
111 | =head1 Size, Endianness, and BOM | |
112 | ||
113 | You can categorize these CES by 3 criteria: size of each character, | |
114 | endianness, and Byte Order Mark. | |
115 | ||
116 | =head2 by size | |
117 | ||
118 | UCS-2 is a fixed-length encoding with each character taking 16 bits. | |
119 | It B<does not> support I<surrogate pairs>. When a surrogate pair | |
120 | is encountered during decode(), its place is filled with \x{FFFD} | |
121 | if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a | |
122 | character whose ord value is larger than 0xFFFF is encountered, | |
123 | its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine | |
124 | croaks if I<CHECK> is 1. | |
125 | ||
126 | UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>. | |
127 | When it encounters a high surrogate (0xD800-0xDBFF), it fetches the | |
128 | following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to | |
129 | form a character. Bogus surrogates result in death. When \x{10000} | |
130 | or above is encountered during encode(), it C<ensurrogate>s them and | |
131 | pushes the surrogate pair to the output stream. | |
132 | ||
133 | UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits. | |
134 | Since it is 32-bit, there is no need for I<surrogate pairs>. | |
135 | ||
136 | =head2 by endianness | |
137 | ||
138 | The first (and now failed) goal of Unicode was to map all character | |
139 | repertoires into a fixed-length integer so that programmers are happy. | |
140 | Since each character is either a I<short> or I<long> in C, you have to | |
141 | pay attention to the endianness of each platform when you pass data | |
142 | to one another. | |
143 | ||
144 | Anything marked as BE is Big Endian (or network byte order) and LE is | |
145 | Little Endian (aka VAX byte order). For anything not marked either | |
146 | BE or LE, a character called Byte Order Mark (BOM) indicating the | |
147 | endianness is prepended to the string. | |
148 | ||
149 | CAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless | |
150 | and as of this writing Encode suite just leave it as is (\x{FeFF}). | |
151 | ||
152 | =over 4 | |
153 | ||
154 | =item BOM as integer when fetched in network byte order | |
155 | ||
156 | 16 32 bits/char | |
157 | ------------------------- | |
158 | BE 0xFeFF 0x0000FeFF | |
159 | LE 0xFFeF 0xFFFe0000 | |
160 | ------------------------- | |
161 | ||
162 | =back | |
163 | ||
164 | This modules handles the BOM as follows. | |
165 | ||
166 | =over 4 | |
167 | ||
168 | =item * | |
169 | ||
170 | When BE or LE is explicitly stated as the name of encoding, BOM is | |
171 | simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE). | |
172 | ||
173 | =item * | |
174 | ||
175 | When BE or LE is omitted during decode(), it checks if BOM is at the | |
176 | beginning of the string; if one is found, the endianness is set to | |
177 | what the BOM says. If no BOM is found, the routine dies. | |
178 | ||
179 | =item * | |
180 | ||
181 | When BE or LE is omitted during encode(), it returns a BE-encoded | |
182 | string with BOM prepended. So when you want to encode a whole text | |
183 | file, make sure you encode() the whole text at once, not line by line | |
184 | or each line, not file, will have a BOM prepended. | |
185 | ||
186 | =item * | |
187 | ||
188 | C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE. | |
189 | UCS-2 is already registered by IANA and others that way. | |
190 | ||
191 | =back | |
192 | ||
193 | =head1 Surrogate Pairs | |
194 | ||
195 | To say the least, surrogate pairs were the biggest mistake of the | |
196 | Unicode Consortium. But according to the late Douglas Adams in I<The | |
197 | Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the | |
198 | Universe was created. This has made a lot of people very angry and | |
199 | been widely regarded as a bad move>. Their mistake was not of this | |
200 | magnitude so let's forgive them. | |
201 | ||
202 | (I don't dare make any comparison with Unicode Consortium and the | |
203 | Vogons here ;) Or, comparing Encode to Babel Fish is completely | |
204 | appropriate -- if you can only stick this into your ear :) | |
205 | ||
206 | Surrogate pairs were born when the Unicode Consortium finally | |
207 | admitted that 16 bits were not big enough to hold all the world's | |
208 | character repertoires. But they already made UCS-2 16-bit. What | |
209 | do we do? | |
210 | ||
211 | Back then, the range 0xD800-0xDFFF was not allocated. Let's split | |
212 | that range in half and use the first half to represent the C<upper | |
213 | half of a character> and the second half to represent the C<lower | |
214 | half of a character>. That way, you can represent 1024 * 1024 = | |
215 | 1048576 more characters. Now we can store character ranges up to | |
216 | \x{10ffff} even with 16-bit encodings. This pair of half-character is | |
217 | now called a I<surrogate pair> and UTF-16 is the name of the encoding | |
218 | that embraces them. | |
219 | ||
220 | Here is a formula to ensurrogate a Unicode character \x{10000} and | |
221 | above; | |
222 | ||
223 | $hi = ($uni - 0x10000) / 0x400 + 0xD800; | |
224 | $lo = ($uni - 0x10000) % 0x400 + 0xDC00; | |
225 | ||
226 | And to desurrogate; | |
227 | ||
228 | $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); | |
229 | ||
230 | Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but | |
231 | perl does not prohibit the use of characters within this range. To perl, | |
232 | every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>. | |
233 | ||
234 | (*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit | |
235 | integer support! | |
236 | ||
237 | =head1 Error Checking | |
238 | ||
239 | Unlike most encodings which accept various ways to handle errors, | |
240 | Unicode encodings simply croaks. | |
241 | ||
242 | % perl -MEncode -e '$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \ | |
243 | -e 'Encode::from_to($_, "utf16","shift_jis", 0); print' | |
244 | UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184. | |
245 | % perl -MEncode -e '$a = "BOM missing"' \ | |
246 | -e ' Encode::from_to($a, "utf16", "shift_jis", 0); print' | |
247 | UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184. | |
248 | ||
249 | Unlike other encodings where mappings are not one-to-one against | |
250 | Unicode, UTFs are supposed to map 100% against one another. So Encode | |
251 | is more strict on UTFs. | |
252 | ||
253 | Consider that "division by zero" of Encode :) | |
254 | ||
255 | =head1 SEE ALSO | |
256 | ||
257 | L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>, | |
258 | L<http://www.unicode.org/unicode/faq/utf_bom.html>, | |
259 | ||
260 | RFC 2781 L<http://rfc.net/rfc2781.html>, | |
261 | ||
262 | The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html> | |
263 | ||
264 | Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)> | |
265 | by Larry Wall, Tom Christiansen, Jon Orwant; | |
266 | O'Reilly & Associates; ISBN 0-596-00027-8 | |
267 | ||
268 | =cut |