Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "PERLUNIINTRO 1" | |
132 | .TH PERLUNIINTRO 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | perluniintro \- Perl Unicode introduction | |
135 | .SH "DESCRIPTION" | |
136 | .IX Header "DESCRIPTION" | |
137 | This document gives a general idea of Unicode and how to use Unicode | |
138 | in Perl. | |
139 | .Sh "Unicode" | |
140 | .IX Subsection "Unicode" | |
141 | Unicode is a character set standard which plans to codify all of the | |
142 | writing systems of the world, plus many other symbols. | |
143 | .PP | |
144 | Unicode and \s-1ISO/IEC\s0 10646 are coordinated standards that provide code | |
145 | points for characters in almost all modern character set standards, | |
146 | covering more than 30 writing systems and hundreds of languages, | |
147 | including all commercially-important modern languages. All characters | |
148 | in the largest Chinese, Japanese, and Korean dictionaries are also | |
149 | encoded. The standards will eventually cover almost all characters in | |
150 | more than 250 writing systems and thousands of languages. | |
151 | Unicode 1.0 was released in October 1991, and 4.0 in April 2003. | |
152 | .PP | |
153 | A Unicode \fIcharacter\fR is an abstract entity. It is not bound to any | |
154 | particular integer width, especially not to the C language \f(CW\*(C`char\*(C'\fR. | |
155 | Unicode is language-neutral and display\-neutral: it does not encode the | |
156 | language of the text and it does not define fonts or other graphical | |
157 | layout details. Unicode operates on characters and on text built from | |
158 | those characters. | |
159 | .PP | |
160 | Unicode defines characters like \f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR or \f(CW\*(C`GREEK | |
161 | SMALL LETTER ALPHA\*(C'\fR and unique numbers for the characters, in this | |
162 | case 0x0041 and 0x03B1, respectively. These unique numbers are called | |
163 | \&\fIcode points\fR. | |
164 | .PP | |
165 | The Unicode standard prefers using hexadecimal notation for the code | |
166 | points. If numbers like \f(CW0x0041\fR are unfamiliar to you, take a peek | |
167 | at a later section, \*(L"Hexadecimal Notation\*(R". The Unicode standard | |
168 | uses the notation \f(CW\*(C`U+0041 LATIN CAPITAL LETTER A\*(C'\fR, to give the | |
169 | hexadecimal code point and the normative name of the character. | |
170 | .PP | |
171 | Unicode also defines various \fIproperties\fR for the characters, like | |
172 | \&\*(L"uppercase\*(R" or \*(L"lowercase\*(R", \*(L"decimal digit\*(R", or \*(L"punctuation\*(R"; | |
173 | these properties are independent of the names of the characters. | |
174 | Furthermore, various operations on the characters like uppercasing, | |
175 | lowercasing, and collating (sorting) are defined. | |
176 | .PP | |
177 | A Unicode character consists either of a single code point, or a | |
178 | \&\fIbase character\fR (like \f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR), followed by one or | |
179 | more \fImodifiers\fR (like \f(CW\*(C`COMBINING ACUTE ACCENT\*(C'\fR). This sequence of | |
180 | base character and modifiers is called a \fIcombining character | |
181 | sequence\fR. | |
182 | .PP | |
183 | Whether to call these combining character sequences \*(L"characters\*(R" | |
184 | depends on your point of view. If you are a programmer, you probably | |
185 | would tend towards seeing each element in the sequences as one unit, | |
186 | or \*(L"character\*(R". The whole sequence could be seen as one \*(L"character\*(R", | |
187 | however, from the user's point of view, since that's probably what it | |
188 | looks like in the context of the user's language. | |
189 | .PP | |
190 | With this \*(L"whole sequence\*(R" view of characters, the total number of | |
191 | characters is open\-ended. But in the programmer's \*(L"one unit is one | |
192 | character\*(R" point of view, the concept of \*(L"characters\*(R" is more | |
193 | deterministic. In this document, we take that second point of view: | |
194 | one \*(L"character\*(R" is one Unicode code point, be it a base character or | |
195 | a combining character. | |
196 | .PP | |
197 | For some combinations, there are \fIprecomposed\fR characters. | |
198 | \&\f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR, for example, is defined as | |
199 | a single code point. These precomposed characters are, however, | |
200 | only available for some combinations, and are mainly | |
201 | meant to support round-trip conversions between Unicode and legacy | |
202 | standards (like the \s-1ISO\s0 8859). In the general case, the composing | |
203 | method is more extensible. To support conversion between | |
204 | different compositions of the characters, various \fInormalization | |
205 | forms\fR to standardize representations are also defined. | |
206 | .PP | |
207 | Because of backward compatibility with legacy encodings, the \*(L"a unique | |
208 | number for every character\*(R" idea breaks down a bit: instead, there is | |
209 | \&\*(L"at least one number for every character\*(R". The same character could | |
210 | be represented differently in several legacy encodings. The | |
211 | converse is also not true: some code points do not have an assigned | |
212 | character. Firstly, there are unallocated code points within | |
213 | otherwise used blocks. Secondly, there are special Unicode control | |
214 | characters that do not represent true characters. | |
215 | .PP | |
216 | A common myth about Unicode is that it would be \*(L"16\-bit\*(R", that is, | |
217 | Unicode is only represented as \f(CW0x10000\fR (or 65536) characters from | |
218 | \&\f(CW0x0000\fR to \f(CW0xFFFF\fR. \fBThis is untrue.\fR Since Unicode 2.0 (July | |
219 | 1996), Unicode has been defined all the way up to 21 bits (\f(CW0x10FFFF\fR), | |
220 | and since Unicode 3.1 (March 2001), characters have been defined | |
221 | beyond \f(CW0xFFFF\fR. The first \f(CW0x10000\fR characters are called the | |
222 | \&\fIPlane 0\fR, or the \fIBasic Multilingual Plane\fR (\s-1BMP\s0). With Unicode | |
223 | 3.1, 17 (yes, seventeen) planes in all were defined\*(--but they are | |
224 | nowhere near full of defined characters, yet. | |
225 | .PP | |
226 | Another myth is that the 256\-character blocks have something to | |
227 | do with languages\*(--that each block would define the characters used | |
228 | by a language or a set of languages. \fBThis is also untrue.\fR | |
229 | The division into blocks exists, but it is almost completely | |
230 | accidental\*(--an artifact of how the characters have been and | |
231 | still are allocated. Instead, there is a concept called \fIscripts\fR, | |
232 | which is more useful: there is \f(CW\*(C`Latin\*(C'\fR script, \f(CW\*(C`Greek\*(C'\fR script, and | |
233 | so on. Scripts usually span varied parts of several blocks. | |
234 | For further information see Unicode::UCD. | |
235 | .PP | |
236 | The Unicode code points are just abstract numbers. To input and | |
237 | output these abstract numbers, the numbers must be \fIencoded\fR or | |
238 | \&\fIserialised\fR somehow. Unicode defines several \fIcharacter encoding | |
239 | forms\fR, of which \fI\s-1UTF\-8\s0\fR is perhaps the most popular. \s-1UTF\-8\s0 is a | |
240 | variable length encoding that encodes Unicode characters as 1 to 6 | |
241 | bytes (only 4 with the currently defined characters). Other encodings | |
242 | include \s-1UTF\-16\s0 and \s-1UTF\-32\s0 and their big\- and little-endian variants | |
243 | (\s-1UTF\-8\s0 is byte-order independent) The \s-1ISO/IEC\s0 10646 defines the \s-1UCS\-2\s0 | |
244 | and \s-1UCS\-4\s0 encoding forms. | |
245 | .PP | |
246 | For more information about encodings\*(--for instance, to learn what | |
247 | \&\fIsurrogates\fR and \fIbyte order marks\fR (BOMs) are\*(--see perlunicode. | |
248 | .Sh "Perl's Unicode Support" | |
249 | .IX Subsection "Perl's Unicode Support" | |
250 | Starting from Perl 5.6.0, Perl has had the capacity to handle Unicode | |
251 | natively. Perl 5.8.0, however, is the first recommended release for | |
252 | serious Unicode work. The maintenance release 5.6.1 fixed many of the | |
253 | problems of the initial Unicode implementation, but for example | |
254 | regular expressions still do not work with Unicode in 5.6.1. | |
255 | .PP | |
256 | \&\fBStarting from Perl 5.8.0, the use of \f(CB\*(C`use utf8\*(C'\fB is no longer | |
257 | necessary.\fR In earlier releases the \f(CW\*(C`utf8\*(C'\fR pragma was used to declare | |
258 | that operations in the current block or file would be Unicode\-aware. | |
259 | This model was found to be wrong, or at least clumsy: the \*(L"Unicodeness\*(R" | |
260 | is now carried with the data, instead of being attached to the | |
261 | operations. Only one case remains where an explicit \f(CW\*(C`use utf8\*(C'\fR is | |
262 | needed: if your Perl script itself is encoded in \s-1UTF\-8\s0, you can use | |
263 | \&\s-1UTF\-8\s0 in your identifier names, and in string and regular expression | |
264 | literals, by saying \f(CW\*(C`use utf8\*(C'\fR. This is not the default because | |
265 | scripts with legacy 8\-bit data in them would break. See utf8. | |
266 | .Sh "Perl's Unicode Model" | |
267 | .IX Subsection "Perl's Unicode Model" | |
268 | Perl supports both pre\-5.6 strings of eight-bit native bytes, and | |
269 | strings of Unicode characters. The principle is that Perl tries to | |
270 | keep its data as eight-bit bytes for as long as possible, but as soon | |
271 | as Unicodeness cannot be avoided, the data is transparently upgraded | |
272 | to Unicode. | |
273 | .PP | |
274 | Internally, Perl currently uses either whatever the native eight-bit | |
275 | character set of the platform (for example Latin\-1) is, defaulting to | |
276 | \&\s-1UTF\-8\s0, to encode Unicode strings. Specifically, if all code points in | |
277 | the string are \f(CW0xFF\fR or less, Perl uses the native eight-bit | |
278 | character set. Otherwise, it uses \s-1UTF\-8\s0. | |
279 | .PP | |
280 | A user of Perl does not normally need to know nor care how Perl | |
281 | happens to encode its internal strings, but it becomes relevant when | |
282 | outputting Unicode strings to a stream without a PerlIO layer \*(-- one with | |
283 | the \*(L"default\*(R" encoding. In such a case, the raw bytes used internally | |
284 | (the native character set or \s-1UTF\-8\s0, as appropriate for each string) | |
285 | will be used, and a \*(L"Wide character\*(R" warning will be issued if those | |
286 | strings contain a character beyond 0x00FF. | |
287 | .PP | |
288 | For example, | |
289 | .PP | |
290 | .Vb 1 | |
291 | \& perl -e 'print "\ex{DF}\en", "\ex{0100}\ex{DF}\en"' | |
292 | .Ve | |
293 | .PP | |
294 | produces a fairly useless mixture of native bytes and \s-1UTF\-8\s0, as well | |
295 | as a warning: | |
296 | .PP | |
297 | .Vb 1 | |
298 | \& Wide character in print at ... | |
299 | .Ve | |
300 | .PP | |
301 | To output \s-1UTF\-8\s0, use the \f(CW\*(C`:utf8\*(C'\fR output layer. Prepending | |
302 | .PP | |
303 | .Vb 1 | |
304 | \& binmode(STDOUT, ":utf8"); | |
305 | .Ve | |
306 | .PP | |
307 | to this sample program ensures that the output is completely \s-1UTF\-8\s0, | |
308 | and removes the program's warning. | |
309 | .PP | |
310 | You can enable automatic UTF\-8\-ification of your standard file | |
311 | handles, default \f(CW\*(C`open()\*(C'\fR layer, and \f(CW@ARGV\fR by using either | |
312 | the \f(CW\*(C`\-C\*(C'\fR command line switch or the \f(CW\*(C`PERL_UNICODE\*(C'\fR environment | |
313 | variable, see perlrun for the documentation of the \f(CW\*(C`\-C\*(C'\fR switch. | |
314 | .PP | |
315 | Note that this means that Perl expects other software to work, too: | |
316 | if Perl has been led to believe that \s-1STDIN\s0 should be \s-1UTF\-8\s0, but then | |
317 | \&\s-1STDIN\s0 coming in from another command is not \s-1UTF\-8\s0, Perl will complain | |
318 | about the malformed \s-1UTF\-8\s0. | |
319 | .PP | |
320 | All features that combine Unicode and I/O also require using the new | |
321 | PerlIO feature. Almost all Perl 5.8 platforms do use PerlIO, though: | |
322 | you can see whether yours is by running \*(L"perl \-V\*(R" and looking for | |
323 | \&\f(CW\*(C`useperlio=define\*(C'\fR. | |
324 | .Sh "Unicode and \s-1EBCDIC\s0" | |
325 | .IX Subsection "Unicode and EBCDIC" | |
326 | Perl 5.8.0 also supports Unicode on \s-1EBCDIC\s0 platforms. There, | |
327 | Unicode support is somewhat more complex to implement since | |
328 | additional conversions are needed at every step. Some problems | |
329 | remain, see perlebcdic for details. | |
330 | .PP | |
331 | In any case, the Unicode support on \s-1EBCDIC\s0 platforms is better than | |
332 | in the 5.6 series, which didn't work much at all for \s-1EBCDIC\s0 platform. | |
333 | On \s-1EBCDIC\s0 platforms, the internal Unicode encoding form is UTF-EBCDIC | |
334 | instead of \s-1UTF\-8\s0. The difference is that as \s-1UTF\-8\s0 is \*(L"ASCII\-safe\*(R" in | |
335 | that \s-1ASCII\s0 characters encode to \s-1UTF\-8\s0 as\-is, while UTF-EBCDIC is | |
336 | \&\*(L"EBCDIC\-safe\*(R". | |
337 | .Sh "Creating Unicode" | |
338 | .IX Subsection "Creating Unicode" | |
339 | To create Unicode characters in literals for code points above \f(CW0xFF\fR, | |
340 | use the \f(CW\*(C`\ex{...}\*(C'\fR notation in double-quoted strings: | |
341 | .PP | |
342 | .Vb 1 | |
343 | \& my $smiley = "\ex{263a}"; | |
344 | .Ve | |
345 | .PP | |
346 | Similarly, it can be used in regular expression literals | |
347 | .PP | |
348 | .Vb 1 | |
349 | \& $smiley =~ /\ex{263a}/; | |
350 | .Ve | |
351 | .PP | |
352 | At run-time you can use \f(CW\*(C`chr()\*(C'\fR: | |
353 | .PP | |
354 | .Vb 1 | |
355 | \& my $hebrew_alef = chr(0x05d0); | |
356 | .Ve | |
357 | .PP | |
358 | See \*(L"Further Resources\*(R" for how to find all these numeric codes. | |
359 | .PP | |
360 | Naturally, \f(CW\*(C`ord()\*(C'\fR will do the reverse: it turns a character into | |
361 | a code point. | |
362 | .PP | |
363 | Note that \f(CW\*(C`\ex..\*(C'\fR (no \f(CW\*(C`{}\*(C'\fR and only two hexadecimal digits), \f(CW\*(C`\ex{...}\*(C'\fR, | |
364 | and \f(CW\*(C`chr(...)\*(C'\fR for arguments less than \f(CW0x100\fR (decimal 256) | |
365 | generate an eight-bit character for backward compatibility with older | |
366 | Perls. For arguments of \f(CW0x100\fR or more, Unicode characters are | |
367 | always produced. If you want to force the production of Unicode | |
368 | characters regardless of the numeric value, use \f(CW\*(C`pack("U", ...)\*(C'\fR | |
369 | instead of \f(CW\*(C`\ex..\*(C'\fR, \f(CW\*(C`\ex{...}\*(C'\fR, or \f(CW\*(C`chr()\*(C'\fR. | |
370 | .PP | |
371 | You can also use the \f(CW\*(C`charnames\*(C'\fR pragma to invoke characters | |
372 | by name in double-quoted strings: | |
373 | .PP | |
374 | .Vb 2 | |
375 | \& use charnames ':full'; | |
376 | \& my $arabic_alef = "\eN{ARABIC LETTER ALEF}"; | |
377 | .Ve | |
378 | .PP | |
379 | And, as mentioned above, you can also \f(CW\*(C`pack()\*(C'\fR numbers into Unicode | |
380 | characters: | |
381 | .PP | |
382 | .Vb 1 | |
383 | \& my $georgian_an = pack("U", 0x10a0); | |
384 | .Ve | |
385 | .PP | |
386 | Note that both \f(CW\*(C`\ex{...}\*(C'\fR and \f(CW\*(C`\eN{...}\*(C'\fR are compile-time string | |
387 | constants: you cannot use variables in them. if you want similar | |
388 | run-time functionality, use \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`charnames::vianame()\*(C'\fR. | |
389 | .PP | |
390 | If you want to force the result to Unicode characters, use the special | |
391 | \&\f(CW"U0"\fR prefix. It consumes no arguments but forces the result to be | |
392 | in Unicode characters, instead of bytes. | |
393 | .PP | |
394 | .Vb 1 | |
395 | \& my $chars = pack("U0C*", 0x80, 0x42); | |
396 | .Ve | |
397 | .PP | |
398 | Likewise, you can force the result to be bytes by using the special | |
399 | \&\f(CW"C0"\fR prefix. | |
400 | .Sh "Handling Unicode" | |
401 | .IX Subsection "Handling Unicode" | |
402 | Handling Unicode is for the most part transparent: just use the | |
403 | strings as usual. Functions like \f(CW\*(C`index()\*(C'\fR, \f(CW\*(C`length()\*(C'\fR, and | |
404 | \&\f(CW\*(C`substr()\*(C'\fR will work on the Unicode characters; regular expressions | |
405 | will work on the Unicode characters (see perlunicode and perlretut). | |
406 | .PP | |
407 | Note that Perl considers combining character sequences to be | |
408 | separate characters, so for example | |
409 | .PP | |
410 | .Vb 2 | |
411 | \& use charnames ':full'; | |
412 | \& print length("\eN{LATIN CAPITAL LETTER A}\eN{COMBINING ACUTE ACCENT}"), "\en"; | |
413 | .Ve | |
414 | .PP | |
415 | will print 2, not 1. The only exception is that regular expressions | |
416 | have \f(CW\*(C`\eX\*(C'\fR for matching a combining character sequence. | |
417 | .PP | |
418 | Life is not quite so transparent, however, when working with legacy | |
419 | encodings, I/O, and certain special cases: | |
420 | .Sh "Legacy Encodings" | |
421 | .IX Subsection "Legacy Encodings" | |
422 | When you combine legacy data and Unicode the legacy data needs | |
423 | to be upgraded to Unicode. Normally \s-1ISO\s0 8859\-1 (or \s-1EBCDIC\s0, if | |
424 | applicable) is assumed. You can override this assumption by | |
425 | using the \f(CW\*(C`encoding\*(C'\fR pragma, for example | |
426 | .PP | |
427 | .Vb 1 | |
428 | \& use encoding 'latin2'; # ISO 8859-2 | |
429 | .Ve | |
430 | .PP | |
431 | in which case literals (string or regular expressions), \f(CW\*(C`chr()\*(C'\fR, | |
432 | and \f(CW\*(C`ord()\*(C'\fR in your whole script are assumed to produce Unicode | |
433 | characters from \s-1ISO\s0 8859\-2 code points. Note that the matching for | |
434 | encoding names is forgiving: instead of \f(CW\*(C`latin2\*(C'\fR you could have | |
435 | said \f(CW\*(C`Latin 2\*(C'\fR, or \f(CW\*(C`iso8859\-2\*(C'\fR, or other variations. With just | |
436 | .PP | |
437 | .Vb 1 | |
438 | \& use encoding; | |
439 | .Ve | |
440 | .PP | |
441 | the environment variable \f(CW\*(C`PERL_ENCODING\*(C'\fR will be consulted. | |
442 | If that variable isn't set, the encoding pragma will fail. | |
443 | .PP | |
444 | The \f(CW\*(C`Encode\*(C'\fR module knows about many encodings and has interfaces | |
445 | for doing conversions between those encodings: | |
446 | .PP | |
447 | .Vb 2 | |
448 | \& use Encode 'decode'; | |
449 | \& $data = decode("iso-8859-3", $data); # convert from legacy to utf-8 | |
450 | .Ve | |
451 | .Sh "Unicode I/O" | |
452 | .IX Subsection "Unicode I/O" | |
453 | Normally, writing out Unicode data | |
454 | .PP | |
455 | .Vb 1 | |
456 | \& print FH $some_string_with_unicode, "\en"; | |
457 | .Ve | |
458 | .PP | |
459 | produces raw bytes that Perl happens to use to internally encode the | |
460 | Unicode string. Perl's internal encoding depends on the system as | |
461 | well as what characters happen to be in the string at the time. If | |
462 | any of the characters are at code points \f(CW0x100\fR or above, you will get | |
463 | a warning. To ensure that the output is explicitly rendered in the | |
464 | encoding you desire\*(--and to avoid the warning\*(--open the stream with | |
465 | the desired encoding. Some examples: | |
466 | .PP | |
467 | .Vb 1 | |
468 | \& open FH, ">:utf8", "file"; | |
469 | .Ve | |
470 | .PP | |
471 | .Vb 3 | |
472 | \& open FH, ">:encoding(ucs2)", "file"; | |
473 | \& open FH, ">:encoding(UTF-8)", "file"; | |
474 | \& open FH, ">:encoding(shift_jis)", "file"; | |
475 | .Ve | |
476 | .PP | |
477 | and on already open streams, use \f(CW\*(C`binmode()\*(C'\fR: | |
478 | .PP | |
479 | .Vb 1 | |
480 | \& binmode(STDOUT, ":utf8"); | |
481 | .Ve | |
482 | .PP | |
483 | .Vb 3 | |
484 | \& binmode(STDOUT, ":encoding(ucs2)"); | |
485 | \& binmode(STDOUT, ":encoding(UTF-8)"); | |
486 | \& binmode(STDOUT, ":encoding(shift_jis)"); | |
487 | .Ve | |
488 | .PP | |
489 | The matching of encoding names is loose: case does not matter, and | |
490 | many encodings have several aliases. Note that the \f(CW\*(C`:utf8\*(C'\fR layer | |
491 | must always be specified exactly like that; it is \fInot\fR subject to | |
492 | the loose matching of encoding names. | |
493 | .PP | |
494 | See PerlIO for the \f(CW\*(C`:utf8\*(C'\fR layer, PerlIO::encoding and | |
495 | Encode::PerlIO for the \f(CW\*(C`:encoding()\*(C'\fR layer, and | |
496 | Encode::Supported for many encodings supported by the \f(CW\*(C`Encode\*(C'\fR | |
497 | module. | |
498 | .PP | |
499 | Reading in a file that you know happens to be encoded in one of the | |
500 | Unicode or legacy encodings does not magically turn the data into | |
501 | Unicode in Perl's eyes. To do that, specify the appropriate | |
502 | layer when opening files | |
503 | .PP | |
504 | .Vb 2 | |
505 | \& open(my $fh,'<:utf8', 'anything'); | |
506 | \& my $line_of_unicode = <$fh>; | |
507 | .Ve | |
508 | .PP | |
509 | .Vb 2 | |
510 | \& open(my $fh,'<:encoding(Big5)', 'anything'); | |
511 | \& my $line_of_unicode = <$fh>; | |
512 | .Ve | |
513 | .PP | |
514 | The I/O layers can also be specified more flexibly with | |
515 | the \f(CW\*(C`open\*(C'\fR pragma. See open, or look at the following example. | |
516 | .PP | |
517 | .Vb 7 | |
518 | \& use open ':utf8'; # input and output default layer will be UTF-8 | |
519 | \& open X, ">file"; | |
520 | \& print X chr(0x100), "\en"; | |
521 | \& close X; | |
522 | \& open Y, "<file"; | |
523 | \& printf "%#x\en", ord(<Y>); # this should print 0x100 | |
524 | \& close Y; | |
525 | .Ve | |
526 | .PP | |
527 | With the \f(CW\*(C`open\*(C'\fR pragma you can use the \f(CW\*(C`:locale\*(C'\fR layer | |
528 | .PP | |
529 | .Vb 9 | |
530 | \& BEGIN { $ENV{LC_ALL} = $ENV{LANG} = 'ru_RU.KOI8-R' } | |
531 | \& # the :locale will probe the locale environment variables like LC_ALL | |
532 | \& use open OUT => ':locale'; # russki parusski | |
533 | \& open(O, ">koi8"); | |
534 | \& print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1 | |
535 | \& close O; | |
536 | \& open(I, "<koi8"); | |
537 | \& printf "%#x\en", ord(<I>), "\en"; # this should print 0xc1 | |
538 | \& close I; | |
539 | .Ve | |
540 | .PP | |
541 | or you can also use the \f(CW':encoding(...)'\fR layer | |
542 | .PP | |
543 | .Vb 2 | |
544 | \& open(my $epic,'<:encoding(iso-8859-7)','iliad.greek'); | |
545 | \& my $line_of_unicode = <$epic>; | |
546 | .Ve | |
547 | .PP | |
548 | These methods install a transparent filter on the I/O stream that | |
549 | converts data from the specified encoding when it is read in from the | |
550 | stream. The result is always Unicode. | |
551 | .PP | |
552 | The open pragma affects all the \f(CW\*(C`open()\*(C'\fR calls after the pragma by | |
553 | setting default layers. If you want to affect only certain | |
554 | streams, use explicit layers directly in the \f(CW\*(C`open()\*(C'\fR call. | |
555 | .PP | |
556 | You can switch encodings on an already opened stream by using | |
557 | \&\f(CW\*(C`binmode()\*(C'\fR; see \*(L"binmode\*(R" in perlfunc. | |
558 | .PP | |
559 | The \f(CW\*(C`:locale\*(C'\fR does not currently (as of Perl 5.8.0) work with | |
560 | \&\f(CW\*(C`open()\*(C'\fR and \f(CW\*(C`binmode()\*(C'\fR, only with the \f(CW\*(C`open\*(C'\fR pragma. The | |
561 | \&\f(CW\*(C`:utf8\*(C'\fR and \f(CW\*(C`:encoding(...)\*(C'\fR methods do work with all of \f(CW\*(C`open()\*(C'\fR, | |
562 | \&\f(CW\*(C`binmode()\*(C'\fR, and the \f(CW\*(C`open\*(C'\fR pragma. | |
563 | .PP | |
564 | Similarly, you may use these I/O layers on output streams to | |
565 | automatically convert Unicode to the specified encoding when it is | |
566 | written to the stream. For example, the following snippet copies the | |
567 | contents of the file \*(L"text.jis\*(R" (encoded as \s-1ISO\-2022\-JP\s0, aka \s-1JIS\s0) to | |
568 | the file \*(L"text.utf8\*(R", encoded as \s-1UTF\-8:\s0 | |
569 | .PP | |
570 | .Vb 3 | |
571 | \& open(my $nihongo, '<:encoding(iso-2022-jp)', 'text.jis'); | |
572 | \& open(my $unicode, '>:utf8', 'text.utf8'); | |
573 | \& while (<$nihongo>) { print $unicode $_ } | |
574 | .Ve | |
575 | .PP | |
576 | The naming of encodings, both by the \f(CW\*(C`open()\*(C'\fR and by the \f(CW\*(C`open\*(C'\fR | |
577 | pragma, is similar to the \f(CW\*(C`encoding\*(C'\fR pragma in that it allows for | |
578 | flexible names: \f(CW\*(C`koi8\-r\*(C'\fR and \f(CW\*(C`KOI8R\*(C'\fR will both be understood. | |
579 | .PP | |
580 | Common encodings recognized by \s-1ISO\s0, \s-1MIME\s0, \s-1IANA\s0, and various other | |
581 | standardisation organisations are recognised; for a more detailed | |
582 | list see Encode::Supported. | |
583 | .PP | |
584 | \&\f(CW\*(C`read()\*(C'\fR reads characters and returns the number of characters. | |
585 | \&\f(CW\*(C`seek()\*(C'\fR and \f(CW\*(C`tell()\*(C'\fR operate on byte counts, as do \f(CW\*(C`sysread()\*(C'\fR | |
586 | and \f(CW\*(C`sysseek()\*(C'\fR. | |
587 | .PP | |
588 | Notice that because of the default behaviour of not doing any | |
589 | conversion upon input if there is no default layer, | |
590 | it is easy to mistakenly write code that keeps on expanding a file | |
591 | by repeatedly encoding the data: | |
592 | .PP | |
593 | .Vb 8 | |
594 | \& # BAD CODE WARNING | |
595 | \& open F, "file"; | |
596 | \& local $/; ## read in the whole file of 8-bit characters | |
597 | \& $t = <F>; | |
598 | \& close F; | |
599 | \& open F, ">:utf8", "file"; | |
600 | \& print F $t; ## convert to UTF-8 on output | |
601 | \& close F; | |
602 | .Ve | |
603 | .PP | |
604 | If you run this code twice, the contents of the \fIfile\fR will be twice | |
605 | \&\s-1UTF\-8\s0 encoded. A \f(CW\*(C`use open ':utf8'\*(C'\fR would have avoided the bug, or | |
606 | explicitly opening also the \fIfile\fR for input as \s-1UTF\-8\s0. | |
607 | .PP | |
608 | \&\fB\s-1NOTE\s0\fR: the \f(CW\*(C`:utf8\*(C'\fR and \f(CW\*(C`:encoding\*(C'\fR features work only if your | |
609 | Perl has been built with the new PerlIO feature (which is the default | |
610 | on most systems). | |
611 | .Sh "Displaying Unicode As Text" | |
612 | .IX Subsection "Displaying Unicode As Text" | |
613 | Sometimes you might want to display Perl scalars containing Unicode as | |
614 | simple \s-1ASCII\s0 (or \s-1EBCDIC\s0) text. The following subroutine converts | |
615 | its argument so that Unicode characters with code points greater than | |
616 | 255 are displayed as \f(CW\*(C`\ex{...}\*(C'\fR, control characters (like \f(CW\*(C`\en\*(C'\fR) are | |
617 | displayed as \f(CW\*(C`\ex..\*(C'\fR, and the rest of the characters as themselves: | |
618 | .PP | |
619 | .Vb 9 | |
620 | \& sub nice_string { | |
621 | \& join("", | |
622 | \& map { $_ > 255 ? # if wide character... | |
623 | \& sprintf("\e\ex{%04X}", $_) : # \ex{...} | |
624 | \& chr($_) =~ /[[:cntrl:]]/ ? # else if control character ... | |
625 | \& sprintf("\e\ex%02X", $_) : # \ex.. | |
626 | \& quotemeta(chr($_)) # else quoted or as themselves | |
627 | \& } unpack("U*", $_[0])); # unpack Unicode characters | |
628 | \& } | |
629 | .Ve | |
630 | .PP | |
631 | For example, | |
632 | .PP | |
633 | .Vb 1 | |
634 | \& nice_string("foo\ex{100}bar\en") | |
635 | .Ve | |
636 | .PP | |
637 | returns the string | |
638 | .PP | |
639 | .Vb 1 | |
640 | \& 'foo\ex{0100}bar\ex0A' | |
641 | .Ve | |
642 | .PP | |
643 | which is ready to be printed. | |
644 | .Sh "Special Cases" | |
645 | .IX Subsection "Special Cases" | |
646 | .IP "\(bu" 4 | |
647 | Bit Complement Operator ~ And \fIvec()\fR | |
648 | .Sp | |
649 | The bit complement operator \f(CW\*(C`~\*(C'\fR may produce surprising results if | |
650 | used on strings containing characters with ordinal values above | |
651 | 255. In such a case, the results are consistent with the internal | |
652 | encoding of the characters, but not with much else. So don't do | |
653 | that. Similarly for \f(CW\*(C`vec()\*(C'\fR: you will be operating on the | |
654 | internally-encoded bit patterns of the Unicode characters, not on | |
655 | the code point values, which is very probably not what you want. | |
656 | .IP "\(bu" 4 | |
657 | Peeking At Perl's Internal Encoding | |
658 | .Sp | |
659 | Normal users of Perl should never care how Perl encodes any particular | |
660 | Unicode string (because the normal ways to get at the contents of a | |
661 | string with Unicode\*(--via input and output\*(--should always be via | |
662 | explicitly-defined I/O layers). But if you must, there are two | |
663 | ways of looking behind the scenes. | |
664 | .Sp | |
665 | One way of peeking inside the internal encoding of Unicode characters | |
666 | is to use \f(CW\*(C`unpack("C*", ...\*(C'\fR to get the bytes or \f(CW\*(C`unpack("H*", ...)\*(C'\fR | |
667 | to display the bytes: | |
668 | .Sp | |
669 | .Vb 2 | |
670 | \& # this prints c4 80 for the UTF-8 bytes 0xc4 0x80 | |
671 | \& print join(" ", unpack("H*", pack("U", 0x100))), "\en"; | |
672 | .Ve | |
673 | .Sp | |
674 | Yet another way would be to use the Devel::Peek module: | |
675 | .Sp | |
676 | .Vb 1 | |
677 | \& perl -MDevel::Peek -e 'Dump(chr(0x100))' | |
678 | .Ve | |
679 | .Sp | |
680 | That shows the \f(CW\*(C`UTF8\*(C'\fR flag in \s-1FLAGS\s0 and both the \s-1UTF\-8\s0 bytes | |
681 | and Unicode characters in \f(CW\*(C`PV\*(C'\fR. See also later in this document | |
682 | the discussion about the \f(CW\*(C`utf8::is_utf8()\*(C'\fR function. | |
683 | .Sh "Advanced Topics" | |
684 | .IX Subsection "Advanced Topics" | |
685 | .IP "\(bu" 4 | |
686 | String Equivalence | |
687 | .Sp | |
688 | The question of string equivalence turns somewhat complicated | |
689 | in Unicode: what do you mean by \*(L"equal\*(R"? | |
690 | .Sp | |
691 | (Is \f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR equal to | |
692 | \&\f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR?) | |
693 | .Sp | |
694 | The short answer is that by default Perl compares equivalence (\f(CW\*(C`eq\*(C'\fR, | |
695 | \&\f(CW\*(C`ne\*(C'\fR) based only on code points of the characters. In the above | |
696 | case, the answer is no (because 0x00C1 != 0x0041). But sometimes, any | |
697 | \&\s-1CAPITAL\s0 \s-1LETTER\s0 As should be considered equal, or even As of any case. | |
698 | .Sp | |
699 | The long answer is that you need to consider character normalization | |
700 | and casing issues: see Unicode::Normalize, Unicode Technical | |
701 | Reports #15 and #21, \fIUnicode Normalization Forms\fR and \fICase | |
702 | Mappings\fR, http://www.unicode.org/unicode/reports/tr15/ and | |
703 | http://www.unicode.org/unicode/reports/tr21/ | |
704 | .Sp | |
705 | As of Perl 5.8.0, the \*(L"Full\*(R" case-folding of \fICase | |
706 | Mappings/SpecialCasing\fR is implemented. | |
707 | .IP "\(bu" 4 | |
708 | String Collation | |
709 | .Sp | |
710 | People like to see their strings nicely sorted\*(--or as Unicode | |
711 | parlance goes, collated. But again, what do you mean by collate? | |
712 | .Sp | |
713 | (Does \f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR come before or after | |
714 | \&\f(CW\*(C`LATIN CAPITAL LETTER A WITH GRAVE\*(C'\fR?) | |
715 | .Sp | |
716 | The short answer is that by default, Perl compares strings (\f(CW\*(C`lt\*(C'\fR, | |
717 | \&\f(CW\*(C`le\*(C'\fR, \f(CW\*(C`cmp\*(C'\fR, \f(CW\*(C`ge\*(C'\fR, \f(CW\*(C`gt\*(C'\fR) based only on the code points of the | |
718 | characters. In the above case, the answer is \*(L"after\*(R", since | |
719 | \&\f(CW0x00C1\fR > \f(CW0x00C0\fR. | |
720 | .Sp | |
721 | The long answer is that \*(L"it depends\*(R", and a good answer cannot be | |
722 | given without knowing (at the very least) the language context. | |
723 | See Unicode::Collate, and \fIUnicode Collation Algorithm\fR | |
724 | http://www.unicode.org/unicode/reports/tr10/ | |
725 | .Sh "Miscellaneous" | |
726 | .IX Subsection "Miscellaneous" | |
727 | .IP "\(bu" 4 | |
728 | Character Ranges and Classes | |
729 | .Sp | |
730 | Character ranges in regular expression character classes (\f(CW\*(C`/[a\-z]/\*(C'\fR) | |
731 | and in the \f(CW\*(C`tr///\*(C'\fR (also known as \f(CW\*(C`y///\*(C'\fR) operator are not magically | |
732 | Unicode\-aware. What this means that \f(CW\*(C`[A\-Za\-z]\*(C'\fR will not magically start | |
733 | to mean \*(L"all alphabetic letters\*(R"; not that it does mean that even for | |
734 | 8\-bit characters, you should be using \f(CW\*(C`/[[:alpha:]]/\*(C'\fR in that case. | |
735 | .Sp | |
736 | For specifying character classes like that in regular expressions, | |
737 | you can use the various Unicode properties\*(--\f(CW\*(C`\epL\*(C'\fR, or perhaps | |
738 | \&\f(CW\*(C`\ep{Alphabetic}\*(C'\fR, in this particular case. You can use Unicode | |
739 | code points as the end points of character ranges, but there is no | |
740 | magic associated with specifying a certain range. For further | |
741 | information\*(--there are dozens of Unicode character classes\*(--see | |
742 | perlunicode. | |
743 | .IP "\(bu" 4 | |
744 | String-To-Number Conversions | |
745 | .Sp | |
746 | Unicode does define several other decimal\*(--and numeric\*(--characters | |
747 | besides the familiar 0 to 9, such as the Arabic and Indic digits. | |
748 | Perl does not support string-to-number conversion for digits other | |
749 | than \s-1ASCII\s0 0 to 9 (and \s-1ASCII\s0 a to f for hexadecimal). | |
750 | .Sh "Questions With Answers" | |
751 | .IX Subsection "Questions With Answers" | |
752 | .IP "\(bu" 4 | |
753 | Will My Old Scripts Break? | |
754 | .Sp | |
755 | Very probably not. Unless you are generating Unicode characters | |
756 | somehow, old behaviour should be preserved. About the only behaviour | |
757 | that has changed and which could start generating Unicode is the old | |
758 | behaviour of \f(CW\*(C`chr()\*(C'\fR where supplying an argument more than 255 | |
759 | produced a character modulo 255. \f(CW\*(C`chr(300)\*(C'\fR, for example, was equal | |
760 | to \f(CW\*(C`chr(45)\*(C'\fR or \*(L"\-\*(R" (in \s-1ASCII\s0), now it is \s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 I \s-1WITH\s0 | |
761 | \&\s-1BREVE\s0. | |
762 | .IP "\(bu" 4 | |
763 | How Do I Make My Scripts Work With Unicode? | |
764 | .Sp | |
765 | Very little work should be needed since nothing changes until you | |
766 | generate Unicode data. The most important thing is getting input as | |
767 | Unicode; for that, see the earlier I/O discussion. | |
768 | .IP "\(bu" 4 | |
769 | How Do I Know Whether My String Is In Unicode? | |
770 | .Sp | |
771 | You shouldn't care. No, you really shouldn't. No, really. If you | |
772 | have to care\*(--beyond the cases described above\*(--it means that we | |
773 | didn't get the transparency of Unicode quite right. | |
774 | .Sp | |
775 | Okay, if you insist: | |
776 | .Sp | |
777 | .Vb 1 | |
778 | \& print utf8::is_utf8($string) ? 1 : 0, "\en"; | |
779 | .Ve | |
780 | .Sp | |
781 | But note that this doesn't mean that any of the characters in the | |
782 | string are necessary \s-1UTF\-8\s0 encoded, or that any of the characters have | |
783 | code points greater than 0xFF (255) or even 0x80 (128), or that the | |
784 | string has any characters at all. All the \f(CW\*(C`is_utf8()\*(C'\fR does is to | |
785 | return the value of the internal \*(L"utf8ness\*(R" flag attached to the | |
786 | \&\f(CW$string\fR. If the flag is off, the bytes in the scalar are interpreted | |
787 | as a single byte encoding. If the flag is on, the bytes in the scalar | |
788 | are interpreted as the (multi\-byte, variable\-length) \s-1UTF\-8\s0 encoded code | |
789 | points of the characters. Bytes added to an \s-1UTF\-8\s0 encoded string are | |
790 | automatically upgraded to \s-1UTF\-8\s0. If mixed non\-UTF\-8 and \s-1UTF\-8\s0 scalars | |
791 | are merged (double\-quoted interpolation, explicit concatenation, and | |
792 | printf/sprintf parameter substitution), the result will be \s-1UTF\-8\s0 encoded | |
793 | as if copies of the byte strings were upgraded to \s-1UTF\-8:\s0 for example, | |
794 | .Sp | |
795 | .Vb 3 | |
796 | \& $a = "ab\ex80c"; | |
797 | \& $b = "\ex{100}"; | |
798 | \& print "$a = $b\en"; | |
799 | .Ve | |
800 | .Sp | |
801 | the output string will be UTF\-8\-encoded \f(CW\*(C`ab\ex80c = \ex{100}\en\*(C'\fR, but | |
802 | \&\f(CW$a\fR will stay byte\-encoded. | |
803 | .Sp | |
804 | Sometimes you might really need to know the byte length of a string | |
805 | instead of the character length. For that use either the | |
806 | \&\f(CW\*(C`Encode::encode_utf8()\*(C'\fR function or the \f(CW\*(C`bytes\*(C'\fR pragma and its only | |
807 | defined function \f(CW\*(C`length()\*(C'\fR: | |
808 | .Sp | |
809 | .Vb 7 | |
810 | \& my $unicode = chr(0x100); | |
811 | \& print length($unicode), "\en"; # will print 1 | |
812 | \& require Encode; | |
813 | \& print length(Encode::encode_utf8($unicode)), "\en"; # will print 2 | |
814 | \& use bytes; | |
815 | \& print length($unicode), "\en"; # will also print 2 | |
816 | \& # (the 0xC4 0x80 of the UTF-8) | |
817 | .Ve | |
818 | .IP "\(bu" 4 | |
819 | How Do I Detect Data That's Not Valid In a Particular Encoding? | |
820 | .Sp | |
821 | Use the \f(CW\*(C`Encode\*(C'\fR package to try converting it. | |
822 | For example, | |
823 | .Sp | |
824 | .Vb 6 | |
825 | \& use Encode 'decode_utf8'; | |
826 | \& if (decode_utf8($string_of_bytes_that_I_think_is_utf8)) { | |
827 | \& # valid | |
828 | \& } else { | |
829 | \& # invalid | |
830 | \& } | |
831 | .Ve | |
832 | .Sp | |
833 | For \s-1UTF\-8\s0 only, you can use: | |
834 | .Sp | |
835 | .Vb 2 | |
836 | \& use warnings; | |
837 | \& @chars = unpack("U0U*", $string_of_bytes_that_I_think_is_utf8); | |
838 | .Ve | |
839 | .Sp | |
840 | If invalid, a \f(CW\*(C`Malformed UTF\-8 character (byte 0x##) in unpack\*(C'\fR | |
841 | warning is produced. The \*(L"U0\*(R" means \*(L"expect strictly \s-1UTF\-8\s0 encoded | |
842 | Unicode\*(R". Without that the \f(CW\*(C`unpack("U*", ...)\*(C'\fR would accept also | |
843 | data like \f(CW\*(C`chr(0xFF\*(C'\fR), similarly to the \f(CW\*(C`pack\*(C'\fR as we saw earlier. | |
844 | .IP "\(bu" 4 | |
845 | How Do I Convert Binary Data Into a Particular Encoding, Or Vice Versa? | |
846 | .Sp | |
847 | This probably isn't as useful as you might think. | |
848 | Normally, you shouldn't need to. | |
849 | .Sp | |
850 | In one sense, what you are asking doesn't make much sense: encodings | |
851 | are for characters, and binary data are not \*(L"characters\*(R", so converting | |
852 | \&\*(L"data\*(R" into some encoding isn't meaningful unless you know in what | |
853 | character set and encoding the binary data is in, in which case it's | |
854 | not just binary data, now is it? | |
855 | .Sp | |
856 | If you have a raw sequence of bytes that you know should be | |
857 | interpreted via a particular encoding, you can use \f(CW\*(C`Encode\*(C'\fR: | |
858 | .Sp | |
859 | .Vb 2 | |
860 | \& use Encode 'from_to'; | |
861 | \& from_to($data, "iso-8859-1", "utf-8"); # from latin-1 to utf-8 | |
862 | .Ve | |
863 | .Sp | |
864 | The call to \f(CW\*(C`from_to()\*(C'\fR changes the bytes in \f(CW$data\fR, but nothing | |
865 | material about the nature of the string has changed as far as Perl is | |
866 | concerned. Both before and after the call, the string \f(CW$data\fR | |
867 | contains just a bunch of 8\-bit bytes. As far as Perl is concerned, | |
868 | the encoding of the string remains as \*(L"system\-native 8\-bit bytes\*(R". | |
869 | .Sp | |
870 | You might relate this to a fictional 'Translate' module: | |
871 | .Sp | |
872 | .Vb 4 | |
873 | \& use Translate; | |
874 | \& my $phrase = "Yes"; | |
875 | \& Translate::from_to($phrase, 'english', 'deutsch'); | |
876 | \& ## phrase now contains "Ja" | |
877 | .Ve | |
878 | .Sp | |
879 | The contents of the string changes, but not the nature of the string. | |
880 | Perl doesn't know any more after the call than before that the | |
881 | contents of the string indicates the affirmative. | |
882 | .Sp | |
883 | Back to converting data. If you have (or want) data in your system's | |
884 | native 8\-bit encoding (e.g. Latin\-1, \s-1EBCDIC\s0, etc.), you can use | |
885 | pack/unpack to convert to/from Unicode. | |
886 | .Sp | |
887 | .Vb 2 | |
888 | \& $native_string = pack("C*", unpack("U*", $Unicode_string)); | |
889 | \& $Unicode_string = pack("U*", unpack("C*", $native_string)); | |
890 | .Ve | |
891 | .Sp | |
892 | If you have a sequence of bytes you \fBknow\fR is valid \s-1UTF\-8\s0, | |
893 | but Perl doesn't know it yet, you can make Perl a believer, too: | |
894 | .Sp | |
895 | .Vb 2 | |
896 | \& use Encode 'decode_utf8'; | |
897 | \& $Unicode = decode_utf8($bytes); | |
898 | .Ve | |
899 | .Sp | |
900 | You can convert well-formed \s-1UTF\-8\s0 to a sequence of bytes, but if | |
901 | you just want to convert random binary data into \s-1UTF\-8\s0, you can't. | |
902 | \&\fBAny random collection of bytes isn't well-formed \s-1UTF\-8\s0\fR. You can | |
903 | use \f(CW\*(C`unpack("C*", $string)\*(C'\fR for the former, and you can create | |
904 | well-formed Unicode data by \f(CW\*(C`pack("U*", 0xff, ...)\*(C'\fR. | |
905 | .IP "\(bu" 4 | |
906 | How Do I Display Unicode? How Do I Input Unicode? | |
907 | .Sp | |
908 | See http://www.alanwood.net/unicode/ and | |
909 | http://www.cl.cam.ac.uk/~mgk25/unicode.html | |
910 | .IP "\(bu" 4 | |
911 | How Does Unicode Work With Traditional Locales? | |
912 | .Sp | |
913 | In Perl, not very well. Avoid using locales through the \f(CW\*(C`locale\*(C'\fR | |
914 | pragma. Use only one or the other. But see perlrun for the | |
915 | description of the \f(CW\*(C`\-C\*(C'\fR switch and its environment counterpart, | |
916 | \&\f(CW$ENV{PERL_UNICODE}\fR to see how to enable various Unicode features, | |
917 | for example by using locale settings. | |
918 | .Sh "Hexadecimal Notation" | |
919 | .IX Subsection "Hexadecimal Notation" | |
920 | The Unicode standard prefers using hexadecimal notation because | |
921 | that more clearly shows the division of Unicode into blocks of 256 characters. | |
922 | Hexadecimal is also simply shorter than decimal. You can use decimal | |
923 | notation, too, but learning to use hexadecimal just makes life easier | |
924 | with the Unicode standard. The \f(CW\*(C`U+HHHH\*(C'\fR notation uses hexadecimal, | |
925 | for example. | |
926 | .PP | |
927 | The \f(CW\*(C`0x\*(C'\fR prefix means a hexadecimal number, the digits are 0\-9 \fIand\fR | |
928 | a\-f (or A\-F, case doesn't matter). Each hexadecimal digit represents | |
929 | four bits, or half a byte. \f(CW\*(C`print 0x..., "\en"\*(C'\fR will show a | |
930 | hexadecimal number in decimal, and \f(CW\*(C`printf "%x\en", $decimal\*(C'\fR will | |
931 | show a decimal number in hexadecimal. If you have just the | |
932 | \&\*(L"hex digits\*(R" of a hexadecimal number, you can use the \f(CW\*(C`hex()\*(C'\fR function. | |
933 | .PP | |
934 | .Vb 6 | |
935 | \& print 0x0009, "\en"; # 9 | |
936 | \& print 0x000a, "\en"; # 10 | |
937 | \& print 0x000f, "\en"; # 15 | |
938 | \& print 0x0010, "\en"; # 16 | |
939 | \& print 0x0011, "\en"; # 17 | |
940 | \& print 0x0100, "\en"; # 256 | |
941 | .Ve | |
942 | .PP | |
943 | .Vb 1 | |
944 | \& print 0x0041, "\en"; # 65 | |
945 | .Ve | |
946 | .PP | |
947 | .Vb 2 | |
948 | \& printf "%x\en", 65; # 41 | |
949 | \& printf "%#x\en", 65; # 0x41 | |
950 | .Ve | |
951 | .PP | |
952 | .Vb 1 | |
953 | \& print hex("41"), "\en"; # 65 | |
954 | .Ve | |
955 | .Sh "Further Resources" | |
956 | .IX Subsection "Further Resources" | |
957 | .IP "\(bu" 4 | |
958 | Unicode Consortium | |
959 | .Sp | |
960 | .Vb 1 | |
961 | \& http://www.unicode.org/ | |
962 | .Ve | |
963 | .IP "\(bu" 4 | |
964 | Unicode \s-1FAQ\s0 | |
965 | .Sp | |
966 | .Vb 1 | |
967 | \& http://www.unicode.org/unicode/faq/ | |
968 | .Ve | |
969 | .IP "\(bu" 4 | |
970 | Unicode Glossary | |
971 | .Sp | |
972 | .Vb 1 | |
973 | \& http://www.unicode.org/glossary/ | |
974 | .Ve | |
975 | .IP "\(bu" 4 | |
976 | Unicode Useful Resources | |
977 | .Sp | |
978 | .Vb 1 | |
979 | \& http://www.unicode.org/unicode/onlinedat/resources.html | |
980 | .Ve | |
981 | .IP "\(bu" 4 | |
982 | Unicode and Multilingual Support in \s-1HTML\s0, Fonts, Web Browsers and Other Applications | |
983 | .Sp | |
984 | .Vb 1 | |
985 | \& http://www.alanwood.net/unicode/ | |
986 | .Ve | |
987 | .IP "\(bu" 4 | |
988 | \&\s-1UTF\-8\s0 and Unicode \s-1FAQ\s0 for Unix/Linux | |
989 | .Sp | |
990 | .Vb 1 | |
991 | \& http://www.cl.cam.ac.uk/~mgk25/unicode.html | |
992 | .Ve | |
993 | .IP "\(bu" 4 | |
994 | Legacy Character Sets | |
995 | .Sp | |
996 | .Vb 2 | |
997 | \& http://www.czyborra.com/ | |
998 | \& http://www.eki.ee/letter/ | |
999 | .Ve | |
1000 | .IP "\(bu" 4 | |
1001 | The Unicode support files live within the Perl installation in the | |
1002 | directory | |
1003 | .Sp | |
1004 | .Vb 1 | |
1005 | \& $Config{installprivlib}/unicore | |
1006 | .Ve | |
1007 | .Sp | |
1008 | in Perl 5.8.0 or newer, and | |
1009 | .Sp | |
1010 | .Vb 1 | |
1011 | \& $Config{installprivlib}/unicode | |
1012 | .Ve | |
1013 | .Sp | |
1014 | in the Perl 5.6 series. (The renaming to \fIlib/unicore\fR was done to | |
1015 | avoid naming conflicts with lib/Unicode in case-insensitive filesystems.) | |
1016 | The main Unicode data file is \fIUnicodeData.txt\fR (or \fIUnicode.301\fR in | |
1017 | Perl 5.6.1.) You can find the \f(CW$Config{installprivlib}\fR by | |
1018 | .Sp | |
1019 | .Vb 1 | |
1020 | \& perl "-V:installprivlib" | |
1021 | .Ve | |
1022 | .Sp | |
1023 | You can explore various information from the Unicode data files using | |
1024 | the \f(CW\*(C`Unicode::UCD\*(C'\fR module. | |
1025 | .SH "UNICODE IN OLDER PERLS" | |
1026 | .IX Header "UNICODE IN OLDER PERLS" | |
1027 | If you cannot upgrade your Perl to 5.8.0 or later, you can still | |
1028 | do some Unicode processing by using the modules \f(CW\*(C`Unicode::String\*(C'\fR, | |
1029 | \&\f(CW\*(C`Unicode::Map8\*(C'\fR, and \f(CW\*(C`Unicode::Map\*(C'\fR, available from \s-1CPAN\s0. | |
1030 | If you have the \s-1GNU\s0 recode installed, you can also use the | |
1031 | Perl front-end \f(CW\*(C`Convert::Recode\*(C'\fR for character conversions. | |
1032 | .PP | |
1033 | The following are fast conversions from \s-1ISO\s0 8859\-1 (Latin\-1) bytes | |
1034 | to \s-1UTF\-8\s0 bytes and back, the code works even with older Perl 5 versions. | |
1035 | .PP | |
1036 | .Vb 2 | |
1037 | \& # ISO 8859-1 to UTF-8 | |
1038 | \& s/([\ex80-\exFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg; | |
1039 | .Ve | |
1040 | .PP | |
1041 | .Vb 2 | |
1042 | \& # UTF-8 to ISO 8859-1 | |
1043 | \& s/([\exC2\exC3])([\ex80-\exBF])/chr(ord($1)<<6&0xC0|ord($2)&0x3F)/eg; | |
1044 | .Ve | |
1045 | .SH "SEE ALSO" | |
1046 | .IX Header "SEE ALSO" | |
1047 | perlunicode, Encode, encoding, open, utf8, bytes, | |
1048 | perlretut, perlrun, Unicode::Collate, Unicode::Normalize, | |
1049 | Unicode::UCD | |
1050 | .SH "ACKNOWLEDGMENTS" | |
1051 | .IX Header "ACKNOWLEDGMENTS" | |
1052 | Thanks to the kind readers of the perl5\-porters@perl.org, | |
1053 | perl\-unicode@perl.org, linux\-utf8@nl.linux.org, and unicore@unicode.org | |
1054 | mailing lists for their valuable feedback. | |
1055 | .SH "AUTHOR, COPYRIGHT, AND LICENSE" | |
1056 | .IX Header "AUTHOR, COPYRIGHT, AND LICENSE" | |
1057 | Copyright 2001\-2002 Jarkko Hietaniemi <jhi@iki.fi> | |
1058 | .PP | |
1059 | This document may be distributed under the same terms as Perl itself. |