Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man1 / perluniintro.1
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "PERLUNIINTRO 1"
132.TH PERLUNIINTRO 1 "2002-06-08" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134perluniintro \- Perl Unicode introduction
135.SH "DESCRIPTION"
136.IX Header "DESCRIPTION"
137This document gives a general idea of Unicode and how to use Unicode
138in Perl.
139.Sh "Unicode"
140.IX Subsection "Unicode"
141Unicode is a character set standard which plans to codify all of the
142writing systems of the world, plus many other symbols.
143.PP
144Unicode and \s-1ISO/IEC\s0 10646 are coordinated standards that provide code
145points for characters in almost all modern character set standards,
146covering more than 30 writing systems and hundreds of languages,
147including all commercially-important modern languages. All characters
148in the largest Chinese, Japanese, and Korean dictionaries are also
149encoded. The standards will eventually cover almost all characters in
150more than 250 writing systems and thousands of languages.
151.PP
152A Unicode \fIcharacter\fR is an abstract entity. It is not bound to any
153particular integer width, especially not to the C language \f(CW\*(C`char\*(C'\fR.
154Unicode is language-neutral and display\-neutral: it does not encode the
155language of the text and it does not define fonts or other graphical
156layout details. Unicode operates on characters and on text built from
157those characters.
158.PP
159Unicode defines characters like \f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR or \f(CW\*(C`GREEK
160SMALL LETTER ALPHA\*(C'\fR and unique numbers for the characters, in this
161case 0x0041 and 0x03B1, respectively. These unique numbers are called
162\&\fIcode points\fR.
163.PP
164The Unicode standard prefers using hexadecimal notation for the code
165points. If numbers like \f(CW0x0041\fR are unfamiliar to
166you, take a peek at a later section, \*(L"Hexadecimal Notation\*(R".
167The Unicode standard uses the notation \f(CW\*(C`U+0041 LATIN CAPITAL LETTER A\*(C'\fR,
168to give the hexadecimal code point and the normative name of
169the character.
170.PP
171Unicode also defines various \fIproperties\fR for the characters, like
172\&\*(L"uppercase\*(R" or \*(L"lowercase\*(R", \*(L"decimal digit\*(R", or \*(L"punctuation\*(R";
173these properties are independent of the names of the characters.
174Furthermore, various operations on the characters like uppercasing,
175lowercasing, and collating (sorting) are defined.
176.PP
177A Unicode character consists either of a single code point, or a
178\&\fIbase character\fR (like \f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR), followed by one or
179more \fImodifiers\fR (like \f(CW\*(C`COMBINING ACUTE ACCENT\*(C'\fR). This sequence of
180base character and modifiers is called a \fIcombining character
181sequence\fR.
182.PP
183Whether to call these combining character sequences \*(L"characters\*(R"
184depends on your point of view. If you are a programmer, you probably
185would tend towards seeing each element in the sequences as one unit,
186or \*(L"character\*(R". The whole sequence could be seen as one \*(L"character\*(R",
187however, from the user's point of view, since that's probably what it
188looks like in the context of the user's language.
189.PP
190With this \*(L"whole sequence\*(R" view of characters, the total number of
191characters is open\-ended. But in the programmer's \*(L"one unit is one
192character\*(R" point of view, the concept of \*(L"characters\*(R" is more
193deterministic. In this document, we take that second point of view:
194one \*(L"character\*(R" is one Unicode code point, be it a base character or
195a combining character.
196.PP
197For some combinations, there are \fIprecomposed\fR characters.
198\&\f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR, for example, is defined as
199a single code point. These precomposed characters are, however,
200only available for some combinations, and are mainly
201meant to support round-trip conversions between Unicode and legacy
202standards (like the \s-1ISO\s0 8859). In the general case, the composing
203method is more extensible. To support conversion between
204different compositions of the characters, various \fInormalization
205forms\fR to standardize representations are also defined.
206.PP
207Because of backward compatibility with legacy encodings, the \*(L"a unique
208number for every character\*(R" idea breaks down a bit: instead, there is
209\&\*(L"at least one number for every character\*(R". The same character could
210be represented differently in several legacy encodings. The
211converse is also not true: some code points do not have an assigned
212character. Firstly, there are unallocated code points within
213otherwise used blocks. Secondly, there are special Unicode control
214characters that do not represent true characters.
215.PP
216A common myth about Unicode is that it would be \*(L"16\-bit\*(R", that is,
217Unicode is only represented as \f(CW0x10000\fR (or 65536) characters from
218\&\f(CW0x0000\fR to \f(CW0xFFFF\fR. \fBThis is untrue.\fR Since Unicode 2.0, Unicode
219has been defined all the way up to 21 bits (\f(CW0x10FFFF\fR), and since
220Unicode 3.1, characters have been defined beyond \f(CW0xFFFF\fR. The first
221\&\f(CW0x10000\fR characters are called the \fIPlane 0\fR, or the \fIBasic
222Multilingual Plane\fR (\s-1BMP\s0). With Unicode 3.1, 17 planes in all are
223defined\*(--but nowhere near full of defined characters, yet.
224.PP
225Another myth is that the 256\-character blocks have something to
226do with languages\*(--that each block would define the characters used
227by a language or a set of languages. \fBThis is also untrue.\fR
228The division into blocks exists, but it is almost completely
229accidental\*(--an artifact of how the characters have been and
230still are allocated. Instead, there is a concept called \fIscripts\fR,
231which is more useful: there is \f(CW\*(C`Latin\*(C'\fR script, \f(CW\*(C`Greek\*(C'\fR script, and
232so on. Scripts usually span varied parts of several blocks.
233For further information see Unicode::UCD.
234.PP
235The Unicode code points are just abstract numbers. To input and
236output these abstract numbers, the numbers must be \fIencoded\fR somehow.
237Unicode defines several \fIcharacter encoding forms\fR, of which \fI\s-1UTF\-8\s0\fR
238is perhaps the most popular. \s-1UTF\-8\s0 is a variable length encoding that
239encodes Unicode characters as 1 to 6 bytes (only 4 with the currently
240defined characters). Other encodings include \s-1UTF\-16\s0 and \s-1UTF\-32\s0 and their
241big\- and little-endian variants (\s-1UTF\-8\s0 is byte-order independent)
242The \s-1ISO/IEC\s0 10646 defines the \s-1UCS\-2\s0 and \s-1UCS\-4\s0 encoding forms.
243.PP
244For more information about encodings\*(--for instance, to learn what
245\&\fIsurrogates\fR and \fIbyte order marks\fR (BOMs) are\*(--see perlunicode.
246.Sh "Perl's Unicode Support"
247.IX Subsection "Perl's Unicode Support"
248Starting from Perl 5.6.0, Perl has had the capacity to handle Unicode
249natively. Perl 5.8.0, however, is the first recommended release for
250serious Unicode work. The maintenance release 5.6.1 fixed many of the
251problems of the initial Unicode implementation, but for example
252regular expressions still do not work with Unicode in 5.6.1.
253.PP
254\&\fBStarting from Perl 5.8.0, the use of \f(CB\*(C`use utf8\*(C'\fB is no longer
255necessary.\fR In earlier releases the \f(CW\*(C`utf8\*(C'\fR pragma was used to declare
256that operations in the current block or file would be Unicode\-aware.
257This model was found to be wrong, or at least clumsy: the \*(L"Unicodeness\*(R"
258is now carried with the data, instead of being attached to the
259operations. Only one case remains where an explicit \f(CW\*(C`use utf8\*(C'\fR is
260needed: if your Perl script itself is encoded in \s-1UTF\-8\s0, you can use
261\&\s-1UTF\-8\s0 in your identifier names, and in string and regular expression
262literals, by saying \f(CW\*(C`use utf8\*(C'\fR. This is not the default because
263scripts with legacy 8\-bit data in them would break. See utf8.
264.Sh "Perl's Unicode Model"
265.IX Subsection "Perl's Unicode Model"
266Perl supports both pre\-5.6 strings of eight-bit native bytes, and
267strings of Unicode characters. The principle is that Perl tries to
268keep its data as eight-bit bytes for as long as possible, but as soon
269as Unicodeness cannot be avoided, the data is transparently upgraded
270to Unicode.
271.PP
272Internally, Perl currently uses either whatever the native eight-bit
273character set of the platform (for example Latin\-1) is, defaulting to
274\&\s-1UTF\-8\s0, to encode Unicode strings. Specifically, if all code points in
275the string are \f(CW0xFF\fR or less, Perl uses the native eight-bit
276character set. Otherwise, it uses \s-1UTF\-8\s0.
277.PP
278A user of Perl does not normally need to know nor care how Perl
279happens to encode its internal strings, but it becomes relevant when
280outputting Unicode strings to a stream without a PerlIO layer \*(-- one with
281the \*(L"default\*(R" encoding. In such a case, the raw bytes used internally
282(the native character set or \s-1UTF\-8\s0, as appropriate for each string)
283will be used, and a \*(L"Wide character\*(R" warning will be issued if those
284strings contain a character beyond 0x00FF.
285.PP
286For example,
287.PP
288.Vb 1
289\& perl -e 'print "\ex{DF}\en", "\ex{0100}\ex{DF}\en"'
290.Ve
291.PP
292produces a fairly useless mixture of native bytes and \s-1UTF\-8\s0, as well
293as a warning:
294.PP
295.Vb 1
296\& Wide character in print at ...
297.Ve
298.PP
299To output \s-1UTF\-8\s0, use the \f(CW\*(C`:utf8\*(C'\fR output layer. Prepending
300.PP
301.Vb 1
302\& binmode(STDOUT, ":utf8");
303.Ve
304.PP
305to this sample program ensures that the output is completely \s-1UTF\-8\s0,
306and removes the program's warning.
307.PP
308If your locale environment variables (\f(CW\*(C`LANGUAGE\*(C'\fR, \f(CW\*(C`LC_ALL\*(C'\fR,
309\&\f(CW\*(C`LC_CTYPE\*(C'\fR, \f(CW\*(C`LANG\*(C'\fR) contain the strings '\s-1UTF\-8\s0' or '\s-1UTF8\s0',
310regardless of case, then the default encoding of your \s-1STDIN\s0, \s-1STDOUT\s0,
311and \s-1STDERR\s0 and of \fBany subsequent file open\fR, is \s-1UTF\-8\s0. Note that
312this means that Perl expects other software to work, too: if Perl has
313been led to believe that \s-1STDIN\s0 should be \s-1UTF\-8\s0, but then \s-1STDIN\s0 coming
314in from another command is not \s-1UTF\-8\s0, Perl will complain about the
315malformed \s-1UTF\-8\s0.
316.PP
317All features that combine Unicode and I/O also require using the new
318PerlIO feature. Almost all Perl 5.8 platforms do use PerlIO, though:
319you can see whether yours is by running \*(L"perl \-V\*(R" and looking for
320\&\f(CW\*(C`useperlio=define\*(C'\fR.
321.Sh "Unicode and \s-1EBCDIC\s0"
322.IX Subsection "Unicode and EBCDIC"
323Perl 5.8.0 also supports Unicode on \s-1EBCDIC\s0 platforms. There,
324Unicode support is somewhat more complex to implement since
325additional conversions are needed at every step. Some problems
326remain, see perlebcdic for details.
327.PP
328In any case, the Unicode support on \s-1EBCDIC\s0 platforms is better than
329in the 5.6 series, which didn't work much at all for \s-1EBCDIC\s0 platform.
330On \s-1EBCDIC\s0 platforms, the internal Unicode encoding form is UTF-EBCDIC
331instead of \s-1UTF\-8\s0. The difference is that as \s-1UTF\-8\s0 is \*(L"ASCII\-safe\*(R" in
332that \s-1ASCII\s0 characters encode to \s-1UTF\-8\s0 as\-is, while UTF-EBCDIC is
333\&\*(L"EBCDIC\-safe\*(R".
334.Sh "Creating Unicode"
335.IX Subsection "Creating Unicode"
336To create Unicode characters in literals for code points above \f(CW0xFF\fR,
337use the \f(CW\*(C`\ex{...}\*(C'\fR notation in double-quoted strings:
338.PP
339.Vb 1
340\& my $smiley = "\ex{263a}";
341.Ve
342.PP
343Similarly, it can be used in regular expression literals
344.PP
345.Vb 1
346\& $smiley =~ /\ex{263a}/;
347.Ve
348.PP
349At run-time you can use \f(CW\*(C`chr()\*(C'\fR:
350.PP
351.Vb 1
352\& my $hebrew_alef = chr(0x05d0);
353.Ve
354.PP
355See \*(L"Further Resources\*(R" for how to find all these numeric codes.
356.PP
357Naturally, \f(CW\*(C`ord()\*(C'\fR will do the reverse: it turns a character into
358a code point.
359.PP
360Note that \f(CW\*(C`\ex..\*(C'\fR (no \f(CW\*(C`{}\*(C'\fR and only two hexadecimal digits), \f(CW\*(C`\ex{...}\*(C'\fR,
361and \f(CW\*(C`chr(...)\*(C'\fR for arguments less than \f(CW0x100\fR (decimal 256)
362generate an eight-bit character for backward compatibility with older
363Perls. For arguments of \f(CW0x100\fR or more, Unicode characters are
364always produced. If you want to force the production of Unicode
365characters regardless of the numeric value, use \f(CW\*(C`pack("U", ...)\*(C'\fR
366instead of \f(CW\*(C`\ex..\*(C'\fR, \f(CW\*(C`\ex{...}\*(C'\fR, or \f(CW\*(C`chr()\*(C'\fR.
367.PP
368You can also use the \f(CW\*(C`charnames\*(C'\fR pragma to invoke characters
369by name in double-quoted strings:
370.PP
371.Vb 2
372\& use charnames ':full';
373\& my $arabic_alef = "\eN{ARABIC LETTER ALEF}";
374.Ve
375.PP
376And, as mentioned above, you can also \f(CW\*(C`pack()\*(C'\fR numbers into Unicode
377characters:
378.PP
379.Vb 1
380\& my $georgian_an = pack("U", 0x10a0);
381.Ve
382.PP
383Note that both \f(CW\*(C`\ex{...}\*(C'\fR and \f(CW\*(C`\eN{...}\*(C'\fR are compile-time string
384constants: you cannot use variables in them. if you want similar
385run-time functionality, use \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`charnames::vianame()\*(C'\fR.
386.PP
387Also note that if all the code points for pack \*(L"U\*(R" are below 0x100,
388bytes will be generated, just like if you were using \f(CW\*(C`chr()\*(C'\fR.
389.PP
390.Vb 1
391\& my $bytes = pack("U*", 0x80, 0xFF);
392.Ve
393.PP
394If you want to force the result to Unicode characters, use the special
395\&\f(CW"U0"\fR prefix. It consumes no arguments but forces the result to be
396in Unicode characters, instead of bytes.
397.PP
398.Vb 1
399\& my $chars = pack("U0U*", 0x80, 0xFF);
400.Ve
401.Sh "Handling Unicode"
402.IX Subsection "Handling Unicode"
403Handling Unicode is for the most part transparent: just use the
404strings as usual. Functions like \f(CW\*(C`index()\*(C'\fR, \f(CW\*(C`length()\*(C'\fR, and
405\&\f(CW\*(C`substr()\*(C'\fR will work on the Unicode characters; regular expressions
406will work on the Unicode characters (see perlunicode and perlretut).
407.PP
408Note that Perl considers combining character sequences to be
409characters, so for example
410.PP
411.Vb 2
412\& use charnames ':full';
413\& print length("\eN{LATIN CAPITAL LETTER A}\eN{COMBINING ACUTE ACCENT}"), "\en";
414.Ve
415.PP
416will print 2, not 1. The only exception is that regular expressions
417have \f(CW\*(C`\eX\*(C'\fR for matching a combining character sequence.
418.PP
419Life is not quite so transparent, however, when working with legacy
420encodings, I/O, and certain special cases:
421.Sh "Legacy Encodings"
422.IX Subsection "Legacy Encodings"
423When you combine legacy data and Unicode the legacy data needs
424to be upgraded to Unicode. Normally \s-1ISO\s0 8859\-1 (or \s-1EBCDIC\s0, if
425applicable) is assumed. You can override this assumption by
426using the \f(CW\*(C`encoding\*(C'\fR pragma, for example
427.PP
428.Vb 1
429\& use encoding 'latin2'; # ISO 8859-2
430.Ve
431.PP
432in which case literals (string or regular expressions), \f(CW\*(C`chr()\*(C'\fR,
433and \f(CW\*(C`ord()\*(C'\fR in your whole script are assumed to produce Unicode
434characters from \s-1ISO\s0 8859\-2 code points. Note that the matching for
435encoding names is forgiving: instead of \f(CW\*(C`latin2\*(C'\fR you could have
436said \f(CW\*(C`Latin 2\*(C'\fR, or \f(CW\*(C`iso8859\-2\*(C'\fR, or other variations. With just
437.PP
438.Vb 1
439\& use encoding;
440.Ve
441.PP
442the environment variable \f(CW\*(C`PERL_ENCODING\*(C'\fR will be consulted.
443If that variable isn't set, the encoding pragma will fail.
444.PP
445The \f(CW\*(C`Encode\*(C'\fR module knows about many encodings and has interfaces
446for doing conversions between those encodings:
447.PP
448.Vb 2
449\& use Encode 'from_to';
450\& from_to($data, "iso-8859-3", "utf-8"); # from legacy to utf-8
451.Ve
452.Sh "Unicode I/O"
453.IX Subsection "Unicode I/O"
454Normally, writing out Unicode data
455.PP
456.Vb 1
457\& print FH $some_string_with_unicode, "\en";
458.Ve
459.PP
460produces raw bytes that Perl happens to use to internally encode the
461Unicode string. Perl's internal encoding depends on the system as
462well as what characters happen to be in the string at the time. If
463any of the characters are at code points \f(CW0x100\fR or above, you will get
464a warning. To ensure that the output is explicitly rendered in the
465encoding you desire\*(--and to avoid the warning\*(--open the stream with
466the desired encoding. Some examples:
467.PP
468.Vb 1
469\& open FH, ">:utf8", "file";
470.Ve
471.PP
472.Vb 3
473\& open FH, ">:encoding(ucs2)", "file";
474\& open FH, ">:encoding(UTF-8)", "file";
475\& open FH, ">:encoding(shift_jis)", "file";
476.Ve
477.PP
478and on already open streams, use \f(CW\*(C`binmode()\*(C'\fR:
479.PP
480.Vb 1
481\& binmode(STDOUT, ":utf8");
482.Ve
483.PP
484.Vb 3
485\& binmode(STDOUT, ":encoding(ucs2)");
486\& binmode(STDOUT, ":encoding(UTF-8)");
487\& binmode(STDOUT, ":encoding(shift_jis)");
488.Ve
489.PP
490The matching of encoding names is loose: case does not matter, and
491many encodings have several aliases. Note that the \f(CW\*(C`:utf8\*(C'\fR layer
492must always be specified exactly like that; it is \fInot\fR subject to
493the loose matching of encoding names.
494.PP
495See PerlIO for the \f(CW\*(C`:utf8\*(C'\fR layer, PerlIO::encoding and
496Encode::PerlIO for the \f(CW\*(C`:encoding()\*(C'\fR layer, and
497Encode::Supported for many encodings supported by the \f(CW\*(C`Encode\*(C'\fR
498module.
499.PP
500Reading in a file that you know happens to be encoded in one of the
501Unicode or legacy encodings does not magically turn the data into
502Unicode in Perl's eyes. To do that, specify the appropriate
503layer when opening files
504.PP
505.Vb 2
506\& open(my $fh,'<:utf8', 'anything');
507\& my $line_of_unicode = <$fh>;
508.Ve
509.PP
510.Vb 2
511\& open(my $fh,'<:encoding(Big5)', 'anything');
512\& my $line_of_unicode = <$fh>;
513.Ve
514.PP
515The I/O layers can also be specified more flexibly with
516the \f(CW\*(C`open\*(C'\fR pragma. See open, or look at the following example.
517.PP
518.Vb 7
519\& use open ':utf8'; # input and output default layer will be UTF-8
520\& open X, ">file";
521\& print X chr(0x100), "\en";
522\& close X;
523\& open Y, "<file";
524\& printf "%#x\en", ord(<Y>); # this should print 0x100
525\& close Y;
526.Ve
527.PP
528With the \f(CW\*(C`open\*(C'\fR pragma you can use the \f(CW\*(C`:locale\*(C'\fR layer
529.PP
530.Vb 9
531\& $ENV{LC_ALL} = $ENV{LANG} = 'ru_RU.KOI8-R';
532\& # the :locale will probe the locale environment variables like LC_ALL
533\& use open OUT => ':locale'; # russki parusski
534\& open(O, ">koi8");
535\& print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1
536\& close O;
537\& open(I, "<koi8");
538\& printf "%#x\en", ord(<I>), "\en"; # this should print 0xc1
539\& close I;
540.Ve
541.PP
542or you can also use the \f(CW':encoding(...)'\fR layer
543.PP
544.Vb 2
545\& open(my $epic,'<:encoding(iso-8859-7)','iliad.greek');
546\& my $line_of_unicode = <$epic>;
547.Ve
548.PP
549These methods install a transparent filter on the I/O stream that
550converts data from the specified encoding when it is read in from the
551stream. The result is always Unicode.
552.PP
553The open pragma affects all the \f(CW\*(C`open()\*(C'\fR calls after the pragma by
554setting default layers. If you want to affect only certain
555streams, use explicit layers directly in the \f(CW\*(C`open()\*(C'\fR call.
556.PP
557You can switch encodings on an already opened stream by using
558\&\f(CW\*(C`binmode()\*(C'\fR; see \*(L"binmode\*(R" in perlfunc.
559.PP
560The \f(CW\*(C`:locale\*(C'\fR does not currently (as of Perl 5.8.0) work with
561\&\f(CW\*(C`open()\*(C'\fR and \f(CW\*(C`binmode()\*(C'\fR, only with the \f(CW\*(C`open\*(C'\fR pragma. The
562\&\f(CW\*(C`:utf8\*(C'\fR and \f(CW\*(C`:encoding(...)\*(C'\fR methods do work with all of \f(CW\*(C`open()\*(C'\fR,
563\&\f(CW\*(C`binmode()\*(C'\fR, and the \f(CW\*(C`open\*(C'\fR pragma.
564.PP
565Similarly, you may use these I/O layers on output streams to
566automatically convert Unicode to the specified encoding when it is
567written to the stream. For example, the following snippet copies the
568contents of the file \*(L"text.jis\*(R" (encoded as \s-1ISO\-2022\-JP\s0, aka \s-1JIS\s0) to
569the file \*(L"text.utf8\*(R", encoded as \s-1UTF\-8:\s0
570.PP
571.Vb 3
572\& open(my $nihongo, '<:encoding(iso2022-jp)', 'text.jis');
573\& open(my $unicode, '>:utf8', 'text.utf8');
574\& while (<$nihongo>) { print $unicode }
575.Ve
576.PP
577The naming of encodings, both by the \f(CW\*(C`open()\*(C'\fR and by the \f(CW\*(C`open\*(C'\fR
578pragma, is similar to the \f(CW\*(C`encoding\*(C'\fR pragma in that it allows for
579flexible names: \f(CW\*(C`koi8\-r\*(C'\fR and \f(CW\*(C`KOI8R\*(C'\fR will both be understood.
580.PP
581Common encodings recognized by \s-1ISO\s0, \s-1MIME\s0, \s-1IANA\s0, and various other
582standardisation organisations are recognised; for a more detailed
583list see Encode::Supported.
584.PP
585\&\f(CW\*(C`read()\*(C'\fR reads characters and returns the number of characters.
586\&\f(CW\*(C`seek()\*(C'\fR and \f(CW\*(C`tell()\*(C'\fR operate on byte counts, as do \f(CW\*(C`sysread()\*(C'\fR
587and \f(CW\*(C`sysseek()\*(C'\fR.
588.PP
589Notice that because of the default behaviour of not doing any
590conversion upon input if there is no default layer,
591it is easy to mistakenly write code that keeps on expanding a file
592by repeatedly encoding the data:
593.PP
594.Vb 8
595\& # BAD CODE WARNING
596\& open F, "file";
597\& local $/; ## read in the whole file of 8-bit characters
598\& $t = <F>;
599\& close F;
600\& open F, ">:utf8", "file";
601\& print F $t; ## convert to UTF-8 on output
602\& close F;
603.Ve
604.PP
605If you run this code twice, the contents of the \fIfile\fR will be twice
606\&\s-1UTF\-8\s0 encoded. A \f(CW\*(C`use open ':utf8'\*(C'\fR would have avoided the bug, or
607explicitly opening also the \fIfile\fR for input as \s-1UTF\-8\s0.
608.PP
609\&\fB\s-1NOTE\s0\fR: the \f(CW\*(C`:utf8\*(C'\fR and \f(CW\*(C`:encoding\*(C'\fR features work only if your
610Perl has been built with the new PerlIO feature.
611.Sh "Displaying Unicode As Text"
612.IX Subsection "Displaying Unicode As Text"
613Sometimes you might want to display Perl scalars containing Unicode as
614simple \s-1ASCII\s0 (or \s-1EBCDIC\s0) text. The following subroutine converts
615its argument so that Unicode characters with code points greater than
616255 are displayed as \f(CW\*(C`\ex{...}\*(C'\fR, control characters (like \f(CW\*(C`\en\*(C'\fR) are
617displayed as \f(CW\*(C`\ex..\*(C'\fR, and the rest of the characters as themselves:
618.PP
619.Vb 9
620\& sub nice_string {
621\& join("",
622\& map { $_ > 255 ? # if wide character...
623\& sprintf("\e\ex{%04X}", $_) : # \ex{...}
624\& chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
625\& sprintf("\e\ex%02X", $_) : # \ex..
626\& chr($_) # else as themselves
627\& } unpack("U*", $_[0])); # unpack Unicode characters
628\& }
629.Ve
630.PP
631For example,
632.PP
633.Vb 1
634\& nice_string("foo\ex{100}bar\en")
635.Ve
636.PP
637returns:
638.PP
639.Vb 1
640\& "foo\ex{0100}bar\ex0A"
641.Ve
642.Sh "Special Cases"
643.IX Subsection "Special Cases"
644.IP "\(bu" 4
645Bit Complement Operator ~ And \fIvec()\fR
646.Sp
647The bit complement operator \f(CW\*(C`~\*(C'\fR may produce surprising results if
648used on strings containing characters with ordinal values above
649255. In such a case, the results are consistent with the internal
650encoding of the characters, but not with much else. So don't do
651that. Similarly for \f(CW\*(C`vec()\*(C'\fR: you will be operating on the
652internally-encoded bit patterns of the Unicode characters, not on
653the code point values, which is very probably not what you want.
654.IP "\(bu" 4
655Peeking At Perl's Internal Encoding
656.Sp
657Normal users of Perl should never care how Perl encodes any particular
658Unicode string (because the normal ways to get at the contents of a
659string with Unicode\*(--via input and output\*(--should always be via
660explicitly-defined I/O layers). But if you must, there are two
661ways of looking behind the scenes.
662.Sp
663One way of peeking inside the internal encoding of Unicode characters
664is to use \f(CW\*(C`unpack("C*", ...\*(C'\fR to get the bytes or \f(CW\*(C`unpack("H*", ...)\*(C'\fR
665to display the bytes:
666.Sp
667.Vb 2
668\& # this prints c4 80 for the UTF-8 bytes 0xc4 0x80
669\& print join(" ", unpack("H*", pack("U", 0x100))), "\en";
670.Ve
671.Sp
672Yet another way would be to use the Devel::Peek module:
673.Sp
674.Vb 1
675\& perl -MDevel::Peek -e 'Dump(chr(0x100))'
676.Ve
677.Sp
678That shows the \s-1UTF8\s0 flag in \s-1FLAGS\s0 and both the \s-1UTF\-8\s0 bytes
679and Unicode characters in \f(CW\*(C`PV\*(C'\fR. See also later in this document
680the discussion about the \f(CW\*(C`is_utf8\*(C'\fR function of the \f(CW\*(C`Encode\*(C'\fR module.
681.Sh "Advanced Topics"
682.IX Subsection "Advanced Topics"
683.IP "\(bu" 4
684String Equivalence
685.Sp
686The question of string equivalence turns somewhat complicated
687in Unicode: what do you mean by \*(L"equal\*(R"?
688.Sp
689(Is \f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR equal to
690\&\f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR?)
691.Sp
692The short answer is that by default Perl compares equivalence (\f(CW\*(C`eq\*(C'\fR,
693\&\f(CW\*(C`ne\*(C'\fR) based only on code points of the characters. In the above
694case, the answer is no (because 0x00C1 != 0x0041). But sometimes, any
695\&\s-1CAPITAL\s0 \s-1LETTER\s0 As should be considered equal, or even As of any case.
696.Sp
697The long answer is that you need to consider character normalization
698and casing issues: see Unicode::Normalize, Unicode Technical
699Reports #15 and #21, \fIUnicode Normalization Forms\fR and \fICase
700Mappings\fR, http://www.unicode.org/unicode/reports/tr15/ and
701http://www.unicode.org/unicode/reports/tr21/
702.Sp
703As of Perl 5.8.0, the \*(L"Full\*(R" case-folding of \fICase
704Mappings/SpecialCasing\fR is implemented.
705.IP "\(bu" 4
706String Collation
707.Sp
708People like to see their strings nicely sorted\*(--or as Unicode
709parlance goes, collated. But again, what do you mean by collate?
710.Sp
711(Does \f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR come before or after
712\&\f(CW\*(C`LATIN CAPITAL LETTER A WITH GRAVE\*(C'\fR?)
713.Sp
714The short answer is that by default, Perl compares strings (\f(CW\*(C`lt\*(C'\fR,
715\&\f(CW\*(C`le\*(C'\fR, \f(CW\*(C`cmp\*(C'\fR, \f(CW\*(C`ge\*(C'\fR, \f(CW\*(C`gt\*(C'\fR) based only on the code points of the
716characters. In the above case, the answer is \*(L"after\*(R", since
717\&\f(CW0x00C1\fR > \f(CW0x00C0\fR.
718.Sp
719The long answer is that \*(L"it depends\*(R", and a good answer cannot be
720given without knowing (at the very least) the language context.
721See Unicode::Collate, and \fIUnicode Collation Algorithm\fR
722http://www.unicode.org/unicode/reports/tr10/
723.Sh "Miscellaneous"
724.IX Subsection "Miscellaneous"
725.IP "\(bu" 4
726Character Ranges and Classes
727.Sp
728Character ranges in regular expression character classes (\f(CW\*(C`/[a\-z]/\*(C'\fR)
729and in the \f(CW\*(C`tr///\*(C'\fR (also known as \f(CW\*(C`y///\*(C'\fR) operator are not magically
730Unicode\-aware. What this means that \f(CW\*(C`[A\-Za\-z]\*(C'\fR will not magically start
731to mean \*(L"all alphabetic letters\*(R"; not that it does mean that even for
7328\-bit characters, you should be using \f(CW\*(C`/[[:alpha:]]/\*(C'\fR in that case.
733.Sp
734For specifying character classes like that in regular expressions,
735you can use the various Unicode properties\*(--\f(CW\*(C`\epL\*(C'\fR, or perhaps
736\&\f(CW\*(C`\ep{Alphabetic}\*(C'\fR, in this particular case. You can use Unicode
737code points as the end points of character ranges, but there is no
738magic associated with specifying a certain range. For further
739information\*(--there are dozens of Unicode character classes\*(--see
740perlunicode.
741.IP "\(bu" 4
742String-To-Number Conversions
743.Sp
744Unicode does define several other decimal\*(--and numeric\*(--characters
745besides the familiar 0 to 9, such as the Arabic and Indic digits.
746Perl does not support string-to-number conversion for digits other
747than \s-1ASCII\s0 0 to 9 (and \s-1ASCII\s0 a to f for hexadecimal).
748.Sh "Questions With Answers"
749.IX Subsection "Questions With Answers"
750.IP "\(bu" 4
751Will My Old Scripts Break?
752.Sp
753Very probably not. Unless you are generating Unicode characters
754somehow, old behaviour should be preserved. About the only behaviour
755that has changed and which could start generating Unicode is the old
756behaviour of \f(CW\*(C`chr()\*(C'\fR where supplying an argument more than 255
757produced a character modulo 255. \f(CW\*(C`chr(300)\*(C'\fR, for example, was equal
758to \f(CW\*(C`chr(45)\*(C'\fR or \*(L"\-\*(R" (in \s-1ASCII\s0), now it is \s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 I \s-1WITH\s0
759\&\s-1BREVE\s0.
760.IP "\(bu" 4
761How Do I Make My Scripts Work With Unicode?
762.Sp
763Very little work should be needed since nothing changes until you
764generate Unicode data. The most important thing is getting input as
765Unicode; for that, see the earlier I/O discussion.
766.IP "\(bu" 4
767How Do I Know Whether My String Is In Unicode?
768.Sp
769You shouldn't care. No, you really shouldn't. No, really. If you
770have to care\*(--beyond the cases described above\*(--it means that we
771didn't get the transparency of Unicode quite right.
772.Sp
773Okay, if you insist:
774.Sp
775.Vb 2
776\& use Encode 'is_utf8';
777\& print is_utf8($string) ? 1 : 0, "\en";
778.Ve
779.Sp
780But note that this doesn't mean that any of the characters in the
781string are necessary \s-1UTF\-8\s0 encoded, or that any of the characters have
782code points greater than 0xFF (255) or even 0x80 (128), or that the
783string has any characters at all. All the \f(CW\*(C`is_utf8()\*(C'\fR does is to
784return the value of the internal \*(L"utf8ness\*(R" flag attached to the
785\&\f(CW$string\fR. If the flag is off, the bytes in the scalar are interpreted
786as a single byte encoding. If the flag is on, the bytes in the scalar
787are interpreted as the (multi\-byte, variable\-length) \s-1UTF\-8\s0 encoded code
788points of the characters. Bytes added to an \s-1UTF\-8\s0 encoded string are
789automatically upgraded to \s-1UTF\-8\s0. If mixed non\-UTF8 and \s-1UTF\-8\s0 scalars
790are merged (double\-quoted interpolation, explicit concatenation, and
791printf/sprintf parameter substitution), the result will be \s-1UTF\-8\s0 encoded
792as if copies of the byte strings were upgraded to \s-1UTF\-8:\s0 for example,
793.Sp
794.Vb 3
795\& $a = "ab\ex80c";
796\& $b = "\ex{100}";
797\& print "$a = $b\en";
798.Ve
799.Sp
800the output string will be UTF\-8\-encoded \f(CW\*(C`ab\ex80c\ex{100}\en\*(C'\fR, but note
801that \f(CW$a\fR will stay byte\-encoded.
802.Sp
803Sometimes you might really need to know the byte length of a string
804instead of the character length. For that use either the
805\&\f(CW\*(C`Encode::encode_utf8()\*(C'\fR function or the \f(CW\*(C`bytes\*(C'\fR pragma and its only
806defined function \f(CW\*(C`length()\*(C'\fR:
807.Sp
808.Vb 7
809\& my $unicode = chr(0x100);
810\& print length($unicode), "\en"; # will print 1
811\& require Encode;
812\& print length(Encode::encode_utf8($unicode)), "\en"; # will print 2
813\& use bytes;
814\& print length($unicode), "\en"; # will also print 2
815\& # (the 0xC4 0x80 of the UTF-8)
816.Ve
817.IP "\(bu" 4
818How Do I Detect Data That's Not Valid In a Particular Encoding?
819.Sp
820Use the \f(CW\*(C`Encode\*(C'\fR package to try converting it.
821For example,
822.Sp
823.Vb 6
824\& use Encode 'encode_utf8';
825\& if (encode_utf8($string_of_bytes_that_I_think_is_utf8)) {
826\& # valid
827\& } else {
828\& # invalid
829\& }
830.Ve
831.Sp
832For \s-1UTF\-8\s0 only, you can use:
833.Sp
834.Vb 2
835\& use warnings;
836\& @chars = unpack("U0U*", $string_of_bytes_that_I_think_is_utf8);
837.Ve
838.Sp
839If invalid, a \f(CW\*(C`Malformed UTF\-8 character (byte 0x##) in unpack\*(C'\fR
840warning is produced. The \*(L"U0\*(R" means \*(L"expect strictly \s-1UTF\-8\s0 encoded
841Unicode\*(R". Without that the \f(CW\*(C`unpack("U*", ...)\*(C'\fR would accept also
842data like \f(CW\*(C`chr(0xFF\*(C'\fR), similarly to the \f(CW\*(C`pack\*(C'\fR as we saw earlier.
843.IP "\(bu" 4
844How Do I Convert Binary Data Into a Particular Encoding, Or Vice Versa?
845.Sp
846This probably isn't as useful as you might think.
847Normally, you shouldn't need to.
848.Sp
849In one sense, what you are asking doesn't make much sense: encodings
850are for characters, and binary data are not \*(L"characters\*(R", so converting
851\&\*(L"data\*(R" into some encoding isn't meaningful unless you know in what
852character set and encoding the binary data is in, in which case it's
853not just binary data, now is it?
854.Sp
855If you have a raw sequence of bytes that you know should be
856interpreted via a particular encoding, you can use \f(CW\*(C`Encode\*(C'\fR:
857.Sp
858.Vb 2
859\& use Encode 'from_to';
860\& from_to($data, "iso-8859-1", "utf-8"); # from latin-1 to utf-8
861.Ve
862.Sp
863The call to \f(CW\*(C`from_to()\*(C'\fR changes the bytes in \f(CW$data\fR, but nothing
864material about the nature of the string has changed as far as Perl is
865concerned. Both before and after the call, the string \f(CW$data\fR
866contains just a bunch of 8\-bit bytes. As far as Perl is concerned,
867the encoding of the string remains as \*(L"system\-native 8\-bit bytes\*(R".
868.Sp
869You might relate this to a fictional 'Translate' module:
870.Sp
871.Vb 4
872\& use Translate;
873\& my $phrase = "Yes";
874\& Translate::from_to($phrase, 'english', 'deutsch');
875\& ## phrase now contains "Ja"
876.Ve
877.Sp
878The contents of the string changes, but not the nature of the string.
879Perl doesn't know any more after the call than before that the
880contents of the string indicates the affirmative.
881.Sp
882Back to converting data. If you have (or want) data in your system's
883native 8\-bit encoding (e.g. Latin\-1, \s-1EBCDIC\s0, etc.), you can use
884pack/unpack to convert to/from Unicode.
885.Sp
886.Vb 2
887\& $native_string = pack("C*", unpack("U*", $Unicode_string));
888\& $Unicode_string = pack("U*", unpack("C*", $native_string));
889.Ve
890.Sp
891If you have a sequence of bytes you \fBknow\fR is valid \s-1UTF\-8\s0,
892but Perl doesn't know it yet, you can make Perl a believer, too:
893.Sp
894.Vb 2
895\& use Encode 'decode_utf8';
896\& $Unicode = decode_utf8($bytes);
897.Ve
898.Sp
899You can convert well-formed \s-1UTF\-8\s0 to a sequence of bytes, but if
900you just want to convert random binary data into \s-1UTF\-8\s0, you can't.
901\&\fBAny random collection of bytes isn't well-formed \s-1UTF\-8\s0\fR. You can
902use \f(CW\*(C`unpack("C*", $string)\*(C'\fR for the former, and you can create
903well-formed Unicode data by \f(CW\*(C`pack("U*", 0xff, ...)\*(C'\fR.
904.IP "\(bu" 4
905How Do I Display Unicode? How Do I Input Unicode?
906.Sp
907See http://www.alanwood.net/unicode/ and
908http://www.cl.cam.ac.uk/~mgk25/unicode.html
909.IP "\(bu" 4
910How Does Unicode Work With Traditional Locales?
911.Sp
912In Perl, not very well. Avoid using locales through the \f(CW\*(C`locale\*(C'\fR
913pragma. Use only one or the other.
914.Sh "Hexadecimal Notation"
915.IX Subsection "Hexadecimal Notation"
916The Unicode standard prefers using hexadecimal notation because
917that more clearly shows the division of Unicode into blocks of 256 characters.
918Hexadecimal is also simply shorter than decimal. You can use decimal
919notation, too, but learning to use hexadecimal just makes life easier
920with the Unicode standard. The \f(CW\*(C`U+HHHH\*(C'\fR notation uses hexadecimal,
921for example.
922.PP
923The \f(CW\*(C`0x\*(C'\fR prefix means a hexadecimal number, the digits are 0\-9 \fIand\fR
924a\-f (or A\-F, case doesn't matter). Each hexadecimal digit represents
925four bits, or half a byte. \f(CW\*(C`print 0x..., "\en"\*(C'\fR will show a
926hexadecimal number in decimal, and \f(CW\*(C`printf "%x\en", $decimal\*(C'\fR will
927show a decimal number in hexadecimal. If you have just the
928\&\*(L"hex digits\*(R" of a hexadecimal number, you can use the \f(CW\*(C`hex()\*(C'\fR function.
929.PP
930.Vb 6
931\& print 0x0009, "\en"; # 9
932\& print 0x000a, "\en"; # 10
933\& print 0x000f, "\en"; # 15
934\& print 0x0010, "\en"; # 16
935\& print 0x0011, "\en"; # 17
936\& print 0x0100, "\en"; # 256
937.Ve
938.PP
939.Vb 1
940\& print 0x0041, "\en"; # 65
941.Ve
942.PP
943.Vb 2
944\& printf "%x\en", 65; # 41
945\& printf "%#x\en", 65; # 0x41
946.Ve
947.PP
948.Vb 1
949\& print hex("41"), "\en"; # 65
950.Ve
951.Sh "Further Resources"
952.IX Subsection "Further Resources"
953.IP "\(bu" 4
954Unicode Consortium
955.Sp
956.Vb 1
957\& http://www.unicode.org/
958.Ve
959.IP "\(bu" 4
960Unicode \s-1FAQ\s0
961.Sp
962.Vb 1
963\& http://www.unicode.org/unicode/faq/
964.Ve
965.IP "\(bu" 4
966Unicode Glossary
967.Sp
968.Vb 1
969\& http://www.unicode.org/glossary/
970.Ve
971.IP "\(bu" 4
972Unicode Useful Resources
973.Sp
974.Vb 1
975\& http://www.unicode.org/unicode/onlinedat/resources.html
976.Ve
977.IP "\(bu" 4
978Unicode and Multilingual Support in \s-1HTML\s0, Fonts, Web Browsers and Other Applications
979.Sp
980.Vb 1
981\& http://www.alanwood.net/unicode/
982.Ve
983.IP "\(bu" 4
984\&\s-1UTF\-8\s0 and Unicode \s-1FAQ\s0 for Unix/Linux
985.Sp
986.Vb 1
987\& http://www.cl.cam.ac.uk/~mgk25/unicode.html
988.Ve
989.IP "\(bu" 4
990Legacy Character Sets
991.Sp
992.Vb 2
993\& http://www.czyborra.com/
994\& http://www.eki.ee/letter/
995.Ve
996.IP "\(bu" 4
997The Unicode support files live within the Perl installation in the
998directory
999.Sp
1000.Vb 1
1001\& $Config{installprivlib}/unicore
1002.Ve
1003.Sp
1004in Perl 5.8.0 or newer, and
1005.Sp
1006.Vb 1
1007\& $Config{installprivlib}/unicode
1008.Ve
1009.Sp
1010in the Perl 5.6 series. (The renaming to \fIlib/unicore\fR was done to
1011avoid naming conflicts with lib/Unicode in case-insensitive filesystems.)
1012The main Unicode data file is \fIUnicodeData.txt\fR (or \fIUnicode.301\fR in
1013Perl 5.6.1.) You can find the \f(CW$Config{installprivlib}\fR by
1014.Sp
1015.Vb 1
1016\& perl "-V:installprivlib"
1017.Ve
1018.Sp
1019You can explore various information from the Unicode data files using
1020the \f(CW\*(C`Unicode::UCD\*(C'\fR module.
1021.SH "UNICODE IN OLDER PERLS"
1022.IX Header "UNICODE IN OLDER PERLS"
1023If you cannot upgrade your Perl to 5.8.0 or later, you can still
1024do some Unicode processing by using the modules \f(CW\*(C`Unicode::String\*(C'\fR,
1025\&\f(CW\*(C`Unicode::Map8\*(C'\fR, and \f(CW\*(C`Unicode::Map\*(C'\fR, available from \s-1CPAN\s0.
1026If you have the \s-1GNU\s0 recode installed, you can also use the
1027Perl front-end \f(CW\*(C`Convert::Recode\*(C'\fR for character conversions.
1028.PP
1029The following are fast conversions from \s-1ISO\s0 8859\-1 (Latin\-1) bytes
1030to \s-1UTF\-8\s0 bytes, the code works even with older Perl 5 versions.
1031.PP
1032.Vb 2
1033\& # ISO 8859-1 to UTF-8
1034\& s/([\ex80-\exFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg;
1035.Ve
1036.PP
1037.Vb 2
1038\& # UTF-8 to ISO 8859-1
1039\& s/([\exC2\exC3])([\ex80-\exBF])/chr(ord($1)<<6&0xC0|ord($2)&0x3F)/eg;
1040.Ve
1041.SH "SEE ALSO"
1042.IX Header "SEE ALSO"
1043perlunicode, Encode, encoding, open, utf8, bytes,
1044perlretut, Unicode::Collate, Unicode::Normalize, Unicode::UCD
1045.SH "ACKNOWLEDGMENTS"
1046.IX Header "ACKNOWLEDGMENTS"
1047Thanks to the kind readers of the perl5\-porters@perl.org,
1048perl\-unicode@perl.org, linux\-utf8@nl.linux.org, and unicore@unicode.org
1049mailing lists for their valuable feedback.
1050.SH "AUTHOR, COPYRIGHT, AND LICENSE"
1051.IX Header "AUTHOR, COPYRIGHT, AND LICENSE"
1052Copyright 2001\-2002 Jarkko Hietaniemi <jhi@iki.fi>
1053.PP
1054This document may be distributed under the same terms as Perl itself.