Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v9 / man / man1 / perluniintro.1
CommitLineData
920dae64
AT
1.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "PERLUNIINTRO 1"
132.TH PERLUNIINTRO 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide"
133.SH "NAME"
134perluniintro \- Perl Unicode introduction
135.SH "DESCRIPTION"
136.IX Header "DESCRIPTION"
137This document gives a general idea of Unicode and how to use Unicode
138in Perl.
139.Sh "Unicode"
140.IX Subsection "Unicode"
141Unicode is a character set standard which plans to codify all of the
142writing systems of the world, plus many other symbols.
143.PP
144Unicode and \s-1ISO/IEC\s0 10646 are coordinated standards that provide code
145points for characters in almost all modern character set standards,
146covering more than 30 writing systems and hundreds of languages,
147including all commercially-important modern languages. All characters
148in the largest Chinese, Japanese, and Korean dictionaries are also
149encoded. The standards will eventually cover almost all characters in
150more than 250 writing systems and thousands of languages.
151Unicode 1.0 was released in October 1991, and 4.0 in April 2003.
152.PP
153A Unicode \fIcharacter\fR is an abstract entity. It is not bound to any
154particular integer width, especially not to the C language \f(CW\*(C`char\*(C'\fR.
155Unicode is language-neutral and display\-neutral: it does not encode the
156language of the text and it does not define fonts or other graphical
157layout details. Unicode operates on characters and on text built from
158those characters.
159.PP
160Unicode defines characters like \f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR or \f(CW\*(C`GREEK
161SMALL LETTER ALPHA\*(C'\fR and unique numbers for the characters, in this
162case 0x0041 and 0x03B1, respectively. These unique numbers are called
163\&\fIcode points\fR.
164.PP
165The Unicode standard prefers using hexadecimal notation for the code
166points. If numbers like \f(CW0x0041\fR are unfamiliar to you, take a peek
167at a later section, \*(L"Hexadecimal Notation\*(R". The Unicode standard
168uses the notation \f(CW\*(C`U+0041 LATIN CAPITAL LETTER A\*(C'\fR, to give the
169hexadecimal code point and the normative name of the character.
170.PP
171Unicode also defines various \fIproperties\fR for the characters, like
172\&\*(L"uppercase\*(R" or \*(L"lowercase\*(R", \*(L"decimal digit\*(R", or \*(L"punctuation\*(R";
173these properties are independent of the names of the characters.
174Furthermore, various operations on the characters like uppercasing,
175lowercasing, and collating (sorting) are defined.
176.PP
177A Unicode character consists either of a single code point, or a
178\&\fIbase character\fR (like \f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR), followed by one or
179more \fImodifiers\fR (like \f(CW\*(C`COMBINING ACUTE ACCENT\*(C'\fR). This sequence of
180base character and modifiers is called a \fIcombining character
181sequence\fR.
182.PP
183Whether to call these combining character sequences \*(L"characters\*(R"
184depends on your point of view. If you are a programmer, you probably
185would tend towards seeing each element in the sequences as one unit,
186or \*(L"character\*(R". The whole sequence could be seen as one \*(L"character\*(R",
187however, from the user's point of view, since that's probably what it
188looks like in the context of the user's language.
189.PP
190With this \*(L"whole sequence\*(R" view of characters, the total number of
191characters is open\-ended. But in the programmer's \*(L"one unit is one
192character\*(R" point of view, the concept of \*(L"characters\*(R" is more
193deterministic. In this document, we take that second point of view:
194one \*(L"character\*(R" is one Unicode code point, be it a base character or
195a combining character.
196.PP
197For some combinations, there are \fIprecomposed\fR characters.
198\&\f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR, for example, is defined as
199a single code point. These precomposed characters are, however,
200only available for some combinations, and are mainly
201meant to support round-trip conversions between Unicode and legacy
202standards (like the \s-1ISO\s0 8859). In the general case, the composing
203method is more extensible. To support conversion between
204different compositions of the characters, various \fInormalization
205forms\fR to standardize representations are also defined.
206.PP
207Because of backward compatibility with legacy encodings, the \*(L"a unique
208number for every character\*(R" idea breaks down a bit: instead, there is
209\&\*(L"at least one number for every character\*(R". The same character could
210be represented differently in several legacy encodings. The
211converse is also not true: some code points do not have an assigned
212character. Firstly, there are unallocated code points within
213otherwise used blocks. Secondly, there are special Unicode control
214characters that do not represent true characters.
215.PP
216A common myth about Unicode is that it would be \*(L"16\-bit\*(R", that is,
217Unicode is only represented as \f(CW0x10000\fR (or 65536) characters from
218\&\f(CW0x0000\fR to \f(CW0xFFFF\fR. \fBThis is untrue.\fR Since Unicode 2.0 (July
2191996), Unicode has been defined all the way up to 21 bits (\f(CW0x10FFFF\fR),
220and since Unicode 3.1 (March 2001), characters have been defined
221beyond \f(CW0xFFFF\fR. The first \f(CW0x10000\fR characters are called the
222\&\fIPlane 0\fR, or the \fIBasic Multilingual Plane\fR (\s-1BMP\s0). With Unicode
2233.1, 17 (yes, seventeen) planes in all were defined\*(--but they are
224nowhere near full of defined characters, yet.
225.PP
226Another myth is that the 256\-character blocks have something to
227do with languages\*(--that each block would define the characters used
228by a language or a set of languages. \fBThis is also untrue.\fR
229The division into blocks exists, but it is almost completely
230accidental\*(--an artifact of how the characters have been and
231still are allocated. Instead, there is a concept called \fIscripts\fR,
232which is more useful: there is \f(CW\*(C`Latin\*(C'\fR script, \f(CW\*(C`Greek\*(C'\fR script, and
233so on. Scripts usually span varied parts of several blocks.
234For further information see Unicode::UCD.
235.PP
236The Unicode code points are just abstract numbers. To input and
237output these abstract numbers, the numbers must be \fIencoded\fR or
238\&\fIserialised\fR somehow. Unicode defines several \fIcharacter encoding
239forms\fR, of which \fI\s-1UTF\-8\s0\fR is perhaps the most popular. \s-1UTF\-8\s0 is a
240variable length encoding that encodes Unicode characters as 1 to 6
241bytes (only 4 with the currently defined characters). Other encodings
242include \s-1UTF\-16\s0 and \s-1UTF\-32\s0 and their big\- and little-endian variants
243(\s-1UTF\-8\s0 is byte-order independent) The \s-1ISO/IEC\s0 10646 defines the \s-1UCS\-2\s0
244and \s-1UCS\-4\s0 encoding forms.
245.PP
246For more information about encodings\*(--for instance, to learn what
247\&\fIsurrogates\fR and \fIbyte order marks\fR (BOMs) are\*(--see perlunicode.
248.Sh "Perl's Unicode Support"
249.IX Subsection "Perl's Unicode Support"
250Starting from Perl 5.6.0, Perl has had the capacity to handle Unicode
251natively. Perl 5.8.0, however, is the first recommended release for
252serious Unicode work. The maintenance release 5.6.1 fixed many of the
253problems of the initial Unicode implementation, but for example
254regular expressions still do not work with Unicode in 5.6.1.
255.PP
256\&\fBStarting from Perl 5.8.0, the use of \f(CB\*(C`use utf8\*(C'\fB is no longer
257necessary.\fR In earlier releases the \f(CW\*(C`utf8\*(C'\fR pragma was used to declare
258that operations in the current block or file would be Unicode\-aware.
259This model was found to be wrong, or at least clumsy: the \*(L"Unicodeness\*(R"
260is now carried with the data, instead of being attached to the
261operations. Only one case remains where an explicit \f(CW\*(C`use utf8\*(C'\fR is
262needed: if your Perl script itself is encoded in \s-1UTF\-8\s0, you can use
263\&\s-1UTF\-8\s0 in your identifier names, and in string and regular expression
264literals, by saying \f(CW\*(C`use utf8\*(C'\fR. This is not the default because
265scripts with legacy 8\-bit data in them would break. See utf8.
266.Sh "Perl's Unicode Model"
267.IX Subsection "Perl's Unicode Model"
268Perl supports both pre\-5.6 strings of eight-bit native bytes, and
269strings of Unicode characters. The principle is that Perl tries to
270keep its data as eight-bit bytes for as long as possible, but as soon
271as Unicodeness cannot be avoided, the data is transparently upgraded
272to Unicode.
273.PP
274Internally, Perl currently uses either whatever the native eight-bit
275character set of the platform (for example Latin\-1) is, defaulting to
276\&\s-1UTF\-8\s0, to encode Unicode strings. Specifically, if all code points in
277the string are \f(CW0xFF\fR or less, Perl uses the native eight-bit
278character set. Otherwise, it uses \s-1UTF\-8\s0.
279.PP
280A user of Perl does not normally need to know nor care how Perl
281happens to encode its internal strings, but it becomes relevant when
282outputting Unicode strings to a stream without a PerlIO layer \*(-- one with
283the \*(L"default\*(R" encoding. In such a case, the raw bytes used internally
284(the native character set or \s-1UTF\-8\s0, as appropriate for each string)
285will be used, and a \*(L"Wide character\*(R" warning will be issued if those
286strings contain a character beyond 0x00FF.
287.PP
288For example,
289.PP
290.Vb 1
291\& perl -e 'print "\ex{DF}\en", "\ex{0100}\ex{DF}\en"'
292.Ve
293.PP
294produces a fairly useless mixture of native bytes and \s-1UTF\-8\s0, as well
295as a warning:
296.PP
297.Vb 1
298\& Wide character in print at ...
299.Ve
300.PP
301To output \s-1UTF\-8\s0, use the \f(CW\*(C`:utf8\*(C'\fR output layer. Prepending
302.PP
303.Vb 1
304\& binmode(STDOUT, ":utf8");
305.Ve
306.PP
307to this sample program ensures that the output is completely \s-1UTF\-8\s0,
308and removes the program's warning.
309.PP
310You can enable automatic UTF\-8\-ification of your standard file
311handles, default \f(CW\*(C`open()\*(C'\fR layer, and \f(CW@ARGV\fR by using either
312the \f(CW\*(C`\-C\*(C'\fR command line switch or the \f(CW\*(C`PERL_UNICODE\*(C'\fR environment
313variable, see perlrun for the documentation of the \f(CW\*(C`\-C\*(C'\fR switch.
314.PP
315Note that this means that Perl expects other software to work, too:
316if Perl has been led to believe that \s-1STDIN\s0 should be \s-1UTF\-8\s0, but then
317\&\s-1STDIN\s0 coming in from another command is not \s-1UTF\-8\s0, Perl will complain
318about the malformed \s-1UTF\-8\s0.
319.PP
320All features that combine Unicode and I/O also require using the new
321PerlIO feature. Almost all Perl 5.8 platforms do use PerlIO, though:
322you can see whether yours is by running \*(L"perl \-V\*(R" and looking for
323\&\f(CW\*(C`useperlio=define\*(C'\fR.
324.Sh "Unicode and \s-1EBCDIC\s0"
325.IX Subsection "Unicode and EBCDIC"
326Perl 5.8.0 also supports Unicode on \s-1EBCDIC\s0 platforms. There,
327Unicode support is somewhat more complex to implement since
328additional conversions are needed at every step. Some problems
329remain, see perlebcdic for details.
330.PP
331In any case, the Unicode support on \s-1EBCDIC\s0 platforms is better than
332in the 5.6 series, which didn't work much at all for \s-1EBCDIC\s0 platform.
333On \s-1EBCDIC\s0 platforms, the internal Unicode encoding form is UTF-EBCDIC
334instead of \s-1UTF\-8\s0. The difference is that as \s-1UTF\-8\s0 is \*(L"ASCII\-safe\*(R" in
335that \s-1ASCII\s0 characters encode to \s-1UTF\-8\s0 as\-is, while UTF-EBCDIC is
336\&\*(L"EBCDIC\-safe\*(R".
337.Sh "Creating Unicode"
338.IX Subsection "Creating Unicode"
339To create Unicode characters in literals for code points above \f(CW0xFF\fR,
340use the \f(CW\*(C`\ex{...}\*(C'\fR notation in double-quoted strings:
341.PP
342.Vb 1
343\& my $smiley = "\ex{263a}";
344.Ve
345.PP
346Similarly, it can be used in regular expression literals
347.PP
348.Vb 1
349\& $smiley =~ /\ex{263a}/;
350.Ve
351.PP
352At run-time you can use \f(CW\*(C`chr()\*(C'\fR:
353.PP
354.Vb 1
355\& my $hebrew_alef = chr(0x05d0);
356.Ve
357.PP
358See \*(L"Further Resources\*(R" for how to find all these numeric codes.
359.PP
360Naturally, \f(CW\*(C`ord()\*(C'\fR will do the reverse: it turns a character into
361a code point.
362.PP
363Note that \f(CW\*(C`\ex..\*(C'\fR (no \f(CW\*(C`{}\*(C'\fR and only two hexadecimal digits), \f(CW\*(C`\ex{...}\*(C'\fR,
364and \f(CW\*(C`chr(...)\*(C'\fR for arguments less than \f(CW0x100\fR (decimal 256)
365generate an eight-bit character for backward compatibility with older
366Perls. For arguments of \f(CW0x100\fR or more, Unicode characters are
367always produced. If you want to force the production of Unicode
368characters regardless of the numeric value, use \f(CW\*(C`pack("U", ...)\*(C'\fR
369instead of \f(CW\*(C`\ex..\*(C'\fR, \f(CW\*(C`\ex{...}\*(C'\fR, or \f(CW\*(C`chr()\*(C'\fR.
370.PP
371You can also use the \f(CW\*(C`charnames\*(C'\fR pragma to invoke characters
372by name in double-quoted strings:
373.PP
374.Vb 2
375\& use charnames ':full';
376\& my $arabic_alef = "\eN{ARABIC LETTER ALEF}";
377.Ve
378.PP
379And, as mentioned above, you can also \f(CW\*(C`pack()\*(C'\fR numbers into Unicode
380characters:
381.PP
382.Vb 1
383\& my $georgian_an = pack("U", 0x10a0);
384.Ve
385.PP
386Note that both \f(CW\*(C`\ex{...}\*(C'\fR and \f(CW\*(C`\eN{...}\*(C'\fR are compile-time string
387constants: you cannot use variables in them. if you want similar
388run-time functionality, use \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`charnames::vianame()\*(C'\fR.
389.PP
390If you want to force the result to Unicode characters, use the special
391\&\f(CW"U0"\fR prefix. It consumes no arguments but forces the result to be
392in Unicode characters, instead of bytes.
393.PP
394.Vb 1
395\& my $chars = pack("U0C*", 0x80, 0x42);
396.Ve
397.PP
398Likewise, you can force the result to be bytes by using the special
399\&\f(CW"C0"\fR prefix.
400.Sh "Handling Unicode"
401.IX Subsection "Handling Unicode"
402Handling Unicode is for the most part transparent: just use the
403strings as usual. Functions like \f(CW\*(C`index()\*(C'\fR, \f(CW\*(C`length()\*(C'\fR, and
404\&\f(CW\*(C`substr()\*(C'\fR will work on the Unicode characters; regular expressions
405will work on the Unicode characters (see perlunicode and perlretut).
406.PP
407Note that Perl considers combining character sequences to be
408separate characters, so for example
409.PP
410.Vb 2
411\& use charnames ':full';
412\& print length("\eN{LATIN CAPITAL LETTER A}\eN{COMBINING ACUTE ACCENT}"), "\en";
413.Ve
414.PP
415will print 2, not 1. The only exception is that regular expressions
416have \f(CW\*(C`\eX\*(C'\fR for matching a combining character sequence.
417.PP
418Life is not quite so transparent, however, when working with legacy
419encodings, I/O, and certain special cases:
420.Sh "Legacy Encodings"
421.IX Subsection "Legacy Encodings"
422When you combine legacy data and Unicode the legacy data needs
423to be upgraded to Unicode. Normally \s-1ISO\s0 8859\-1 (or \s-1EBCDIC\s0, if
424applicable) is assumed. You can override this assumption by
425using the \f(CW\*(C`encoding\*(C'\fR pragma, for example
426.PP
427.Vb 1
428\& use encoding 'latin2'; # ISO 8859-2
429.Ve
430.PP
431in which case literals (string or regular expressions), \f(CW\*(C`chr()\*(C'\fR,
432and \f(CW\*(C`ord()\*(C'\fR in your whole script are assumed to produce Unicode
433characters from \s-1ISO\s0 8859\-2 code points. Note that the matching for
434encoding names is forgiving: instead of \f(CW\*(C`latin2\*(C'\fR you could have
435said \f(CW\*(C`Latin 2\*(C'\fR, or \f(CW\*(C`iso8859\-2\*(C'\fR, or other variations. With just
436.PP
437.Vb 1
438\& use encoding;
439.Ve
440.PP
441the environment variable \f(CW\*(C`PERL_ENCODING\*(C'\fR will be consulted.
442If that variable isn't set, the encoding pragma will fail.
443.PP
444The \f(CW\*(C`Encode\*(C'\fR module knows about many encodings and has interfaces
445for doing conversions between those encodings:
446.PP
447.Vb 2
448\& use Encode 'decode';
449\& $data = decode("iso-8859-3", $data); # convert from legacy to utf-8
450.Ve
451.Sh "Unicode I/O"
452.IX Subsection "Unicode I/O"
453Normally, writing out Unicode data
454.PP
455.Vb 1
456\& print FH $some_string_with_unicode, "\en";
457.Ve
458.PP
459produces raw bytes that Perl happens to use to internally encode the
460Unicode string. Perl's internal encoding depends on the system as
461well as what characters happen to be in the string at the time. If
462any of the characters are at code points \f(CW0x100\fR or above, you will get
463a warning. To ensure that the output is explicitly rendered in the
464encoding you desire\*(--and to avoid the warning\*(--open the stream with
465the desired encoding. Some examples:
466.PP
467.Vb 1
468\& open FH, ">:utf8", "file";
469.Ve
470.PP
471.Vb 3
472\& open FH, ">:encoding(ucs2)", "file";
473\& open FH, ">:encoding(UTF-8)", "file";
474\& open FH, ">:encoding(shift_jis)", "file";
475.Ve
476.PP
477and on already open streams, use \f(CW\*(C`binmode()\*(C'\fR:
478.PP
479.Vb 1
480\& binmode(STDOUT, ":utf8");
481.Ve
482.PP
483.Vb 3
484\& binmode(STDOUT, ":encoding(ucs2)");
485\& binmode(STDOUT, ":encoding(UTF-8)");
486\& binmode(STDOUT, ":encoding(shift_jis)");
487.Ve
488.PP
489The matching of encoding names is loose: case does not matter, and
490many encodings have several aliases. Note that the \f(CW\*(C`:utf8\*(C'\fR layer
491must always be specified exactly like that; it is \fInot\fR subject to
492the loose matching of encoding names.
493.PP
494See PerlIO for the \f(CW\*(C`:utf8\*(C'\fR layer, PerlIO::encoding and
495Encode::PerlIO for the \f(CW\*(C`:encoding()\*(C'\fR layer, and
496Encode::Supported for many encodings supported by the \f(CW\*(C`Encode\*(C'\fR
497module.
498.PP
499Reading in a file that you know happens to be encoded in one of the
500Unicode or legacy encodings does not magically turn the data into
501Unicode in Perl's eyes. To do that, specify the appropriate
502layer when opening files
503.PP
504.Vb 2
505\& open(my $fh,'<:utf8', 'anything');
506\& my $line_of_unicode = <$fh>;
507.Ve
508.PP
509.Vb 2
510\& open(my $fh,'<:encoding(Big5)', 'anything');
511\& my $line_of_unicode = <$fh>;
512.Ve
513.PP
514The I/O layers can also be specified more flexibly with
515the \f(CW\*(C`open\*(C'\fR pragma. See open, or look at the following example.
516.PP
517.Vb 7
518\& use open ':utf8'; # input and output default layer will be UTF-8
519\& open X, ">file";
520\& print X chr(0x100), "\en";
521\& close X;
522\& open Y, "<file";
523\& printf "%#x\en", ord(<Y>); # this should print 0x100
524\& close Y;
525.Ve
526.PP
527With the \f(CW\*(C`open\*(C'\fR pragma you can use the \f(CW\*(C`:locale\*(C'\fR layer
528.PP
529.Vb 9
530\& BEGIN { $ENV{LC_ALL} = $ENV{LANG} = 'ru_RU.KOI8-R' }
531\& # the :locale will probe the locale environment variables like LC_ALL
532\& use open OUT => ':locale'; # russki parusski
533\& open(O, ">koi8");
534\& print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1
535\& close O;
536\& open(I, "<koi8");
537\& printf "%#x\en", ord(<I>), "\en"; # this should print 0xc1
538\& close I;
539.Ve
540.PP
541or you can also use the \f(CW':encoding(...)'\fR layer
542.PP
543.Vb 2
544\& open(my $epic,'<:encoding(iso-8859-7)','iliad.greek');
545\& my $line_of_unicode = <$epic>;
546.Ve
547.PP
548These methods install a transparent filter on the I/O stream that
549converts data from the specified encoding when it is read in from the
550stream. The result is always Unicode.
551.PP
552The open pragma affects all the \f(CW\*(C`open()\*(C'\fR calls after the pragma by
553setting default layers. If you want to affect only certain
554streams, use explicit layers directly in the \f(CW\*(C`open()\*(C'\fR call.
555.PP
556You can switch encodings on an already opened stream by using
557\&\f(CW\*(C`binmode()\*(C'\fR; see \*(L"binmode\*(R" in perlfunc.
558.PP
559The \f(CW\*(C`:locale\*(C'\fR does not currently (as of Perl 5.8.0) work with
560\&\f(CW\*(C`open()\*(C'\fR and \f(CW\*(C`binmode()\*(C'\fR, only with the \f(CW\*(C`open\*(C'\fR pragma. The
561\&\f(CW\*(C`:utf8\*(C'\fR and \f(CW\*(C`:encoding(...)\*(C'\fR methods do work with all of \f(CW\*(C`open()\*(C'\fR,
562\&\f(CW\*(C`binmode()\*(C'\fR, and the \f(CW\*(C`open\*(C'\fR pragma.
563.PP
564Similarly, you may use these I/O layers on output streams to
565automatically convert Unicode to the specified encoding when it is
566written to the stream. For example, the following snippet copies the
567contents of the file \*(L"text.jis\*(R" (encoded as \s-1ISO\-2022\-JP\s0, aka \s-1JIS\s0) to
568the file \*(L"text.utf8\*(R", encoded as \s-1UTF\-8:\s0
569.PP
570.Vb 3
571\& open(my $nihongo, '<:encoding(iso-2022-jp)', 'text.jis');
572\& open(my $unicode, '>:utf8', 'text.utf8');
573\& while (<$nihongo>) { print $unicode $_ }
574.Ve
575.PP
576The naming of encodings, both by the \f(CW\*(C`open()\*(C'\fR and by the \f(CW\*(C`open\*(C'\fR
577pragma, is similar to the \f(CW\*(C`encoding\*(C'\fR pragma in that it allows for
578flexible names: \f(CW\*(C`koi8\-r\*(C'\fR and \f(CW\*(C`KOI8R\*(C'\fR will both be understood.
579.PP
580Common encodings recognized by \s-1ISO\s0, \s-1MIME\s0, \s-1IANA\s0, and various other
581standardisation organisations are recognised; for a more detailed
582list see Encode::Supported.
583.PP
584\&\f(CW\*(C`read()\*(C'\fR reads characters and returns the number of characters.
585\&\f(CW\*(C`seek()\*(C'\fR and \f(CW\*(C`tell()\*(C'\fR operate on byte counts, as do \f(CW\*(C`sysread()\*(C'\fR
586and \f(CW\*(C`sysseek()\*(C'\fR.
587.PP
588Notice that because of the default behaviour of not doing any
589conversion upon input if there is no default layer,
590it is easy to mistakenly write code that keeps on expanding a file
591by repeatedly encoding the data:
592.PP
593.Vb 8
594\& # BAD CODE WARNING
595\& open F, "file";
596\& local $/; ## read in the whole file of 8-bit characters
597\& $t = <F>;
598\& close F;
599\& open F, ">:utf8", "file";
600\& print F $t; ## convert to UTF-8 on output
601\& close F;
602.Ve
603.PP
604If you run this code twice, the contents of the \fIfile\fR will be twice
605\&\s-1UTF\-8\s0 encoded. A \f(CW\*(C`use open ':utf8'\*(C'\fR would have avoided the bug, or
606explicitly opening also the \fIfile\fR for input as \s-1UTF\-8\s0.
607.PP
608\&\fB\s-1NOTE\s0\fR: the \f(CW\*(C`:utf8\*(C'\fR and \f(CW\*(C`:encoding\*(C'\fR features work only if your
609Perl has been built with the new PerlIO feature (which is the default
610on most systems).
611.Sh "Displaying Unicode As Text"
612.IX Subsection "Displaying Unicode As Text"
613Sometimes you might want to display Perl scalars containing Unicode as
614simple \s-1ASCII\s0 (or \s-1EBCDIC\s0) text. The following subroutine converts
615its argument so that Unicode characters with code points greater than
616255 are displayed as \f(CW\*(C`\ex{...}\*(C'\fR, control characters (like \f(CW\*(C`\en\*(C'\fR) are
617displayed as \f(CW\*(C`\ex..\*(C'\fR, and the rest of the characters as themselves:
618.PP
619.Vb 9
620\& sub nice_string {
621\& join("",
622\& map { $_ > 255 ? # if wide character...
623\& sprintf("\e\ex{%04X}", $_) : # \ex{...}
624\& chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
625\& sprintf("\e\ex%02X", $_) : # \ex..
626\& quotemeta(chr($_)) # else quoted or as themselves
627\& } unpack("U*", $_[0])); # unpack Unicode characters
628\& }
629.Ve
630.PP
631For example,
632.PP
633.Vb 1
634\& nice_string("foo\ex{100}bar\en")
635.Ve
636.PP
637returns the string
638.PP
639.Vb 1
640\& 'foo\ex{0100}bar\ex0A'
641.Ve
642.PP
643which is ready to be printed.
644.Sh "Special Cases"
645.IX Subsection "Special Cases"
646.IP "\(bu" 4
647Bit Complement Operator ~ And \fIvec()\fR
648.Sp
649The bit complement operator \f(CW\*(C`~\*(C'\fR may produce surprising results if
650used on strings containing characters with ordinal values above
651255. In such a case, the results are consistent with the internal
652encoding of the characters, but not with much else. So don't do
653that. Similarly for \f(CW\*(C`vec()\*(C'\fR: you will be operating on the
654internally-encoded bit patterns of the Unicode characters, not on
655the code point values, which is very probably not what you want.
656.IP "\(bu" 4
657Peeking At Perl's Internal Encoding
658.Sp
659Normal users of Perl should never care how Perl encodes any particular
660Unicode string (because the normal ways to get at the contents of a
661string with Unicode\*(--via input and output\*(--should always be via
662explicitly-defined I/O layers). But if you must, there are two
663ways of looking behind the scenes.
664.Sp
665One way of peeking inside the internal encoding of Unicode characters
666is to use \f(CW\*(C`unpack("C*", ...\*(C'\fR to get the bytes or \f(CW\*(C`unpack("H*", ...)\*(C'\fR
667to display the bytes:
668.Sp
669.Vb 2
670\& # this prints c4 80 for the UTF-8 bytes 0xc4 0x80
671\& print join(" ", unpack("H*", pack("U", 0x100))), "\en";
672.Ve
673.Sp
674Yet another way would be to use the Devel::Peek module:
675.Sp
676.Vb 1
677\& perl -MDevel::Peek -e 'Dump(chr(0x100))'
678.Ve
679.Sp
680That shows the \f(CW\*(C`UTF8\*(C'\fR flag in \s-1FLAGS\s0 and both the \s-1UTF\-8\s0 bytes
681and Unicode characters in \f(CW\*(C`PV\*(C'\fR. See also later in this document
682the discussion about the \f(CW\*(C`utf8::is_utf8()\*(C'\fR function.
683.Sh "Advanced Topics"
684.IX Subsection "Advanced Topics"
685.IP "\(bu" 4
686String Equivalence
687.Sp
688The question of string equivalence turns somewhat complicated
689in Unicode: what do you mean by \*(L"equal\*(R"?
690.Sp
691(Is \f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR equal to
692\&\f(CW\*(C`LATIN CAPITAL LETTER A\*(C'\fR?)
693.Sp
694The short answer is that by default Perl compares equivalence (\f(CW\*(C`eq\*(C'\fR,
695\&\f(CW\*(C`ne\*(C'\fR) based only on code points of the characters. In the above
696case, the answer is no (because 0x00C1 != 0x0041). But sometimes, any
697\&\s-1CAPITAL\s0 \s-1LETTER\s0 As should be considered equal, or even As of any case.
698.Sp
699The long answer is that you need to consider character normalization
700and casing issues: see Unicode::Normalize, Unicode Technical
701Reports #15 and #21, \fIUnicode Normalization Forms\fR and \fICase
702Mappings\fR, http://www.unicode.org/unicode/reports/tr15/ and
703http://www.unicode.org/unicode/reports/tr21/
704.Sp
705As of Perl 5.8.0, the \*(L"Full\*(R" case-folding of \fICase
706Mappings/SpecialCasing\fR is implemented.
707.IP "\(bu" 4
708String Collation
709.Sp
710People like to see their strings nicely sorted\*(--or as Unicode
711parlance goes, collated. But again, what do you mean by collate?
712.Sp
713(Does \f(CW\*(C`LATIN CAPITAL LETTER A WITH ACUTE\*(C'\fR come before or after
714\&\f(CW\*(C`LATIN CAPITAL LETTER A WITH GRAVE\*(C'\fR?)
715.Sp
716The short answer is that by default, Perl compares strings (\f(CW\*(C`lt\*(C'\fR,
717\&\f(CW\*(C`le\*(C'\fR, \f(CW\*(C`cmp\*(C'\fR, \f(CW\*(C`ge\*(C'\fR, \f(CW\*(C`gt\*(C'\fR) based only on the code points of the
718characters. In the above case, the answer is \*(L"after\*(R", since
719\&\f(CW0x00C1\fR > \f(CW0x00C0\fR.
720.Sp
721The long answer is that \*(L"it depends\*(R", and a good answer cannot be
722given without knowing (at the very least) the language context.
723See Unicode::Collate, and \fIUnicode Collation Algorithm\fR
724http://www.unicode.org/unicode/reports/tr10/
725.Sh "Miscellaneous"
726.IX Subsection "Miscellaneous"
727.IP "\(bu" 4
728Character Ranges and Classes
729.Sp
730Character ranges in regular expression character classes (\f(CW\*(C`/[a\-z]/\*(C'\fR)
731and in the \f(CW\*(C`tr///\*(C'\fR (also known as \f(CW\*(C`y///\*(C'\fR) operator are not magically
732Unicode\-aware. What this means that \f(CW\*(C`[A\-Za\-z]\*(C'\fR will not magically start
733to mean \*(L"all alphabetic letters\*(R"; not that it does mean that even for
7348\-bit characters, you should be using \f(CW\*(C`/[[:alpha:]]/\*(C'\fR in that case.
735.Sp
736For specifying character classes like that in regular expressions,
737you can use the various Unicode properties\*(--\f(CW\*(C`\epL\*(C'\fR, or perhaps
738\&\f(CW\*(C`\ep{Alphabetic}\*(C'\fR, in this particular case. You can use Unicode
739code points as the end points of character ranges, but there is no
740magic associated with specifying a certain range. For further
741information\*(--there are dozens of Unicode character classes\*(--see
742perlunicode.
743.IP "\(bu" 4
744String-To-Number Conversions
745.Sp
746Unicode does define several other decimal\*(--and numeric\*(--characters
747besides the familiar 0 to 9, such as the Arabic and Indic digits.
748Perl does not support string-to-number conversion for digits other
749than \s-1ASCII\s0 0 to 9 (and \s-1ASCII\s0 a to f for hexadecimal).
750.Sh "Questions With Answers"
751.IX Subsection "Questions With Answers"
752.IP "\(bu" 4
753Will My Old Scripts Break?
754.Sp
755Very probably not. Unless you are generating Unicode characters
756somehow, old behaviour should be preserved. About the only behaviour
757that has changed and which could start generating Unicode is the old
758behaviour of \f(CW\*(C`chr()\*(C'\fR where supplying an argument more than 255
759produced a character modulo 255. \f(CW\*(C`chr(300)\*(C'\fR, for example, was equal
760to \f(CW\*(C`chr(45)\*(C'\fR or \*(L"\-\*(R" (in \s-1ASCII\s0), now it is \s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 I \s-1WITH\s0
761\&\s-1BREVE\s0.
762.IP "\(bu" 4
763How Do I Make My Scripts Work With Unicode?
764.Sp
765Very little work should be needed since nothing changes until you
766generate Unicode data. The most important thing is getting input as
767Unicode; for that, see the earlier I/O discussion.
768.IP "\(bu" 4
769How Do I Know Whether My String Is In Unicode?
770.Sp
771You shouldn't care. No, you really shouldn't. No, really. If you
772have to care\*(--beyond the cases described above\*(--it means that we
773didn't get the transparency of Unicode quite right.
774.Sp
775Okay, if you insist:
776.Sp
777.Vb 1
778\& print utf8::is_utf8($string) ? 1 : 0, "\en";
779.Ve
780.Sp
781But note that this doesn't mean that any of the characters in the
782string are necessary \s-1UTF\-8\s0 encoded, or that any of the characters have
783code points greater than 0xFF (255) or even 0x80 (128), or that the
784string has any characters at all. All the \f(CW\*(C`is_utf8()\*(C'\fR does is to
785return the value of the internal \*(L"utf8ness\*(R" flag attached to the
786\&\f(CW$string\fR. If the flag is off, the bytes in the scalar are interpreted
787as a single byte encoding. If the flag is on, the bytes in the scalar
788are interpreted as the (multi\-byte, variable\-length) \s-1UTF\-8\s0 encoded code
789points of the characters. Bytes added to an \s-1UTF\-8\s0 encoded string are
790automatically upgraded to \s-1UTF\-8\s0. If mixed non\-UTF\-8 and \s-1UTF\-8\s0 scalars
791are merged (double\-quoted interpolation, explicit concatenation, and
792printf/sprintf parameter substitution), the result will be \s-1UTF\-8\s0 encoded
793as if copies of the byte strings were upgraded to \s-1UTF\-8:\s0 for example,
794.Sp
795.Vb 3
796\& $a = "ab\ex80c";
797\& $b = "\ex{100}";
798\& print "$a = $b\en";
799.Ve
800.Sp
801the output string will be UTF\-8\-encoded \f(CW\*(C`ab\ex80c = \ex{100}\en\*(C'\fR, but
802\&\f(CW$a\fR will stay byte\-encoded.
803.Sp
804Sometimes you might really need to know the byte length of a string
805instead of the character length. For that use either the
806\&\f(CW\*(C`Encode::encode_utf8()\*(C'\fR function or the \f(CW\*(C`bytes\*(C'\fR pragma and its only
807defined function \f(CW\*(C`length()\*(C'\fR:
808.Sp
809.Vb 7
810\& my $unicode = chr(0x100);
811\& print length($unicode), "\en"; # will print 1
812\& require Encode;
813\& print length(Encode::encode_utf8($unicode)), "\en"; # will print 2
814\& use bytes;
815\& print length($unicode), "\en"; # will also print 2
816\& # (the 0xC4 0x80 of the UTF-8)
817.Ve
818.IP "\(bu" 4
819How Do I Detect Data That's Not Valid In a Particular Encoding?
820.Sp
821Use the \f(CW\*(C`Encode\*(C'\fR package to try converting it.
822For example,
823.Sp
824.Vb 6
825\& use Encode 'decode_utf8';
826\& if (decode_utf8($string_of_bytes_that_I_think_is_utf8)) {
827\& # valid
828\& } else {
829\& # invalid
830\& }
831.Ve
832.Sp
833For \s-1UTF\-8\s0 only, you can use:
834.Sp
835.Vb 2
836\& use warnings;
837\& @chars = unpack("U0U*", $string_of_bytes_that_I_think_is_utf8);
838.Ve
839.Sp
840If invalid, a \f(CW\*(C`Malformed UTF\-8 character (byte 0x##) in unpack\*(C'\fR
841warning is produced. The \*(L"U0\*(R" means \*(L"expect strictly \s-1UTF\-8\s0 encoded
842Unicode\*(R". Without that the \f(CW\*(C`unpack("U*", ...)\*(C'\fR would accept also
843data like \f(CW\*(C`chr(0xFF\*(C'\fR), similarly to the \f(CW\*(C`pack\*(C'\fR as we saw earlier.
844.IP "\(bu" 4
845How Do I Convert Binary Data Into a Particular Encoding, Or Vice Versa?
846.Sp
847This probably isn't as useful as you might think.
848Normally, you shouldn't need to.
849.Sp
850In one sense, what you are asking doesn't make much sense: encodings
851are for characters, and binary data are not \*(L"characters\*(R", so converting
852\&\*(L"data\*(R" into some encoding isn't meaningful unless you know in what
853character set and encoding the binary data is in, in which case it's
854not just binary data, now is it?
855.Sp
856If you have a raw sequence of bytes that you know should be
857interpreted via a particular encoding, you can use \f(CW\*(C`Encode\*(C'\fR:
858.Sp
859.Vb 2
860\& use Encode 'from_to';
861\& from_to($data, "iso-8859-1", "utf-8"); # from latin-1 to utf-8
862.Ve
863.Sp
864The call to \f(CW\*(C`from_to()\*(C'\fR changes the bytes in \f(CW$data\fR, but nothing
865material about the nature of the string has changed as far as Perl is
866concerned. Both before and after the call, the string \f(CW$data\fR
867contains just a bunch of 8\-bit bytes. As far as Perl is concerned,
868the encoding of the string remains as \*(L"system\-native 8\-bit bytes\*(R".
869.Sp
870You might relate this to a fictional 'Translate' module:
871.Sp
872.Vb 4
873\& use Translate;
874\& my $phrase = "Yes";
875\& Translate::from_to($phrase, 'english', 'deutsch');
876\& ## phrase now contains "Ja"
877.Ve
878.Sp
879The contents of the string changes, but not the nature of the string.
880Perl doesn't know any more after the call than before that the
881contents of the string indicates the affirmative.
882.Sp
883Back to converting data. If you have (or want) data in your system's
884native 8\-bit encoding (e.g. Latin\-1, \s-1EBCDIC\s0, etc.), you can use
885pack/unpack to convert to/from Unicode.
886.Sp
887.Vb 2
888\& $native_string = pack("C*", unpack("U*", $Unicode_string));
889\& $Unicode_string = pack("U*", unpack("C*", $native_string));
890.Ve
891.Sp
892If you have a sequence of bytes you \fBknow\fR is valid \s-1UTF\-8\s0,
893but Perl doesn't know it yet, you can make Perl a believer, too:
894.Sp
895.Vb 2
896\& use Encode 'decode_utf8';
897\& $Unicode = decode_utf8($bytes);
898.Ve
899.Sp
900You can convert well-formed \s-1UTF\-8\s0 to a sequence of bytes, but if
901you just want to convert random binary data into \s-1UTF\-8\s0, you can't.
902\&\fBAny random collection of bytes isn't well-formed \s-1UTF\-8\s0\fR. You can
903use \f(CW\*(C`unpack("C*", $string)\*(C'\fR for the former, and you can create
904well-formed Unicode data by \f(CW\*(C`pack("U*", 0xff, ...)\*(C'\fR.
905.IP "\(bu" 4
906How Do I Display Unicode? How Do I Input Unicode?
907.Sp
908See http://www.alanwood.net/unicode/ and
909http://www.cl.cam.ac.uk/~mgk25/unicode.html
910.IP "\(bu" 4
911How Does Unicode Work With Traditional Locales?
912.Sp
913In Perl, not very well. Avoid using locales through the \f(CW\*(C`locale\*(C'\fR
914pragma. Use only one or the other. But see perlrun for the
915description of the \f(CW\*(C`\-C\*(C'\fR switch and its environment counterpart,
916\&\f(CW$ENV{PERL_UNICODE}\fR to see how to enable various Unicode features,
917for example by using locale settings.
918.Sh "Hexadecimal Notation"
919.IX Subsection "Hexadecimal Notation"
920The Unicode standard prefers using hexadecimal notation because
921that more clearly shows the division of Unicode into blocks of 256 characters.
922Hexadecimal is also simply shorter than decimal. You can use decimal
923notation, too, but learning to use hexadecimal just makes life easier
924with the Unicode standard. The \f(CW\*(C`U+HHHH\*(C'\fR notation uses hexadecimal,
925for example.
926.PP
927The \f(CW\*(C`0x\*(C'\fR prefix means a hexadecimal number, the digits are 0\-9 \fIand\fR
928a\-f (or A\-F, case doesn't matter). Each hexadecimal digit represents
929four bits, or half a byte. \f(CW\*(C`print 0x..., "\en"\*(C'\fR will show a
930hexadecimal number in decimal, and \f(CW\*(C`printf "%x\en", $decimal\*(C'\fR will
931show a decimal number in hexadecimal. If you have just the
932\&\*(L"hex digits\*(R" of a hexadecimal number, you can use the \f(CW\*(C`hex()\*(C'\fR function.
933.PP
934.Vb 6
935\& print 0x0009, "\en"; # 9
936\& print 0x000a, "\en"; # 10
937\& print 0x000f, "\en"; # 15
938\& print 0x0010, "\en"; # 16
939\& print 0x0011, "\en"; # 17
940\& print 0x0100, "\en"; # 256
941.Ve
942.PP
943.Vb 1
944\& print 0x0041, "\en"; # 65
945.Ve
946.PP
947.Vb 2
948\& printf "%x\en", 65; # 41
949\& printf "%#x\en", 65; # 0x41
950.Ve
951.PP
952.Vb 1
953\& print hex("41"), "\en"; # 65
954.Ve
955.Sh "Further Resources"
956.IX Subsection "Further Resources"
957.IP "\(bu" 4
958Unicode Consortium
959.Sp
960.Vb 1
961\& http://www.unicode.org/
962.Ve
963.IP "\(bu" 4
964Unicode \s-1FAQ\s0
965.Sp
966.Vb 1
967\& http://www.unicode.org/unicode/faq/
968.Ve
969.IP "\(bu" 4
970Unicode Glossary
971.Sp
972.Vb 1
973\& http://www.unicode.org/glossary/
974.Ve
975.IP "\(bu" 4
976Unicode Useful Resources
977.Sp
978.Vb 1
979\& http://www.unicode.org/unicode/onlinedat/resources.html
980.Ve
981.IP "\(bu" 4
982Unicode and Multilingual Support in \s-1HTML\s0, Fonts, Web Browsers and Other Applications
983.Sp
984.Vb 1
985\& http://www.alanwood.net/unicode/
986.Ve
987.IP "\(bu" 4
988\&\s-1UTF\-8\s0 and Unicode \s-1FAQ\s0 for Unix/Linux
989.Sp
990.Vb 1
991\& http://www.cl.cam.ac.uk/~mgk25/unicode.html
992.Ve
993.IP "\(bu" 4
994Legacy Character Sets
995.Sp
996.Vb 2
997\& http://www.czyborra.com/
998\& http://www.eki.ee/letter/
999.Ve
1000.IP "\(bu" 4
1001The Unicode support files live within the Perl installation in the
1002directory
1003.Sp
1004.Vb 1
1005\& $Config{installprivlib}/unicore
1006.Ve
1007.Sp
1008in Perl 5.8.0 or newer, and
1009.Sp
1010.Vb 1
1011\& $Config{installprivlib}/unicode
1012.Ve
1013.Sp
1014in the Perl 5.6 series. (The renaming to \fIlib/unicore\fR was done to
1015avoid naming conflicts with lib/Unicode in case-insensitive filesystems.)
1016The main Unicode data file is \fIUnicodeData.txt\fR (or \fIUnicode.301\fR in
1017Perl 5.6.1.) You can find the \f(CW$Config{installprivlib}\fR by
1018.Sp
1019.Vb 1
1020\& perl "-V:installprivlib"
1021.Ve
1022.Sp
1023You can explore various information from the Unicode data files using
1024the \f(CW\*(C`Unicode::UCD\*(C'\fR module.
1025.SH "UNICODE IN OLDER PERLS"
1026.IX Header "UNICODE IN OLDER PERLS"
1027If you cannot upgrade your Perl to 5.8.0 or later, you can still
1028do some Unicode processing by using the modules \f(CW\*(C`Unicode::String\*(C'\fR,
1029\&\f(CW\*(C`Unicode::Map8\*(C'\fR, and \f(CW\*(C`Unicode::Map\*(C'\fR, available from \s-1CPAN\s0.
1030If you have the \s-1GNU\s0 recode installed, you can also use the
1031Perl front-end \f(CW\*(C`Convert::Recode\*(C'\fR for character conversions.
1032.PP
1033The following are fast conversions from \s-1ISO\s0 8859\-1 (Latin\-1) bytes
1034to \s-1UTF\-8\s0 bytes and back, the code works even with older Perl 5 versions.
1035.PP
1036.Vb 2
1037\& # ISO 8859-1 to UTF-8
1038\& s/([\ex80-\exFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg;
1039.Ve
1040.PP
1041.Vb 2
1042\& # UTF-8 to ISO 8859-1
1043\& s/([\exC2\exC3])([\ex80-\exBF])/chr(ord($1)<<6&0xC0|ord($2)&0x3F)/eg;
1044.Ve
1045.SH "SEE ALSO"
1046.IX Header "SEE ALSO"
1047perlunicode, Encode, encoding, open, utf8, bytes,
1048perlretut, perlrun, Unicode::Collate, Unicode::Normalize,
1049Unicode::UCD
1050.SH "ACKNOWLEDGMENTS"
1051.IX Header "ACKNOWLEDGMENTS"
1052Thanks to the kind readers of the perl5\-porters@perl.org,
1053perl\-unicode@perl.org, linux\-utf8@nl.linux.org, and unicore@unicode.org
1054mailing lists for their valuable feedback.
1055.SH "AUTHOR, COPYRIGHT, AND LICENSE"
1056.IX Header "AUTHOR, COPYRIGHT, AND LICENSE"
1057Copyright 2001\-2002 Jarkko Hietaniemi <jhi@iki.fi>
1058.PP
1059This document may be distributed under the same terms as Perl itself.