Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man1 / perlunicode.1
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "PERLUNICODE 1"
132.TH PERLUNICODE 1 "2002-06-08" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134perlunicode \- Unicode support in Perl
135.SH "DESCRIPTION"
136.IX Header "DESCRIPTION"
137.Sh "Important Caveats"
138.IX Subsection "Important Caveats"
139Unicode support is an extensive requirement. While Perl does not
140implement the Unicode standard or the accompanying technical reports
141from cover to cover, Perl does support many Unicode features.
142.IP "Input and Output Layers" 4
143.IX Item "Input and Output Layers"
144Perl knows when a filehandle uses Perl's internal Unicode encodings
145(\s-1UTF\-8\s0, or UTF-EBCDIC if in \s-1EBCDIC\s0) if the filehandle is opened with
146the \*(L":utf8\*(R" layer. Other encodings can be converted to Perl's
147encoding on input or from Perl's encoding on output by use of the
148\&\*(L":encoding(...)\*(R" layer. See open.
149.Sp
150To indicate that Perl source itself is using a particular encoding,
151see encoding.
152.IP "Regular Expressions" 4
153.IX Item "Regular Expressions"
154The regular expression compiler produces polymorphic opcodes. That is,
155the pattern adapts to the data and automatically switches to the Unicode
156character scheme when presented with Unicode data\*(--or instead uses
157a traditional byte scheme when presented with byte data.
158.ie n .IP """use utf8"" still needed to enable \s-1UTF\-8/UTF\-EBCDIC\s0 in scripts" 4
159.el .IP "\f(CWuse utf8\fR still needed to enable \s-1UTF\-8/UTF\-EBCDIC\s0 in scripts" 4
160.IX Item "use utf8 still needed to enable UTF-8/UTF-EBCDIC in scripts"
161As a compatibility measure, the \f(CW\*(C`use utf8\*(C'\fR pragma must be explicitly
162included to enable recognition of \s-1UTF\-8\s0 in the Perl scripts themselves
163(in string or regular expression literals, or in identifier names) on
164ASCII-based machines or to recognize UTF-EBCDIC on EBCDIC-based
165machines. \fBThese are the only times when an explicit \f(CB\*(C`use utf8\*(C'\fB
166is needed.\fR See utf8.
167.Sp
168You can also use the \f(CW\*(C`encoding\*(C'\fR pragma to change the default encoding
169of the data in your script; see encoding.
170.Sh "Byte and Character Semantics"
171.IX Subsection "Byte and Character Semantics"
172Beginning with version 5.6, Perl uses logically-wide characters to
173represent strings internally.
174.PP
175In future, Perl-level operations will be expected to work with
176characters rather than bytes.
177.PP
178However, as an interim compatibility measure, Perl aims to
179provide a safe migration path from byte semantics to character
180semantics for programs. For operations where Perl can unambiguously
181decide that the input data are characters, Perl switches to
182character semantics. For operations where this determination cannot
183be made without additional information from the user, Perl decides in
184favor of compatibility and chooses to use byte semantics.
185.PP
186This behavior preserves compatibility with earlier versions of Perl,
187which allowed byte semantics in Perl operations only if
188none of the program's inputs were marked as being as source of Unicode
189character data. Such data may come from filehandles, from calls to
190external programs, from information provided by the system (such as \f(CW%ENV\fR),
191or from literals and constants in the source text.
192.PP
193On Windows platforms, if the \f(CW\*(C`\-C\*(C'\fR command line switch is used or the
194${^WIDE_SYSTEM_CALLS} global flag is set to \f(CW1\fR, all system calls
195will use the corresponding wide-character APIs. This feature is
196available only on Windows to conform to the \s-1API\s0 standard already
197established for that platform\*(--and there are very few non-Windows
198platforms that have Unicode-aware APIs.
199.PP
200The \f(CW\*(C`bytes\*(C'\fR pragma will always, regardless of platform, force byte
201semantics in a particular lexical scope. See bytes.
202.PP
203The \f(CW\*(C`utf8\*(C'\fR pragma is primarily a compatibility device that enables
204recognition of \s-1UTF\-\s0(8|EBCDIC) in literals encountered by the parser.
205Note that this pragma is only required while Perl defaults to byte
206semantics; when character semantics become the default, this pragma
207may become a no\-op. See utf8.
208.PP
209Unless explicitly stated, Perl operators use character semantics
210for Unicode data and byte semantics for non-Unicode data.
211The decision to use character semantics is made transparently. If
212input data comes from a Unicode source\*(--for example, if a character
213encoding layer is added to a filehandle or a literal Unicode
214string constant appears in a program\*(--character semantics apply.
215Otherwise, byte semantics are in effect. The \f(CW\*(C`bytes\*(C'\fR pragma should
216be used to force byte semantics on Unicode data.
217.PP
218If strings operating under byte semantics and strings with Unicode
219character data are concatenated, the new string will be upgraded to
220\&\fI\s-1ISO\s0 8859\-1 (Latin\-1)\fR, even if the old Unicode string used \s-1EBCDIC\s0.
221This translation is done without regard to the system's native 8\-bit
222encoding, so to change this for systems with non\-Latin\-1 and
223non-EBCDIC native encodings use the \f(CW\*(C`encoding\*(C'\fR pragma. See
224encoding.
225.PP
226Under character semantics, many operations that formerly operated on
227bytes now operate on characters. A character in Perl is
228logically just a number ranging from 0 to 2**31 or so. Larger
229characters may encode into longer sequences of bytes internally, but
230this internal detail is mostly hidden for Perl code.
231See perluniintro for more.
232.Sh "Effects of Character Semantics"
233.IX Subsection "Effects of Character Semantics"
234Character semantics have the following effects:
235.IP "\(bu" 4
236Strings\*(--including hash keys\*(--and regular expression patterns may
237contain characters that have an ordinal value larger than 255.
238.Sp
239If you use a Unicode editor to edit your program, Unicode characters
240may occur directly within the literal strings in one of the various
241Unicode encodings (\s-1UTF\-8\s0, \s-1UTF\-EBCDIC\s0, \s-1UCS\-2\s0, etc.), but will be recognized
242as such and converted to Perl's internal representation only if the
243appropriate encoding is specified.
244.Sp
245Unicode characters can also be added to a string by using the
246\&\f(CW\*(C`\ex{...}\*(C'\fR notation. The Unicode code for the desired character, in
247hexadecimal, should be placed in the braces. For instance, a smiley
248face is \f(CW\*(C`\ex{263A}\*(C'\fR. This encoding scheme only works for characters
249with a code of 0x100 or above.
250.Sp
251Additionally, if you
252.Sp
253.Vb 1
254\& use charnames ':full';
255.Ve
256.Sp
257you can use the \f(CW\*(C`\eN{...}\*(C'\fR notation and put the official Unicode
258character name within the braces, such as \f(CW\*(C`\eN{WHITE SMILING FACE}\*(C'\fR.
259.IP "\(bu" 4
260If an appropriate encoding is specified, identifiers within the
261Perl script may contain Unicode alphanumeric characters, including
262ideographs. Perl does not currently attempt to canonicalize variable
263names.
264.IP "\(bu" 4
265Regular expressions match characters instead of bytes. \*(L".\*(R" matches
266a character instead of a byte. The \f(CW\*(C`\eC\*(C'\fR pattern is provided to force
267a match a single byte\*(--a \f(CW\*(C`char\*(C'\fR in C, hence \f(CW\*(C`\eC\*(C'\fR.
268.IP "\(bu" 4
269Character classes in regular expressions match characters instead of
270bytes and match against the character properties specified in the
271Unicode properties database. \f(CW\*(C`\ew\*(C'\fR can be used to match a Japanese
272ideograph, for instance.
273.IP "\(bu" 4
274Named Unicode properties, scripts, and block ranges may be used like
275character classes via the \f(CW\*(C`\ep{}\*(C'\fR \*(L"matches property\*(R" construct and
276the \f(CW\*(C`\eP{}\*(C'\fR negation, \*(L"doesn't match property\*(R".
277.Sp
278For instance, \f(CW\*(C`\ep{Lu}\*(C'\fR matches any character with the Unicode \*(L"Lu\*(R"
279(Letter, uppercase) property, while \f(CW\*(C`\ep{M}\*(C'\fR matches any character
280with an \*(L"M\*(R" (mark\*(--accents and such) property. Brackets are not
281required for single letter properties, so \f(CW\*(C`\ep{M}\*(C'\fR is equivalent to
282\&\f(CW\*(C`\epM\*(C'\fR. Many predefined properties are available, such as
283\&\f(CW\*(C`\ep{Mirrored}\*(C'\fR and \f(CW\*(C`\ep{Tibetan}\*(C'\fR.
284.Sp
285The official Unicode script and block names have spaces and dashes as
286separators, but for convenience you can use dashes, spaces, or
287underbars, and case is unimportant. It is recommended, however, that
288for consistency you use the following naming: the official Unicode
289script, property, or block name (see below for the additional rules
290that apply to block names) with whitespace and dashes removed, and the
291words \*(L"uppercase\-first\-lowercase\-rest\*(R". \f(CW\*(C`Latin\-1 Supplement\*(C'\fR thus
292becomes \f(CW\*(C`Latin1Supplement\*(C'\fR.
293.Sp
294You can also use negation in both \f(CW\*(C`\ep{}\*(C'\fR and \f(CW\*(C`\eP{}\*(C'\fR by introducing a caret
295(^) between the first brace and the property name: \f(CW\*(C`\ep{^Tamil}\*(C'\fR is
296equal to \f(CW\*(C`\eP{Tamil}\*(C'\fR.
297.Sp
298Here are the basic Unicode General Category properties, followed by their
299long form. You can use either; \f(CW\*(C`\ep{Lu}\*(C'\fR and \f(CW\*(C`\ep{LowercaseLetter}\*(C'\fR,
300for instance, are identical.
301.Sp
302.Vb 1
303\& Short Long
304.Ve
305.Sp
306.Vb 6
307\& L Letter
308\& Lu UppercaseLetter
309\& Ll LowercaseLetter
310\& Lt TitlecaseLetter
311\& Lm ModifierLetter
312\& Lo OtherLetter
313.Ve
314.Sp
315.Vb 4
316\& M Mark
317\& Mn NonspacingMark
318\& Mc SpacingMark
319\& Me EnclosingMark
320.Ve
321.Sp
322.Vb 4
323\& N Number
324\& Nd DecimalNumber
325\& Nl LetterNumber
326\& No OtherNumber
327.Ve
328.Sp
329.Vb 10
330\& P Punctuation
331\& Pc ConnectorPunctuation
332\& Pd DashPunctuation
333\& Ps OpenPunctuation
334\& Pe ClosePunctuation
335\& Pi InitialPunctuation
336\& (may behave like Ps or Pe depending on usage)
337\& Pf FinalPunctuation
338\& (may behave like Ps or Pe depending on usage)
339\& Po OtherPunctuation
340.Ve
341.Sp
342.Vb 5
343\& S Symbol
344\& Sm MathSymbol
345\& Sc CurrencySymbol
346\& Sk ModifierSymbol
347\& So OtherSymbol
348.Ve
349.Sp
350.Vb 4
351\& Z Separator
352\& Zs SpaceSeparator
353\& Zl LineSeparator
354\& Zp ParagraphSeparator
355.Ve
356.Sp
357.Vb 6
358\& C Other
359\& Cc Control
360\& Cf Format
361\& Cs Surrogate (not usable)
362\& Co PrivateUse
363\& Cn Unassigned
364.Ve
365.Sp
366Single-letter properties match all characters in any of the
367two-letter sub-properties starting with the same letter.
368\&\f(CW\*(C`L&\*(C'\fR is a special case, which is an alias for \f(CW\*(C`Ll\*(C'\fR, \f(CW\*(C`Lu\*(C'\fR, and \f(CW\*(C`Lt\*(C'\fR.
369.Sp
370Because Perl hides the need for the user to understand the internal
371representation of Unicode characters, there is no need to implement
372the somewhat messy concept of surrogates. \f(CW\*(C`Cs\*(C'\fR is therefore not
373supported.
374.Sp
375Because scripts differ in their directionality\*(--Hebrew is
376written right to left, for example\*(--Unicode supplies these properties:
377.Sp
378.Vb 1
379\& Property Meaning
380.Ve
381.Sp
382.Vb 19
383\& BidiL Left-to-Right
384\& BidiLRE Left-to-Right Embedding
385\& BidiLRO Left-to-Right Override
386\& BidiR Right-to-Left
387\& BidiAL Right-to-Left Arabic
388\& BidiRLE Right-to-Left Embedding
389\& BidiRLO Right-to-Left Override
390\& BidiPDF Pop Directional Format
391\& BidiEN European Number
392\& BidiES European Number Separator
393\& BidiET European Number Terminator
394\& BidiAN Arabic Number
395\& BidiCS Common Number Separator
396\& BidiNSM Non-Spacing Mark
397\& BidiBN Boundary Neutral
398\& BidiB Paragraph Separator
399\& BidiS Segment Separator
400\& BidiWS Whitespace
401\& BidiON Other Neutrals
402.Ve
403.Sp
404For example, \f(CW\*(C`\ep{BidiR}\*(C'\fR matches characters that are normally
405written right to left.
406.Sh "Scripts"
407.IX Subsection "Scripts"
408The script names which can be used by \f(CW\*(C`\ep{...}\*(C'\fR and \f(CW\*(C`\eP{...}\*(C'\fR,
409such as in \f(CW\*(C`\ep{Latin}\*(C'\fR or \f(CW\*(C`\ep{Cyrillic}\*(C'\fR, are as follows:
410.PP
411.Vb 44
412\& Arabic
413\& Armenian
414\& Bengali
415\& Bopomofo
416\& Buhid
417\& CanadianAboriginal
418\& Cherokee
419\& Cyrillic
420\& Deseret
421\& Devanagari
422\& Ethiopic
423\& Georgian
424\& Gothic
425\& Greek
426\& Gujarati
427\& Gurmukhi
428\& Han
429\& Hangul
430\& Hanunoo
431\& Hebrew
432\& Hiragana
433\& Inherited
434\& Kannada
435\& Katakana
436\& Khmer
437\& Lao
438\& Latin
439\& Malayalam
440\& Mongolian
441\& Myanmar
442\& Ogham
443\& OldItalic
444\& Oriya
445\& Runic
446\& Sinhala
447\& Syriac
448\& Tagalog
449\& Tagbanwa
450\& Tamil
451\& Telugu
452\& Thaana
453\& Thai
454\& Tibetan
455\& Yi
456.Ve
457.PP
458Extended property classes can supplement the basic
459properties, defined by the \fIPropList\fR Unicode database:
460.PP
461.Vb 27
462\& ASCIIHexDigit
463\& BidiControl
464\& Dash
465\& Deprecated
466\& Diacritic
467\& Extender
468\& GraphemeLink
469\& HexDigit
470\& Hyphen
471\& Ideographic
472\& IDSBinaryOperator
473\& IDSTrinaryOperator
474\& JoinControl
475\& LogicalOrderException
476\& NoncharacterCodePoint
477\& OtherAlphabetic
478\& OtherDefaultIgnorableCodePoint
479\& OtherGraphemeExtend
480\& OtherLowercase
481\& OtherMath
482\& OtherUppercase
483\& QuotationMark
484\& Radical
485\& SoftDotted
486\& TerminalPunctuation
487\& UnifiedIdeograph
488\& WhiteSpace
489.Ve
490.PP
491and there are further derived properties:
492.PP
493.Vb 4
494\& Alphabetic Lu + Ll + Lt + Lm + Lo + OtherAlphabetic
495\& Lowercase Ll + OtherLowercase
496\& Uppercase Lu + OtherUppercase
497\& Math Sm + OtherMath
498.Ve
499.PP
500.Vb 2
501\& ID_Start Lu + Ll + Lt + Lm + Lo + Nl
502\& ID_Continue ID_Start + Mn + Mc + Nd + Pc
503.Ve
504.PP
505.Vb 5
506\& Any Any character
507\& Assigned Any non-Cn character (i.e. synonym for \eP{Cn})
508\& Unassigned Synonym for \ep{Cn}
509\& Common Any character (or unassigned code point)
510\& not explicitly assigned to a script
511.Ve
512.PP
513For backward compatibility (with Perl 5.6), all properties mentioned
514so far may have \f(CW\*(C`Is\*(C'\fR prepended to their name, so \f(CW\*(C`\eP{IsLu}\*(C'\fR, for
515example, is equal to \f(CW\*(C`\eP{Lu}\*(C'\fR.
516.Sh "Blocks"
517.IX Subsection "Blocks"
518In addition to \fBscripts\fR, Unicode also defines \fBblocks\fR of
519characters. The difference between scripts and blocks is that the
520concept of scripts is closer to natural languages, while the concept
521of blocks is more of an artificial grouping based on groups of 256
522Unicode characters. For example, the \f(CW\*(C`Latin\*(C'\fR script contains letters
523from many blocks but does not contain all the characters from those
524blocks. It does not, for example, contain digits, because digits are
525shared across many scripts. Digits and similar groups, like
526punctuation, are in a category called \f(CW\*(C`Common\*(C'\fR.
527.PP
528For more about scripts, see the \s-1UTR\s0 #24:
529.PP
530.Vb 1
531\& http://www.unicode.org/unicode/reports/tr24/
532.Ve
533.PP
534For more about blocks, see:
535.PP
536.Vb 1
537\& http://www.unicode.org/Public/UNIDATA/Blocks.txt
538.Ve
539.PP
540Block names are given with the \f(CW\*(C`In\*(C'\fR prefix. For example, the
541Katakana block is referenced via \f(CW\*(C`\ep{InKatakana}\*(C'\fR. The \f(CW\*(C`In\*(C'\fR
542prefix may be omitted if there is no naming conflict with a script
543or any other property, but it is recommended that \f(CW\*(C`In\*(C'\fR always be used
544for block tests to avoid confusion.
545.PP
546These block names are supported:
547.PP
548.Vb 110
549\& InAlphabeticPresentationForms
550\& InArabic
551\& InArabicPresentationFormsA
552\& InArabicPresentationFormsB
553\& InArmenian
554\& InArrows
555\& InBasicLatin
556\& InBengali
557\& InBlockElements
558\& InBopomofo
559\& InBopomofoExtended
560\& InBoxDrawing
561\& InBraillePatterns
562\& InBuhid
563\& InByzantineMusicalSymbols
564\& InCJKCompatibility
565\& InCJKCompatibilityForms
566\& InCJKCompatibilityIdeographs
567\& InCJKCompatibilityIdeographsSupplement
568\& InCJKRadicalsSupplement
569\& InCJKSymbolsAndPunctuation
570\& InCJKUnifiedIdeographs
571\& InCJKUnifiedIdeographsExtensionA
572\& InCJKUnifiedIdeographsExtensionB
573\& InCherokee
574\& InCombiningDiacriticalMarks
575\& InCombiningDiacriticalMarksforSymbols
576\& InCombiningHalfMarks
577\& InControlPictures
578\& InCurrencySymbols
579\& InCyrillic
580\& InCyrillicSupplementary
581\& InDeseret
582\& InDevanagari
583\& InDingbats
584\& InEnclosedAlphanumerics
585\& InEnclosedCJKLettersAndMonths
586\& InEthiopic
587\& InGeneralPunctuation
588\& InGeometricShapes
589\& InGeorgian
590\& InGothic
591\& InGreekExtended
592\& InGreekAndCoptic
593\& InGujarati
594\& InGurmukhi
595\& InHalfwidthAndFullwidthForms
596\& InHangulCompatibilityJamo
597\& InHangulJamo
598\& InHangulSyllables
599\& InHanunoo
600\& InHebrew
601\& InHighPrivateUseSurrogates
602\& InHighSurrogates
603\& InHiragana
604\& InIPAExtensions
605\& InIdeographicDescriptionCharacters
606\& InKanbun
607\& InKangxiRadicals
608\& InKannada
609\& InKatakana
610\& InKatakanaPhoneticExtensions
611\& InKhmer
612\& InLao
613\& InLatin1Supplement
614\& InLatinExtendedA
615\& InLatinExtendedAdditional
616\& InLatinExtendedB
617\& InLetterlikeSymbols
618\& InLowSurrogates
619\& InMalayalam
620\& InMathematicalAlphanumericSymbols
621\& InMathematicalOperators
622\& InMiscellaneousMathematicalSymbolsA
623\& InMiscellaneousMathematicalSymbolsB
624\& InMiscellaneousSymbols
625\& InMiscellaneousTechnical
626\& InMongolian
627\& InMusicalSymbols
628\& InMyanmar
629\& InNumberForms
630\& InOgham
631\& InOldItalic
632\& InOpticalCharacterRecognition
633\& InOriya
634\& InPrivateUseArea
635\& InRunic
636\& InSinhala
637\& InSmallFormVariants
638\& InSpacingModifierLetters
639\& InSpecials
640\& InSuperscriptsAndSubscripts
641\& InSupplementalArrowsA
642\& InSupplementalArrowsB
643\& InSupplementalMathematicalOperators
644\& InSupplementaryPrivateUseAreaA
645\& InSupplementaryPrivateUseAreaB
646\& InSyriac
647\& InTagalog
648\& InTagbanwa
649\& InTags
650\& InTamil
651\& InTelugu
652\& InThaana
653\& InThai
654\& InTibetan
655\& InUnifiedCanadianAboriginalSyllabics
656\& InVariationSelectors
657\& InYiRadicals
658\& InYiSyllables
659.Ve
660.IP "\(bu" 4
661The special pattern \f(CW\*(C`\eX\*(C'\fR matches any extended Unicode
662sequence\-\-\*(L"a combining character sequence\*(R" in Standardese\*(--where the
663first character is a base character and subsequent characters are mark
664characters that apply to the base character. \f(CW\*(C`\eX\*(C'\fR is equivalent to
665\&\f(CW\*(C`(?:\ePM\epM*)\*(C'\fR.
666.IP "\(bu" 4
667The \f(CW\*(C`tr///\*(C'\fR operator translates characters instead of bytes. Note
668that the \f(CW\*(C`tr///CU\*(C'\fR functionality has been removed. For similar
669functionality see pack('U0', ...) and pack('C0', ...).
670.IP "\(bu" 4
671Case translation operators use the Unicode case translation tables
672when character input is provided. Note that \f(CW\*(C`uc()\*(C'\fR, or \f(CW\*(C`\eU\*(C'\fR in
673interpolated strings, translates to uppercase, while \f(CW\*(C`ucfirst\*(C'\fR,
674or \f(CW\*(C`\eu\*(C'\fR in interpolated strings, translates to titlecase in languages
675that make the distinction.
676.IP "\(bu" 4
677Most operators that deal with positions or lengths in a string will
678automatically switch to using character positions, including
679\&\f(CW\*(C`chop()\*(C'\fR, \f(CW\*(C`substr()\*(C'\fR, \f(CW\*(C`pos()\*(C'\fR, \f(CW\*(C`index()\*(C'\fR, \f(CW\*(C`rindex()\*(C'\fR,
680\&\f(CW\*(C`sprintf()\*(C'\fR, \f(CW\*(C`write()\*(C'\fR, and \f(CW\*(C`length()\*(C'\fR. Operators that
681specifically do not switch include \f(CW\*(C`vec()\*(C'\fR, \f(CW\*(C`pack()\*(C'\fR, and
682\&\f(CW\*(C`unpack()\*(C'\fR. Operators that really don't care include \f(CW\*(C`chomp()\*(C'\fR,
683operators that treats strings as a bucket of bits such as \f(CW\*(C`sort()\*(C'\fR,
684and operators dealing with filenames.
685.IP "\(bu" 4
686The \f(CW\*(C`pack()\*(C'\fR/\f(CW\*(C`unpack()\*(C'\fR letters \f(CW\*(C`c\*(C'\fR and \f(CW\*(C`C\*(C'\fR do \fInot\fR change,
687since they are often used for byte-oriented formats. Again, think
688\&\f(CW\*(C`char\*(C'\fR in the C language.
689.Sp
690There is a new \f(CW\*(C`U\*(C'\fR specifier that converts between Unicode characters
691and code points.
692.IP "\(bu" 4
693The \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`ord()\*(C'\fR functions work on characters, similar to
694\&\f(CW\*(C`pack("U")\*(C'\fR and \f(CW\*(C`unpack("U")\*(C'\fR, \fInot\fR \f(CW\*(C`pack("C")\*(C'\fR and
695\&\f(CW\*(C`unpack("C")\*(C'\fR. \f(CW\*(C`pack("C")\*(C'\fR and \f(CW\*(C`unpack("C")\*(C'\fR are methods for
696emulating byte-oriented \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`ord()\*(C'\fR on Unicode strings.
697While these methods reveal the internal encoding of Unicode strings,
698that is not something one normally needs to care about at all.
699.IP "\(bu" 4
700The bit string operators, \f(CW\*(C`& | ^ ~\*(C'\fR, can operate on character data.
701However, for backward compatibility, such as when using bit string
702operations when characters are all less than 256 in ordinal value, one
703should not use \f(CW\*(C`~\*(C'\fR (the bit complement) with characters of both
704values less than 256 and values greater than 256. Most importantly,
705DeMorgan's laws (\f(CW\*(C`~($x|$y) eq ~$x&~$y\*(C'\fR and \f(CW\*(C`~($x&$y) eq ~$x|~$y\*(C'\fR)
706will not hold. The reason for this mathematical \fIfaux pas\fR is that
707the complement cannot return \fBboth\fR the 8\-bit (byte\-wide) bit
708complement \fBand\fR the full character-wide bit complement.
709.IP "\(bu" 4
710\&\fIlc()\fR, \fIuc()\fR, \fIlcfirst()\fR, and \fIucfirst()\fR work for the following cases:
711.RS 4
712.IP "\(bu" 8
713the case mapping is from a single Unicode character to another
714single Unicode character, or
715.IP "\(bu" 8
716the case mapping is from a single Unicode character to more
717than one Unicode character.
718.RE
719.RS 4
720.Sp
721The following cases do not yet work:
722.IP "\(bu" 8
723the \*(L"final sigma\*(R" (Greek), and
724.IP "\(bu" 8
725anything to with locales (Lithuanian, Turkish, Azeri).
726.RE
727.RS 4
728.Sp
729See the Unicode Technical Report #21, Case Mappings, for more details.
730.RE
731.IP "\(bu" 4
732And finally, \f(CW\*(C`scalar reverse()\*(C'\fR reverses by character rather than by byte.
733.Sh "User-Defined Character Properties"
734.IX Subsection "User-Defined Character Properties"
735You can define your own character properties by defining subroutines
736whose names begin with \*(L"In\*(R" or \*(L"Is\*(R". The subroutines must be
737visible in the package that uses the properties. The user-defined
738properties can be used in the regular expression \f(CW\*(C`\ep\*(C'\fR and \f(CW\*(C`\eP\*(C'\fR
739constructs.
740.PP
741The subroutines must return a specially-formatted string, with one
742or more newline-separated lines. Each line must be one of the following:
743.IP "\(bu" 4
744Two hexadecimal numbers separated by horizontal whitespace (space or
745tabular characters) denoting a range of Unicode code points to include.
746.IP "\(bu" 4
747Something to include, prefixed by \*(L"+\*(R": a built-in character
748property (prefixed by \*(L"utf8::\*(R"), to represent all the characters in that
749property; two hexadecimal code points for a range; or a single
750hexadecimal code point.
751.IP "\(bu" 4
752Something to exclude, prefixed by \*(L"\-\*(R": an existing character
753property (prefixed by \*(L"utf8::\*(R"), for all the characters in that
754property; two hexadecimal code points for a range; or a single
755hexadecimal code point.
756.IP "\(bu" 4
757Something to negate, prefixed \*(L"!\*(R": an existing character
758property (prefixed by \*(L"utf8::\*(R") for all the characters except the
759characters in the property; two hexadecimal code points for a range;
760or a single hexadecimal code point.
761.PP
762For example, to define a property that covers both the Japanese
763syllabaries (hiragana and katakana), you can define
764.PP
765.Vb 6
766\& sub InKana {
767\& return <<END;
768\& 3040\et309F
769\& 30A0\et30FF
770\& END
771\& }
772.Ve
773.PP
774Imagine that the here-doc end marker is at the beginning of the line.
775Now you can use \f(CW\*(C`\ep{InKana}\*(C'\fR and \f(CW\*(C`\eP{InKana}\*(C'\fR.
776.PP
777You could also have used the existing block property names:
778.PP
779.Vb 6
780\& sub InKana {
781\& return <<'END';
782\& +utf8::InHiragana
783\& +utf8::InKatakana
784\& END
785\& }
786.Ve
787.PP
788Suppose you wanted to match only the allocated characters,
789not the raw block ranges: in other words, you want to remove
790the non\-characters:
791.PP
792.Vb 7
793\& sub InKana {
794\& return <<'END';
795\& +utf8::InHiragana
796\& +utf8::InKatakana
797\& -utf8::IsCn
798\& END
799\& }
800.Ve
801.PP
802The negation is useful for defining (surprise!) negated classes.
803.PP
804.Vb 7
805\& sub InNotKana {
806\& return <<'END';
807\& !utf8::InHiragana
808\& -utf8::InKatakana
809\& +utf8::IsCn
810\& END
811\& }
812.Ve
813.Sh "Character Encodings for Input and Output"
814.IX Subsection "Character Encodings for Input and Output"
815See Encode.
816.Sh "Unicode Regular Expression Support Level"
817.IX Subsection "Unicode Regular Expression Support Level"
818The following list of Unicode support for regular expressions describes
819all the features currently supported. The references to \*(L"Level N\*(R"
820and the section numbers refer to the Unicode Technical Report 18,
821\&\*(L"Unicode Regular Expression Guidelines\*(R".
822.IP "\(bu" 4
823Level 1 \- Basic Unicode Support
824.Sp
825.Vb 7
826\& 2.1 Hex Notation - done [1]
827\& Named Notation - done [2]
828\& 2.2 Categories - done [3][4]
829\& 2.3 Subtraction - MISSING [5][6]
830\& 2.4 Simple Word Boundaries - done [7]
831\& 2.5 Simple Loose Matches - done [8]
832\& 2.6 End of Line - MISSING [9][10]
833.Ve
834.Sp
835.Vb 18
836\& [ 1] \ex{...}
837\& [ 2] \eN{...}
838\& [ 3] . \ep{...} \eP{...}
839\& [ 4] now scripts (see UTR#24 Script Names) in addition to blocks
840\& [ 5] have negation
841\& [ 6] can use regular expression look-ahead [a]
842\& or user-defined character properties [b] to emulate subtraction
843\& [ 7] include Letters in word characters
844\& [ 8] note that Perl does Full case-folding in matching, not Simple:
845\& for example U+1F88 is equivalent with U+1F000 U+03B9,
846\& not with 1F80. This difference matters for certain Greek
847\& capital letters with certain modifiers: the Full case-folding
848\& decomposes the letter, while the Simple case-folding would map
849\& it to a single character.
850\& [ 9] see UTR#13 Unicode Newline Guidelines
851\& [10] should do ^ and $ also on \ex{85}, \ex{2028} and \ex{2029})
852\& (should also affect <>, $., and script line numbers)
853\& (the \ex{85}, \ex{2028} and \ex{2029} do match \es)
854.Ve
855.Sp
856[a] You can mimic class subtraction using lookahead.
857For example, what \s-1TR18\s0 might write as
858.Sp
859.Vb 1
860\& [{Greek}-[{UNASSIGNED}]]
861.Ve
862.Sp
863in Perl can be written as:
864.Sp
865.Vb 2
866\& (?!\ep{Unassigned})\ep{InGreekAndCoptic}
867\& (?=\ep{Assigned})\ep{InGreekAndCoptic}
868.Ve
869.Sp
870But in this particular example, you probably really want
871.Sp
872.Vb 1
873\& \ep{GreekAndCoptic}
874.Ve
875.Sp
876which will match assigned characters known to be part of the Greek script.
877.Sp
878[b] See \*(L"User\-Defined Character Properties\*(R".
879.IP "\(bu" 4
880Level 2 \- Extended Unicode Support
881.Sp
882.Vb 5
883\& 3.1 Surrogates - MISSING
884\& 3.2 Canonical Equivalents - MISSING [11][12]
885\& 3.3 Locale-Independent Graphemes - MISSING [13]
886\& 3.4 Locale-Independent Words - MISSING [14]
887\& 3.5 Locale-Independent Loose Matches - MISSING [15]
888.Ve
889.Sp
890.Vb 5
891\& [11] see UTR#15 Unicode Normalization
892\& [12] have Unicode::Normalize but not integrated to regexes
893\& [13] have \eX but at this level . should equal that
894\& [14] need three classes, not just \ew and \eW
895\& [15] see UTR#21 Case Mappings
896.Ve
897.IP "\(bu" 4
898Level 3 \- Locale-Sensitive Support
899.Sp
900.Vb 5
901\& 4.1 Locale-Dependent Categories - MISSING
902\& 4.2 Locale-Dependent Graphemes - MISSING [16][17]
903\& 4.3 Locale-Dependent Words - MISSING
904\& 4.4 Locale-Dependent Loose Matches - MISSING
905\& 4.5 Locale-Dependent Ranges - MISSING
906.Ve
907.Sp
908.Vb 2
909\& [16] see UTR#10 Unicode Collation Algorithms
910\& [17] have Unicode::Collate but not integrated to regexes
911.Ve
912.Sh "Unicode Encodings"
913.IX Subsection "Unicode Encodings"
914Unicode characters are assigned to \fIcode points\fR, which are abstract
915numbers. To use these numbers, various encodings are needed.
916.IP "\(bu" 4
917\&\s-1UTF\-8\s0
918.Sp
919\&\s-1UTF\-8\s0 is a variable-length (1 to 6 bytes, current character allocations
920require 4 bytes), byte-order independent encoding. For \s-1ASCII\s0 (and we
921really do mean 7\-bit \s-1ASCII\s0, not another 8\-bit encoding), \s-1UTF\-8\s0 is
922transparent.
923.Sp
924The following table is from Unicode 3.2.
925.Sp
926.Vb 1
927\& Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
928.Ve
929.Sp
930.Vb 10
931\& U+0000..U+007F 00..7F
932\& U+0080..U+07FF C2..DF 80..BF
933\& U+0800..U+0FFF E0 A0..BF 80..BF
934\& U+1000..U+CFFF E1..EC 80..BF 80..BF
935\& U+D000..U+D7FF ED 80..9F 80..BF
936\& U+D800..U+DFFF ******* ill-formed *******
937\& U+E000..U+FFFF EE..EF 80..BF 80..BF
938\& U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
939\& U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
940\& U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
941.Ve
942.Sp
943Note the \f(CW\*(C`A0..BF\*(C'\fR in \f(CW\*(C`U+0800..U+0FFF\*(C'\fR, the \f(CW\*(C`80..9F\*(C'\fR in
944\&\f(CW\*(C`U+D000...U+D7FF\*(C'\fR, the \f(CW\*(C`90..B\*(C'\fRF in \f(CW\*(C`U+10000..U+3FFFF\*(C'\fR, and the
945\&\f(CW\*(C`80...8F\*(C'\fR in \f(CW\*(C`U+100000..U+10FFFF\*(C'\fR. The \*(L"gaps\*(R" are caused by legal
946\&\s-1UTF\-8\s0 avoiding non-shortest encodings: it is technically possible to
947UTF\-8\-encode a single code point in different ways, but that is
948explicitly forbidden, and the shortest possible encoding should always
949be used. So that's what Perl does.
950.Sp
951Another way to look at it is via bits:
952.Sp
953.Vb 1
954\& Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
955.Ve
956.Sp
957.Vb 4
958\& 0aaaaaaa 0aaaaaaa
959\& 00000bbbbbaaaaaa 110bbbbb 10aaaaaa
960\& ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa
961\& 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa
962.Ve
963.Sp
964As you can see, the continuation bytes all begin with \f(CW10\fR, and the
965leading bits of the start byte tell how many bytes the are in the
966encoded character.
967.IP "\(bu" 4
968UTF-EBCDIC
969.Sp
970Like \s-1UTF\-8\s0 but EBCDIC\-safe, in the way that \s-1UTF\-8\s0 is ASCII\-safe.
971.IP "\(bu" 4
972\&\s-1UTF\-16\s0, \s-1UTF\-16BE\s0, \s-1UTF16\-LE\s0, Surrogates, and BOMs (Byte Order Marks)
973.Sp
974The followings items are mostly for reference and general Unicode
975knowledge, Perl doesn't use these constructs internally.
976.Sp
977\&\s-1UTF\-16\s0 is a 2 or 4 byte encoding. The Unicode code points
978\&\f(CW\*(C`U+0000..U+FFFF\*(C'\fR are stored in a single 16\-bit unit, and the code
979points \f(CW\*(C`U+10000..U+10FFFF\*(C'\fR in two 16\-bit units. The latter case is
980using \fIsurrogates\fR, the first 16\-bit unit being the \fIhigh
981surrogate\fR, and the second being the \fIlow surrogate\fR.
982.Sp
983Surrogates are code points set aside to encode the \f(CW\*(C`U+10000..U+10FFFF\*(C'\fR
984range of Unicode code points in pairs of 16\-bit units. The \fIhigh
985surrogates\fR are the range \f(CW\*(C`U+D800..U+DBFF\*(C'\fR, and the \fIlow surrogates\fR
986are the range \f(CW\*(C`U+DC00..U+DFFF\*(C'\fR. The surrogate encoding is
987.Sp
988.Vb 2
989\& $hi = ($uni - 0x10000) / 0x400 + 0xD800;
990\& $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
991.Ve
992.Sp
993and the decoding is
994.Sp
995.Vb 1
996\& $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
997.Ve
998.Sp
999If you try to generate surrogates (for example by using \fIchr()\fR), you
1000will get a warning if warnings are turned on, because those code
1001points are not valid for a Unicode character.
1002.Sp
1003Because of the 16\-bitness, \s-1UTF\-16\s0 is byte-order dependent. \s-1UTF\-16\s0
1004itself can be used for in-memory computations, but if storage or
1005transfer is required either \s-1UTF\-16BE\s0 (big\-endian) or \s-1UTF\-16LE\s0
1006(little\-endian) encodings must be chosen.
1007.Sp
1008This introduces another problem: what if you just know that your data
1009is \s-1UTF\-16\s0, but you don't know which endianness? Byte Order Marks, or
1010BOMs, are a solution to this. A special character has been reserved
1011in Unicode to function as a byte order marker: the character with the
1012code point \f(CW\*(C`U+FEFF\*(C'\fR is the \s-1BOM\s0.
1013.Sp
1014The trick is that if you read a \s-1BOM\s0, you will know the byte order,
1015since if it was written on a big-endian platform, you will read the
1016bytes \f(CW\*(C`0xFE 0xFF\*(C'\fR, but if it was written on a little-endian platform,
1017you will read the bytes \f(CW\*(C`0xFF 0xFE\*(C'\fR. (And if the originating platform
1018was writing in \s-1UTF\-8\s0, you will read the bytes \f(CW\*(C`0xEF 0xBB 0xBF\*(C'\fR.)
1019.Sp
1020The way this trick works is that the character with the code point
1021\&\f(CW\*(C`U+FFFE\*(C'\fR is guaranteed not to be a valid Unicode character, so the
1022sequence of bytes \f(CW\*(C`0xFF 0xFE\*(C'\fR is unambiguously \*(L"\s-1BOM\s0, represented in
1023little-endian format\*(R" and cannot be \f(CW\*(C`U+FFFE\*(C'\fR, represented in big-endian
1024format".
1025.IP "\(bu" 4
1026\&\s-1UTF\-32\s0, \s-1UTF\-32BE\s0, \s-1UTF32\-LE\s0
1027.Sp
1028The \s-1UTF\-32\s0 family is pretty much like the \s-1UTF\-16\s0 family, expect that
1029the units are 32\-bit, and therefore the surrogate scheme is not
1030needed. The \s-1BOM\s0 signatures will be \f(CW\*(C`0x00 0x00 0xFE 0xFF\*(C'\fR for \s-1BE\s0 and
1031\&\f(CW\*(C`0xFF 0xFE 0x00 0x00\*(C'\fR for \s-1LE\s0.
1032.IP "\(bu" 4
1033\&\s-1UCS\-2\s0, \s-1UCS\-4\s0
1034.Sp
1035Encodings defined by the \s-1ISO\s0 10646 standard. \s-1UCS\-2\s0 is a 16\-bit
1036encoding. Unlike \s-1UTF\-16\s0, \s-1UCS\-2\s0 is not extensible beyond \f(CW\*(C`U+FFFF\*(C'\fR,
1037because it does not use surrogates. \s-1UCS\-4\s0 is a 32\-bit encoding,
1038functionally identical to \s-1UTF\-32\s0.
1039.IP "\(bu" 4
1040\&\s-1UTF\-7\s0
1041.Sp
1042A seven-bit safe (non\-eight\-bit) encoding, which is useful if the
1043transport or storage is not eight-bit safe. Defined by \s-1RFC\s0 2152.
1044.Sh "Security Implications of Unicode"
1045.IX Subsection "Security Implications of Unicode"
1046.IP "\(bu" 4
1047Malformed \s-1UTF\-8\s0
1048.Sp
1049Unfortunately, the specification of \s-1UTF\-8\s0 leaves some room for
1050interpretation of how many bytes of encoded output one should generate
1051from one input Unicode character. Strictly speaking, the shortest
1052possible sequence of \s-1UTF\-8\s0 bytes should be generated,
1053because otherwise there is potential for an input buffer overflow at
1054the receiving end of a \s-1UTF\-8\s0 connection. Perl always generates the
1055shortest length \s-1UTF\-8\s0, and with warnings on Perl will warn about
1056non-shortest length \s-1UTF\-8\s0 along with other malformations, such as the
1057surrogates, which are not real Unicode code points.
1058.IP "\(bu" 4
1059Regular expressions behave slightly differently between byte data and
1060character (Unicode) data. For example, the \*(L"word character\*(R" character
1061class \f(CW\*(C`\ew\*(C'\fR will work differently depending on if data is eight-bit bytes
1062or Unicode.
1063.Sp
1064In the first case, the set of \f(CW\*(C`\ew\*(C'\fR characters is either small\*(--the
1065default set of alphabetic characters, digits, and the \*(L"_\*(R"\-\-or, if you
1066are using a locale (see perllocale), the \f(CW\*(C`\ew\*(C'\fR might contain a few
1067more letters according to your language and country.
1068.Sp
1069In the second case, the \f(CW\*(C`\ew\*(C'\fR set of characters is much, much larger.
1070Most importantly, even in the set of the first 256 characters, it will
1071probably match different characters: unlike most locales, which are
1072specific to a language and country pair, Unicode classifies all the
1073characters that are letters \fIsomewhere\fR as \f(CW\*(C`\ew\*(C'\fR. For example, your
1074locale might not think that \s-1LATIN\s0 \s-1SMALL\s0 \s-1LETTER\s0 \s-1ETH\s0 is a letter (unless
1075you happen to speak Icelandic), but Unicode does.
1076.Sp
1077As discussed elsewhere, Perl has one foot (two hooves?) planted in
1078each of two worlds: the old world of bytes and the new world of
1079characters, upgrading from bytes to characters when necessary.
1080If your legacy code does not explicitly use Unicode, no automatic
1081switch-over to characters should happen. Characters shouldn't get
1082downgraded to bytes, either. It is possible to accidentally mix bytes
1083and characters, however (see perluniintro), in which case \f(CW\*(C`\ew\*(C'\fR in
1084regular expressions might start behaving differently. Review your
1085code. Use warnings and the \f(CW\*(C`strict\*(C'\fR pragma.
1086.Sh "Unicode in Perl on \s-1EBCDIC\s0"
1087.IX Subsection "Unicode in Perl on EBCDIC"
1088The way Unicode is handled on \s-1EBCDIC\s0 platforms is still
1089experimental. On such platforms, references to \s-1UTF\-8\s0 encoding in this
1090document and elsewhere should be read as meaning the UTF-EBCDIC
1091specified in Unicode Technical Report 16, unless \s-1ASCII\s0 vs. \s-1EBCDIC\s0 issues
1092are specifically discussed. There is no \f(CW\*(C`utfebcdic\*(C'\fR pragma or
1093\&\*(L":utfebcdic\*(R" layer; rather, \*(L"utf8\*(R" and \*(L":utf8\*(R" are reused to mean
1094the platform's \*(L"natural\*(R" 8\-bit encoding of Unicode. See perlebcdic
1095for more discussion of the issues.
1096.Sh "Locales"
1097.IX Subsection "Locales"
1098Usually locale settings and Unicode do not affect each other, but
1099there are a couple of exceptions:
1100.IP "\(bu" 4
1101If your locale environment variables (\s-1LANGUAGE\s0, \s-1LC_ALL\s0, \s-1LC_CTYPE\s0, \s-1LANG\s0)
1102contain the strings '\s-1UTF\-8\s0' or '\s-1UTF8\s0' (case\-insensitive matching),
1103the default encodings of your \s-1STDIN\s0, \s-1STDOUT\s0, and \s-1STDERR\s0, and of
1104\&\fBany subsequent file open\fR, are considered to be \s-1UTF\-8\s0.
1105.IP "\(bu" 4
1106Perl tries really hard to work both with Unicode and the old
1107byte-oriented world. Most often this is nice, but sometimes Perl's
1108straddling of the proverbial fence causes problems.
1109.Sh "Using Unicode in \s-1XS\s0"
1110.IX Subsection "Using Unicode in XS"
1111If you want to handle Perl Unicode in \s-1XS\s0 extensions, you may find
1112the following C APIs useful. See perlapi for details.
1113.IP "\(bu" 4
1114\&\f(CW\*(C`DO_UTF8(sv)\*(C'\fR returns true if the \f(CW\*(C`UTF8\*(C'\fR flag is on and the bytes
1115pragma is not in effect. \f(CW\*(C`SvUTF8(sv)\*(C'\fR returns true is the \f(CW\*(C`UTF8\*(C'\fR
1116flag is on; the bytes pragma is ignored. The \f(CW\*(C`UTF8\*(C'\fR flag being on
1117does \fBnot\fR mean that there are any characters of code points greater
1118than 255 (or 127) in the scalar or that there are even any characters
1119in the scalar. What the \f(CW\*(C`UTF8\*(C'\fR flag means is that the sequence of
1120octets in the representation of the scalar is the sequence of \s-1UTF\-8\s0
1121encoded code points of the characters of a string. The \f(CW\*(C`UTF8\*(C'\fR flag
1122being off means that each octet in this representation encodes a
1123single character with code point 0..255 within the string. Perl's
1124Unicode model is not to use \s-1UTF\-8\s0 until it is absolutely necessary.
1125.IP "\(bu" 4
1126\&\f(CW\*(C`uvuni_to_utf8(buf, chr\*(C'\fR) writes a Unicode character code point into
1127a buffer encoding the code point as \s-1UTF\-8\s0, and returns a pointer
1128pointing after the \s-1UTF\-8\s0 bytes.
1129.IP "\(bu" 4
1130\&\f(CW\*(C`utf8_to_uvuni(buf, lenp)\*(C'\fR reads \s-1UTF\-8\s0 encoded bytes from a buffer and
1131returns the Unicode character code point and, optionally, the length of
1132the \s-1UTF\-8\s0 byte sequence.
1133.IP "\(bu" 4
1134\&\f(CW\*(C`utf8_length(start, end)\*(C'\fR returns the length of the \s-1UTF\-8\s0 encoded buffer
1135in characters. \f(CW\*(C`sv_len_utf8(sv)\*(C'\fR returns the length of the \s-1UTF\-8\s0 encoded
1136scalar.
1137.IP "\(bu" 4
1138\&\f(CW\*(C`sv_utf8_upgrade(sv)\*(C'\fR converts the string of the scalar to its \s-1UTF\-8\s0
1139encoded form. \f(CW\*(C`sv_utf8_downgrade(sv)\*(C'\fR does the opposite, if
1140possible. \f(CW\*(C`sv_utf8_encode(sv)\*(C'\fR is like sv_utf8_upgrade except that
1141it does not set the \f(CW\*(C`UTF8\*(C'\fR flag. \f(CW\*(C`sv_utf8_decode()\*(C'\fR does the
1142opposite of \f(CW\*(C`sv_utf8_encode()\*(C'\fR. Note that none of these are to be
1143used as general-purpose encoding or decoding interfaces: \f(CW\*(C`use Encode\*(C'\fR
1144for that. \f(CW\*(C`sv_utf8_upgrade()\*(C'\fR is affected by the encoding pragma
1145but \f(CW\*(C`sv_utf8_downgrade()\*(C'\fR is not (since the encoding pragma is
1146designed to be a one-way street).
1147.IP "\(bu" 4
1148\&\f(CWis_utf8_char(s)\fR returns true if the pointer points to a valid \s-1UTF\-8\s0
1149character.
1150.IP "\(bu" 4
1151\&\f(CW\*(C`is_utf8_string(buf, len)\*(C'\fR returns true if \f(CW\*(C`len\*(C'\fR bytes of the buffer
1152are valid \s-1UTF\-8\s0.
1153.IP "\(bu" 4
1154\&\f(CW\*(C`UTF8SKIP(buf)\*(C'\fR will return the number of bytes in the \s-1UTF\-8\s0 encoded
1155character in the buffer. \f(CW\*(C`UNISKIP(chr)\*(C'\fR will return the number of bytes
1156required to UTF\-8\-encode the Unicode character code point. \f(CW\*(C`UTF8SKIP()\*(C'\fR
1157is useful for example for iterating over the characters of a \s-1UTF\-8\s0
1158encoded buffer; \f(CW\*(C`UNISKIP()\*(C'\fR is useful, for example, in computing
1159the size required for a \s-1UTF\-8\s0 encoded buffer.
1160.IP "\(bu" 4
1161\&\f(CW\*(C`utf8_distance(a, b)\*(C'\fR will tell the distance in characters between the
1162two pointers pointing to the same \s-1UTF\-8\s0 encoded buffer.
1163.IP "\(bu" 4
1164\&\f(CW\*(C`utf8_hop(s, off)\*(C'\fR will return a pointer to an \s-1UTF\-8\s0 encoded buffer
1165that is \f(CW\*(C`off\*(C'\fR (positive or negative) Unicode characters displaced
1166from the \s-1UTF\-8\s0 buffer \f(CW\*(C`s\*(C'\fR. Be careful not to overstep the buffer:
1167\&\f(CW\*(C`utf8_hop()\*(C'\fR will merrily run off the end or the beginning of the
1168buffer if told to do so.
1169.IP "\(bu" 4
1170\&\f(CW\*(C`pv_uni_display(dsv, spv, len, pvlim, flags)\*(C'\fR and
1171\&\f(CW\*(C`sv_uni_display(dsv, ssv, pvlim, flags)\*(C'\fR are useful for debugging the
1172output of Unicode strings and scalars. By default they are useful
1173only for debugging\*(--they display \fBall\fR characters as hexadecimal code
1174points\*(--but with the flags \f(CW\*(C`UNI_DISPLAY_ISPRINT\*(C'\fR,
1175\&\f(CW\*(C`UNI_DISPLAY_BACKSLASH\*(C'\fR, and \f(CW\*(C`UNI_DISPLAY_QQ\*(C'\fR you can make the
1176output more readable.
1177.IP "\(bu" 4
1178\&\f(CW\*(C`ibcmp_utf8(s1, pe1, u1, l1, u1, s2, pe2, l2, u2)\*(C'\fR can be used to
1179compare two strings case-insensitively in Unicode. For case-sensitive
1180comparisons you can just use \f(CW\*(C`memEQ()\*(C'\fR and \f(CW\*(C`memNE()\*(C'\fR as usual.
1181.PP
1182For more information, see perlapi, and \fIutf8.c\fR and \fIutf8.h\fR
1183in the Perl source code distribution.
1184.SH "BUGS"
1185.IX Header "BUGS"
1186.Sh "Interaction with Locales"
1187.IX Subsection "Interaction with Locales"
1188Use of locales with Unicode data may lead to odd results. Currently,
1189Perl attempts to attach 8\-bit locale info to characters in the range
11900..255, but this technique is demonstrably incorrect for locales that
1191use characters above that range when mapped into Unicode. Perl's
1192Unicode support will also tend to run slower. Use of locales with
1193Unicode is discouraged.
1194.Sh "Interaction with Extensions"
1195.IX Subsection "Interaction with Extensions"
1196When Perl exchanges data with an extension, the extension should be
1197able to understand the \s-1UTF\-8\s0 flag and act accordingly. If the
1198extension doesn't know about the flag, it's likely that the extension
1199will return incorrectly-flagged data.
1200.PP
1201So if you're working with Unicode data, consult the documentation of
1202every module you're using if there are any issues with Unicode data
1203exchange. If the documentation does not talk about Unicode at all,
1204suspect the worst and probably look at the source to learn how the
1205module is implemented. Modules written completely in Perl shouldn't
1206cause problems. Modules that directly or indirectly access code written
1207in other programming languages are at risk.
1208.PP
1209For affected functions, the simple strategy to avoid data corruption is
1210to always make the encoding of the exchanged data explicit. Choose an
1211encoding that you know the extension can handle. Convert arguments passed
1212to the extensions to that encoding and convert results back from that
1213encoding. Write wrapper functions that do the conversions for you, so
1214you can later change the functions when the extension catches up.
1215.PP
1216To provide an example, let's say the popular Foo::Bar::escape_html
1217function doesn't deal with Unicode data yet. The wrapper function
1218would convert the argument to raw \s-1UTF\-8\s0 and convert the result back to
1219Perl's internal representation like so:
1220.PP
1221.Vb 5
1222\& sub my_escape_html ($) {
1223\& my($what) = shift;
1224\& return unless defined $what;
1225\& Encode::decode_utf8(Foo::Bar::escape_html(Encode::encode_utf8($what)));
1226\& }
1227.Ve
1228.PP
1229Sometimes, when the extension does not convert data but just stores
1230and retrieves them, you will be in a position to use the otherwise
1231dangerous \fIEncode::_utf8_on()\fR function. Let's say the popular
1232\&\f(CW\*(C`Foo::Bar\*(C'\fR extension, written in C, provides a \f(CW\*(C`param\*(C'\fR method that
1233lets you store and retrieve data according to these prototypes:
1234.PP
1235.Vb 2
1236\& $self->param($name, $value); # set a scalar
1237\& $value = $self->param($name); # retrieve a scalar
1238.Ve
1239.PP
1240If it does not yet provide support for any encoding, one could write a
1241derived class with such a \f(CW\*(C`param\*(C'\fR method:
1242.PP
1243.Vb 12
1244\& sub param {
1245\& my($self,$name,$value) = @_;
1246\& utf8::upgrade($name); # make sure it is UTF-8 encoded
1247\& if (defined $value)
1248\& utf8::upgrade($value); # make sure it is UTF-8 encoded
1249\& return $self->SUPER::param($name,$value);
1250\& } else {
1251\& my $ret = $self->SUPER::param($name);
1252\& Encode::_utf8_on($ret); # we know, it is UTF-8 encoded
1253\& return $ret;
1254\& }
1255\& }
1256.Ve
1257.PP
1258Some extensions provide filters on data entry/exit points, such as
1259DB_File::filter_store_key and family. Look out for such filters in
1260the documentation of your extensions, they can make the transition to
1261Unicode data much easier.
1262.Sh "Speed"
1263.IX Subsection "Speed"
1264Some functions are slower when working on \s-1UTF\-8\s0 encoded strings than
1265on byte encoded strings. All functions that need to hop over
1266characters such as \fIlength()\fR, \fIsubstr()\fR or \fIindex()\fR can work \fBmuch\fR
1267faster when the underlying data are byte\-encoded. Witness the
1268following benchmark:
1269.PP
1270.Vb 18
1271\& % perl -e '
1272\& use Benchmark;
1273\& use strict;
1274\& our $l = 10000;
1275\& our $u = our $b = "x" x $l;
1276\& substr($u,0,1) = "\ex{100}";
1277\& timethese(-2,{
1278\& LENGTH_B => q{ length($b) },
1279\& LENGTH_U => q{ length($u) },
1280\& SUBSTR_B => q{ substr($b, $l/4, $l/2) },
1281\& SUBSTR_U => q{ substr($u, $l/4, $l/2) },
1282\& });
1283\& '
1284\& Benchmark: running LENGTH_B, LENGTH_U, SUBSTR_B, SUBSTR_U for at least 2 CPU seconds...
1285\& LENGTH_B: 2 wallclock secs ( 2.36 usr + 0.00 sys = 2.36 CPU) @ 5649983.05/s (n=13333960)
1286\& LENGTH_U: 2 wallclock secs ( 2.11 usr + 0.00 sys = 2.11 CPU) @ 12155.45/s (n=25648)
1287\& SUBSTR_B: 3 wallclock secs ( 2.16 usr + 0.00 sys = 2.16 CPU) @ 374480.09/s (n=808877)
1288\& SUBSTR_U: 2 wallclock secs ( 2.11 usr + 0.00 sys = 2.11 CPU) @ 6791.00/s (n=14329)
1289.Ve
1290.PP
1291The numbers show an incredible slowness on long \s-1UTF\-8\s0 strings. You
1292should carefully avoid using these functions in tight loops. If you
1293want to iterate over characters, the superior coding technique would
1294split the characters into an array instead of using substr, as the following
1295benchmark shows:
1296.PP
1297.Vb 18
1298\& % perl -e '
1299\& use Benchmark;
1300\& use strict;
1301\& our $l = 10000;
1302\& our $u = our $b = "x" x $l;
1303\& substr($u,0,1) = "\ex{100}";
1304\& timethese(-5,{
1305\& SPLIT_B => q{ for my $c (split //, $b){} },
1306\& SPLIT_U => q{ for my $c (split //, $u){} },
1307\& SUBSTR_B => q{ for my $i (0..length($b)-1){my $c = substr($b,$i,1);} },
1308\& SUBSTR_U => q{ for my $i (0..length($u)-1){my $c = substr($u,$i,1);} },
1309\& });
1310\& '
1311\& Benchmark: running SPLIT_B, SPLIT_U, SUBSTR_B, SUBSTR_U for at least 5 CPU seconds...
1312\& SPLIT_B: 6 wallclock secs ( 5.29 usr + 0.00 sys = 5.29 CPU) @ 56.14/s (n=297)
1313\& SPLIT_U: 5 wallclock secs ( 5.17 usr + 0.01 sys = 5.18 CPU) @ 55.21/s (n=286)
1314\& SUBSTR_B: 5 wallclock secs ( 5.34 usr + 0.00 sys = 5.34 CPU) @ 123.22/s (n=658)
1315\& SUBSTR_U: 7 wallclock secs ( 6.20 usr + 0.00 sys = 6.20 CPU) @ 0.81/s (n=5)
1316.Ve
1317.PP
1318Even though the algorithm based on \f(CW\*(C`substr()\*(C'\fR is faster than
1319\&\f(CW\*(C`split()\*(C'\fR for byte-encoded data, it pales in comparison to the speed
1320of \f(CW\*(C`split()\*(C'\fR when used with \s-1UTF\-8\s0 data.
1321.SH "SEE ALSO"
1322.IX Header "SEE ALSO"
1323perluniintro, encoding, Encode, open, utf8, bytes,
1324perlretut, \*(L"${^WIDE_SYSTEM_CALLS}\*(R" in perlvar