Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "PERLUNICODE 1" | |
132 | .TH PERLUNICODE 1 "2002-06-08" "perl v5.8.0" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | perlunicode \- Unicode support in Perl | |
135 | .SH "DESCRIPTION" | |
136 | .IX Header "DESCRIPTION" | |
137 | .Sh "Important Caveats" | |
138 | .IX Subsection "Important Caveats" | |
139 | Unicode support is an extensive requirement. While Perl does not | |
140 | implement the Unicode standard or the accompanying technical reports | |
141 | from cover to cover, Perl does support many Unicode features. | |
142 | .IP "Input and Output Layers" 4 | |
143 | .IX Item "Input and Output Layers" | |
144 | Perl knows when a filehandle uses Perl's internal Unicode encodings | |
145 | (\s-1UTF\-8\s0, or UTF-EBCDIC if in \s-1EBCDIC\s0) if the filehandle is opened with | |
146 | the \*(L":utf8\*(R" layer. Other encodings can be converted to Perl's | |
147 | encoding on input or from Perl's encoding on output by use of the | |
148 | \&\*(L":encoding(...)\*(R" layer. See open. | |
149 | .Sp | |
150 | To indicate that Perl source itself is using a particular encoding, | |
151 | see encoding. | |
152 | .IP "Regular Expressions" 4 | |
153 | .IX Item "Regular Expressions" | |
154 | The regular expression compiler produces polymorphic opcodes. That is, | |
155 | the pattern adapts to the data and automatically switches to the Unicode | |
156 | character scheme when presented with Unicode data\*(--or instead uses | |
157 | a traditional byte scheme when presented with byte data. | |
158 | .ie n .IP """use utf8"" still needed to enable \s-1UTF\-8/UTF\-EBCDIC\s0 in scripts" 4 | |
159 | .el .IP "\f(CWuse utf8\fR still needed to enable \s-1UTF\-8/UTF\-EBCDIC\s0 in scripts" 4 | |
160 | .IX Item "use utf8 still needed to enable UTF-8/UTF-EBCDIC in scripts" | |
161 | As a compatibility measure, the \f(CW\*(C`use utf8\*(C'\fR pragma must be explicitly | |
162 | included to enable recognition of \s-1UTF\-8\s0 in the Perl scripts themselves | |
163 | (in string or regular expression literals, or in identifier names) on | |
164 | ASCII-based machines or to recognize UTF-EBCDIC on EBCDIC-based | |
165 | machines. \fBThese are the only times when an explicit \f(CB\*(C`use utf8\*(C'\fB | |
166 | is needed.\fR See utf8. | |
167 | .Sp | |
168 | You can also use the \f(CW\*(C`encoding\*(C'\fR pragma to change the default encoding | |
169 | of the data in your script; see encoding. | |
170 | .Sh "Byte and Character Semantics" | |
171 | .IX Subsection "Byte and Character Semantics" | |
172 | Beginning with version 5.6, Perl uses logically-wide characters to | |
173 | represent strings internally. | |
174 | .PP | |
175 | In future, Perl-level operations will be expected to work with | |
176 | characters rather than bytes. | |
177 | .PP | |
178 | However, as an interim compatibility measure, Perl aims to | |
179 | provide a safe migration path from byte semantics to character | |
180 | semantics for programs. For operations where Perl can unambiguously | |
181 | decide that the input data are characters, Perl switches to | |
182 | character semantics. For operations where this determination cannot | |
183 | be made without additional information from the user, Perl decides in | |
184 | favor of compatibility and chooses to use byte semantics. | |
185 | .PP | |
186 | This behavior preserves compatibility with earlier versions of Perl, | |
187 | which allowed byte semantics in Perl operations only if | |
188 | none of the program's inputs were marked as being as source of Unicode | |
189 | character data. Such data may come from filehandles, from calls to | |
190 | external programs, from information provided by the system (such as \f(CW%ENV\fR), | |
191 | or from literals and constants in the source text. | |
192 | .PP | |
193 | On Windows platforms, if the \f(CW\*(C`\-C\*(C'\fR command line switch is used or the | |
194 | ${^WIDE_SYSTEM_CALLS} global flag is set to \f(CW1\fR, all system calls | |
195 | will use the corresponding wide-character APIs. This feature is | |
196 | available only on Windows to conform to the \s-1API\s0 standard already | |
197 | established for that platform\*(--and there are very few non-Windows | |
198 | platforms that have Unicode-aware APIs. | |
199 | .PP | |
200 | The \f(CW\*(C`bytes\*(C'\fR pragma will always, regardless of platform, force byte | |
201 | semantics in a particular lexical scope. See bytes. | |
202 | .PP | |
203 | The \f(CW\*(C`utf8\*(C'\fR pragma is primarily a compatibility device that enables | |
204 | recognition of \s-1UTF\-\s0(8|EBCDIC) in literals encountered by the parser. | |
205 | Note that this pragma is only required while Perl defaults to byte | |
206 | semantics; when character semantics become the default, this pragma | |
207 | may become a no\-op. See utf8. | |
208 | .PP | |
209 | Unless explicitly stated, Perl operators use character semantics | |
210 | for Unicode data and byte semantics for non-Unicode data. | |
211 | The decision to use character semantics is made transparently. If | |
212 | input data comes from a Unicode source\*(--for example, if a character | |
213 | encoding layer is added to a filehandle or a literal Unicode | |
214 | string constant appears in a program\*(--character semantics apply. | |
215 | Otherwise, byte semantics are in effect. The \f(CW\*(C`bytes\*(C'\fR pragma should | |
216 | be used to force byte semantics on Unicode data. | |
217 | .PP | |
218 | If strings operating under byte semantics and strings with Unicode | |
219 | character data are concatenated, the new string will be upgraded to | |
220 | \&\fI\s-1ISO\s0 8859\-1 (Latin\-1)\fR, even if the old Unicode string used \s-1EBCDIC\s0. | |
221 | This translation is done without regard to the system's native 8\-bit | |
222 | encoding, so to change this for systems with non\-Latin\-1 and | |
223 | non-EBCDIC native encodings use the \f(CW\*(C`encoding\*(C'\fR pragma. See | |
224 | encoding. | |
225 | .PP | |
226 | Under character semantics, many operations that formerly operated on | |
227 | bytes now operate on characters. A character in Perl is | |
228 | logically just a number ranging from 0 to 2**31 or so. Larger | |
229 | characters may encode into longer sequences of bytes internally, but | |
230 | this internal detail is mostly hidden for Perl code. | |
231 | See perluniintro for more. | |
232 | .Sh "Effects of Character Semantics" | |
233 | .IX Subsection "Effects of Character Semantics" | |
234 | Character semantics have the following effects: | |
235 | .IP "\(bu" 4 | |
236 | Strings\*(--including hash keys\*(--and regular expression patterns may | |
237 | contain characters that have an ordinal value larger than 255. | |
238 | .Sp | |
239 | If you use a Unicode editor to edit your program, Unicode characters | |
240 | may occur directly within the literal strings in one of the various | |
241 | Unicode encodings (\s-1UTF\-8\s0, \s-1UTF\-EBCDIC\s0, \s-1UCS\-2\s0, etc.), but will be recognized | |
242 | as such and converted to Perl's internal representation only if the | |
243 | appropriate encoding is specified. | |
244 | .Sp | |
245 | Unicode characters can also be added to a string by using the | |
246 | \&\f(CW\*(C`\ex{...}\*(C'\fR notation. The Unicode code for the desired character, in | |
247 | hexadecimal, should be placed in the braces. For instance, a smiley | |
248 | face is \f(CW\*(C`\ex{263A}\*(C'\fR. This encoding scheme only works for characters | |
249 | with a code of 0x100 or above. | |
250 | .Sp | |
251 | Additionally, if you | |
252 | .Sp | |
253 | .Vb 1 | |
254 | \& use charnames ':full'; | |
255 | .Ve | |
256 | .Sp | |
257 | you can use the \f(CW\*(C`\eN{...}\*(C'\fR notation and put the official Unicode | |
258 | character name within the braces, such as \f(CW\*(C`\eN{WHITE SMILING FACE}\*(C'\fR. | |
259 | .IP "\(bu" 4 | |
260 | If an appropriate encoding is specified, identifiers within the | |
261 | Perl script may contain Unicode alphanumeric characters, including | |
262 | ideographs. Perl does not currently attempt to canonicalize variable | |
263 | names. | |
264 | .IP "\(bu" 4 | |
265 | Regular expressions match characters instead of bytes. \*(L".\*(R" matches | |
266 | a character instead of a byte. The \f(CW\*(C`\eC\*(C'\fR pattern is provided to force | |
267 | a match a single byte\*(--a \f(CW\*(C`char\*(C'\fR in C, hence \f(CW\*(C`\eC\*(C'\fR. | |
268 | .IP "\(bu" 4 | |
269 | Character classes in regular expressions match characters instead of | |
270 | bytes and match against the character properties specified in the | |
271 | Unicode properties database. \f(CW\*(C`\ew\*(C'\fR can be used to match a Japanese | |
272 | ideograph, for instance. | |
273 | .IP "\(bu" 4 | |
274 | Named Unicode properties, scripts, and block ranges may be used like | |
275 | character classes via the \f(CW\*(C`\ep{}\*(C'\fR \*(L"matches property\*(R" construct and | |
276 | the \f(CW\*(C`\eP{}\*(C'\fR negation, \*(L"doesn't match property\*(R". | |
277 | .Sp | |
278 | For instance, \f(CW\*(C`\ep{Lu}\*(C'\fR matches any character with the Unicode \*(L"Lu\*(R" | |
279 | (Letter, uppercase) property, while \f(CW\*(C`\ep{M}\*(C'\fR matches any character | |
280 | with an \*(L"M\*(R" (mark\*(--accents and such) property. Brackets are not | |
281 | required for single letter properties, so \f(CW\*(C`\ep{M}\*(C'\fR is equivalent to | |
282 | \&\f(CW\*(C`\epM\*(C'\fR. Many predefined properties are available, such as | |
283 | \&\f(CW\*(C`\ep{Mirrored}\*(C'\fR and \f(CW\*(C`\ep{Tibetan}\*(C'\fR. | |
284 | .Sp | |
285 | The official Unicode script and block names have spaces and dashes as | |
286 | separators, but for convenience you can use dashes, spaces, or | |
287 | underbars, and case is unimportant. It is recommended, however, that | |
288 | for consistency you use the following naming: the official Unicode | |
289 | script, property, or block name (see below for the additional rules | |
290 | that apply to block names) with whitespace and dashes removed, and the | |
291 | words \*(L"uppercase\-first\-lowercase\-rest\*(R". \f(CW\*(C`Latin\-1 Supplement\*(C'\fR thus | |
292 | becomes \f(CW\*(C`Latin1Supplement\*(C'\fR. | |
293 | .Sp | |
294 | You can also use negation in both \f(CW\*(C`\ep{}\*(C'\fR and \f(CW\*(C`\eP{}\*(C'\fR by introducing a caret | |
295 | (^) between the first brace and the property name: \f(CW\*(C`\ep{^Tamil}\*(C'\fR is | |
296 | equal to \f(CW\*(C`\eP{Tamil}\*(C'\fR. | |
297 | .Sp | |
298 | Here are the basic Unicode General Category properties, followed by their | |
299 | long form. You can use either; \f(CW\*(C`\ep{Lu}\*(C'\fR and \f(CW\*(C`\ep{LowercaseLetter}\*(C'\fR, | |
300 | for instance, are identical. | |
301 | .Sp | |
302 | .Vb 1 | |
303 | \& Short Long | |
304 | .Ve | |
305 | .Sp | |
306 | .Vb 6 | |
307 | \& L Letter | |
308 | \& Lu UppercaseLetter | |
309 | \& Ll LowercaseLetter | |
310 | \& Lt TitlecaseLetter | |
311 | \& Lm ModifierLetter | |
312 | \& Lo OtherLetter | |
313 | .Ve | |
314 | .Sp | |
315 | .Vb 4 | |
316 | \& M Mark | |
317 | \& Mn NonspacingMark | |
318 | \& Mc SpacingMark | |
319 | \& Me EnclosingMark | |
320 | .Ve | |
321 | .Sp | |
322 | .Vb 4 | |
323 | \& N Number | |
324 | \& Nd DecimalNumber | |
325 | \& Nl LetterNumber | |
326 | \& No OtherNumber | |
327 | .Ve | |
328 | .Sp | |
329 | .Vb 10 | |
330 | \& P Punctuation | |
331 | \& Pc ConnectorPunctuation | |
332 | \& Pd DashPunctuation | |
333 | \& Ps OpenPunctuation | |
334 | \& Pe ClosePunctuation | |
335 | \& Pi InitialPunctuation | |
336 | \& (may behave like Ps or Pe depending on usage) | |
337 | \& Pf FinalPunctuation | |
338 | \& (may behave like Ps or Pe depending on usage) | |
339 | \& Po OtherPunctuation | |
340 | .Ve | |
341 | .Sp | |
342 | .Vb 5 | |
343 | \& S Symbol | |
344 | \& Sm MathSymbol | |
345 | \& Sc CurrencySymbol | |
346 | \& Sk ModifierSymbol | |
347 | \& So OtherSymbol | |
348 | .Ve | |
349 | .Sp | |
350 | .Vb 4 | |
351 | \& Z Separator | |
352 | \& Zs SpaceSeparator | |
353 | \& Zl LineSeparator | |
354 | \& Zp ParagraphSeparator | |
355 | .Ve | |
356 | .Sp | |
357 | .Vb 6 | |
358 | \& C Other | |
359 | \& Cc Control | |
360 | \& Cf Format | |
361 | \& Cs Surrogate (not usable) | |
362 | \& Co PrivateUse | |
363 | \& Cn Unassigned | |
364 | .Ve | |
365 | .Sp | |
366 | Single-letter properties match all characters in any of the | |
367 | two-letter sub-properties starting with the same letter. | |
368 | \&\f(CW\*(C`L&\*(C'\fR is a special case, which is an alias for \f(CW\*(C`Ll\*(C'\fR, \f(CW\*(C`Lu\*(C'\fR, and \f(CW\*(C`Lt\*(C'\fR. | |
369 | .Sp | |
370 | Because Perl hides the need for the user to understand the internal | |
371 | representation of Unicode characters, there is no need to implement | |
372 | the somewhat messy concept of surrogates. \f(CW\*(C`Cs\*(C'\fR is therefore not | |
373 | supported. | |
374 | .Sp | |
375 | Because scripts differ in their directionality\*(--Hebrew is | |
376 | written right to left, for example\*(--Unicode supplies these properties: | |
377 | .Sp | |
378 | .Vb 1 | |
379 | \& Property Meaning | |
380 | .Ve | |
381 | .Sp | |
382 | .Vb 19 | |
383 | \& BidiL Left-to-Right | |
384 | \& BidiLRE Left-to-Right Embedding | |
385 | \& BidiLRO Left-to-Right Override | |
386 | \& BidiR Right-to-Left | |
387 | \& BidiAL Right-to-Left Arabic | |
388 | \& BidiRLE Right-to-Left Embedding | |
389 | \& BidiRLO Right-to-Left Override | |
390 | \& BidiPDF Pop Directional Format | |
391 | \& BidiEN European Number | |
392 | \& BidiES European Number Separator | |
393 | \& BidiET European Number Terminator | |
394 | \& BidiAN Arabic Number | |
395 | \& BidiCS Common Number Separator | |
396 | \& BidiNSM Non-Spacing Mark | |
397 | \& BidiBN Boundary Neutral | |
398 | \& BidiB Paragraph Separator | |
399 | \& BidiS Segment Separator | |
400 | \& BidiWS Whitespace | |
401 | \& BidiON Other Neutrals | |
402 | .Ve | |
403 | .Sp | |
404 | For example, \f(CW\*(C`\ep{BidiR}\*(C'\fR matches characters that are normally | |
405 | written right to left. | |
406 | .Sh "Scripts" | |
407 | .IX Subsection "Scripts" | |
408 | The script names which can be used by \f(CW\*(C`\ep{...}\*(C'\fR and \f(CW\*(C`\eP{...}\*(C'\fR, | |
409 | such as in \f(CW\*(C`\ep{Latin}\*(C'\fR or \f(CW\*(C`\ep{Cyrillic}\*(C'\fR, are as follows: | |
410 | .PP | |
411 | .Vb 44 | |
412 | \& Arabic | |
413 | \& Armenian | |
414 | \& Bengali | |
415 | \& Bopomofo | |
416 | \& Buhid | |
417 | \& CanadianAboriginal | |
418 | \& Cherokee | |
419 | \& Cyrillic | |
420 | \& Deseret | |
421 | \& Devanagari | |
422 | \& Ethiopic | |
423 | \& Georgian | |
424 | \& Gothic | |
425 | \& Greek | |
426 | \& Gujarati | |
427 | \& Gurmukhi | |
428 | \& Han | |
429 | \& Hangul | |
430 | \& Hanunoo | |
431 | \& Hebrew | |
432 | \& Hiragana | |
433 | \& Inherited | |
434 | \& Kannada | |
435 | \& Katakana | |
436 | \& Khmer | |
437 | \& Lao | |
438 | \& Latin | |
439 | \& Malayalam | |
440 | \& Mongolian | |
441 | \& Myanmar | |
442 | \& Ogham | |
443 | \& OldItalic | |
444 | \& Oriya | |
445 | \& Runic | |
446 | \& Sinhala | |
447 | \& Syriac | |
448 | \& Tagalog | |
449 | \& Tagbanwa | |
450 | \& Tamil | |
451 | \& Telugu | |
452 | \& Thaana | |
453 | \& Thai | |
454 | \& Tibetan | |
455 | \& Yi | |
456 | .Ve | |
457 | .PP | |
458 | Extended property classes can supplement the basic | |
459 | properties, defined by the \fIPropList\fR Unicode database: | |
460 | .PP | |
461 | .Vb 27 | |
462 | \& ASCIIHexDigit | |
463 | \& BidiControl | |
464 | \& Dash | |
465 | \& Deprecated | |
466 | \& Diacritic | |
467 | \& Extender | |
468 | \& GraphemeLink | |
469 | \& HexDigit | |
470 | \& Hyphen | |
471 | \& Ideographic | |
472 | \& IDSBinaryOperator | |
473 | \& IDSTrinaryOperator | |
474 | \& JoinControl | |
475 | \& LogicalOrderException | |
476 | \& NoncharacterCodePoint | |
477 | \& OtherAlphabetic | |
478 | \& OtherDefaultIgnorableCodePoint | |
479 | \& OtherGraphemeExtend | |
480 | \& OtherLowercase | |
481 | \& OtherMath | |
482 | \& OtherUppercase | |
483 | \& QuotationMark | |
484 | \& Radical | |
485 | \& SoftDotted | |
486 | \& TerminalPunctuation | |
487 | \& UnifiedIdeograph | |
488 | \& WhiteSpace | |
489 | .Ve | |
490 | .PP | |
491 | and there are further derived properties: | |
492 | .PP | |
493 | .Vb 4 | |
494 | \& Alphabetic Lu + Ll + Lt + Lm + Lo + OtherAlphabetic | |
495 | \& Lowercase Ll + OtherLowercase | |
496 | \& Uppercase Lu + OtherUppercase | |
497 | \& Math Sm + OtherMath | |
498 | .Ve | |
499 | .PP | |
500 | .Vb 2 | |
501 | \& ID_Start Lu + Ll + Lt + Lm + Lo + Nl | |
502 | \& ID_Continue ID_Start + Mn + Mc + Nd + Pc | |
503 | .Ve | |
504 | .PP | |
505 | .Vb 5 | |
506 | \& Any Any character | |
507 | \& Assigned Any non-Cn character (i.e. synonym for \eP{Cn}) | |
508 | \& Unassigned Synonym for \ep{Cn} | |
509 | \& Common Any character (or unassigned code point) | |
510 | \& not explicitly assigned to a script | |
511 | .Ve | |
512 | .PP | |
513 | For backward compatibility (with Perl 5.6), all properties mentioned | |
514 | so far may have \f(CW\*(C`Is\*(C'\fR prepended to their name, so \f(CW\*(C`\eP{IsLu}\*(C'\fR, for | |
515 | example, is equal to \f(CW\*(C`\eP{Lu}\*(C'\fR. | |
516 | .Sh "Blocks" | |
517 | .IX Subsection "Blocks" | |
518 | In addition to \fBscripts\fR, Unicode also defines \fBblocks\fR of | |
519 | characters. The difference between scripts and blocks is that the | |
520 | concept of scripts is closer to natural languages, while the concept | |
521 | of blocks is more of an artificial grouping based on groups of 256 | |
522 | Unicode characters. For example, the \f(CW\*(C`Latin\*(C'\fR script contains letters | |
523 | from many blocks but does not contain all the characters from those | |
524 | blocks. It does not, for example, contain digits, because digits are | |
525 | shared across many scripts. Digits and similar groups, like | |
526 | punctuation, are in a category called \f(CW\*(C`Common\*(C'\fR. | |
527 | .PP | |
528 | For more about scripts, see the \s-1UTR\s0 #24: | |
529 | .PP | |
530 | .Vb 1 | |
531 | \& http://www.unicode.org/unicode/reports/tr24/ | |
532 | .Ve | |
533 | .PP | |
534 | For more about blocks, see: | |
535 | .PP | |
536 | .Vb 1 | |
537 | \& http://www.unicode.org/Public/UNIDATA/Blocks.txt | |
538 | .Ve | |
539 | .PP | |
540 | Block names are given with the \f(CW\*(C`In\*(C'\fR prefix. For example, the | |
541 | Katakana block is referenced via \f(CW\*(C`\ep{InKatakana}\*(C'\fR. The \f(CW\*(C`In\*(C'\fR | |
542 | prefix may be omitted if there is no naming conflict with a script | |
543 | or any other property, but it is recommended that \f(CW\*(C`In\*(C'\fR always be used | |
544 | for block tests to avoid confusion. | |
545 | .PP | |
546 | These block names are supported: | |
547 | .PP | |
548 | .Vb 110 | |
549 | \& InAlphabeticPresentationForms | |
550 | \& InArabic | |
551 | \& InArabicPresentationFormsA | |
552 | \& InArabicPresentationFormsB | |
553 | \& InArmenian | |
554 | \& InArrows | |
555 | \& InBasicLatin | |
556 | \& InBengali | |
557 | \& InBlockElements | |
558 | \& InBopomofo | |
559 | \& InBopomofoExtended | |
560 | \& InBoxDrawing | |
561 | \& InBraillePatterns | |
562 | \& InBuhid | |
563 | \& InByzantineMusicalSymbols | |
564 | \& InCJKCompatibility | |
565 | \& InCJKCompatibilityForms | |
566 | \& InCJKCompatibilityIdeographs | |
567 | \& InCJKCompatibilityIdeographsSupplement | |
568 | \& InCJKRadicalsSupplement | |
569 | \& InCJKSymbolsAndPunctuation | |
570 | \& InCJKUnifiedIdeographs | |
571 | \& InCJKUnifiedIdeographsExtensionA | |
572 | \& InCJKUnifiedIdeographsExtensionB | |
573 | \& InCherokee | |
574 | \& InCombiningDiacriticalMarks | |
575 | \& InCombiningDiacriticalMarksforSymbols | |
576 | \& InCombiningHalfMarks | |
577 | \& InControlPictures | |
578 | \& InCurrencySymbols | |
579 | \& InCyrillic | |
580 | \& InCyrillicSupplementary | |
581 | \& InDeseret | |
582 | \& InDevanagari | |
583 | \& InDingbats | |
584 | \& InEnclosedAlphanumerics | |
585 | \& InEnclosedCJKLettersAndMonths | |
586 | \& InEthiopic | |
587 | \& InGeneralPunctuation | |
588 | \& InGeometricShapes | |
589 | \& InGeorgian | |
590 | \& InGothic | |
591 | \& InGreekExtended | |
592 | \& InGreekAndCoptic | |
593 | \& InGujarati | |
594 | \& InGurmukhi | |
595 | \& InHalfwidthAndFullwidthForms | |
596 | \& InHangulCompatibilityJamo | |
597 | \& InHangulJamo | |
598 | \& InHangulSyllables | |
599 | \& InHanunoo | |
600 | \& InHebrew | |
601 | \& InHighPrivateUseSurrogates | |
602 | \& InHighSurrogates | |
603 | \& InHiragana | |
604 | \& InIPAExtensions | |
605 | \& InIdeographicDescriptionCharacters | |
606 | \& InKanbun | |
607 | \& InKangxiRadicals | |
608 | \& InKannada | |
609 | \& InKatakana | |
610 | \& InKatakanaPhoneticExtensions | |
611 | \& InKhmer | |
612 | \& InLao | |
613 | \& InLatin1Supplement | |
614 | \& InLatinExtendedA | |
615 | \& InLatinExtendedAdditional | |
616 | \& InLatinExtendedB | |
617 | \& InLetterlikeSymbols | |
618 | \& InLowSurrogates | |
619 | \& InMalayalam | |
620 | \& InMathematicalAlphanumericSymbols | |
621 | \& InMathematicalOperators | |
622 | \& InMiscellaneousMathematicalSymbolsA | |
623 | \& InMiscellaneousMathematicalSymbolsB | |
624 | \& InMiscellaneousSymbols | |
625 | \& InMiscellaneousTechnical | |
626 | \& InMongolian | |
627 | \& InMusicalSymbols | |
628 | \& InMyanmar | |
629 | \& InNumberForms | |
630 | \& InOgham | |
631 | \& InOldItalic | |
632 | \& InOpticalCharacterRecognition | |
633 | \& InOriya | |
634 | \& InPrivateUseArea | |
635 | \& InRunic | |
636 | \& InSinhala | |
637 | \& InSmallFormVariants | |
638 | \& InSpacingModifierLetters | |
639 | \& InSpecials | |
640 | \& InSuperscriptsAndSubscripts | |
641 | \& InSupplementalArrowsA | |
642 | \& InSupplementalArrowsB | |
643 | \& InSupplementalMathematicalOperators | |
644 | \& InSupplementaryPrivateUseAreaA | |
645 | \& InSupplementaryPrivateUseAreaB | |
646 | \& InSyriac | |
647 | \& InTagalog | |
648 | \& InTagbanwa | |
649 | \& InTags | |
650 | \& InTamil | |
651 | \& InTelugu | |
652 | \& InThaana | |
653 | \& InThai | |
654 | \& InTibetan | |
655 | \& InUnifiedCanadianAboriginalSyllabics | |
656 | \& InVariationSelectors | |
657 | \& InYiRadicals | |
658 | \& InYiSyllables | |
659 | .Ve | |
660 | .IP "\(bu" 4 | |
661 | The special pattern \f(CW\*(C`\eX\*(C'\fR matches any extended Unicode | |
662 | sequence\-\-\*(L"a combining character sequence\*(R" in Standardese\*(--where the | |
663 | first character is a base character and subsequent characters are mark | |
664 | characters that apply to the base character. \f(CW\*(C`\eX\*(C'\fR is equivalent to | |
665 | \&\f(CW\*(C`(?:\ePM\epM*)\*(C'\fR. | |
666 | .IP "\(bu" 4 | |
667 | The \f(CW\*(C`tr///\*(C'\fR operator translates characters instead of bytes. Note | |
668 | that the \f(CW\*(C`tr///CU\*(C'\fR functionality has been removed. For similar | |
669 | functionality see pack('U0', ...) and pack('C0', ...). | |
670 | .IP "\(bu" 4 | |
671 | Case translation operators use the Unicode case translation tables | |
672 | when character input is provided. Note that \f(CW\*(C`uc()\*(C'\fR, or \f(CW\*(C`\eU\*(C'\fR in | |
673 | interpolated strings, translates to uppercase, while \f(CW\*(C`ucfirst\*(C'\fR, | |
674 | or \f(CW\*(C`\eu\*(C'\fR in interpolated strings, translates to titlecase in languages | |
675 | that make the distinction. | |
676 | .IP "\(bu" 4 | |
677 | Most operators that deal with positions or lengths in a string will | |
678 | automatically switch to using character positions, including | |
679 | \&\f(CW\*(C`chop()\*(C'\fR, \f(CW\*(C`substr()\*(C'\fR, \f(CW\*(C`pos()\*(C'\fR, \f(CW\*(C`index()\*(C'\fR, \f(CW\*(C`rindex()\*(C'\fR, | |
680 | \&\f(CW\*(C`sprintf()\*(C'\fR, \f(CW\*(C`write()\*(C'\fR, and \f(CW\*(C`length()\*(C'\fR. Operators that | |
681 | specifically do not switch include \f(CW\*(C`vec()\*(C'\fR, \f(CW\*(C`pack()\*(C'\fR, and | |
682 | \&\f(CW\*(C`unpack()\*(C'\fR. Operators that really don't care include \f(CW\*(C`chomp()\*(C'\fR, | |
683 | operators that treats strings as a bucket of bits such as \f(CW\*(C`sort()\*(C'\fR, | |
684 | and operators dealing with filenames. | |
685 | .IP "\(bu" 4 | |
686 | The \f(CW\*(C`pack()\*(C'\fR/\f(CW\*(C`unpack()\*(C'\fR letters \f(CW\*(C`c\*(C'\fR and \f(CW\*(C`C\*(C'\fR do \fInot\fR change, | |
687 | since they are often used for byte-oriented formats. Again, think | |
688 | \&\f(CW\*(C`char\*(C'\fR in the C language. | |
689 | .Sp | |
690 | There is a new \f(CW\*(C`U\*(C'\fR specifier that converts between Unicode characters | |
691 | and code points. | |
692 | .IP "\(bu" 4 | |
693 | The \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`ord()\*(C'\fR functions work on characters, similar to | |
694 | \&\f(CW\*(C`pack("U")\*(C'\fR and \f(CW\*(C`unpack("U")\*(C'\fR, \fInot\fR \f(CW\*(C`pack("C")\*(C'\fR and | |
695 | \&\f(CW\*(C`unpack("C")\*(C'\fR. \f(CW\*(C`pack("C")\*(C'\fR and \f(CW\*(C`unpack("C")\*(C'\fR are methods for | |
696 | emulating byte-oriented \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`ord()\*(C'\fR on Unicode strings. | |
697 | While these methods reveal the internal encoding of Unicode strings, | |
698 | that is not something one normally needs to care about at all. | |
699 | .IP "\(bu" 4 | |
700 | The bit string operators, \f(CW\*(C`& | ^ ~\*(C'\fR, can operate on character data. | |
701 | However, for backward compatibility, such as when using bit string | |
702 | operations when characters are all less than 256 in ordinal value, one | |
703 | should not use \f(CW\*(C`~\*(C'\fR (the bit complement) with characters of both | |
704 | values less than 256 and values greater than 256. Most importantly, | |
705 | DeMorgan's laws (\f(CW\*(C`~($x|$y) eq ~$x&~$y\*(C'\fR and \f(CW\*(C`~($x&$y) eq ~$x|~$y\*(C'\fR) | |
706 | will not hold. The reason for this mathematical \fIfaux pas\fR is that | |
707 | the complement cannot return \fBboth\fR the 8\-bit (byte\-wide) bit | |
708 | complement \fBand\fR the full character-wide bit complement. | |
709 | .IP "\(bu" 4 | |
710 | \&\fIlc()\fR, \fIuc()\fR, \fIlcfirst()\fR, and \fIucfirst()\fR work for the following cases: | |
711 | .RS 4 | |
712 | .IP "\(bu" 8 | |
713 | the case mapping is from a single Unicode character to another | |
714 | single Unicode character, or | |
715 | .IP "\(bu" 8 | |
716 | the case mapping is from a single Unicode character to more | |
717 | than one Unicode character. | |
718 | .RE | |
719 | .RS 4 | |
720 | .Sp | |
721 | The following cases do not yet work: | |
722 | .IP "\(bu" 8 | |
723 | the \*(L"final sigma\*(R" (Greek), and | |
724 | .IP "\(bu" 8 | |
725 | anything to with locales (Lithuanian, Turkish, Azeri). | |
726 | .RE | |
727 | .RS 4 | |
728 | .Sp | |
729 | See the Unicode Technical Report #21, Case Mappings, for more details. | |
730 | .RE | |
731 | .IP "\(bu" 4 | |
732 | And finally, \f(CW\*(C`scalar reverse()\*(C'\fR reverses by character rather than by byte. | |
733 | .Sh "User-Defined Character Properties" | |
734 | .IX Subsection "User-Defined Character Properties" | |
735 | You can define your own character properties by defining subroutines | |
736 | whose names begin with \*(L"In\*(R" or \*(L"Is\*(R". The subroutines must be | |
737 | visible in the package that uses the properties. The user-defined | |
738 | properties can be used in the regular expression \f(CW\*(C`\ep\*(C'\fR and \f(CW\*(C`\eP\*(C'\fR | |
739 | constructs. | |
740 | .PP | |
741 | The subroutines must return a specially-formatted string, with one | |
742 | or more newline-separated lines. Each line must be one of the following: | |
743 | .IP "\(bu" 4 | |
744 | Two hexadecimal numbers separated by horizontal whitespace (space or | |
745 | tabular characters) denoting a range of Unicode code points to include. | |
746 | .IP "\(bu" 4 | |
747 | Something to include, prefixed by \*(L"+\*(R": a built-in character | |
748 | property (prefixed by \*(L"utf8::\*(R"), to represent all the characters in that | |
749 | property; two hexadecimal code points for a range; or a single | |
750 | hexadecimal code point. | |
751 | .IP "\(bu" 4 | |
752 | Something to exclude, prefixed by \*(L"\-\*(R": an existing character | |
753 | property (prefixed by \*(L"utf8::\*(R"), for all the characters in that | |
754 | property; two hexadecimal code points for a range; or a single | |
755 | hexadecimal code point. | |
756 | .IP "\(bu" 4 | |
757 | Something to negate, prefixed \*(L"!\*(R": an existing character | |
758 | property (prefixed by \*(L"utf8::\*(R") for all the characters except the | |
759 | characters in the property; two hexadecimal code points for a range; | |
760 | or a single hexadecimal code point. | |
761 | .PP | |
762 | For example, to define a property that covers both the Japanese | |
763 | syllabaries (hiragana and katakana), you can define | |
764 | .PP | |
765 | .Vb 6 | |
766 | \& sub InKana { | |
767 | \& return <<END; | |
768 | \& 3040\et309F | |
769 | \& 30A0\et30FF | |
770 | \& END | |
771 | \& } | |
772 | .Ve | |
773 | .PP | |
774 | Imagine that the here-doc end marker is at the beginning of the line. | |
775 | Now you can use \f(CW\*(C`\ep{InKana}\*(C'\fR and \f(CW\*(C`\eP{InKana}\*(C'\fR. | |
776 | .PP | |
777 | You could also have used the existing block property names: | |
778 | .PP | |
779 | .Vb 6 | |
780 | \& sub InKana { | |
781 | \& return <<'END'; | |
782 | \& +utf8::InHiragana | |
783 | \& +utf8::InKatakana | |
784 | \& END | |
785 | \& } | |
786 | .Ve | |
787 | .PP | |
788 | Suppose you wanted to match only the allocated characters, | |
789 | not the raw block ranges: in other words, you want to remove | |
790 | the non\-characters: | |
791 | .PP | |
792 | .Vb 7 | |
793 | \& sub InKana { | |
794 | \& return <<'END'; | |
795 | \& +utf8::InHiragana | |
796 | \& +utf8::InKatakana | |
797 | \& -utf8::IsCn | |
798 | \& END | |
799 | \& } | |
800 | .Ve | |
801 | .PP | |
802 | The negation is useful for defining (surprise!) negated classes. | |
803 | .PP | |
804 | .Vb 7 | |
805 | \& sub InNotKana { | |
806 | \& return <<'END'; | |
807 | \& !utf8::InHiragana | |
808 | \& -utf8::InKatakana | |
809 | \& +utf8::IsCn | |
810 | \& END | |
811 | \& } | |
812 | .Ve | |
813 | .Sh "Character Encodings for Input and Output" | |
814 | .IX Subsection "Character Encodings for Input and Output" | |
815 | See Encode. | |
816 | .Sh "Unicode Regular Expression Support Level" | |
817 | .IX Subsection "Unicode Regular Expression Support Level" | |
818 | The following list of Unicode support for regular expressions describes | |
819 | all the features currently supported. The references to \*(L"Level N\*(R" | |
820 | and the section numbers refer to the Unicode Technical Report 18, | |
821 | \&\*(L"Unicode Regular Expression Guidelines\*(R". | |
822 | .IP "\(bu" 4 | |
823 | Level 1 \- Basic Unicode Support | |
824 | .Sp | |
825 | .Vb 7 | |
826 | \& 2.1 Hex Notation - done [1] | |
827 | \& Named Notation - done [2] | |
828 | \& 2.2 Categories - done [3][4] | |
829 | \& 2.3 Subtraction - MISSING [5][6] | |
830 | \& 2.4 Simple Word Boundaries - done [7] | |
831 | \& 2.5 Simple Loose Matches - done [8] | |
832 | \& 2.6 End of Line - MISSING [9][10] | |
833 | .Ve | |
834 | .Sp | |
835 | .Vb 18 | |
836 | \& [ 1] \ex{...} | |
837 | \& [ 2] \eN{...} | |
838 | \& [ 3] . \ep{...} \eP{...} | |
839 | \& [ 4] now scripts (see UTR#24 Script Names) in addition to blocks | |
840 | \& [ 5] have negation | |
841 | \& [ 6] can use regular expression look-ahead [a] | |
842 | \& or user-defined character properties [b] to emulate subtraction | |
843 | \& [ 7] include Letters in word characters | |
844 | \& [ 8] note that Perl does Full case-folding in matching, not Simple: | |
845 | \& for example U+1F88 is equivalent with U+1F000 U+03B9, | |
846 | \& not with 1F80. This difference matters for certain Greek | |
847 | \& capital letters with certain modifiers: the Full case-folding | |
848 | \& decomposes the letter, while the Simple case-folding would map | |
849 | \& it to a single character. | |
850 | \& [ 9] see UTR#13 Unicode Newline Guidelines | |
851 | \& [10] should do ^ and $ also on \ex{85}, \ex{2028} and \ex{2029}) | |
852 | \& (should also affect <>, $., and script line numbers) | |
853 | \& (the \ex{85}, \ex{2028} and \ex{2029} do match \es) | |
854 | .Ve | |
855 | .Sp | |
856 | [a] You can mimic class subtraction using lookahead. | |
857 | For example, what \s-1TR18\s0 might write as | |
858 | .Sp | |
859 | .Vb 1 | |
860 | \& [{Greek}-[{UNASSIGNED}]] | |
861 | .Ve | |
862 | .Sp | |
863 | in Perl can be written as: | |
864 | .Sp | |
865 | .Vb 2 | |
866 | \& (?!\ep{Unassigned})\ep{InGreekAndCoptic} | |
867 | \& (?=\ep{Assigned})\ep{InGreekAndCoptic} | |
868 | .Ve | |
869 | .Sp | |
870 | But in this particular example, you probably really want | |
871 | .Sp | |
872 | .Vb 1 | |
873 | \& \ep{GreekAndCoptic} | |
874 | .Ve | |
875 | .Sp | |
876 | which will match assigned characters known to be part of the Greek script. | |
877 | .Sp | |
878 | [b] See \*(L"User\-Defined Character Properties\*(R". | |
879 | .IP "\(bu" 4 | |
880 | Level 2 \- Extended Unicode Support | |
881 | .Sp | |
882 | .Vb 5 | |
883 | \& 3.1 Surrogates - MISSING | |
884 | \& 3.2 Canonical Equivalents - MISSING [11][12] | |
885 | \& 3.3 Locale-Independent Graphemes - MISSING [13] | |
886 | \& 3.4 Locale-Independent Words - MISSING [14] | |
887 | \& 3.5 Locale-Independent Loose Matches - MISSING [15] | |
888 | .Ve | |
889 | .Sp | |
890 | .Vb 5 | |
891 | \& [11] see UTR#15 Unicode Normalization | |
892 | \& [12] have Unicode::Normalize but not integrated to regexes | |
893 | \& [13] have \eX but at this level . should equal that | |
894 | \& [14] need three classes, not just \ew and \eW | |
895 | \& [15] see UTR#21 Case Mappings | |
896 | .Ve | |
897 | .IP "\(bu" 4 | |
898 | Level 3 \- Locale-Sensitive Support | |
899 | .Sp | |
900 | .Vb 5 | |
901 | \& 4.1 Locale-Dependent Categories - MISSING | |
902 | \& 4.2 Locale-Dependent Graphemes - MISSING [16][17] | |
903 | \& 4.3 Locale-Dependent Words - MISSING | |
904 | \& 4.4 Locale-Dependent Loose Matches - MISSING | |
905 | \& 4.5 Locale-Dependent Ranges - MISSING | |
906 | .Ve | |
907 | .Sp | |
908 | .Vb 2 | |
909 | \& [16] see UTR#10 Unicode Collation Algorithms | |
910 | \& [17] have Unicode::Collate but not integrated to regexes | |
911 | .Ve | |
912 | .Sh "Unicode Encodings" | |
913 | .IX Subsection "Unicode Encodings" | |
914 | Unicode characters are assigned to \fIcode points\fR, which are abstract | |
915 | numbers. To use these numbers, various encodings are needed. | |
916 | .IP "\(bu" 4 | |
917 | \&\s-1UTF\-8\s0 | |
918 | .Sp | |
919 | \&\s-1UTF\-8\s0 is a variable-length (1 to 6 bytes, current character allocations | |
920 | require 4 bytes), byte-order independent encoding. For \s-1ASCII\s0 (and we | |
921 | really do mean 7\-bit \s-1ASCII\s0, not another 8\-bit encoding), \s-1UTF\-8\s0 is | |
922 | transparent. | |
923 | .Sp | |
924 | The following table is from Unicode 3.2. | |
925 | .Sp | |
926 | .Vb 1 | |
927 | \& Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte | |
928 | .Ve | |
929 | .Sp | |
930 | .Vb 10 | |
931 | \& U+0000..U+007F 00..7F | |
932 | \& U+0080..U+07FF C2..DF 80..BF | |
933 | \& U+0800..U+0FFF E0 A0..BF 80..BF | |
934 | \& U+1000..U+CFFF E1..EC 80..BF 80..BF | |
935 | \& U+D000..U+D7FF ED 80..9F 80..BF | |
936 | \& U+D800..U+DFFF ******* ill-formed ******* | |
937 | \& U+E000..U+FFFF EE..EF 80..BF 80..BF | |
938 | \& U+10000..U+3FFFF F0 90..BF 80..BF 80..BF | |
939 | \& U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF | |
940 | \& U+100000..U+10FFFF F4 80..8F 80..BF 80..BF | |
941 | .Ve | |
942 | .Sp | |
943 | Note the \f(CW\*(C`A0..BF\*(C'\fR in \f(CW\*(C`U+0800..U+0FFF\*(C'\fR, the \f(CW\*(C`80..9F\*(C'\fR in | |
944 | \&\f(CW\*(C`U+D000...U+D7FF\*(C'\fR, the \f(CW\*(C`90..B\*(C'\fRF in \f(CW\*(C`U+10000..U+3FFFF\*(C'\fR, and the | |
945 | \&\f(CW\*(C`80...8F\*(C'\fR in \f(CW\*(C`U+100000..U+10FFFF\*(C'\fR. The \*(L"gaps\*(R" are caused by legal | |
946 | \&\s-1UTF\-8\s0 avoiding non-shortest encodings: it is technically possible to | |
947 | UTF\-8\-encode a single code point in different ways, but that is | |
948 | explicitly forbidden, and the shortest possible encoding should always | |
949 | be used. So that's what Perl does. | |
950 | .Sp | |
951 | Another way to look at it is via bits: | |
952 | .Sp | |
953 | .Vb 1 | |
954 | \& Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte | |
955 | .Ve | |
956 | .Sp | |
957 | .Vb 4 | |
958 | \& 0aaaaaaa 0aaaaaaa | |
959 | \& 00000bbbbbaaaaaa 110bbbbb 10aaaaaa | |
960 | \& ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa | |
961 | \& 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa | |
962 | .Ve | |
963 | .Sp | |
964 | As you can see, the continuation bytes all begin with \f(CW10\fR, and the | |
965 | leading bits of the start byte tell how many bytes the are in the | |
966 | encoded character. | |
967 | .IP "\(bu" 4 | |
968 | UTF-EBCDIC | |
969 | .Sp | |
970 | Like \s-1UTF\-8\s0 but EBCDIC\-safe, in the way that \s-1UTF\-8\s0 is ASCII\-safe. | |
971 | .IP "\(bu" 4 | |
972 | \&\s-1UTF\-16\s0, \s-1UTF\-16BE\s0, \s-1UTF16\-LE\s0, Surrogates, and BOMs (Byte Order Marks) | |
973 | .Sp | |
974 | The followings items are mostly for reference and general Unicode | |
975 | knowledge, Perl doesn't use these constructs internally. | |
976 | .Sp | |
977 | \&\s-1UTF\-16\s0 is a 2 or 4 byte encoding. The Unicode code points | |
978 | \&\f(CW\*(C`U+0000..U+FFFF\*(C'\fR are stored in a single 16\-bit unit, and the code | |
979 | points \f(CW\*(C`U+10000..U+10FFFF\*(C'\fR in two 16\-bit units. The latter case is | |
980 | using \fIsurrogates\fR, the first 16\-bit unit being the \fIhigh | |
981 | surrogate\fR, and the second being the \fIlow surrogate\fR. | |
982 | .Sp | |
983 | Surrogates are code points set aside to encode the \f(CW\*(C`U+10000..U+10FFFF\*(C'\fR | |
984 | range of Unicode code points in pairs of 16\-bit units. The \fIhigh | |
985 | surrogates\fR are the range \f(CW\*(C`U+D800..U+DBFF\*(C'\fR, and the \fIlow surrogates\fR | |
986 | are the range \f(CW\*(C`U+DC00..U+DFFF\*(C'\fR. The surrogate encoding is | |
987 | .Sp | |
988 | .Vb 2 | |
989 | \& $hi = ($uni - 0x10000) / 0x400 + 0xD800; | |
990 | \& $lo = ($uni - 0x10000) % 0x400 + 0xDC00; | |
991 | .Ve | |
992 | .Sp | |
993 | and the decoding is | |
994 | .Sp | |
995 | .Vb 1 | |
996 | \& $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); | |
997 | .Ve | |
998 | .Sp | |
999 | If you try to generate surrogates (for example by using \fIchr()\fR), you | |
1000 | will get a warning if warnings are turned on, because those code | |
1001 | points are not valid for a Unicode character. | |
1002 | .Sp | |
1003 | Because of the 16\-bitness, \s-1UTF\-16\s0 is byte-order dependent. \s-1UTF\-16\s0 | |
1004 | itself can be used for in-memory computations, but if storage or | |
1005 | transfer is required either \s-1UTF\-16BE\s0 (big\-endian) or \s-1UTF\-16LE\s0 | |
1006 | (little\-endian) encodings must be chosen. | |
1007 | .Sp | |
1008 | This introduces another problem: what if you just know that your data | |
1009 | is \s-1UTF\-16\s0, but you don't know which endianness? Byte Order Marks, or | |
1010 | BOMs, are a solution to this. A special character has been reserved | |
1011 | in Unicode to function as a byte order marker: the character with the | |
1012 | code point \f(CW\*(C`U+FEFF\*(C'\fR is the \s-1BOM\s0. | |
1013 | .Sp | |
1014 | The trick is that if you read a \s-1BOM\s0, you will know the byte order, | |
1015 | since if it was written on a big-endian platform, you will read the | |
1016 | bytes \f(CW\*(C`0xFE 0xFF\*(C'\fR, but if it was written on a little-endian platform, | |
1017 | you will read the bytes \f(CW\*(C`0xFF 0xFE\*(C'\fR. (And if the originating platform | |
1018 | was writing in \s-1UTF\-8\s0, you will read the bytes \f(CW\*(C`0xEF 0xBB 0xBF\*(C'\fR.) | |
1019 | .Sp | |
1020 | The way this trick works is that the character with the code point | |
1021 | \&\f(CW\*(C`U+FFFE\*(C'\fR is guaranteed not to be a valid Unicode character, so the | |
1022 | sequence of bytes \f(CW\*(C`0xFF 0xFE\*(C'\fR is unambiguously \*(L"\s-1BOM\s0, represented in | |
1023 | little-endian format\*(R" and cannot be \f(CW\*(C`U+FFFE\*(C'\fR, represented in big-endian | |
1024 | format". | |
1025 | .IP "\(bu" 4 | |
1026 | \&\s-1UTF\-32\s0, \s-1UTF\-32BE\s0, \s-1UTF32\-LE\s0 | |
1027 | .Sp | |
1028 | The \s-1UTF\-32\s0 family is pretty much like the \s-1UTF\-16\s0 family, expect that | |
1029 | the units are 32\-bit, and therefore the surrogate scheme is not | |
1030 | needed. The \s-1BOM\s0 signatures will be \f(CW\*(C`0x00 0x00 0xFE 0xFF\*(C'\fR for \s-1BE\s0 and | |
1031 | \&\f(CW\*(C`0xFF 0xFE 0x00 0x00\*(C'\fR for \s-1LE\s0. | |
1032 | .IP "\(bu" 4 | |
1033 | \&\s-1UCS\-2\s0, \s-1UCS\-4\s0 | |
1034 | .Sp | |
1035 | Encodings defined by the \s-1ISO\s0 10646 standard. \s-1UCS\-2\s0 is a 16\-bit | |
1036 | encoding. Unlike \s-1UTF\-16\s0, \s-1UCS\-2\s0 is not extensible beyond \f(CW\*(C`U+FFFF\*(C'\fR, | |
1037 | because it does not use surrogates. \s-1UCS\-4\s0 is a 32\-bit encoding, | |
1038 | functionally identical to \s-1UTF\-32\s0. | |
1039 | .IP "\(bu" 4 | |
1040 | \&\s-1UTF\-7\s0 | |
1041 | .Sp | |
1042 | A seven-bit safe (non\-eight\-bit) encoding, which is useful if the | |
1043 | transport or storage is not eight-bit safe. Defined by \s-1RFC\s0 2152. | |
1044 | .Sh "Security Implications of Unicode" | |
1045 | .IX Subsection "Security Implications of Unicode" | |
1046 | .IP "\(bu" 4 | |
1047 | Malformed \s-1UTF\-8\s0 | |
1048 | .Sp | |
1049 | Unfortunately, the specification of \s-1UTF\-8\s0 leaves some room for | |
1050 | interpretation of how many bytes of encoded output one should generate | |
1051 | from one input Unicode character. Strictly speaking, the shortest | |
1052 | possible sequence of \s-1UTF\-8\s0 bytes should be generated, | |
1053 | because otherwise there is potential for an input buffer overflow at | |
1054 | the receiving end of a \s-1UTF\-8\s0 connection. Perl always generates the | |
1055 | shortest length \s-1UTF\-8\s0, and with warnings on Perl will warn about | |
1056 | non-shortest length \s-1UTF\-8\s0 along with other malformations, such as the | |
1057 | surrogates, which are not real Unicode code points. | |
1058 | .IP "\(bu" 4 | |
1059 | Regular expressions behave slightly differently between byte data and | |
1060 | character (Unicode) data. For example, the \*(L"word character\*(R" character | |
1061 | class \f(CW\*(C`\ew\*(C'\fR will work differently depending on if data is eight-bit bytes | |
1062 | or Unicode. | |
1063 | .Sp | |
1064 | In the first case, the set of \f(CW\*(C`\ew\*(C'\fR characters is either small\*(--the | |
1065 | default set of alphabetic characters, digits, and the \*(L"_\*(R"\-\-or, if you | |
1066 | are using a locale (see perllocale), the \f(CW\*(C`\ew\*(C'\fR might contain a few | |
1067 | more letters according to your language and country. | |
1068 | .Sp | |
1069 | In the second case, the \f(CW\*(C`\ew\*(C'\fR set of characters is much, much larger. | |
1070 | Most importantly, even in the set of the first 256 characters, it will | |
1071 | probably match different characters: unlike most locales, which are | |
1072 | specific to a language and country pair, Unicode classifies all the | |
1073 | characters that are letters \fIsomewhere\fR as \f(CW\*(C`\ew\*(C'\fR. For example, your | |
1074 | locale might not think that \s-1LATIN\s0 \s-1SMALL\s0 \s-1LETTER\s0 \s-1ETH\s0 is a letter (unless | |
1075 | you happen to speak Icelandic), but Unicode does. | |
1076 | .Sp | |
1077 | As discussed elsewhere, Perl has one foot (two hooves?) planted in | |
1078 | each of two worlds: the old world of bytes and the new world of | |
1079 | characters, upgrading from bytes to characters when necessary. | |
1080 | If your legacy code does not explicitly use Unicode, no automatic | |
1081 | switch-over to characters should happen. Characters shouldn't get | |
1082 | downgraded to bytes, either. It is possible to accidentally mix bytes | |
1083 | and characters, however (see perluniintro), in which case \f(CW\*(C`\ew\*(C'\fR in | |
1084 | regular expressions might start behaving differently. Review your | |
1085 | code. Use warnings and the \f(CW\*(C`strict\*(C'\fR pragma. | |
1086 | .Sh "Unicode in Perl on \s-1EBCDIC\s0" | |
1087 | .IX Subsection "Unicode in Perl on EBCDIC" | |
1088 | The way Unicode is handled on \s-1EBCDIC\s0 platforms is still | |
1089 | experimental. On such platforms, references to \s-1UTF\-8\s0 encoding in this | |
1090 | document and elsewhere should be read as meaning the UTF-EBCDIC | |
1091 | specified in Unicode Technical Report 16, unless \s-1ASCII\s0 vs. \s-1EBCDIC\s0 issues | |
1092 | are specifically discussed. There is no \f(CW\*(C`utfebcdic\*(C'\fR pragma or | |
1093 | \&\*(L":utfebcdic\*(R" layer; rather, \*(L"utf8\*(R" and \*(L":utf8\*(R" are reused to mean | |
1094 | the platform's \*(L"natural\*(R" 8\-bit encoding of Unicode. See perlebcdic | |
1095 | for more discussion of the issues. | |
1096 | .Sh "Locales" | |
1097 | .IX Subsection "Locales" | |
1098 | Usually locale settings and Unicode do not affect each other, but | |
1099 | there are a couple of exceptions: | |
1100 | .IP "\(bu" 4 | |
1101 | If your locale environment variables (\s-1LANGUAGE\s0, \s-1LC_ALL\s0, \s-1LC_CTYPE\s0, \s-1LANG\s0) | |
1102 | contain the strings '\s-1UTF\-8\s0' or '\s-1UTF8\s0' (case\-insensitive matching), | |
1103 | the default encodings of your \s-1STDIN\s0, \s-1STDOUT\s0, and \s-1STDERR\s0, and of | |
1104 | \&\fBany subsequent file open\fR, are considered to be \s-1UTF\-8\s0. | |
1105 | .IP "\(bu" 4 | |
1106 | Perl tries really hard to work both with Unicode and the old | |
1107 | byte-oriented world. Most often this is nice, but sometimes Perl's | |
1108 | straddling of the proverbial fence causes problems. | |
1109 | .Sh "Using Unicode in \s-1XS\s0" | |
1110 | .IX Subsection "Using Unicode in XS" | |
1111 | If you want to handle Perl Unicode in \s-1XS\s0 extensions, you may find | |
1112 | the following C APIs useful. See perlapi for details. | |
1113 | .IP "\(bu" 4 | |
1114 | \&\f(CW\*(C`DO_UTF8(sv)\*(C'\fR returns true if the \f(CW\*(C`UTF8\*(C'\fR flag is on and the bytes | |
1115 | pragma is not in effect. \f(CW\*(C`SvUTF8(sv)\*(C'\fR returns true is the \f(CW\*(C`UTF8\*(C'\fR | |
1116 | flag is on; the bytes pragma is ignored. The \f(CW\*(C`UTF8\*(C'\fR flag being on | |
1117 | does \fBnot\fR mean that there are any characters of code points greater | |
1118 | than 255 (or 127) in the scalar or that there are even any characters | |
1119 | in the scalar. What the \f(CW\*(C`UTF8\*(C'\fR flag means is that the sequence of | |
1120 | octets in the representation of the scalar is the sequence of \s-1UTF\-8\s0 | |
1121 | encoded code points of the characters of a string. The \f(CW\*(C`UTF8\*(C'\fR flag | |
1122 | being off means that each octet in this representation encodes a | |
1123 | single character with code point 0..255 within the string. Perl's | |
1124 | Unicode model is not to use \s-1UTF\-8\s0 until it is absolutely necessary. | |
1125 | .IP "\(bu" 4 | |
1126 | \&\f(CW\*(C`uvuni_to_utf8(buf, chr\*(C'\fR) writes a Unicode character code point into | |
1127 | a buffer encoding the code point as \s-1UTF\-8\s0, and returns a pointer | |
1128 | pointing after the \s-1UTF\-8\s0 bytes. | |
1129 | .IP "\(bu" 4 | |
1130 | \&\f(CW\*(C`utf8_to_uvuni(buf, lenp)\*(C'\fR reads \s-1UTF\-8\s0 encoded bytes from a buffer and | |
1131 | returns the Unicode character code point and, optionally, the length of | |
1132 | the \s-1UTF\-8\s0 byte sequence. | |
1133 | .IP "\(bu" 4 | |
1134 | \&\f(CW\*(C`utf8_length(start, end)\*(C'\fR returns the length of the \s-1UTF\-8\s0 encoded buffer | |
1135 | in characters. \f(CW\*(C`sv_len_utf8(sv)\*(C'\fR returns the length of the \s-1UTF\-8\s0 encoded | |
1136 | scalar. | |
1137 | .IP "\(bu" 4 | |
1138 | \&\f(CW\*(C`sv_utf8_upgrade(sv)\*(C'\fR converts the string of the scalar to its \s-1UTF\-8\s0 | |
1139 | encoded form. \f(CW\*(C`sv_utf8_downgrade(sv)\*(C'\fR does the opposite, if | |
1140 | possible. \f(CW\*(C`sv_utf8_encode(sv)\*(C'\fR is like sv_utf8_upgrade except that | |
1141 | it does not set the \f(CW\*(C`UTF8\*(C'\fR flag. \f(CW\*(C`sv_utf8_decode()\*(C'\fR does the | |
1142 | opposite of \f(CW\*(C`sv_utf8_encode()\*(C'\fR. Note that none of these are to be | |
1143 | used as general-purpose encoding or decoding interfaces: \f(CW\*(C`use Encode\*(C'\fR | |
1144 | for that. \f(CW\*(C`sv_utf8_upgrade()\*(C'\fR is affected by the encoding pragma | |
1145 | but \f(CW\*(C`sv_utf8_downgrade()\*(C'\fR is not (since the encoding pragma is | |
1146 | designed to be a one-way street). | |
1147 | .IP "\(bu" 4 | |
1148 | \&\f(CWis_utf8_char(s)\fR returns true if the pointer points to a valid \s-1UTF\-8\s0 | |
1149 | character. | |
1150 | .IP "\(bu" 4 | |
1151 | \&\f(CW\*(C`is_utf8_string(buf, len)\*(C'\fR returns true if \f(CW\*(C`len\*(C'\fR bytes of the buffer | |
1152 | are valid \s-1UTF\-8\s0. | |
1153 | .IP "\(bu" 4 | |
1154 | \&\f(CW\*(C`UTF8SKIP(buf)\*(C'\fR will return the number of bytes in the \s-1UTF\-8\s0 encoded | |
1155 | character in the buffer. \f(CW\*(C`UNISKIP(chr)\*(C'\fR will return the number of bytes | |
1156 | required to UTF\-8\-encode the Unicode character code point. \f(CW\*(C`UTF8SKIP()\*(C'\fR | |
1157 | is useful for example for iterating over the characters of a \s-1UTF\-8\s0 | |
1158 | encoded buffer; \f(CW\*(C`UNISKIP()\*(C'\fR is useful, for example, in computing | |
1159 | the size required for a \s-1UTF\-8\s0 encoded buffer. | |
1160 | .IP "\(bu" 4 | |
1161 | \&\f(CW\*(C`utf8_distance(a, b)\*(C'\fR will tell the distance in characters between the | |
1162 | two pointers pointing to the same \s-1UTF\-8\s0 encoded buffer. | |
1163 | .IP "\(bu" 4 | |
1164 | \&\f(CW\*(C`utf8_hop(s, off)\*(C'\fR will return a pointer to an \s-1UTF\-8\s0 encoded buffer | |
1165 | that is \f(CW\*(C`off\*(C'\fR (positive or negative) Unicode characters displaced | |
1166 | from the \s-1UTF\-8\s0 buffer \f(CW\*(C`s\*(C'\fR. Be careful not to overstep the buffer: | |
1167 | \&\f(CW\*(C`utf8_hop()\*(C'\fR will merrily run off the end or the beginning of the | |
1168 | buffer if told to do so. | |
1169 | .IP "\(bu" 4 | |
1170 | \&\f(CW\*(C`pv_uni_display(dsv, spv, len, pvlim, flags)\*(C'\fR and | |
1171 | \&\f(CW\*(C`sv_uni_display(dsv, ssv, pvlim, flags)\*(C'\fR are useful for debugging the | |
1172 | output of Unicode strings and scalars. By default they are useful | |
1173 | only for debugging\*(--they display \fBall\fR characters as hexadecimal code | |
1174 | points\*(--but with the flags \f(CW\*(C`UNI_DISPLAY_ISPRINT\*(C'\fR, | |
1175 | \&\f(CW\*(C`UNI_DISPLAY_BACKSLASH\*(C'\fR, and \f(CW\*(C`UNI_DISPLAY_QQ\*(C'\fR you can make the | |
1176 | output more readable. | |
1177 | .IP "\(bu" 4 | |
1178 | \&\f(CW\*(C`ibcmp_utf8(s1, pe1, u1, l1, u1, s2, pe2, l2, u2)\*(C'\fR can be used to | |
1179 | compare two strings case-insensitively in Unicode. For case-sensitive | |
1180 | comparisons you can just use \f(CW\*(C`memEQ()\*(C'\fR and \f(CW\*(C`memNE()\*(C'\fR as usual. | |
1181 | .PP | |
1182 | For more information, see perlapi, and \fIutf8.c\fR and \fIutf8.h\fR | |
1183 | in the Perl source code distribution. | |
1184 | .SH "BUGS" | |
1185 | .IX Header "BUGS" | |
1186 | .Sh "Interaction with Locales" | |
1187 | .IX Subsection "Interaction with Locales" | |
1188 | Use of locales with Unicode data may lead to odd results. Currently, | |
1189 | Perl attempts to attach 8\-bit locale info to characters in the range | |
1190 | 0..255, but this technique is demonstrably incorrect for locales that | |
1191 | use characters above that range when mapped into Unicode. Perl's | |
1192 | Unicode support will also tend to run slower. Use of locales with | |
1193 | Unicode is discouraged. | |
1194 | .Sh "Interaction with Extensions" | |
1195 | .IX Subsection "Interaction with Extensions" | |
1196 | When Perl exchanges data with an extension, the extension should be | |
1197 | able to understand the \s-1UTF\-8\s0 flag and act accordingly. If the | |
1198 | extension doesn't know about the flag, it's likely that the extension | |
1199 | will return incorrectly-flagged data. | |
1200 | .PP | |
1201 | So if you're working with Unicode data, consult the documentation of | |
1202 | every module you're using if there are any issues with Unicode data | |
1203 | exchange. If the documentation does not talk about Unicode at all, | |
1204 | suspect the worst and probably look at the source to learn how the | |
1205 | module is implemented. Modules written completely in Perl shouldn't | |
1206 | cause problems. Modules that directly or indirectly access code written | |
1207 | in other programming languages are at risk. | |
1208 | .PP | |
1209 | For affected functions, the simple strategy to avoid data corruption is | |
1210 | to always make the encoding of the exchanged data explicit. Choose an | |
1211 | encoding that you know the extension can handle. Convert arguments passed | |
1212 | to the extensions to that encoding and convert results back from that | |
1213 | encoding. Write wrapper functions that do the conversions for you, so | |
1214 | you can later change the functions when the extension catches up. | |
1215 | .PP | |
1216 | To provide an example, let's say the popular Foo::Bar::escape_html | |
1217 | function doesn't deal with Unicode data yet. The wrapper function | |
1218 | would convert the argument to raw \s-1UTF\-8\s0 and convert the result back to | |
1219 | Perl's internal representation like so: | |
1220 | .PP | |
1221 | .Vb 5 | |
1222 | \& sub my_escape_html ($) { | |
1223 | \& my($what) = shift; | |
1224 | \& return unless defined $what; | |
1225 | \& Encode::decode_utf8(Foo::Bar::escape_html(Encode::encode_utf8($what))); | |
1226 | \& } | |
1227 | .Ve | |
1228 | .PP | |
1229 | Sometimes, when the extension does not convert data but just stores | |
1230 | and retrieves them, you will be in a position to use the otherwise | |
1231 | dangerous \fIEncode::_utf8_on()\fR function. Let's say the popular | |
1232 | \&\f(CW\*(C`Foo::Bar\*(C'\fR extension, written in C, provides a \f(CW\*(C`param\*(C'\fR method that | |
1233 | lets you store and retrieve data according to these prototypes: | |
1234 | .PP | |
1235 | .Vb 2 | |
1236 | \& $self->param($name, $value); # set a scalar | |
1237 | \& $value = $self->param($name); # retrieve a scalar | |
1238 | .Ve | |
1239 | .PP | |
1240 | If it does not yet provide support for any encoding, one could write a | |
1241 | derived class with such a \f(CW\*(C`param\*(C'\fR method: | |
1242 | .PP | |
1243 | .Vb 12 | |
1244 | \& sub param { | |
1245 | \& my($self,$name,$value) = @_; | |
1246 | \& utf8::upgrade($name); # make sure it is UTF-8 encoded | |
1247 | \& if (defined $value) | |
1248 | \& utf8::upgrade($value); # make sure it is UTF-8 encoded | |
1249 | \& return $self->SUPER::param($name,$value); | |
1250 | \& } else { | |
1251 | \& my $ret = $self->SUPER::param($name); | |
1252 | \& Encode::_utf8_on($ret); # we know, it is UTF-8 encoded | |
1253 | \& return $ret; | |
1254 | \& } | |
1255 | \& } | |
1256 | .Ve | |
1257 | .PP | |
1258 | Some extensions provide filters on data entry/exit points, such as | |
1259 | DB_File::filter_store_key and family. Look out for such filters in | |
1260 | the documentation of your extensions, they can make the transition to | |
1261 | Unicode data much easier. | |
1262 | .Sh "Speed" | |
1263 | .IX Subsection "Speed" | |
1264 | Some functions are slower when working on \s-1UTF\-8\s0 encoded strings than | |
1265 | on byte encoded strings. All functions that need to hop over | |
1266 | characters such as \fIlength()\fR, \fIsubstr()\fR or \fIindex()\fR can work \fBmuch\fR | |
1267 | faster when the underlying data are byte\-encoded. Witness the | |
1268 | following benchmark: | |
1269 | .PP | |
1270 | .Vb 18 | |
1271 | \& % perl -e ' | |
1272 | \& use Benchmark; | |
1273 | \& use strict; | |
1274 | \& our $l = 10000; | |
1275 | \& our $u = our $b = "x" x $l; | |
1276 | \& substr($u,0,1) = "\ex{100}"; | |
1277 | \& timethese(-2,{ | |
1278 | \& LENGTH_B => q{ length($b) }, | |
1279 | \& LENGTH_U => q{ length($u) }, | |
1280 | \& SUBSTR_B => q{ substr($b, $l/4, $l/2) }, | |
1281 | \& SUBSTR_U => q{ substr($u, $l/4, $l/2) }, | |
1282 | \& }); | |
1283 | \& ' | |
1284 | \& Benchmark: running LENGTH_B, LENGTH_U, SUBSTR_B, SUBSTR_U for at least 2 CPU seconds... | |
1285 | \& LENGTH_B: 2 wallclock secs ( 2.36 usr + 0.00 sys = 2.36 CPU) @ 5649983.05/s (n=13333960) | |
1286 | \& LENGTH_U: 2 wallclock secs ( 2.11 usr + 0.00 sys = 2.11 CPU) @ 12155.45/s (n=25648) | |
1287 | \& SUBSTR_B: 3 wallclock secs ( 2.16 usr + 0.00 sys = 2.16 CPU) @ 374480.09/s (n=808877) | |
1288 | \& SUBSTR_U: 2 wallclock secs ( 2.11 usr + 0.00 sys = 2.11 CPU) @ 6791.00/s (n=14329) | |
1289 | .Ve | |
1290 | .PP | |
1291 | The numbers show an incredible slowness on long \s-1UTF\-8\s0 strings. You | |
1292 | should carefully avoid using these functions in tight loops. If you | |
1293 | want to iterate over characters, the superior coding technique would | |
1294 | split the characters into an array instead of using substr, as the following | |
1295 | benchmark shows: | |
1296 | .PP | |
1297 | .Vb 18 | |
1298 | \& % perl -e ' | |
1299 | \& use Benchmark; | |
1300 | \& use strict; | |
1301 | \& our $l = 10000; | |
1302 | \& our $u = our $b = "x" x $l; | |
1303 | \& substr($u,0,1) = "\ex{100}"; | |
1304 | \& timethese(-5,{ | |
1305 | \& SPLIT_B => q{ for my $c (split //, $b){} }, | |
1306 | \& SPLIT_U => q{ for my $c (split //, $u){} }, | |
1307 | \& SUBSTR_B => q{ for my $i (0..length($b)-1){my $c = substr($b,$i,1);} }, | |
1308 | \& SUBSTR_U => q{ for my $i (0..length($u)-1){my $c = substr($u,$i,1);} }, | |
1309 | \& }); | |
1310 | \& ' | |
1311 | \& Benchmark: running SPLIT_B, SPLIT_U, SUBSTR_B, SUBSTR_U for at least 5 CPU seconds... | |
1312 | \& SPLIT_B: 6 wallclock secs ( 5.29 usr + 0.00 sys = 5.29 CPU) @ 56.14/s (n=297) | |
1313 | \& SPLIT_U: 5 wallclock secs ( 5.17 usr + 0.01 sys = 5.18 CPU) @ 55.21/s (n=286) | |
1314 | \& SUBSTR_B: 5 wallclock secs ( 5.34 usr + 0.00 sys = 5.34 CPU) @ 123.22/s (n=658) | |
1315 | \& SUBSTR_U: 7 wallclock secs ( 6.20 usr + 0.00 sys = 6.20 CPU) @ 0.81/s (n=5) | |
1316 | .Ve | |
1317 | .PP | |
1318 | Even though the algorithm based on \f(CW\*(C`substr()\*(C'\fR is faster than | |
1319 | \&\f(CW\*(C`split()\*(C'\fR for byte-encoded data, it pales in comparison to the speed | |
1320 | of \f(CW\*(C`split()\*(C'\fR when used with \s-1UTF\-8\s0 data. | |
1321 | .SH "SEE ALSO" | |
1322 | .IX Header "SEE ALSO" | |
1323 | perluniintro, encoding, Encode, open, utf8, bytes, | |
1324 | perlretut, \*(L"${^WIDE_SYSTEM_CALLS}\*(R" in perlvar |