Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "PERLUNICODE 1" | |
132 | .TH PERLUNICODE 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | perlunicode \- Unicode support in Perl | |
135 | .SH "DESCRIPTION" | |
136 | .IX Header "DESCRIPTION" | |
137 | .Sh "Important Caveats" | |
138 | .IX Subsection "Important Caveats" | |
139 | Unicode support is an extensive requirement. While Perl does not | |
140 | implement the Unicode standard or the accompanying technical reports | |
141 | from cover to cover, Perl does support many Unicode features. | |
142 | .IP "Input and Output Layers" 4 | |
143 | .IX Item "Input and Output Layers" | |
144 | Perl knows when a filehandle uses Perl's internal Unicode encodings | |
145 | (\s-1UTF\-8\s0, or UTF-EBCDIC if in \s-1EBCDIC\s0) if the filehandle is opened with | |
146 | the \*(L":utf8\*(R" layer. Other encodings can be converted to Perl's | |
147 | encoding on input or from Perl's encoding on output by use of the | |
148 | \&\*(L":encoding(...)\*(R" layer. See open. | |
149 | .Sp | |
150 | To indicate that Perl source itself is using a particular encoding, | |
151 | see encoding. | |
152 | .IP "Regular Expressions" 4 | |
153 | .IX Item "Regular Expressions" | |
154 | The regular expression compiler produces polymorphic opcodes. That is, | |
155 | the pattern adapts to the data and automatically switches to the Unicode | |
156 | character scheme when presented with Unicode data\*(--or instead uses | |
157 | a traditional byte scheme when presented with byte data. | |
158 | .ie n .IP """use utf8"" still needed to enable \s-1UTF\-8/UTF\-EBCDIC\s0 in scripts" 4 | |
159 | .el .IP "\f(CWuse utf8\fR still needed to enable \s-1UTF\-8/UTF\-EBCDIC\s0 in scripts" 4 | |
160 | .IX Item "use utf8 still needed to enable UTF-8/UTF-EBCDIC in scripts" | |
161 | As a compatibility measure, the \f(CW\*(C`use utf8\*(C'\fR pragma must be explicitly | |
162 | included to enable recognition of \s-1UTF\-8\s0 in the Perl scripts themselves | |
163 | (in string or regular expression literals, or in identifier names) on | |
164 | ASCII-based machines or to recognize UTF-EBCDIC on EBCDIC-based | |
165 | machines. \fBThese are the only times when an explicit \f(CB\*(C`use utf8\*(C'\fB | |
166 | is needed.\fR See utf8. | |
167 | .Sp | |
168 | You can also use the \f(CW\*(C`encoding\*(C'\fR pragma to change the default encoding | |
169 | of the data in your script; see encoding. | |
170 | .IP "BOM-marked scripts and \s-1UTF\-16\s0 scripts autodetected" 4 | |
171 | .IX Item "BOM-marked scripts and UTF-16 scripts autodetected" | |
172 | If a Perl script begins marked with the Unicode \s-1BOM\s0 (\s-1UTF\-16LE\s0, \s-1UTF16\-BE\s0, | |
173 | or \s-1UTF\-8\s0), or if the script looks like non-BOM-marked \s-1UTF\-16\s0 of either | |
174 | endianness, Perl will correctly read in the script as Unicode. | |
175 | (BOMless \s-1UTF\-8\s0 cannot be effectively recognized or differentiated from | |
176 | \&\s-1ISO\s0 8859\-1 or other eight-bit encodings.) | |
177 | .ie n .IP """use encoding"" needed to upgrade non\-Latin\-1 byte strings" 4 | |
178 | .el .IP "\f(CWuse encoding\fR needed to upgrade non\-Latin\-1 byte strings" 4 | |
179 | .IX Item "use encoding needed to upgrade non-Latin-1 byte strings" | |
180 | By default, there is a fundamental asymmetry in Perl's unicode model: | |
181 | implicit upgrading from byte strings to Unicode strings assumes that | |
182 | they were encoded in \fI\s-1ISO\s0 8859\-1 (Latin\-1)\fR, but Unicode strings are | |
183 | downgraded with \s-1UTF\-8\s0 encoding. This happens because the first 256 | |
184 | codepoints in Unicode happens to agree with Latin\-1. | |
185 | .Sp | |
186 | If you wish to interpret byte strings as \s-1UTF\-8\s0 instead, use the | |
187 | \&\f(CW\*(C`encoding\*(C'\fR pragma: | |
188 | .Sp | |
189 | .Vb 1 | |
190 | \& use encoding 'utf8'; | |
191 | .Ve | |
192 | .Sp | |
193 | See \*(L"Byte and Character Semantics\*(R" for more details. | |
194 | .Sh "Byte and Character Semantics" | |
195 | .IX Subsection "Byte and Character Semantics" | |
196 | Beginning with version 5.6, Perl uses logically-wide characters to | |
197 | represent strings internally. | |
198 | .PP | |
199 | In future, Perl-level operations will be expected to work with | |
200 | characters rather than bytes. | |
201 | .PP | |
202 | However, as an interim compatibility measure, Perl aims to | |
203 | provide a safe migration path from byte semantics to character | |
204 | semantics for programs. For operations where Perl can unambiguously | |
205 | decide that the input data are characters, Perl switches to | |
206 | character semantics. For operations where this determination cannot | |
207 | be made without additional information from the user, Perl decides in | |
208 | favor of compatibility and chooses to use byte semantics. | |
209 | .PP | |
210 | This behavior preserves compatibility with earlier versions of Perl, | |
211 | which allowed byte semantics in Perl operations only if | |
212 | none of the program's inputs were marked as being as source of Unicode | |
213 | character data. Such data may come from filehandles, from calls to | |
214 | external programs, from information provided by the system (such as \f(CW%ENV\fR), | |
215 | or from literals and constants in the source text. | |
216 | .PP | |
217 | The \f(CW\*(C`bytes\*(C'\fR pragma will always, regardless of platform, force byte | |
218 | semantics in a particular lexical scope. See bytes. | |
219 | .PP | |
220 | The \f(CW\*(C`utf8\*(C'\fR pragma is primarily a compatibility device that enables | |
221 | recognition of \s-1UTF\-\s0(8|EBCDIC) in literals encountered by the parser. | |
222 | Note that this pragma is only required while Perl defaults to byte | |
223 | semantics; when character semantics become the default, this pragma | |
224 | may become a no\-op. See utf8. | |
225 | .PP | |
226 | Unless explicitly stated, Perl operators use character semantics | |
227 | for Unicode data and byte semantics for non-Unicode data. | |
228 | The decision to use character semantics is made transparently. If | |
229 | input data comes from a Unicode source\*(--for example, if a character | |
230 | encoding layer is added to a filehandle or a literal Unicode | |
231 | string constant appears in a program\*(--character semantics apply. | |
232 | Otherwise, byte semantics are in effect. The \f(CW\*(C`bytes\*(C'\fR pragma should | |
233 | be used to force byte semantics on Unicode data. | |
234 | .PP | |
235 | If strings operating under byte semantics and strings with Unicode | |
236 | character data are concatenated, the new string will be created by | |
237 | decoding the byte strings as \fI\s-1ISO\s0 8859\-1 (Latin\-1)\fR, even if the | |
238 | old Unicode string used \s-1EBCDIC\s0. This translation is done without | |
239 | regard to the system's native 8\-bit encoding. To change this for | |
240 | systems with non\-Latin\-1 and non-EBCDIC native encodings, use the | |
241 | \&\f(CW\*(C`encoding\*(C'\fR pragma. See encoding. | |
242 | .PP | |
243 | Under character semantics, many operations that formerly operated on | |
244 | bytes now operate on characters. A character in Perl is | |
245 | logically just a number ranging from 0 to 2**31 or so. Larger | |
246 | characters may encode into longer sequences of bytes internally, but | |
247 | this internal detail is mostly hidden for Perl code. | |
248 | See perluniintro for more. | |
249 | .Sh "Effects of Character Semantics" | |
250 | .IX Subsection "Effects of Character Semantics" | |
251 | Character semantics have the following effects: | |
252 | .IP "\(bu" 4 | |
253 | Strings\*(--including hash keys\*(--and regular expression patterns may | |
254 | contain characters that have an ordinal value larger than 255. | |
255 | .Sp | |
256 | If you use a Unicode editor to edit your program, Unicode characters | |
257 | may occur directly within the literal strings in one of the various | |
258 | Unicode encodings (\s-1UTF\-8\s0, \s-1UTF\-EBCDIC\s0, \s-1UCS\-2\s0, etc.), but will be recognized | |
259 | as such and converted to Perl's internal representation only if the | |
260 | appropriate encoding is specified. | |
261 | .Sp | |
262 | Unicode characters can also be added to a string by using the | |
263 | \&\f(CW\*(C`\ex{...}\*(C'\fR notation. The Unicode code for the desired character, in | |
264 | hexadecimal, should be placed in the braces. For instance, a smiley | |
265 | face is \f(CW\*(C`\ex{263A}\*(C'\fR. This encoding scheme only works for characters | |
266 | with a code of 0x100 or above. | |
267 | .Sp | |
268 | Additionally, if you | |
269 | .Sp | |
270 | .Vb 1 | |
271 | \& use charnames ':full'; | |
272 | .Ve | |
273 | .Sp | |
274 | you can use the \f(CW\*(C`\eN{...}\*(C'\fR notation and put the official Unicode | |
275 | character name within the braces, such as \f(CW\*(C`\eN{WHITE SMILING FACE}\*(C'\fR. | |
276 | .IP "\(bu" 4 | |
277 | If an appropriate encoding is specified, identifiers within the | |
278 | Perl script may contain Unicode alphanumeric characters, including | |
279 | ideographs. Perl does not currently attempt to canonicalize variable | |
280 | names. | |
281 | .IP "\(bu" 4 | |
282 | Regular expressions match characters instead of bytes. \*(L".\*(R" matches | |
283 | a character instead of a byte. The \f(CW\*(C`\eC\*(C'\fR pattern is provided to force | |
284 | a match a single byte\*(--a \f(CW\*(C`char\*(C'\fR in C, hence \f(CW\*(C`\eC\*(C'\fR. | |
285 | .IP "\(bu" 4 | |
286 | Character classes in regular expressions match characters instead of | |
287 | bytes and match against the character properties specified in the | |
288 | Unicode properties database. \f(CW\*(C`\ew\*(C'\fR can be used to match a Japanese | |
289 | ideograph, for instance. | |
290 | .Sp | |
291 | (However, and as a limitation of the current implementation, using | |
292 | \&\f(CW\*(C`\ew\*(C'\fR or \f(CW\*(C`\eW\*(C'\fR \fIinside\fR a \f(CW\*(C`[...]\*(C'\fR character class will still match | |
293 | with byte semantics.) | |
294 | .IP "\(bu" 4 | |
295 | Named Unicode properties, scripts, and block ranges may be used like | |
296 | character classes via the \f(CW\*(C`\ep{}\*(C'\fR \*(L"matches property\*(R" construct and | |
297 | the \f(CW\*(C`\eP{}\*(C'\fR negation, \*(L"doesn't match property\*(R". | |
298 | .Sp | |
299 | For instance, \f(CW\*(C`\ep{Lu}\*(C'\fR matches any character with the Unicode \*(L"Lu\*(R" | |
300 | (Letter, uppercase) property, while \f(CW\*(C`\ep{M}\*(C'\fR matches any character | |
301 | with an \*(L"M\*(R" (mark\*(--accents and such) property. Brackets are not | |
302 | required for single letter properties, so \f(CW\*(C`\ep{M}\*(C'\fR is equivalent to | |
303 | \&\f(CW\*(C`\epM\*(C'\fR. Many predefined properties are available, such as | |
304 | \&\f(CW\*(C`\ep{Mirrored}\*(C'\fR and \f(CW\*(C`\ep{Tibetan}\*(C'\fR. | |
305 | .Sp | |
306 | The official Unicode script and block names have spaces and dashes as | |
307 | separators, but for convenience you can use dashes, spaces, or | |
308 | underbars, and case is unimportant. It is recommended, however, that | |
309 | for consistency you use the following naming: the official Unicode | |
310 | script, property, or block name (see below for the additional rules | |
311 | that apply to block names) with whitespace and dashes removed, and the | |
312 | words \*(L"uppercase\-first\-lowercase\-rest\*(R". \f(CW\*(C`Latin\-1 Supplement\*(C'\fR thus | |
313 | becomes \f(CW\*(C`Latin1Supplement\*(C'\fR. | |
314 | .Sp | |
315 | You can also use negation in both \f(CW\*(C`\ep{}\*(C'\fR and \f(CW\*(C`\eP{}\*(C'\fR by introducing a caret | |
316 | (^) between the first brace and the property name: \f(CW\*(C`\ep{^Tamil}\*(C'\fR is | |
317 | equal to \f(CW\*(C`\eP{Tamil}\*(C'\fR. | |
318 | .Sp | |
319 | \&\fB\s-1NOTE:\s0 the properties, scripts, and blocks listed here are as of | |
320 | Unicode 3.2.0, March 2002, or Perl 5.8.0, July 2002. Unicode 4.0.0 | |
321 | came out in April 2003, and Perl 5.8.1 in September 2003.\fR | |
322 | .Sp | |
323 | Here are the basic Unicode General Category properties, followed by their | |
324 | long form. You can use either; \f(CW\*(C`\ep{Lu}\*(C'\fR and \f(CW\*(C`\ep{UppercaseLetter}\*(C'\fR, | |
325 | for instance, are identical. | |
326 | .Sp | |
327 | .Vb 1 | |
328 | \& Short Long | |
329 | .Ve | |
330 | .Sp | |
331 | .Vb 7 | |
332 | \& L Letter | |
333 | \& LC CasedLetter | |
334 | \& Lu UppercaseLetter | |
335 | \& Ll LowercaseLetter | |
336 | \& Lt TitlecaseLetter | |
337 | \& Lm ModifierLetter | |
338 | \& Lo OtherLetter | |
339 | .Ve | |
340 | .Sp | |
341 | .Vb 4 | |
342 | \& M Mark | |
343 | \& Mn NonspacingMark | |
344 | \& Mc SpacingMark | |
345 | \& Me EnclosingMark | |
346 | .Ve | |
347 | .Sp | |
348 | .Vb 4 | |
349 | \& N Number | |
350 | \& Nd DecimalNumber | |
351 | \& Nl LetterNumber | |
352 | \& No OtherNumber | |
353 | .Ve | |
354 | .Sp | |
355 | .Vb 10 | |
356 | \& P Punctuation | |
357 | \& Pc ConnectorPunctuation | |
358 | \& Pd DashPunctuation | |
359 | \& Ps OpenPunctuation | |
360 | \& Pe ClosePunctuation | |
361 | \& Pi InitialPunctuation | |
362 | \& (may behave like Ps or Pe depending on usage) | |
363 | \& Pf FinalPunctuation | |
364 | \& (may behave like Ps or Pe depending on usage) | |
365 | \& Po OtherPunctuation | |
366 | .Ve | |
367 | .Sp | |
368 | .Vb 5 | |
369 | \& S Symbol | |
370 | \& Sm MathSymbol | |
371 | \& Sc CurrencySymbol | |
372 | \& Sk ModifierSymbol | |
373 | \& So OtherSymbol | |
374 | .Ve | |
375 | .Sp | |
376 | .Vb 4 | |
377 | \& Z Separator | |
378 | \& Zs SpaceSeparator | |
379 | \& Zl LineSeparator | |
380 | \& Zp ParagraphSeparator | |
381 | .Ve | |
382 | .Sp | |
383 | .Vb 6 | |
384 | \& C Other | |
385 | \& Cc Control | |
386 | \& Cf Format | |
387 | \& Cs Surrogate (not usable) | |
388 | \& Co PrivateUse | |
389 | \& Cn Unassigned | |
390 | .Ve | |
391 | .Sp | |
392 | Single-letter properties match all characters in any of the | |
393 | two-letter sub-properties starting with the same letter. | |
394 | \&\f(CW\*(C`LC\*(C'\fR and \f(CW\*(C`L&\*(C'\fR are special cases, which are aliases for the set of | |
395 | \&\f(CW\*(C`Ll\*(C'\fR, \f(CW\*(C`Lu\*(C'\fR, and \f(CW\*(C`Lt\*(C'\fR. | |
396 | .Sp | |
397 | Because Perl hides the need for the user to understand the internal | |
398 | representation of Unicode characters, there is no need to implement | |
399 | the somewhat messy concept of surrogates. \f(CW\*(C`Cs\*(C'\fR is therefore not | |
400 | supported. | |
401 | .Sp | |
402 | Because scripts differ in their directionality\*(--Hebrew is | |
403 | written right to left, for example\*(--Unicode supplies these properties in | |
404 | the BidiClass class: | |
405 | .Sp | |
406 | .Vb 1 | |
407 | \& Property Meaning | |
408 | .Ve | |
409 | .Sp | |
410 | .Vb 19 | |
411 | \& L Left-to-Right | |
412 | \& LRE Left-to-Right Embedding | |
413 | \& LRO Left-to-Right Override | |
414 | \& R Right-to-Left | |
415 | \& AL Right-to-Left Arabic | |
416 | \& RLE Right-to-Left Embedding | |
417 | \& RLO Right-to-Left Override | |
418 | \& PDF Pop Directional Format | |
419 | \& EN European Number | |
420 | \& ES European Number Separator | |
421 | \& ET European Number Terminator | |
422 | \& AN Arabic Number | |
423 | \& CS Common Number Separator | |
424 | \& NSM Non-Spacing Mark | |
425 | \& BN Boundary Neutral | |
426 | \& B Paragraph Separator | |
427 | \& S Segment Separator | |
428 | \& WS Whitespace | |
429 | \& ON Other Neutrals | |
430 | .Ve | |
431 | .Sp | |
432 | For example, \f(CW\*(C`\ep{BidiClass:R}\*(C'\fR matches characters that are normally | |
433 | written right to left. | |
434 | .Sh "Scripts" | |
435 | .IX Subsection "Scripts" | |
436 | The script names which can be used by \f(CW\*(C`\ep{...}\*(C'\fR and \f(CW\*(C`\eP{...}\*(C'\fR, | |
437 | such as in \f(CW\*(C`\ep{Latin}\*(C'\fR or \f(CW\*(C`\ep{Cyrillic}\*(C'\fR, are as follows: | |
438 | .PP | |
439 | .Vb 44 | |
440 | \& Arabic | |
441 | \& Armenian | |
442 | \& Bengali | |
443 | \& Bopomofo | |
444 | \& Buhid | |
445 | \& CanadianAboriginal | |
446 | \& Cherokee | |
447 | \& Cyrillic | |
448 | \& Deseret | |
449 | \& Devanagari | |
450 | \& Ethiopic | |
451 | \& Georgian | |
452 | \& Gothic | |
453 | \& Greek | |
454 | \& Gujarati | |
455 | \& Gurmukhi | |
456 | \& Han | |
457 | \& Hangul | |
458 | \& Hanunoo | |
459 | \& Hebrew | |
460 | \& Hiragana | |
461 | \& Inherited | |
462 | \& Kannada | |
463 | \& Katakana | |
464 | \& Khmer | |
465 | \& Lao | |
466 | \& Latin | |
467 | \& Malayalam | |
468 | \& Mongolian | |
469 | \& Myanmar | |
470 | \& Ogham | |
471 | \& OldItalic | |
472 | \& Oriya | |
473 | \& Runic | |
474 | \& Sinhala | |
475 | \& Syriac | |
476 | \& Tagalog | |
477 | \& Tagbanwa | |
478 | \& Tamil | |
479 | \& Telugu | |
480 | \& Thaana | |
481 | \& Thai | |
482 | \& Tibetan | |
483 | \& Yi | |
484 | .Ve | |
485 | .PP | |
486 | Extended property classes can supplement the basic | |
487 | properties, defined by the \fIPropList\fR Unicode database: | |
488 | .PP | |
489 | .Vb 27 | |
490 | \& ASCIIHexDigit | |
491 | \& BidiControl | |
492 | \& Dash | |
493 | \& Deprecated | |
494 | \& Diacritic | |
495 | \& Extender | |
496 | \& GraphemeLink | |
497 | \& HexDigit | |
498 | \& Hyphen | |
499 | \& Ideographic | |
500 | \& IDSBinaryOperator | |
501 | \& IDSTrinaryOperator | |
502 | \& JoinControl | |
503 | \& LogicalOrderException | |
504 | \& NoncharacterCodePoint | |
505 | \& OtherAlphabetic | |
506 | \& OtherDefaultIgnorableCodePoint | |
507 | \& OtherGraphemeExtend | |
508 | \& OtherLowercase | |
509 | \& OtherMath | |
510 | \& OtherUppercase | |
511 | \& QuotationMark | |
512 | \& Radical | |
513 | \& SoftDotted | |
514 | \& TerminalPunctuation | |
515 | \& UnifiedIdeograph | |
516 | \& WhiteSpace | |
517 | .Ve | |
518 | .PP | |
519 | and there are further derived properties: | |
520 | .PP | |
521 | .Vb 4 | |
522 | \& Alphabetic Lu + Ll + Lt + Lm + Lo + OtherAlphabetic | |
523 | \& Lowercase Ll + OtherLowercase | |
524 | \& Uppercase Lu + OtherUppercase | |
525 | \& Math Sm + OtherMath | |
526 | .Ve | |
527 | .PP | |
528 | .Vb 2 | |
529 | \& ID_Start Lu + Ll + Lt + Lm + Lo + Nl | |
530 | \& ID_Continue ID_Start + Mn + Mc + Nd + Pc | |
531 | .Ve | |
532 | .PP | |
533 | .Vb 5 | |
534 | \& Any Any character | |
535 | \& Assigned Any non-Cn character (i.e. synonym for \eP{Cn}) | |
536 | \& Unassigned Synonym for \ep{Cn} | |
537 | \& Common Any character (or unassigned code point) | |
538 | \& not explicitly assigned to a script | |
539 | .Ve | |
540 | .PP | |
541 | For backward compatibility (with Perl 5.6), all properties mentioned | |
542 | so far may have \f(CW\*(C`Is\*(C'\fR prepended to their name, so \f(CW\*(C`\eP{IsLu}\*(C'\fR, for | |
543 | example, is equal to \f(CW\*(C`\eP{Lu}\*(C'\fR. | |
544 | .Sh "Blocks" | |
545 | .IX Subsection "Blocks" | |
546 | In addition to \fBscripts\fR, Unicode also defines \fBblocks\fR of | |
547 | characters. The difference between scripts and blocks is that the | |
548 | concept of scripts is closer to natural languages, while the concept | |
549 | of blocks is more of an artificial grouping based on groups of 256 | |
550 | Unicode characters. For example, the \f(CW\*(C`Latin\*(C'\fR script contains letters | |
551 | from many blocks but does not contain all the characters from those | |
552 | blocks. It does not, for example, contain digits, because digits are | |
553 | shared across many scripts. Digits and similar groups, like | |
554 | punctuation, are in a category called \f(CW\*(C`Common\*(C'\fR. | |
555 | .PP | |
556 | For more about scripts, see the \s-1UTR\s0 #24: | |
557 | .PP | |
558 | .Vb 1 | |
559 | \& http://www.unicode.org/unicode/reports/tr24/ | |
560 | .Ve | |
561 | .PP | |
562 | For more about blocks, see: | |
563 | .PP | |
564 | .Vb 1 | |
565 | \& http://www.unicode.org/Public/UNIDATA/Blocks.txt | |
566 | .Ve | |
567 | .PP | |
568 | Block names are given with the \f(CW\*(C`In\*(C'\fR prefix. For example, the | |
569 | Katakana block is referenced via \f(CW\*(C`\ep{InKatakana}\*(C'\fR. The \f(CW\*(C`In\*(C'\fR | |
570 | prefix may be omitted if there is no naming conflict with a script | |
571 | or any other property, but it is recommended that \f(CW\*(C`In\*(C'\fR always be used | |
572 | for block tests to avoid confusion. | |
573 | .PP | |
574 | These block names are supported: | |
575 | .PP | |
576 | .Vb 110 | |
577 | \& InAlphabeticPresentationForms | |
578 | \& InArabic | |
579 | \& InArabicPresentationFormsA | |
580 | \& InArabicPresentationFormsB | |
581 | \& InArmenian | |
582 | \& InArrows | |
583 | \& InBasicLatin | |
584 | \& InBengali | |
585 | \& InBlockElements | |
586 | \& InBopomofo | |
587 | \& InBopomofoExtended | |
588 | \& InBoxDrawing | |
589 | \& InBraillePatterns | |
590 | \& InBuhid | |
591 | \& InByzantineMusicalSymbols | |
592 | \& InCJKCompatibility | |
593 | \& InCJKCompatibilityForms | |
594 | \& InCJKCompatibilityIdeographs | |
595 | \& InCJKCompatibilityIdeographsSupplement | |
596 | \& InCJKRadicalsSupplement | |
597 | \& InCJKSymbolsAndPunctuation | |
598 | \& InCJKUnifiedIdeographs | |
599 | \& InCJKUnifiedIdeographsExtensionA | |
600 | \& InCJKUnifiedIdeographsExtensionB | |
601 | \& InCherokee | |
602 | \& InCombiningDiacriticalMarks | |
603 | \& InCombiningDiacriticalMarksforSymbols | |
604 | \& InCombiningHalfMarks | |
605 | \& InControlPictures | |
606 | \& InCurrencySymbols | |
607 | \& InCyrillic | |
608 | \& InCyrillicSupplementary | |
609 | \& InDeseret | |
610 | \& InDevanagari | |
611 | \& InDingbats | |
612 | \& InEnclosedAlphanumerics | |
613 | \& InEnclosedCJKLettersAndMonths | |
614 | \& InEthiopic | |
615 | \& InGeneralPunctuation | |
616 | \& InGeometricShapes | |
617 | \& InGeorgian | |
618 | \& InGothic | |
619 | \& InGreekExtended | |
620 | \& InGreekAndCoptic | |
621 | \& InGujarati | |
622 | \& InGurmukhi | |
623 | \& InHalfwidthAndFullwidthForms | |
624 | \& InHangulCompatibilityJamo | |
625 | \& InHangulJamo | |
626 | \& InHangulSyllables | |
627 | \& InHanunoo | |
628 | \& InHebrew | |
629 | \& InHighPrivateUseSurrogates | |
630 | \& InHighSurrogates | |
631 | \& InHiragana | |
632 | \& InIPAExtensions | |
633 | \& InIdeographicDescriptionCharacters | |
634 | \& InKanbun | |
635 | \& InKangxiRadicals | |
636 | \& InKannada | |
637 | \& InKatakana | |
638 | \& InKatakanaPhoneticExtensions | |
639 | \& InKhmer | |
640 | \& InLao | |
641 | \& InLatin1Supplement | |
642 | \& InLatinExtendedA | |
643 | \& InLatinExtendedAdditional | |
644 | \& InLatinExtendedB | |
645 | \& InLetterlikeSymbols | |
646 | \& InLowSurrogates | |
647 | \& InMalayalam | |
648 | \& InMathematicalAlphanumericSymbols | |
649 | \& InMathematicalOperators | |
650 | \& InMiscellaneousMathematicalSymbolsA | |
651 | \& InMiscellaneousMathematicalSymbolsB | |
652 | \& InMiscellaneousSymbols | |
653 | \& InMiscellaneousTechnical | |
654 | \& InMongolian | |
655 | \& InMusicalSymbols | |
656 | \& InMyanmar | |
657 | \& InNumberForms | |
658 | \& InOgham | |
659 | \& InOldItalic | |
660 | \& InOpticalCharacterRecognition | |
661 | \& InOriya | |
662 | \& InPrivateUseArea | |
663 | \& InRunic | |
664 | \& InSinhala | |
665 | \& InSmallFormVariants | |
666 | \& InSpacingModifierLetters | |
667 | \& InSpecials | |
668 | \& InSuperscriptsAndSubscripts | |
669 | \& InSupplementalArrowsA | |
670 | \& InSupplementalArrowsB | |
671 | \& InSupplementalMathematicalOperators | |
672 | \& InSupplementaryPrivateUseAreaA | |
673 | \& InSupplementaryPrivateUseAreaB | |
674 | \& InSyriac | |
675 | \& InTagalog | |
676 | \& InTagbanwa | |
677 | \& InTags | |
678 | \& InTamil | |
679 | \& InTelugu | |
680 | \& InThaana | |
681 | \& InThai | |
682 | \& InTibetan | |
683 | \& InUnifiedCanadianAboriginalSyllabics | |
684 | \& InVariationSelectors | |
685 | \& InYiRadicals | |
686 | \& InYiSyllables | |
687 | .Ve | |
688 | .IP "\(bu" 4 | |
689 | The special pattern \f(CW\*(C`\eX\*(C'\fR matches any extended Unicode | |
690 | sequence\-\-\*(L"a combining character sequence\*(R" in Standardese\*(--where the | |
691 | first character is a base character and subsequent characters are mark | |
692 | characters that apply to the base character. \f(CW\*(C`\eX\*(C'\fR is equivalent to | |
693 | \&\f(CW\*(C`(?:\ePM\epM*)\*(C'\fR. | |
694 | .IP "\(bu" 4 | |
695 | The \f(CW\*(C`tr///\*(C'\fR operator translates characters instead of bytes. Note | |
696 | that the \f(CW\*(C`tr///CU\*(C'\fR functionality has been removed. For similar | |
697 | functionality see pack('U0', ...) and pack('C0', ...). | |
698 | .IP "\(bu" 4 | |
699 | Case translation operators use the Unicode case translation tables | |
700 | when character input is provided. Note that \f(CW\*(C`uc()\*(C'\fR, or \f(CW\*(C`\eU\*(C'\fR in | |
701 | interpolated strings, translates to uppercase, while \f(CW\*(C`ucfirst\*(C'\fR, | |
702 | or \f(CW\*(C`\eu\*(C'\fR in interpolated strings, translates to titlecase in languages | |
703 | that make the distinction. | |
704 | .IP "\(bu" 4 | |
705 | Most operators that deal with positions or lengths in a string will | |
706 | automatically switch to using character positions, including | |
707 | \&\f(CW\*(C`chop()\*(C'\fR, \f(CW\*(C`chomp()\*(C'\fR, \f(CW\*(C`substr()\*(C'\fR, \f(CW\*(C`pos()\*(C'\fR, \f(CW\*(C`index()\*(C'\fR, \f(CW\*(C`rindex()\*(C'\fR, | |
708 | \&\f(CW\*(C`sprintf()\*(C'\fR, \f(CW\*(C`write()\*(C'\fR, and \f(CW\*(C`length()\*(C'\fR. Operators that | |
709 | specifically do not switch include \f(CW\*(C`vec()\*(C'\fR, \f(CW\*(C`pack()\*(C'\fR, and | |
710 | \&\f(CW\*(C`unpack()\*(C'\fR. Operators that really don't care include | |
711 | operators that treats strings as a bucket of bits such as \f(CW\*(C`sort()\*(C'\fR, | |
712 | and operators dealing with filenames. | |
713 | .IP "\(bu" 4 | |
714 | The \f(CW\*(C`pack()\*(C'\fR/\f(CW\*(C`unpack()\*(C'\fR letters \f(CW\*(C`c\*(C'\fR and \f(CW\*(C`C\*(C'\fR do \fInot\fR change, | |
715 | since they are often used for byte-oriented formats. Again, think | |
716 | \&\f(CW\*(C`char\*(C'\fR in the C language. | |
717 | .Sp | |
718 | There is a new \f(CW\*(C`U\*(C'\fR specifier that converts between Unicode characters | |
719 | and code points. | |
720 | .IP "\(bu" 4 | |
721 | The \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`ord()\*(C'\fR functions work on characters, similar to | |
722 | \&\f(CW\*(C`pack("U")\*(C'\fR and \f(CW\*(C`unpack("U")\*(C'\fR, \fInot\fR \f(CW\*(C`pack("C")\*(C'\fR and | |
723 | \&\f(CW\*(C`unpack("C")\*(C'\fR. \f(CW\*(C`pack("C")\*(C'\fR and \f(CW\*(C`unpack("C")\*(C'\fR are methods for | |
724 | emulating byte-oriented \f(CW\*(C`chr()\*(C'\fR and \f(CW\*(C`ord()\*(C'\fR on Unicode strings. | |
725 | While these methods reveal the internal encoding of Unicode strings, | |
726 | that is not something one normally needs to care about at all. | |
727 | .IP "\(bu" 4 | |
728 | The bit string operators, \f(CW\*(C`& | ^ ~\*(C'\fR, can operate on character data. | |
729 | However, for backward compatibility, such as when using bit string | |
730 | operations when characters are all less than 256 in ordinal value, one | |
731 | should not use \f(CW\*(C`~\*(C'\fR (the bit complement) with characters of both | |
732 | values less than 256 and values greater than 256. Most importantly, | |
733 | DeMorgan's laws (\f(CW\*(C`~($x|$y) eq ~$x&~$y\*(C'\fR and \f(CW\*(C`~($x&$y) eq ~$x|~$y\*(C'\fR) | |
734 | will not hold. The reason for this mathematical \fIfaux pas\fR is that | |
735 | the complement cannot return \fBboth\fR the 8\-bit (byte\-wide) bit | |
736 | complement \fBand\fR the full character-wide bit complement. | |
737 | .IP "\(bu" 4 | |
738 | \&\fIlc()\fR, \fIuc()\fR, \fIlcfirst()\fR, and \fIucfirst()\fR work for the following cases: | |
739 | .RS 4 | |
740 | .IP "\(bu" 8 | |
741 | the case mapping is from a single Unicode character to another | |
742 | single Unicode character, or | |
743 | .IP "\(bu" 8 | |
744 | the case mapping is from a single Unicode character to more | |
745 | than one Unicode character. | |
746 | .RE | |
747 | .RS 4 | |
748 | .Sp | |
749 | Things to do with locales (Lithuanian, Turkish, Azeri) do \fBnot\fR work | |
750 | since Perl does not understand the concept of Unicode locales. | |
751 | .Sp | |
752 | See the Unicode Technical Report #21, Case Mappings, for more details. | |
753 | .RE | |
754 | .IP "\(bu" 4 | |
755 | And finally, \f(CW\*(C`scalar reverse()\*(C'\fR reverses by character rather than by byte. | |
756 | .Sh "User-Defined Character Properties" | |
757 | .IX Subsection "User-Defined Character Properties" | |
758 | You can define your own character properties by defining subroutines | |
759 | whose names begin with \*(L"In\*(R" or \*(L"Is\*(R". The subroutines can be defined in | |
760 | any package. The user-defined properties can be used in the regular | |
761 | expression \f(CW\*(C`\ep\*(C'\fR and \f(CW\*(C`\eP\*(C'\fR constructs; if you are using a user-defined | |
762 | property from a package other than the one you are in, you must specify | |
763 | its package in the \f(CW\*(C`\ep\*(C'\fR or \f(CW\*(C`\eP\*(C'\fR construct. | |
764 | .PP | |
765 | .Vb 3 | |
766 | \& # assuming property IsForeign defined in Lang:: | |
767 | \& package main; # property package name required | |
768 | \& if ($txt =~ /\ep{Lang::IsForeign}+/) { ... } | |
769 | .Ve | |
770 | .PP | |
771 | .Vb 2 | |
772 | \& package Lang; # property package name not required | |
773 | \& if ($txt =~ /\ep{IsForeign}+/) { ... } | |
774 | .Ve | |
775 | .PP | |
776 | Note that the effect is compile-time and immutable once defined. | |
777 | .PP | |
778 | The subroutines must return a specially-formatted string, with one | |
779 | or more newline-separated lines. Each line must be one of the following: | |
780 | .IP "\(bu" 4 | |
781 | Two hexadecimal numbers separated by horizontal whitespace (space or | |
782 | tabular characters) denoting a range of Unicode code points to include. | |
783 | .IP "\(bu" 4 | |
784 | Something to include, prefixed by \*(L"+\*(R": a built-in character | |
785 | property (prefixed by \*(L"utf8::\*(R") or a user-defined character property, | |
786 | to represent all the characters in that property; two hexadecimal code | |
787 | points for a range; or a single hexadecimal code point. | |
788 | .IP "\(bu" 4 | |
789 | Something to exclude, prefixed by \*(L"\-\*(R": an existing character | |
790 | property (prefixed by \*(L"utf8::\*(R") or a user-defined character property, | |
791 | to represent all the characters in that property; two hexadecimal code | |
792 | points for a range; or a single hexadecimal code point. | |
793 | .IP "\(bu" 4 | |
794 | Something to negate, prefixed \*(L"!\*(R": an existing character | |
795 | property (prefixed by \*(L"utf8::\*(R") or a user-defined character property, | |
796 | to represent all the characters in that property; two hexadecimal code | |
797 | points for a range; or a single hexadecimal code point. | |
798 | .IP "\(bu" 4 | |
799 | Something to intersect with, prefixed by \*(L"&\*(R": an existing character | |
800 | property (prefixed by \*(L"utf8::\*(R") or a user-defined character property, | |
801 | for all the characters except the characters in the property; two | |
802 | hexadecimal code points for a range; or a single hexadecimal code point. | |
803 | .PP | |
804 | For example, to define a property that covers both the Japanese | |
805 | syllabaries (hiragana and katakana), you can define | |
806 | .PP | |
807 | .Vb 6 | |
808 | \& sub InKana { | |
809 | \& return <<END; | |
810 | \& 3040\et309F | |
811 | \& 30A0\et30FF | |
812 | \& END | |
813 | \& } | |
814 | .Ve | |
815 | .PP | |
816 | Imagine that the here-doc end marker is at the beginning of the line. | |
817 | Now you can use \f(CW\*(C`\ep{InKana}\*(C'\fR and \f(CW\*(C`\eP{InKana}\*(C'\fR. | |
818 | .PP | |
819 | You could also have used the existing block property names: | |
820 | .PP | |
821 | .Vb 6 | |
822 | \& sub InKana { | |
823 | \& return <<'END'; | |
824 | \& +utf8::InHiragana | |
825 | \& +utf8::InKatakana | |
826 | \& END | |
827 | \& } | |
828 | .Ve | |
829 | .PP | |
830 | Suppose you wanted to match only the allocated characters, | |
831 | not the raw block ranges: in other words, you want to remove | |
832 | the non\-characters: | |
833 | .PP | |
834 | .Vb 7 | |
835 | \& sub InKana { | |
836 | \& return <<'END'; | |
837 | \& +utf8::InHiragana | |
838 | \& +utf8::InKatakana | |
839 | \& -utf8::IsCn | |
840 | \& END | |
841 | \& } | |
842 | .Ve | |
843 | .PP | |
844 | The negation is useful for defining (surprise!) negated classes. | |
845 | .PP | |
846 | .Vb 7 | |
847 | \& sub InNotKana { | |
848 | \& return <<'END'; | |
849 | \& !utf8::InHiragana | |
850 | \& -utf8::InKatakana | |
851 | \& +utf8::IsCn | |
852 | \& END | |
853 | \& } | |
854 | .Ve | |
855 | .PP | |
856 | Intersection is useful for getting the common characters matched by | |
857 | two (or more) classes. | |
858 | .PP | |
859 | .Vb 6 | |
860 | \& sub InFooAndBar { | |
861 | \& return <<'END'; | |
862 | \& +main::Foo | |
863 | \& &main::Bar | |
864 | \& END | |
865 | \& } | |
866 | .Ve | |
867 | .PP | |
868 | It's important to remember not to use \*(L"&\*(R" for the first set \*(-- that | |
869 | would be intersecting with nothing (resulting in an empty set). | |
870 | .PP | |
871 | You can also define your own mappings to be used in the \fIlc()\fR, | |
872 | \&\fIlcfirst()\fR, \fIuc()\fR, and \fIucfirst()\fR (or their string-inlined versions). | |
873 | The principle is the same: define subroutines in the \f(CW\*(C`main\*(C'\fR package | |
874 | with names like \f(CW\*(C`ToLower\*(C'\fR (for \fIlc()\fR and \fIlcfirst()\fR), \f(CW\*(C`ToTitle\*(C'\fR (for | |
875 | the first character in \fIucfirst()\fR), and \f(CW\*(C`ToUpper\*(C'\fR (for \fIuc()\fR, and the | |
876 | rest of the characters in \fIucfirst()\fR). | |
877 | .PP | |
878 | The string returned by the subroutines needs now to be three | |
879 | hexadecimal numbers separated by tabulators: start of the source | |
880 | range, end of the source range, and start of the destination range. | |
881 | For example: | |
882 | .PP | |
883 | .Vb 5 | |
884 | \& sub ToUpper { | |
885 | \& return <<END; | |
886 | \& 0061\et0063\et0041 | |
887 | \& END | |
888 | \& } | |
889 | .Ve | |
890 | .PP | |
891 | defines an \fIuc()\fR mapping that causes only the characters \*(L"a\*(R", \*(L"b\*(R", and | |
892 | \&\*(L"c\*(R" to be mapped to \*(L"A\*(R", \*(L"B\*(R", \*(L"C\*(R", all other characters will remain | |
893 | unchanged. | |
894 | .PP | |
895 | If there is no source range to speak of, that is, the mapping is from | |
896 | a single character to another single character, leave the end of the | |
897 | source range empty, but the two tabulator characters are still needed. | |
898 | For example: | |
899 | .PP | |
900 | .Vb 5 | |
901 | \& sub ToLower { | |
902 | \& return <<END; | |
903 | \& 0041\et\et0061 | |
904 | \& END | |
905 | \& } | |
906 | .Ve | |
907 | .PP | |
908 | defines a \fIlc()\fR mapping that causes only \*(L"A\*(R" to be mapped to \*(L"a\*(R", all | |
909 | other characters will remain unchanged. | |
910 | .PP | |
911 | (For serious hackers only) If you want to introspect the default | |
912 | mappings, you can find the data in the directory | |
913 | \&\f(CW$Config{privlib}\fR/\fIunicore/To/\fR. The mapping data is returned as | |
914 | the here\-document, and the \f(CW\*(C`utf8::ToSpecFoo\*(C'\fR are special exception | |
915 | mappings derived from <$Config{privlib}>/\fIunicore/SpecialCasing.txt\fR. | |
916 | The \f(CW\*(C`Digit\*(C'\fR and \f(CW\*(C`Fold\*(C'\fR mappings that one can see in the directory | |
917 | are not directly user\-accessible, one can use either the | |
918 | \&\f(CW\*(C`Unicode::UCD\*(C'\fR module, or just match case-insensitively (that's when | |
919 | the \f(CW\*(C`Fold\*(C'\fR mapping is used). | |
920 | .PP | |
921 | A final note on the user-defined property tests and mappings: they | |
922 | will be used only if the scalar has been marked as having Unicode | |
923 | characters. Old byte-style strings will not be affected. | |
924 | .Sh "Character Encodings for Input and Output" | |
925 | .IX Subsection "Character Encodings for Input and Output" | |
926 | See Encode. | |
927 | .Sh "Unicode Regular Expression Support Level" | |
928 | .IX Subsection "Unicode Regular Expression Support Level" | |
929 | The following list of Unicode support for regular expressions describes | |
930 | all the features currently supported. The references to \*(L"Level N\*(R" | |
931 | and the section numbers refer to the Unicode Technical Report 18, | |
932 | \&\*(L"Unicode Regular Expression Guidelines\*(R", version 6 (Unicode 3.2.0, | |
933 | Perl 5.8.0). | |
934 | .IP "\(bu" 4 | |
935 | Level 1 \- Basic Unicode Support | |
936 | .Sp | |
937 | .Vb 7 | |
938 | \& 2.1 Hex Notation - done [1] | |
939 | \& Named Notation - done [2] | |
940 | \& 2.2 Categories - done [3][4] | |
941 | \& 2.3 Subtraction - MISSING [5][6] | |
942 | \& 2.4 Simple Word Boundaries - done [7] | |
943 | \& 2.5 Simple Loose Matches - done [8] | |
944 | \& 2.6 End of Line - MISSING [9][10] | |
945 | .Ve | |
946 | .Sp | |
947 | .Vb 20 | |
948 | \& [ 1] \ex{...} | |
949 | \& [ 2] \eN{...} | |
950 | \& [ 3] . \ep{...} \eP{...} | |
951 | \& [ 4] support for scripts (see UTR#24 Script Names), blocks, | |
952 | \& binary properties, enumerated non-binary properties, and | |
953 | \& numeric properties (as listed in UTR#18 Other Properties) | |
954 | \& [ 5] have negation | |
955 | \& [ 6] can use regular expression look-ahead [a] | |
956 | \& or user-defined character properties [b] to emulate subtraction | |
957 | \& [ 7] include Letters in word characters | |
958 | \& [ 8] note that Perl does Full case-folding in matching, not Simple: | |
959 | \& for example U+1F88 is equivalent with U+1F00 U+03B9, | |
960 | \& not with 1F80. This difference matters for certain Greek | |
961 | \& capital letters with certain modifiers: the Full case-folding | |
962 | \& decomposes the letter, while the Simple case-folding would map | |
963 | \& it to a single character. | |
964 | \& [ 9] see UTR #13 Unicode Newline Guidelines | |
965 | \& [10] should do ^ and $ also on \ex{85}, \ex{2028} and \ex{2029} | |
966 | \& (should also affect <>, $., and script line numbers) | |
967 | \& (the \ex{85}, \ex{2028} and \ex{2029} do match \es) | |
968 | .Ve | |
969 | .Sp | |
970 | [a] You can mimic class subtraction using lookahead. | |
971 | For example, what \s-1UTR\s0 #18 might write as | |
972 | .Sp | |
973 | .Vb 1 | |
974 | \& [{Greek}-[{UNASSIGNED}]] | |
975 | .Ve | |
976 | .Sp | |
977 | in Perl can be written as: | |
978 | .Sp | |
979 | .Vb 2 | |
980 | \& (?!\ep{Unassigned})\ep{InGreekAndCoptic} | |
981 | \& (?=\ep{Assigned})\ep{InGreekAndCoptic} | |
982 | .Ve | |
983 | .Sp | |
984 | But in this particular example, you probably really want | |
985 | .Sp | |
986 | .Vb 1 | |
987 | \& \ep{GreekAndCoptic} | |
988 | .Ve | |
989 | .Sp | |
990 | which will match assigned characters known to be part of the Greek script. | |
991 | .Sp | |
992 | Also see the Unicode::Regex::Set module, it does implement the full | |
993 | \&\s-1UTR\s0 #18 grouping, intersection, union, and removal (subtraction) syntax. | |
994 | .Sp | |
995 | [b] See \*(L"User\-Defined Character Properties\*(R". | |
996 | .IP "\(bu" 4 | |
997 | Level 2 \- Extended Unicode Support | |
998 | .Sp | |
999 | .Vb 5 | |
1000 | \& 3.1 Surrogates - MISSING [11] | |
1001 | \& 3.2 Canonical Equivalents - MISSING [12][13] | |
1002 | \& 3.3 Locale-Independent Graphemes - MISSING [14] | |
1003 | \& 3.4 Locale-Independent Words - MISSING [15] | |
1004 | \& 3.5 Locale-Independent Loose Matches - MISSING [16] | |
1005 | .Ve | |
1006 | .Sp | |
1007 | .Vb 7 | |
1008 | \& [11] Surrogates are solely a UTF-16 concept and Perl's internal | |
1009 | \& representation is UTF-8. The Encode module does UTF-16, though. | |
1010 | \& [12] see UTR#15 Unicode Normalization | |
1011 | \& [13] have Unicode::Normalize but not integrated to regexes | |
1012 | \& [14] have \eX but at this level . should equal that | |
1013 | \& [15] need three classes, not just \ew and \eW | |
1014 | \& [16] see UTR#21 Case Mappings | |
1015 | .Ve | |
1016 | .IP "\(bu" 4 | |
1017 | Level 3 \- Locale-Sensitive Support | |
1018 | .Sp | |
1019 | .Vb 5 | |
1020 | \& 4.1 Locale-Dependent Categories - MISSING | |
1021 | \& 4.2 Locale-Dependent Graphemes - MISSING [16][17] | |
1022 | \& 4.3 Locale-Dependent Words - MISSING | |
1023 | \& 4.4 Locale-Dependent Loose Matches - MISSING | |
1024 | \& 4.5 Locale-Dependent Ranges - MISSING | |
1025 | .Ve | |
1026 | .Sp | |
1027 | .Vb 2 | |
1028 | \& [16] see UTR#10 Unicode Collation Algorithms | |
1029 | \& [17] have Unicode::Collate but not integrated to regexes | |
1030 | .Ve | |
1031 | .Sh "Unicode Encodings" | |
1032 | .IX Subsection "Unicode Encodings" | |
1033 | Unicode characters are assigned to \fIcode points\fR, which are abstract | |
1034 | numbers. To use these numbers, various encodings are needed. | |
1035 | .IP "\(bu" 4 | |
1036 | \&\s-1UTF\-8\s0 | |
1037 | .Sp | |
1038 | \&\s-1UTF\-8\s0 is a variable-length (1 to 6 bytes, current character allocations | |
1039 | require 4 bytes), byte-order independent encoding. For \s-1ASCII\s0 (and we | |
1040 | really do mean 7\-bit \s-1ASCII\s0, not another 8\-bit encoding), \s-1UTF\-8\s0 is | |
1041 | transparent. | |
1042 | .Sp | |
1043 | The following table is from Unicode 3.2. | |
1044 | .Sp | |
1045 | .Vb 1 | |
1046 | \& Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte | |
1047 | .Ve | |
1048 | .Sp | |
1049 | .Vb 10 | |
1050 | \& U+0000..U+007F 00..7F | |
1051 | \& U+0080..U+07FF C2..DF 80..BF | |
1052 | \& U+0800..U+0FFF E0 A0..BF 80..BF | |
1053 | \& U+1000..U+CFFF E1..EC 80..BF 80..BF | |
1054 | \& U+D000..U+D7FF ED 80..9F 80..BF | |
1055 | \& U+D800..U+DFFF ******* ill-formed ******* | |
1056 | \& U+E000..U+FFFF EE..EF 80..BF 80..BF | |
1057 | \& U+10000..U+3FFFF F0 90..BF 80..BF 80..BF | |
1058 | \& U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF | |
1059 | \& U+100000..U+10FFFF F4 80..8F 80..BF 80..BF | |
1060 | .Ve | |
1061 | .Sp | |
1062 | Note the \f(CW\*(C`A0..BF\*(C'\fR in \f(CW\*(C`U+0800..U+0FFF\*(C'\fR, the \f(CW\*(C`80..9F\*(C'\fR in | |
1063 | \&\f(CW\*(C`U+D000...U+D7FF\*(C'\fR, the \f(CW\*(C`90..B\*(C'\fRF in \f(CW\*(C`U+10000..U+3FFFF\*(C'\fR, and the | |
1064 | \&\f(CW\*(C`80...8F\*(C'\fR in \f(CW\*(C`U+100000..U+10FFFF\*(C'\fR. The \*(L"gaps\*(R" are caused by legal | |
1065 | \&\s-1UTF\-8\s0 avoiding non-shortest encodings: it is technically possible to | |
1066 | UTF\-8\-encode a single code point in different ways, but that is | |
1067 | explicitly forbidden, and the shortest possible encoding should always | |
1068 | be used. So that's what Perl does. | |
1069 | .Sp | |
1070 | Another way to look at it is via bits: | |
1071 | .Sp | |
1072 | .Vb 1 | |
1073 | \& Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte | |
1074 | .Ve | |
1075 | .Sp | |
1076 | .Vb 4 | |
1077 | \& 0aaaaaaa 0aaaaaaa | |
1078 | \& 00000bbbbbaaaaaa 110bbbbb 10aaaaaa | |
1079 | \& ccccbbbbbbaaaaaa 1110cccc 10bbbbbb 10aaaaaa | |
1080 | \& 00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa | |
1081 | .Ve | |
1082 | .Sp | |
1083 | As you can see, the continuation bytes all begin with \f(CW10\fR, and the | |
1084 | leading bits of the start byte tell how many bytes the are in the | |
1085 | encoded character. | |
1086 | .IP "\(bu" 4 | |
1087 | UTF-EBCDIC | |
1088 | .Sp | |
1089 | Like \s-1UTF\-8\s0 but EBCDIC\-safe, in the way that \s-1UTF\-8\s0 is ASCII\-safe. | |
1090 | .IP "\(bu" 4 | |
1091 | \&\s-1UTF\-16\s0, \s-1UTF\-16BE\s0, \s-1UTF\-16LE\s0, Surrogates, and BOMs (Byte Order Marks) | |
1092 | .Sp | |
1093 | The followings items are mostly for reference and general Unicode | |
1094 | knowledge, Perl doesn't use these constructs internally. | |
1095 | .Sp | |
1096 | \&\s-1UTF\-16\s0 is a 2 or 4 byte encoding. The Unicode code points | |
1097 | \&\f(CW\*(C`U+0000..U+FFFF\*(C'\fR are stored in a single 16\-bit unit, and the code | |
1098 | points \f(CW\*(C`U+10000..U+10FFFF\*(C'\fR in two 16\-bit units. The latter case is | |
1099 | using \fIsurrogates\fR, the first 16\-bit unit being the \fIhigh | |
1100 | surrogate\fR, and the second being the \fIlow surrogate\fR. | |
1101 | .Sp | |
1102 | Surrogates are code points set aside to encode the \f(CW\*(C`U+10000..U+10FFFF\*(C'\fR | |
1103 | range of Unicode code points in pairs of 16\-bit units. The \fIhigh | |
1104 | surrogates\fR are the range \f(CW\*(C`U+D800..U+DBFF\*(C'\fR, and the \fIlow surrogates\fR | |
1105 | are the range \f(CW\*(C`U+DC00..U+DFFF\*(C'\fR. The surrogate encoding is | |
1106 | .Sp | |
1107 | .Vb 2 | |
1108 | \& $hi = ($uni - 0x10000) / 0x400 + 0xD800; | |
1109 | \& $lo = ($uni - 0x10000) % 0x400 + 0xDC00; | |
1110 | .Ve | |
1111 | .Sp | |
1112 | and the decoding is | |
1113 | .Sp | |
1114 | .Vb 1 | |
1115 | \& $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00); | |
1116 | .Ve | |
1117 | .Sp | |
1118 | If you try to generate surrogates (for example by using \fIchr()\fR), you | |
1119 | will get a warning if warnings are turned on, because those code | |
1120 | points are not valid for a Unicode character. | |
1121 | .Sp | |
1122 | Because of the 16\-bitness, \s-1UTF\-16\s0 is byte-order dependent. \s-1UTF\-16\s0 | |
1123 | itself can be used for in-memory computations, but if storage or | |
1124 | transfer is required either \s-1UTF\-16BE\s0 (big\-endian) or \s-1UTF\-16LE\s0 | |
1125 | (little\-endian) encodings must be chosen. | |
1126 | .Sp | |
1127 | This introduces another problem: what if you just know that your data | |
1128 | is \s-1UTF\-16\s0, but you don't know which endianness? Byte Order Marks, or | |
1129 | BOMs, are a solution to this. A special character has been reserved | |
1130 | in Unicode to function as a byte order marker: the character with the | |
1131 | code point \f(CW\*(C`U+FEFF\*(C'\fR is the \s-1BOM\s0. | |
1132 | .Sp | |
1133 | The trick is that if you read a \s-1BOM\s0, you will know the byte order, | |
1134 | since if it was written on a big-endian platform, you will read the | |
1135 | bytes \f(CW\*(C`0xFE 0xFF\*(C'\fR, but if it was written on a little-endian platform, | |
1136 | you will read the bytes \f(CW\*(C`0xFF 0xFE\*(C'\fR. (And if the originating platform | |
1137 | was writing in \s-1UTF\-8\s0, you will read the bytes \f(CW\*(C`0xEF 0xBB 0xBF\*(C'\fR.) | |
1138 | .Sp | |
1139 | The way this trick works is that the character with the code point | |
1140 | \&\f(CW\*(C`U+FFFE\*(C'\fR is guaranteed not to be a valid Unicode character, so the | |
1141 | sequence of bytes \f(CW\*(C`0xFF 0xFE\*(C'\fR is unambiguously \*(L"\s-1BOM\s0, represented in | |
1142 | little-endian format\*(R" and cannot be \f(CW\*(C`U+FFFE\*(C'\fR, represented in big-endian | |
1143 | format". | |
1144 | .IP "\(bu" 4 | |
1145 | \&\s-1UTF\-32\s0, \s-1UTF\-32BE\s0, \s-1UTF\-32LE\s0 | |
1146 | .Sp | |
1147 | The \s-1UTF\-32\s0 family is pretty much like the \s-1UTF\-16\s0 family, expect that | |
1148 | the units are 32\-bit, and therefore the surrogate scheme is not | |
1149 | needed. The \s-1BOM\s0 signatures will be \f(CW\*(C`0x00 0x00 0xFE 0xFF\*(C'\fR for \s-1BE\s0 and | |
1150 | \&\f(CW\*(C`0xFF 0xFE 0x00 0x00\*(C'\fR for \s-1LE\s0. | |
1151 | .IP "\(bu" 4 | |
1152 | \&\s-1UCS\-2\s0, \s-1UCS\-4\s0 | |
1153 | .Sp | |
1154 | Encodings defined by the \s-1ISO\s0 10646 standard. \s-1UCS\-2\s0 is a 16\-bit | |
1155 | encoding. Unlike \s-1UTF\-16\s0, \s-1UCS\-2\s0 is not extensible beyond \f(CW\*(C`U+FFFF\*(C'\fR, | |
1156 | because it does not use surrogates. \s-1UCS\-4\s0 is a 32\-bit encoding, | |
1157 | functionally identical to \s-1UTF\-32\s0. | |
1158 | .IP "\(bu" 4 | |
1159 | \&\s-1UTF\-7\s0 | |
1160 | .Sp | |
1161 | A seven-bit safe (non\-eight\-bit) encoding, which is useful if the | |
1162 | transport or storage is not eight-bit safe. Defined by \s-1RFC\s0 2152. | |
1163 | .Sh "Security Implications of Unicode" | |
1164 | .IX Subsection "Security Implications of Unicode" | |
1165 | .IP "\(bu" 4 | |
1166 | Malformed \s-1UTF\-8\s0 | |
1167 | .Sp | |
1168 | Unfortunately, the specification of \s-1UTF\-8\s0 leaves some room for | |
1169 | interpretation of how many bytes of encoded output one should generate | |
1170 | from one input Unicode character. Strictly speaking, the shortest | |
1171 | possible sequence of \s-1UTF\-8\s0 bytes should be generated, | |
1172 | because otherwise there is potential for an input buffer overflow at | |
1173 | the receiving end of a \s-1UTF\-8\s0 connection. Perl always generates the | |
1174 | shortest length \s-1UTF\-8\s0, and with warnings on Perl will warn about | |
1175 | non-shortest length \s-1UTF\-8\s0 along with other malformations, such as the | |
1176 | surrogates, which are not real Unicode code points. | |
1177 | .IP "\(bu" 4 | |
1178 | Regular expressions behave slightly differently between byte data and | |
1179 | character (Unicode) data. For example, the \*(L"word character\*(R" character | |
1180 | class \f(CW\*(C`\ew\*(C'\fR will work differently depending on if data is eight-bit bytes | |
1181 | or Unicode. | |
1182 | .Sp | |
1183 | In the first case, the set of \f(CW\*(C`\ew\*(C'\fR characters is either small\*(--the | |
1184 | default set of alphabetic characters, digits, and the \*(L"_\*(R"\-\-or, if you | |
1185 | are using a locale (see perllocale), the \f(CW\*(C`\ew\*(C'\fR might contain a few | |
1186 | more letters according to your language and country. | |
1187 | .Sp | |
1188 | In the second case, the \f(CW\*(C`\ew\*(C'\fR set of characters is much, much larger. | |
1189 | Most importantly, even in the set of the first 256 characters, it will | |
1190 | probably match different characters: unlike most locales, which are | |
1191 | specific to a language and country pair, Unicode classifies all the | |
1192 | characters that are letters \fIsomewhere\fR as \f(CW\*(C`\ew\*(C'\fR. For example, your | |
1193 | locale might not think that \s-1LATIN\s0 \s-1SMALL\s0 \s-1LETTER\s0 \s-1ETH\s0 is a letter (unless | |
1194 | you happen to speak Icelandic), but Unicode does. | |
1195 | .Sp | |
1196 | As discussed elsewhere, Perl has one foot (two hooves?) planted in | |
1197 | each of two worlds: the old world of bytes and the new world of | |
1198 | characters, upgrading from bytes to characters when necessary. | |
1199 | If your legacy code does not explicitly use Unicode, no automatic | |
1200 | switch-over to characters should happen. Characters shouldn't get | |
1201 | downgraded to bytes, either. It is possible to accidentally mix bytes | |
1202 | and characters, however (see perluniintro), in which case \f(CW\*(C`\ew\*(C'\fR in | |
1203 | regular expressions might start behaving differently. Review your | |
1204 | code. Use warnings and the \f(CW\*(C`strict\*(C'\fR pragma. | |
1205 | .Sh "Unicode in Perl on \s-1EBCDIC\s0" | |
1206 | .IX Subsection "Unicode in Perl on EBCDIC" | |
1207 | The way Unicode is handled on \s-1EBCDIC\s0 platforms is still | |
1208 | experimental. On such platforms, references to \s-1UTF\-8\s0 encoding in this | |
1209 | document and elsewhere should be read as meaning the UTF-EBCDIC | |
1210 | specified in Unicode Technical Report 16, unless \s-1ASCII\s0 vs. \s-1EBCDIC\s0 issues | |
1211 | are specifically discussed. There is no \f(CW\*(C`utfebcdic\*(C'\fR pragma or | |
1212 | \&\*(L":utfebcdic\*(R" layer; rather, \*(L"utf8\*(R" and \*(L":utf8\*(R" are reused to mean | |
1213 | the platform's \*(L"natural\*(R" 8\-bit encoding of Unicode. See perlebcdic | |
1214 | for more discussion of the issues. | |
1215 | .Sh "Locales" | |
1216 | .IX Subsection "Locales" | |
1217 | Usually locale settings and Unicode do not affect each other, but | |
1218 | there are a couple of exceptions: | |
1219 | .IP "\(bu" 4 | |
1220 | You can enable automatic UTF\-8\-ification of your standard file | |
1221 | handles, default \f(CW\*(C`open()\*(C'\fR layer, and \f(CW@ARGV\fR by using either | |
1222 | the \f(CW\*(C`\-C\*(C'\fR command line switch or the \f(CW\*(C`PERL_UNICODE\*(C'\fR environment | |
1223 | variable, see perlrun for the documentation of the \f(CW\*(C`\-C\*(C'\fR switch. | |
1224 | .IP "\(bu" 4 | |
1225 | Perl tries really hard to work both with Unicode and the old | |
1226 | byte-oriented world. Most often this is nice, but sometimes Perl's | |
1227 | straddling of the proverbial fence causes problems. | |
1228 | .Sh "When Unicode Does Not Happen" | |
1229 | .IX Subsection "When Unicode Does Not Happen" | |
1230 | While Perl does have extensive ways to input and output in Unicode, | |
1231 | and few other 'entry points' like the \f(CW@ARGV\fR which can be interpreted | |
1232 | as Unicode (\s-1UTF\-8\s0), there still are many places where Unicode (in some | |
1233 | encoding or another) could be given as arguments or received as | |
1234 | results, or both, but it is not. | |
1235 | .PP | |
1236 | The following are such interfaces. For all of these interfaces Perl | |
1237 | currently (as of 5.8.3) simply assumes byte strings both as arguments | |
1238 | and results, or \s-1UTF\-8\s0 strings if the \f(CW\*(C`encoding\*(C'\fR pragma has been used. | |
1239 | .PP | |
1240 | One reason why Perl does not attempt to resolve the role of Unicode in | |
1241 | this cases is that the answers are highly dependent on the operating | |
1242 | system and the file system(s). For example, whether filenames can be | |
1243 | in Unicode, and in exactly what kind of encoding, is not exactly a | |
1244 | portable concept. Similarly for the qx and system: how well will the | |
1245 | \&'command line interface' (and which of them?) handle Unicode? | |
1246 | .IP "\(bu" 4 | |
1247 | chdir, chmod, chown, chroot, exec, link, lstat, mkdir, | |
1248 | rename, rmdir, stat, symlink, truncate, unlink, utime, \-X | |
1249 | .IP "\(bu" 4 | |
1250 | %ENV | |
1251 | .IP "\(bu" 4 | |
1252 | glob (aka the <*>) | |
1253 | .IP "\(bu" 4 | |
1254 | open, opendir, sysopen | |
1255 | .IP "\(bu" 4 | |
1256 | qx (aka the backtick operator), system | |
1257 | .IP "\(bu" 4 | |
1258 | readdir, readlink | |
1259 | .Sh "Forcing Unicode in Perl (Or Unforcing Unicode in Perl)" | |
1260 | .IX Subsection "Forcing Unicode in Perl (Or Unforcing Unicode in Perl)" | |
1261 | Sometimes (see \*(L"When Unicode Does Not Happen\*(R") there are | |
1262 | situations where you simply need to force Perl to believe that a byte | |
1263 | string is \s-1UTF\-8\s0, or vice versa. The low-level calls | |
1264 | utf8::upgrade($bytestring) and utf8::downgrade($utf8string) are | |
1265 | the answers. | |
1266 | .PP | |
1267 | Do not use them without careful thought, though: Perl may easily get | |
1268 | very confused, angry, or even crash, if you suddenly change the 'nature' | |
1269 | of scalar like that. Especially careful you have to be if you use the | |
1270 | \&\fIutf8::upgrade()\fR: any random byte string is not valid \s-1UTF\-8\s0. | |
1271 | .Sh "Using Unicode in \s-1XS\s0" | |
1272 | .IX Subsection "Using Unicode in XS" | |
1273 | If you want to handle Perl Unicode in \s-1XS\s0 extensions, you may find the | |
1274 | following C APIs useful. See also \*(L"Unicode Support\*(R" in perlguts for an | |
1275 | explanation about Unicode at the \s-1XS\s0 level, and perlapi for the \s-1API\s0 | |
1276 | details. | |
1277 | .IP "\(bu" 4 | |
1278 | \&\f(CW\*(C`DO_UTF8(sv)\*(C'\fR returns true if the \f(CW\*(C`UTF8\*(C'\fR flag is on and the bytes | |
1279 | pragma is not in effect. \f(CW\*(C`SvUTF8(sv)\*(C'\fR returns true is the \f(CW\*(C`UTF8\*(C'\fR | |
1280 | flag is on; the bytes pragma is ignored. The \f(CW\*(C`UTF8\*(C'\fR flag being on | |
1281 | does \fBnot\fR mean that there are any characters of code points greater | |
1282 | than 255 (or 127) in the scalar or that there are even any characters | |
1283 | in the scalar. What the \f(CW\*(C`UTF8\*(C'\fR flag means is that the sequence of | |
1284 | octets in the representation of the scalar is the sequence of \s-1UTF\-8\s0 | |
1285 | encoded code points of the characters of a string. The \f(CW\*(C`UTF8\*(C'\fR flag | |
1286 | being off means that each octet in this representation encodes a | |
1287 | single character with code point 0..255 within the string. Perl's | |
1288 | Unicode model is not to use \s-1UTF\-8\s0 until it is absolutely necessary. | |
1289 | .IP "\(bu" 4 | |
1290 | \&\f(CW\*(C`uvuni_to_utf8(buf, chr)\*(C'\fR writes a Unicode character code point into | |
1291 | a buffer encoding the code point as \s-1UTF\-8\s0, and returns a pointer | |
1292 | pointing after the \s-1UTF\-8\s0 bytes. | |
1293 | .IP "\(bu" 4 | |
1294 | \&\f(CW\*(C`utf8_to_uvuni(buf, lenp)\*(C'\fR reads \s-1UTF\-8\s0 encoded bytes from a buffer and | |
1295 | returns the Unicode character code point and, optionally, the length of | |
1296 | the \s-1UTF\-8\s0 byte sequence. | |
1297 | .IP "\(bu" 4 | |
1298 | \&\f(CW\*(C`utf8_length(start, end)\*(C'\fR returns the length of the \s-1UTF\-8\s0 encoded buffer | |
1299 | in characters. \f(CW\*(C`sv_len_utf8(sv)\*(C'\fR returns the length of the \s-1UTF\-8\s0 encoded | |
1300 | scalar. | |
1301 | .IP "\(bu" 4 | |
1302 | \&\f(CW\*(C`sv_utf8_upgrade(sv)\*(C'\fR converts the string of the scalar to its \s-1UTF\-8\s0 | |
1303 | encoded form. \f(CW\*(C`sv_utf8_downgrade(sv)\*(C'\fR does the opposite, if | |
1304 | possible. \f(CW\*(C`sv_utf8_encode(sv)\*(C'\fR is like sv_utf8_upgrade except that | |
1305 | it does not set the \f(CW\*(C`UTF8\*(C'\fR flag. \f(CW\*(C`sv_utf8_decode()\*(C'\fR does the | |
1306 | opposite of \f(CW\*(C`sv_utf8_encode()\*(C'\fR. Note that none of these are to be | |
1307 | used as general-purpose encoding or decoding interfaces: \f(CW\*(C`use Encode\*(C'\fR | |
1308 | for that. \f(CW\*(C`sv_utf8_upgrade()\*(C'\fR is affected by the encoding pragma | |
1309 | but \f(CW\*(C`sv_utf8_downgrade()\*(C'\fR is not (since the encoding pragma is | |
1310 | designed to be a one-way street). | |
1311 | .IP "\(bu" 4 | |
1312 | \&\f(CWis_utf8_char(s)\fR returns true if the pointer points to a valid \s-1UTF\-8\s0 | |
1313 | character. | |
1314 | .IP "\(bu" 4 | |
1315 | \&\f(CW\*(C`is_utf8_string(buf, len)\*(C'\fR returns true if \f(CW\*(C`len\*(C'\fR bytes of the buffer | |
1316 | are valid \s-1UTF\-8\s0. | |
1317 | .IP "\(bu" 4 | |
1318 | \&\f(CW\*(C`UTF8SKIP(buf)\*(C'\fR will return the number of bytes in the \s-1UTF\-8\s0 encoded | |
1319 | character in the buffer. \f(CW\*(C`UNISKIP(chr)\*(C'\fR will return the number of bytes | |
1320 | required to UTF\-8\-encode the Unicode character code point. \f(CW\*(C`UTF8SKIP()\*(C'\fR | |
1321 | is useful for example for iterating over the characters of a \s-1UTF\-8\s0 | |
1322 | encoded buffer; \f(CW\*(C`UNISKIP()\*(C'\fR is useful, for example, in computing | |
1323 | the size required for a \s-1UTF\-8\s0 encoded buffer. | |
1324 | .IP "\(bu" 4 | |
1325 | \&\f(CW\*(C`utf8_distance(a, b)\*(C'\fR will tell the distance in characters between the | |
1326 | two pointers pointing to the same \s-1UTF\-8\s0 encoded buffer. | |
1327 | .IP "\(bu" 4 | |
1328 | \&\f(CW\*(C`utf8_hop(s, off)\*(C'\fR will return a pointer to an \s-1UTF\-8\s0 encoded buffer | |
1329 | that is \f(CW\*(C`off\*(C'\fR (positive or negative) Unicode characters displaced | |
1330 | from the \s-1UTF\-8\s0 buffer \f(CW\*(C`s\*(C'\fR. Be careful not to overstep the buffer: | |
1331 | \&\f(CW\*(C`utf8_hop()\*(C'\fR will merrily run off the end or the beginning of the | |
1332 | buffer if told to do so. | |
1333 | .IP "\(bu" 4 | |
1334 | \&\f(CW\*(C`pv_uni_display(dsv, spv, len, pvlim, flags)\*(C'\fR and | |
1335 | \&\f(CW\*(C`sv_uni_display(dsv, ssv, pvlim, flags)\*(C'\fR are useful for debugging the | |
1336 | output of Unicode strings and scalars. By default they are useful | |
1337 | only for debugging\*(--they display \fBall\fR characters as hexadecimal code | |
1338 | points\*(--but with the flags \f(CW\*(C`UNI_DISPLAY_ISPRINT\*(C'\fR, | |
1339 | \&\f(CW\*(C`UNI_DISPLAY_BACKSLASH\*(C'\fR, and \f(CW\*(C`UNI_DISPLAY_QQ\*(C'\fR you can make the | |
1340 | output more readable. | |
1341 | .IP "\(bu" 4 | |
1342 | \&\f(CW\*(C`ibcmp_utf8(s1, pe1, u1, l1, u1, s2, pe2, l2, u2)\*(C'\fR can be used to | |
1343 | compare two strings case-insensitively in Unicode. For case-sensitive | |
1344 | comparisons you can just use \f(CW\*(C`memEQ()\*(C'\fR and \f(CW\*(C`memNE()\*(C'\fR as usual. | |
1345 | .PP | |
1346 | For more information, see perlapi, and \fIutf8.c\fR and \fIutf8.h\fR | |
1347 | in the Perl source code distribution. | |
1348 | .SH "BUGS" | |
1349 | .IX Header "BUGS" | |
1350 | .Sh "Interaction with Locales" | |
1351 | .IX Subsection "Interaction with Locales" | |
1352 | Use of locales with Unicode data may lead to odd results. Currently, | |
1353 | Perl attempts to attach 8\-bit locale info to characters in the range | |
1354 | 0..255, but this technique is demonstrably incorrect for locales that | |
1355 | use characters above that range when mapped into Unicode. Perl's | |
1356 | Unicode support will also tend to run slower. Use of locales with | |
1357 | Unicode is discouraged. | |
1358 | .Sh "Interaction with Extensions" | |
1359 | .IX Subsection "Interaction with Extensions" | |
1360 | When Perl exchanges data with an extension, the extension should be | |
1361 | able to understand the \s-1UTF\-8\s0 flag and act accordingly. If the | |
1362 | extension doesn't know about the flag, it's likely that the extension | |
1363 | will return incorrectly-flagged data. | |
1364 | .PP | |
1365 | So if you're working with Unicode data, consult the documentation of | |
1366 | every module you're using if there are any issues with Unicode data | |
1367 | exchange. If the documentation does not talk about Unicode at all, | |
1368 | suspect the worst and probably look at the source to learn how the | |
1369 | module is implemented. Modules written completely in Perl shouldn't | |
1370 | cause problems. Modules that directly or indirectly access code written | |
1371 | in other programming languages are at risk. | |
1372 | .PP | |
1373 | For affected functions, the simple strategy to avoid data corruption is | |
1374 | to always make the encoding of the exchanged data explicit. Choose an | |
1375 | encoding that you know the extension can handle. Convert arguments passed | |
1376 | to the extensions to that encoding and convert results back from that | |
1377 | encoding. Write wrapper functions that do the conversions for you, so | |
1378 | you can later change the functions when the extension catches up. | |
1379 | .PP | |
1380 | To provide an example, let's say the popular Foo::Bar::escape_html | |
1381 | function doesn't deal with Unicode data yet. The wrapper function | |
1382 | would convert the argument to raw \s-1UTF\-8\s0 and convert the result back to | |
1383 | Perl's internal representation like so: | |
1384 | .PP | |
1385 | .Vb 5 | |
1386 | \& sub my_escape_html ($) { | |
1387 | \& my($what) = shift; | |
1388 | \& return unless defined $what; | |
1389 | \& Encode::decode_utf8(Foo::Bar::escape_html(Encode::encode_utf8($what))); | |
1390 | \& } | |
1391 | .Ve | |
1392 | .PP | |
1393 | Sometimes, when the extension does not convert data but just stores | |
1394 | and retrieves them, you will be in a position to use the otherwise | |
1395 | dangerous \fIEncode::_utf8_on()\fR function. Let's say the popular | |
1396 | \&\f(CW\*(C`Foo::Bar\*(C'\fR extension, written in C, provides a \f(CW\*(C`param\*(C'\fR method that | |
1397 | lets you store and retrieve data according to these prototypes: | |
1398 | .PP | |
1399 | .Vb 2 | |
1400 | \& $self->param($name, $value); # set a scalar | |
1401 | \& $value = $self->param($name); # retrieve a scalar | |
1402 | .Ve | |
1403 | .PP | |
1404 | If it does not yet provide support for any encoding, one could write a | |
1405 | derived class with such a \f(CW\*(C`param\*(C'\fR method: | |
1406 | .PP | |
1407 | .Vb 12 | |
1408 | \& sub param { | |
1409 | \& my($self,$name,$value) = @_; | |
1410 | \& utf8::upgrade($name); # make sure it is UTF-8 encoded | |
1411 | \& if (defined $value) | |
1412 | \& utf8::upgrade($value); # make sure it is UTF-8 encoded | |
1413 | \& return $self->SUPER::param($name,$value); | |
1414 | \& } else { | |
1415 | \& my $ret = $self->SUPER::param($name); | |
1416 | \& Encode::_utf8_on($ret); # we know, it is UTF-8 encoded | |
1417 | \& return $ret; | |
1418 | \& } | |
1419 | \& } | |
1420 | .Ve | |
1421 | .PP | |
1422 | Some extensions provide filters on data entry/exit points, such as | |
1423 | DB_File::filter_store_key and family. Look out for such filters in | |
1424 | the documentation of your extensions, they can make the transition to | |
1425 | Unicode data much easier. | |
1426 | .Sh "Speed" | |
1427 | .IX Subsection "Speed" | |
1428 | Some functions are slower when working on \s-1UTF\-8\s0 encoded strings than | |
1429 | on byte encoded strings. All functions that need to hop over | |
1430 | characters such as \fIlength()\fR, \fIsubstr()\fR or \fIindex()\fR, or matching regular | |
1431 | expressions can work \fBmuch\fR faster when the underlying data are | |
1432 | byte\-encoded. | |
1433 | .PP | |
1434 | In Perl 5.8.0 the slowness was often quite spectacular; in Perl 5.8.1 | |
1435 | a caching scheme was introduced which will hopefully make the slowness | |
1436 | somewhat less spectacular, at least for some operations. In general, | |
1437 | operations with \s-1UTF\-8\s0 encoded strings are still slower. As an example, | |
1438 | the Unicode properties (character classes) like \f(CW\*(C`\ep{Nd}\*(C'\fR are known to | |
1439 | be quite a bit slower (5\-20 times) than their simpler counterparts | |
1440 | like \f(CW\*(C`\ed\*(C'\fR (then again, there 268 Unicode characters matching \f(CW\*(C`Nd\*(C'\fR | |
1441 | compared with the 10 \s-1ASCII\s0 characters matching \f(CW\*(C`d\*(C'\fR). | |
1442 | .Sh "Porting code from perl\-5.6.X" | |
1443 | .IX Subsection "Porting code from perl-5.6.X" | |
1444 | Perl 5.8 has a different Unicode model from 5.6. In 5.6 the programmer | |
1445 | was required to use the \f(CW\*(C`utf8\*(C'\fR pragma to declare that a given scope | |
1446 | expected to deal with Unicode data and had to make sure that only | |
1447 | Unicode data were reaching that scope. If you have code that is | |
1448 | working with 5.6, you will need some of the following adjustments to | |
1449 | your code. The examples are written such that the code will continue | |
1450 | to work under 5.6, so you should be safe to try them out. | |
1451 | .IP "\(bu" 4 | |
1452 | A filehandle that should read or write \s-1UTF\-8\s0 | |
1453 | .Sp | |
1454 | .Vb 3 | |
1455 | \& if ($] > 5.007) { | |
1456 | \& binmode $fh, ":utf8"; | |
1457 | \& } | |
1458 | .Ve | |
1459 | .IP "\(bu" 4 | |
1460 | A scalar that is going to be passed to some extension | |
1461 | .Sp | |
1462 | Be it Compress::Zlib, Apache::Request or any extension that has no | |
1463 | mention of Unicode in the manpage, you need to make sure that the | |
1464 | \&\s-1UTF\-8\s0 flag is stripped off. Note that at the time of this writing | |
1465 | (October 2002) the mentioned modules are not UTF\-8\-aware. Please | |
1466 | check the documentation to verify if this is still true. | |
1467 | .Sp | |
1468 | .Vb 4 | |
1469 | \& if ($] > 5.007) { | |
1470 | \& require Encode; | |
1471 | \& $val = Encode::encode_utf8($val); # make octets | |
1472 | \& } | |
1473 | .Ve | |
1474 | .IP "\(bu" 4 | |
1475 | A scalar we got back from an extension | |
1476 | .Sp | |
1477 | If you believe the scalar comes back as \s-1UTF\-8\s0, you will most likely | |
1478 | want the \s-1UTF\-8\s0 flag restored: | |
1479 | .Sp | |
1480 | .Vb 4 | |
1481 | \& if ($] > 5.007) { | |
1482 | \& require Encode; | |
1483 | \& $val = Encode::decode_utf8($val); | |
1484 | \& } | |
1485 | .Ve | |
1486 | .IP "\(bu" 4 | |
1487 | Same thing, if you are really sure it is \s-1UTF\-8\s0 | |
1488 | .Sp | |
1489 | .Vb 4 | |
1490 | \& if ($] > 5.007) { | |
1491 | \& require Encode; | |
1492 | \& Encode::_utf8_on($val); | |
1493 | \& } | |
1494 | .Ve | |
1495 | .IP "\(bu" 4 | |
1496 | A wrapper for fetchrow_array and fetchrow_hashref | |
1497 | .Sp | |
1498 | When the database contains only \s-1UTF\-8\s0, a wrapper function or method is | |
1499 | a convenient way to replace all your fetchrow_array and | |
1500 | fetchrow_hashref calls. A wrapper function will also make it easier to | |
1501 | adapt to future enhancements in your database driver. Note that at the | |
1502 | time of this writing (October 2002), the \s-1DBI\s0 has no standardized way | |
1503 | to deal with \s-1UTF\-8\s0 data. Please check the documentation to verify if | |
1504 | that is still true. | |
1505 | .Sp | |
1506 | .Vb 26 | |
1507 | \& sub fetchrow { | |
1508 | \& my($self, $sth, $what) = @_; # $what is one of fetchrow_{array,hashref} | |
1509 | \& if ($] < 5.007) { | |
1510 | \& return $sth->$what; | |
1511 | \& } else { | |
1512 | \& require Encode; | |
1513 | \& if (wantarray) { | |
1514 | \& my @arr = $sth->$what; | |
1515 | \& for (@arr) { | |
1516 | \& defined && /[^\e000-\e177]/ && Encode::_utf8_on($_); | |
1517 | \& } | |
1518 | \& return @arr; | |
1519 | \& } else { | |
1520 | \& my $ret = $sth->$what; | |
1521 | \& if (ref $ret) { | |
1522 | \& for my $k (keys %$ret) { | |
1523 | \& defined && /[^\e000-\e177]/ && Encode::_utf8_on($_) for $ret->{$k}; | |
1524 | \& } | |
1525 | \& return $ret; | |
1526 | \& } else { | |
1527 | \& defined && /[^\e000-\e177]/ && Encode::_utf8_on($_) for $ret; | |
1528 | \& return $ret; | |
1529 | \& } | |
1530 | \& } | |
1531 | \& } | |
1532 | \& } | |
1533 | .Ve | |
1534 | .IP "\(bu" 4 | |
1535 | A large scalar that you know can only contain \s-1ASCII\s0 | |
1536 | .Sp | |
1537 | Scalars that contain only \s-1ASCII\s0 and are marked as \s-1UTF\-8\s0 are sometimes | |
1538 | a drag to your program. If you recognize such a situation, just remove | |
1539 | the \s-1UTF\-8\s0 flag: | |
1540 | .Sp | |
1541 | .Vb 1 | |
1542 | \& utf8::downgrade($val) if $] > 5.007; | |
1543 | .Ve | |
1544 | .SH "SEE ALSO" | |
1545 | .IX Header "SEE ALSO" | |
1546 | perluniintro, encoding, Encode, open, utf8, bytes, | |
1547 | perlretut, \*(L"${^UNICODE}\*(R" in perlvar |