Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "Unicode::UCD 3" | |
132 | .TH Unicode::UCD 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | Unicode::UCD \- Unicode character database | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 2 | |
138 | \& use Unicode::UCD 'charinfo'; | |
139 | \& my $charinfo = charinfo($codepoint); | |
140 | .Ve | |
141 | .PP | |
142 | .Vb 2 | |
143 | \& use Unicode::UCD 'charblock'; | |
144 | \& my $charblock = charblock($codepoint); | |
145 | .Ve | |
146 | .PP | |
147 | .Vb 2 | |
148 | \& use Unicode::UCD 'charscript'; | |
149 | \& my $charscript = charscript($codepoint); | |
150 | .Ve | |
151 | .PP | |
152 | .Vb 2 | |
153 | \& use Unicode::UCD 'charblocks'; | |
154 | \& my $charblocks = charblocks(); | |
155 | .Ve | |
156 | .PP | |
157 | .Vb 2 | |
158 | \& use Unicode::UCD 'charscripts'; | |
159 | \& my %charscripts = charscripts(); | |
160 | .Ve | |
161 | .PP | |
162 | .Vb 3 | |
163 | \& use Unicode::UCD qw(charscript charinrange); | |
164 | \& my $range = charscript($script); | |
165 | \& print "looks like $script\en" if charinrange($range, $codepoint); | |
166 | .Ve | |
167 | .PP | |
168 | .Vb 2 | |
169 | \& use Unicode::UCD 'compexcl'; | |
170 | \& my $compexcl = compexcl($codepoint); | |
171 | .Ve | |
172 | .PP | |
173 | .Vb 2 | |
174 | \& use Unicode::UCD 'namedseq'; | |
175 | \& my $namedseq = namedseq($named_sequence_name); | |
176 | .Ve | |
177 | .PP | |
178 | .Vb 1 | |
179 | \& my $unicode_version = Unicode::UCD::UnicodeVersion(); | |
180 | .Ve | |
181 | .SH "DESCRIPTION" | |
182 | .IX Header "DESCRIPTION" | |
183 | The Unicode::UCD module offers a simple interface to the Unicode | |
184 | Character Database. | |
185 | .Sh "charinfo" | |
186 | .IX Subsection "charinfo" | |
187 | .Vb 1 | |
188 | \& use Unicode::UCD 'charinfo'; | |
189 | .Ve | |
190 | .PP | |
191 | .Vb 1 | |
192 | \& my $charinfo = charinfo(0x41); | |
193 | .Ve | |
194 | .PP | |
195 | \&\fIcharinfo()\fR returns a reference to a hash that has the following fields | |
196 | as defined by the Unicode standard: | |
197 | .PP | |
198 | .Vb 1 | |
199 | \& key | |
200 | .Ve | |
201 | .PP | |
202 | .Vb 15 | |
203 | \& code code point with at least four hexdigits | |
204 | \& name name of the character IN UPPER CASE | |
205 | \& category general category of the character | |
206 | \& combining classes used in the Canonical Ordering Algorithm | |
207 | \& bidi bidirectional category | |
208 | \& decomposition character decomposition mapping | |
209 | \& decimal if decimal digit this is the integer numeric value | |
210 | \& digit if digit this is the numeric value | |
211 | \& numeric if numeric is the integer or rational numeric value | |
212 | \& mirrored if mirrored in bidirectional text | |
213 | \& unicode10 Unicode 1.0 name if existed and different | |
214 | \& comment ISO 10646 comment field | |
215 | \& upper uppercase equivalent mapping | |
216 | \& lower lowercase equivalent mapping | |
217 | \& title titlecase equivalent mapping | |
218 | .Ve | |
219 | .PP | |
220 | .Vb 2 | |
221 | \& block block the character belongs to (used in \ep{In...}) | |
222 | \& script script the character belongs to | |
223 | .Ve | |
224 | .PP | |
225 | If no match is found, a reference to an empty hash is returned. | |
226 | .PP | |
227 | The \f(CW\*(C`block\*(C'\fR property is the same as returned by \fIcharinfo()\fR. It is | |
228 | not defined in the Unicode Character Database proper (Chapter 4 of the | |
229 | Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database | |
230 | (Chapter 14 of \s-1TUS3\s0). Similarly for the \f(CW\*(C`script\*(C'\fR property. | |
231 | .PP | |
232 | Note that you cannot do (de)composition and casing based solely on the | |
233 | above \f(CW\*(C`decomposition\*(C'\fR and \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, \f(CW\*(C`title\*(C'\fR, properties, | |
234 | you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions. | |
235 | .Sh "charblock" | |
236 | .IX Subsection "charblock" | |
237 | .Vb 1 | |
238 | \& use Unicode::UCD 'charblock'; | |
239 | .Ve | |
240 | .PP | |
241 | .Vb 4 | |
242 | \& my $charblock = charblock(0x41); | |
243 | \& my $charblock = charblock(1234); | |
244 | \& my $charblock = charblock("0x263a"); | |
245 | \& my $charblock = charblock("U+263a"); | |
246 | .Ve | |
247 | .PP | |
248 | .Vb 1 | |
249 | \& my $range = charblock('Armenian'); | |
250 | .Ve | |
251 | .PP | |
252 | With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character | |
253 | belongs to, e.g. \f(CW\*(C`Basic Latin\*(C'\fR. Note that not all the character | |
254 | positions within all blocks are defined. | |
255 | .PP | |
256 | See also \*(L"Blocks versus Scripts\*(R". | |
257 | .PP | |
258 | If supplied with an argument that can't be a code point, \fIcharblock()\fR tries | |
259 | to do the opposite and interpret the argument as a character block. The | |
260 | return value is a \fIrange\fR: an anonymous list of lists that contain | |
261 | \&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether | |
262 | a code point is in a range using the \*(L"charinrange\*(R" function. If the | |
263 | argument is not a known character block, \f(CW\*(C`undef\*(C'\fR is returned. | |
264 | .Sh "charscript" | |
265 | .IX Subsection "charscript" | |
266 | .Vb 1 | |
267 | \& use Unicode::UCD 'charscript'; | |
268 | .Ve | |
269 | .PP | |
270 | .Vb 3 | |
271 | \& my $charscript = charscript(0x41); | |
272 | \& my $charscript = charscript(1234); | |
273 | \& my $charscript = charscript("U+263a"); | |
274 | .Ve | |
275 | .PP | |
276 | .Vb 1 | |
277 | \& my $range = charscript('Thai'); | |
278 | .Ve | |
279 | .PP | |
280 | With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the | |
281 | character belongs to, e.g. \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR. | |
282 | .PP | |
283 | See also \*(L"Blocks versus Scripts\*(R". | |
284 | .PP | |
285 | If supplied with an argument that can't be a code point, \fIcharscript()\fR tries | |
286 | to do the opposite and interpret the argument as a character script. The | |
287 | return value is a \fIrange\fR: an anonymous list of lists that contain | |
288 | \&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a | |
289 | code point is in a range using the \*(L"charinrange\*(R" function. If the | |
290 | argument is not a known character script, \f(CW\*(C`undef\*(C'\fR is returned. | |
291 | .Sh "charblocks" | |
292 | .IX Subsection "charblocks" | |
293 | .Vb 1 | |
294 | \& use Unicode::UCD 'charblocks'; | |
295 | .Ve | |
296 | .PP | |
297 | .Vb 1 | |
298 | \& my $charblocks = charblocks(); | |
299 | .Ve | |
300 | .PP | |
301 | \&\fIcharblocks()\fR returns a reference to a hash with the known block names | |
302 | as the keys, and the code point ranges (see \*(L"charblock\*(R") as the values. | |
303 | .PP | |
304 | See also \*(L"Blocks versus Scripts\*(R". | |
305 | .Sh "charscripts" | |
306 | .IX Subsection "charscripts" | |
307 | .Vb 1 | |
308 | \& use Unicode::UCD 'charscripts'; | |
309 | .Ve | |
310 | .PP | |
311 | .Vb 1 | |
312 | \& my %charscripts = charscripts(); | |
313 | .Ve | |
314 | .PP | |
315 | \&\fIcharscripts()\fR returns a hash with the known script names as the keys, | |
316 | and the code point ranges (see \*(L"charscript\*(R") as the values. | |
317 | .PP | |
318 | See also \*(L"Blocks versus Scripts\*(R". | |
319 | .Sh "Blocks versus Scripts" | |
320 | .IX Subsection "Blocks versus Scripts" | |
321 | The difference between a block and a script is that scripts are closer | |
322 | to the linguistic notion of a set of characters required to present | |
323 | languages, while block is more of an artifact of the Unicode character | |
324 | numbering and separation into blocks of (mostly) 256 characters. | |
325 | .PP | |
326 | For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such | |
327 | as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and | |
328 | \&\f(CW\*(C`Latin Extended\-B\*(C'\fR. On the other hand, the Latin script does not | |
329 | contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as | |
330 | the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits | |
331 | or the punctuation. | |
332 | .PP | |
333 | For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt | |
334 | .PP | |
335 | For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/ | |
336 | .Sh "Matching Scripts and Blocks" | |
337 | .IX Subsection "Matching Scripts and Blocks" | |
338 | Scripts are matched with the regular-expression construct | |
339 | \&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script), | |
340 | while \f(CW\*(C`\ep{In...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{InTibetan}\*(C'\fR matches | |
341 | any of the 256 code points in the Tibetan block). | |
342 | .Sh "Code Point Arguments" | |
343 | .IX Subsection "Code Point Arguments" | |
344 | A \fIcode point argument\fR is either a decimal or a hexadecimal scalar | |
345 | designating a Unicode character, or \f(CW\*(C`U+\*(C'\fR followed by hexadecimals | |
346 | designating a Unicode character. In other words, if you want a code | |
347 | point to be interpreted as a hexadecimal number, you must prefix it | |
348 | with either \f(CW\*(C`0x\*(C'\fR or \f(CW\*(C`U+\*(C'\fR, because a string like e.g. \f(CW123\fR will | |
349 | be interpreted as a decimal code point. Also note that Unicode is | |
350 | \&\fBnot\fR limited to 16 bits (the number of Unicode characters is | |
351 | open\-ended, in theory unlimited): you may have more than 4 hexdigits. | |
352 | .Sh "charinrange" | |
353 | .IX Subsection "charinrange" | |
354 | In addition to using the \f(CW\*(C`\ep{In...}\*(C'\fR and \f(CW\*(C`\eP{In...}\*(C'\fR constructs, you | |
355 | can also test whether a code point is in the \fIrange\fR as returned by | |
356 | \&\*(L"charblock\*(R" and \*(L"charscript\*(R" or as the values of the hash returned | |
357 | by \*(L"charblocks\*(R" and \*(L"charscripts\*(R" by using \fIcharinrange()\fR: | |
358 | .PP | |
359 | .Vb 1 | |
360 | \& use Unicode::UCD qw(charscript charinrange); | |
361 | .Ve | |
362 | .PP | |
363 | .Vb 2 | |
364 | \& $range = charscript('Hiragana'); | |
365 | \& print "looks like hiragana\en" if charinrange($range, $codepoint); | |
366 | .Ve | |
367 | .Sh "compexcl" | |
368 | .IX Subsection "compexcl" | |
369 | .Vb 1 | |
370 | \& use Unicode::UCD 'compexcl'; | |
371 | .Ve | |
372 | .PP | |
373 | .Vb 1 | |
374 | \& my $compexcl = compexcl("09dc"); | |
375 | .Ve | |
376 | .PP | |
377 | The \fIcompexcl()\fR returns the composition exclusion (that is, if the | |
378 | character should not be produced during a precomposition) of the | |
379 | character specified by a \fBcode point argument\fR. | |
380 | .PP | |
381 | If there is a composition exclusion for the character, true is | |
382 | returned. Otherwise, false is returned. | |
383 | .Sh "casefold" | |
384 | .IX Subsection "casefold" | |
385 | .Vb 1 | |
386 | \& use Unicode::UCD 'casefold'; | |
387 | .Ve | |
388 | .PP | |
389 | .Vb 1 | |
390 | \& my $casefold = casefold("00DF"); | |
391 | .Ve | |
392 | .PP | |
393 | The \fIcasefold()\fR returns the locale-independent case folding of the | |
394 | character specified by a \fBcode point argument\fR. | |
395 | .PP | |
396 | If there is a case folding for that character, a reference to a hash | |
397 | with the following fields is returned: | |
398 | .PP | |
399 | .Vb 1 | |
400 | \& key | |
401 | .Ve | |
402 | .PP | |
403 | .Vb 3 | |
404 | \& code code point with at least four hexdigits | |
405 | \& status "C", "F", "S", or "I" | |
406 | \& mapping one or more codes separated by spaces | |
407 | .Ve | |
408 | .PP | |
409 | The meaning of the \fIstatus\fR is as follows: | |
410 | .PP | |
411 | .Vb 15 | |
412 | \& C common case folding, common mappings shared | |
413 | \& by both simple and full mappings | |
414 | \& F full case folding, mappings that cause strings | |
415 | \& to grow in length. Multiple characters are separated | |
416 | \& by spaces | |
417 | \& S simple case folding, mappings to single characters | |
418 | \& where different from F | |
419 | \& I special case for dotted uppercase I and | |
420 | \& dotless lowercase i | |
421 | \& - If this mapping is included, the result is | |
422 | \& case-insensitive, but dotless and dotted I's | |
423 | \& are not distinguished | |
424 | \& - If this mapping is excluded, the result is not | |
425 | \& fully case-insensitive, but dotless and dotted | |
426 | \& I's are distinguished | |
427 | .Ve | |
428 | .PP | |
429 | If there is no case folding for that character, \f(CW\*(C`undef\*(C'\fR is returned. | |
430 | .PP | |
431 | For more information about case mappings see | |
432 | http://www.unicode.org/unicode/reports/tr21/ | |
433 | .Sh "casespec" | |
434 | .IX Subsection "casespec" | |
435 | .Vb 1 | |
436 | \& use Unicode::UCD 'casespec'; | |
437 | .Ve | |
438 | .PP | |
439 | .Vb 1 | |
440 | \& my $casespec = casespec("FB00"); | |
441 | .Ve | |
442 | .PP | |
443 | The \fIcasespec()\fR returns the potentially locale-dependent case mapping | |
444 | of the character specified by a \fBcode point argument\fR. The mapping | |
445 | may change the length of the string (which the basic Unicode case | |
446 | mappings as returned by \fIcharinfo()\fR never do). | |
447 | .PP | |
448 | If there is a case folding for that character, a reference to a hash | |
449 | with the following fields is returned: | |
450 | .PP | |
451 | .Vb 1 | |
452 | \& key | |
453 | .Ve | |
454 | .PP | |
455 | .Vb 5 | |
456 | \& code code point with at least four hexdigits | |
457 | \& lower lowercase | |
458 | \& title titlecase | |
459 | \& upper uppercase | |
460 | \& condition condition list (may be undef) | |
461 | .Ve | |
462 | .PP | |
463 | The \f(CW\*(C`condition\*(C'\fR is optional. Where present, it consists of one or | |
464 | more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as | |
465 | used to separate elements, spaces are to be ignored). A condition | |
466 | list overrides the normal behavior if all of the listed conditions are | |
467 | true. Case distinctions in the condition list are not significant. | |
468 | Conditions preceded by \*(L"\s-1NON_\s0\*(R" represent the negation of the condition. | |
469 | .PP | |
470 | Note that when there are multiple case folding definitions for a | |
471 | single code point because of different locales, the value returned by | |
472 | \&\fIcasespec()\fR is a hash reference which has the locales as the keys and | |
473 | hash references as described above as the values. | |
474 | .PP | |
475 | A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly | |
476 | followed by a \*(L"_\*(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed | |
477 | by a \*(L"_\*(R" and a variant code). You can find the lists of those codes, | |
478 | see Locale::Country and Locale::Language. | |
479 | .PP | |
480 | A \fIcontext\fR is one of the following choices: | |
481 | .PP | |
482 | .Vb 4 | |
483 | \& FINAL The letter is not followed by a letter of | |
484 | \& general category L (e.g. Ll, Lt, Lu, Lm, or Lo) | |
485 | \& MODERN The mapping is only used for modern text | |
486 | \& AFTER_i The last base character was "i" (U+0069) | |
487 | .Ve | |
488 | .PP | |
489 | For more information about case mappings see | |
490 | http://www.unicode.org/unicode/reports/tr21/ | |
491 | .Sh "\fInamedseq()\fP" | |
492 | .IX Subsection "namedseq()" | |
493 | .Vb 1 | |
494 | \& use Unicode::UCD 'namedseq'; | |
495 | .Ve | |
496 | .PP | |
497 | .Vb 3 | |
498 | \& my $namedseq = namedseq("KATAKANA LETTER AINU P"); | |
499 | \& my @namedseq = namedseq("KATAKANA LETTER AINU P"); | |
500 | \& my %namedseq = namedseq(); | |
501 | .Ve | |
502 | .PP | |
503 | If used with a single argument in a scalar context, returns the string | |
504 | consisting of the code points of the named sequence, or \f(CW\*(C`undef\*(C'\fR if no | |
505 | named sequence by that name exists. If used with a single argument in | |
506 | a list context, returns list of the code points. If used with no | |
507 | arguments in a list context, returns a hash with the names of the | |
508 | named sequences as the keys and the named sequences as strings as | |
509 | the values. Otherwise, returns \f(CW\*(C`undef\*(C'\fR or empty list depending | |
510 | on the context. | |
511 | .PP | |
512 | (New from Unicode 4.1.0) | |
513 | .Sh "Unicode::UCD::UnicodeVersion" | |
514 | .IX Subsection "Unicode::UCD::UnicodeVersion" | |
515 | \&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode | |
516 | Character Database, in other words, the version of the Unicode | |
517 | standard the database implements. The version is a string | |
518 | of numbers delimited by dots (\f(CW'.'\fR). | |
519 | .Sh "Implementation Note" | |
520 | .IX Subsection "Implementation Note" | |
521 | The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode | |
522 | Character Database (the database is included in the Perl distribution). | |
523 | The filehandle is then kept open for further queries. In other words, | |
524 | if you are wondering where one of your filehandles went, that's where. | |
525 | .SH "BUGS" | |
526 | .IX Header "BUGS" | |
527 | Does not yet support \s-1EBCDIC\s0 platforms. | |
528 | .SH "AUTHOR" | |
529 | .IX Header "AUTHOR" | |
530 | Jarkko Hietaniemi |