Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / Unicode::UCD.3
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "Unicode::UCD 3"
132.TH Unicode::UCD 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134Unicode::UCD \- Unicode character database
135.SH "SYNOPSIS"
136.IX Header "SYNOPSIS"
137.Vb 2
138\& use Unicode::UCD 'charinfo';
139\& my $charinfo = charinfo($codepoint);
140.Ve
141.PP
142.Vb 2
143\& use Unicode::UCD 'charblock';
144\& my $charblock = charblock($codepoint);
145.Ve
146.PP
147.Vb 2
148\& use Unicode::UCD 'charscript';
149\& my $charscript = charblock($codepoint);
150.Ve
151.PP
152.Vb 2
153\& use Unicode::UCD 'charblocks';
154\& my $charblocks = charblocks();
155.Ve
156.PP
157.Vb 2
158\& use Unicode::UCD 'charscripts';
159\& my %charscripts = charscripts();
160.Ve
161.PP
162.Vb 3
163\& use Unicode::UCD qw(charscript charinrange);
164\& my $range = charscript($script);
165\& print "looks like $script\en" if charinrange($range, $codepoint);
166.Ve
167.PP
168.Vb 2
169\& use Unicode::UCD 'compexcl';
170\& my $compexcl = compexcl($codepoint);
171.Ve
172.PP
173.Vb 1
174\& my $unicode_version = Unicode::UCD::UnicodeVersion();
175.Ve
176.SH "DESCRIPTION"
177.IX Header "DESCRIPTION"
178The Unicode::UCD module offers a simple interface to the Unicode
179Character Database.
180.Sh "charinfo"
181.IX Subsection "charinfo"
182.Vb 1
183\& use Unicode::UCD 'charinfo';
184.Ve
185.PP
186.Vb 1
187\& my $charinfo = charinfo(0x41);
188.Ve
189.PP
190\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
191as defined by the Unicode standard:
192.PP
193.Vb 1
194\& key
195.Ve
196.PP
197.Vb 15
198\& code code point with at least four hexdigits
199\& name name of the character IN UPPER CASE
200\& category general category of the character
201\& combining classes used in the Canonical Ordering Algorithm
202\& bidi bidirectional category
203\& decomposition character decomposition mapping
204\& decimal if decimal digit this is the integer numeric value
205\& digit if digit this is the numeric value
206\& numeric if numeric is the integer or rational numeric value
207\& mirrored if mirrored in bidirectional text
208\& unicode10 Unicode 1.0 name if existed and different
209\& comment ISO 10646 comment field
210\& upper uppercase equivalent mapping
211\& lower lowercase equivalent mapping
212\& title titlecase equivalent mapping
213.Ve
214.PP
215.Vb 2
216\& block block the character belongs to (used in \ep{In...})
217\& script script the character belongs to
218.Ve
219.PP
220If no match is found, a reference to an empty hash is returned.
221.PP
222The \f(CW\*(C`block\*(C'\fR property is the same as returned by \fIcharinfo()\fR. It is
223not defined in the Unicode Character Database proper (Chapter 4 of the
224Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
225(Chapter 14 of \s-1TUS3\s0). Similarly for the \f(CW\*(C`script\*(C'\fR property.
226.PP
227Note that you cannot do (de)composition and casing based solely on the
228above \f(CW\*(C`decomposition\*(C'\fR and \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, \f(CW\*(C`title\*(C'\fR, properties,
229you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
230.Sh "charblock"
231.IX Subsection "charblock"
232.Vb 1
233\& use Unicode::UCD 'charblock';
234.Ve
235.PP
236.Vb 4
237\& my $charblock = charblock(0x41);
238\& my $charblock = charblock(1234);
239\& my $charblock = charblock("0x263a");
240\& my $charblock = charblock("U+263a");
241.Ve
242.PP
243.Vb 1
244\& my $range = charblock('Armenian');
245.Ve
246.PP
247With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
248belongs to, e.g. \f(CW\*(C`Basic Latin\*(C'\fR. Note that not all the character
249positions within all blocks are defined.
250.PP
251See also \*(L"Blocks versus Scripts\*(R".
252.PP
253If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
254to do the opposite and interpret the argument as a character block. The
255return value is a \fIrange\fR: an anonymous list of lists that contain
256\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
257code point is in a range using the \*(L"charinrange\*(R" function. If the
258argument is not a known charater block, \f(CW\*(C`undef\*(C'\fR is returned.
259.Sh "charscript"
260.IX Subsection "charscript"
261.Vb 1
262\& use Unicode::UCD 'charscript';
263.Ve
264.PP
265.Vb 3
266\& my $charscript = charscript(0x41);
267\& my $charscript = charscript(1234);
268\& my $charscript = charscript("U+263a");
269.Ve
270.PP
271.Vb 1
272\& my $range = charscript('Thai');
273.Ve
274.PP
275With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
276character belongs to, e.g. \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR.
277.PP
278See also \*(L"Blocks versus Scripts\*(R".
279.PP
280If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
281to do the opposite and interpret the argument as a character script. The
282return value is a \fIrange\fR: an anonymous list of lists that contain
283\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
284code point is in a range using the \*(L"charinrange\*(R" function. If the
285argument is not a known charater script, \f(CW\*(C`undef\*(C'\fR is returned.
286.Sh "charblocks"
287.IX Subsection "charblocks"
288.Vb 1
289\& use Unicode::UCD 'charblocks';
290.Ve
291.PP
292.Vb 1
293\& my $charblocks = charblocks();
294.Ve
295.PP
296\&\fIcharblocks()\fR returns a reference to a hash with the known block names
297as the keys, and the code point ranges (see \*(L"charblock\*(R") as the values.
298.PP
299See also \*(L"Blocks versus Scripts\*(R".
300.Sh "charscripts"
301.IX Subsection "charscripts"
302.Vb 1
303\& use Unicode::UCD 'charscripts';
304.Ve
305.PP
306.Vb 1
307\& my %charscripts = charscripts();
308.Ve
309.PP
310\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
311and the code point ranges (see \*(L"charscript\*(R") as the values.
312.PP
313See also \*(L"Blocks versus Scripts\*(R".
314.Sh "Blocks versus Scripts"
315.IX Subsection "Blocks versus Scripts"
316The difference between a block and a script is that scripts are closer
317to the linguistic notion of a set of characters required to present
318languages, while block is more of an artifact of the Unicode character
319numbering and separation into blocks of (mostly) 256 characters.
320.PP
321For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
322as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and
323\&\f(CW\*(C`Latin Extended\-B\*(C'\fR. On the other hand, the Latin script does not
324contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as
325the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
326or the punctuation.
327.PP
328For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
329.PP
330For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
331.Sh "Matching Scripts and Blocks"
332.IX Subsection "Matching Scripts and Blocks"
333Scripts are matched with the regular-expression construct
334\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script),
335while \f(CW\*(C`\ep{In...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{InTibetan}\*(C'\fR matches
336any of the 256 code points in the Tibetan block).
337.Sh "Code Point Arguments"
338.IX Subsection "Code Point Arguments"
339A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
340designating a Unicode character, or \f(CW\*(C`U+\*(C'\fR followed by hexadecimals
341designating a Unicode character. Note that Unicode is \fBnot\fR limited
342to 16 bits (the number of Unicode characters is open\-ended, in theory
343unlimited): you may have more than 4 hexdigits.
344.Sh "charinrange"
345.IX Subsection "charinrange"
346In addition to using the \f(CW\*(C`\ep{In...}\*(C'\fR and \f(CW\*(C`\eP{In...}\*(C'\fR constructs, you
347can also test whether a code point is in the \fIrange\fR as returned by
348\&\*(L"charblock\*(R" and \*(L"charscript\*(R" or as the values of the hash returned
349by \*(L"charblocks\*(R" and \*(L"charscripts\*(R" by using \fIcharinrange()\fR:
350.PP
351.Vb 1
352\& use Unicode::UCD qw(charscript charinrange);
353.Ve
354.PP
355.Vb 2
356\& $range = charscript('Hiragana');
357\& print "looks like hiragana\en" if charinrange($range, $codepoint);
358.Ve
359.Sh "compexcl"
360.IX Subsection "compexcl"
361.Vb 1
362\& use Unicode::UCD 'compexcl';
363.Ve
364.PP
365.Vb 1
366\& my $compexcl = compexcl("09dc");
367.Ve
368.PP
369The \fIcompexcl()\fR returns the composition exclusion (that is, if the
370character should not be produced during a precomposition) of the
371character specified by a \fBcode point argument\fR.
372.PP
373If there is a composition exclusion for the character, true is
374returned. Otherwise, false is returned.
375.Sh "casefold"
376.IX Subsection "casefold"
377.Vb 1
378\& use Unicode::UCD 'casefold';
379.Ve
380.PP
381.Vb 1
382\& my %casefold = casefold("09dc");
383.Ve
384.PP
385The \fIcasefold()\fR returns the locale-independent case folding of the
386character specified by a \fBcode point argument\fR.
387.PP
388If there is a case folding for that character, a reference to a hash
389with the following fields is returned:
390.PP
391.Vb 1
392\& key
393.Ve
394.PP
395.Vb 3
396\& code code point with at least four hexdigits
397\& status "C", "F", "S", or "I"
398\& mapping one or more codes separated by spaces
399.Ve
400.PP
401The meaning of the \fIstatus\fR is as follows:
402.PP
403.Vb 15
404\& C common case folding, common mappings shared
405\& by both simple and full mappings
406\& F full case folding, mappings that cause strings
407\& to grow in length. Multiple characters are separated
408\& by spaces
409\& S simple case folding, mappings to single characters
410\& where different from F
411\& I special case for dotted uppercase I and
412\& dotless lowercase i
413\& - If this mapping is included, the result is
414\& case-insensitive, but dotless and dotted I's
415\& are not distinguished
416\& - If this mapping is excluded, the result is not
417\& fully case-insensitive, but dotless and dotted
418\& I's are distinguished
419.Ve
420.PP
421If there is no case folding for that character, \f(CW\*(C`undef\*(C'\fR is returned.
422.PP
423For more information about case mappings see
424http://www.unicode.org/unicode/reports/tr21/
425.Sh "casespec"
426.IX Subsection "casespec"
427.Vb 1
428\& use Unicode::UCD 'casespec';
429.Ve
430.PP
431.Vb 1
432\& my %casespec = casespec("09dc");
433.Ve
434.PP
435The \fIcasespec()\fR returns the potentially locale-dependent case mapping
436of the character specified by a \fBcode point argument\fR. The mapping
437may change the length of the string (which the basic Unicode case
438mappings as returned by \fIcharinfo()\fR never do).
439.PP
440If there is a case folding for that character, a reference to a hash
441with the following fields is returned:
442.PP
443.Vb 1
444\& key
445.Ve
446.PP
447.Vb 5
448\& code code point with at least four hexdigits
449\& lower lowercase
450\& title titlecase
451\& upper uppercase
452\& condition condition list (may be undef)
453.Ve
454.PP
455The \f(CW\*(C`condition\*(C'\fR is optional. Where present, it consists of one or
456more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
457used to separate elements, spaces are to be ignored). A condition
458list overrides the normal behavior if all of the listed conditions are
459true. Case distinctions in the condition list are not significant.
460Conditions preceded by \*(L"\s-1NON_\s0\*(R" represent the negation of the condition
461.PP
462Note that when there are multiple case folding definitions for a
463single code point because of different locales, the value returned by
464\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
465hash references as described above as the values.
466.PP
467A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
468followed by a \*(L"_\*(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
469by a \*(L"_\*(R" and a variant code). You can find the lists of those codes,
470see Locale::Country and Locale::Language.
471.PP
472A \fIcontext\fR is one of the following choices:
473.PP
474.Vb 4
475\& FINAL The letter is not followed by a letter of
476\& general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
477\& MODERN The mapping is only used for modern text
478\& AFTER_i The last base character was "i" (U+0069)
479.Ve
480.PP
481For more information about case mappings see
482http://www.unicode.org/unicode/reports/tr21/
483.Sh "Unicode::UCD::UnicodeVersion"
484.IX Subsection "Unicode::UCD::UnicodeVersion"
485\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
486Character Database, in other words, the version of the Unicode
487standard the database implements. The version is a string
488of numbers delimited by dots (\f(CW'.'\fR).
489.Sh "Implementation Note"
490.IX Subsection "Implementation Note"
491The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
492Character Database (the database is included in the Perl distribution).
493The filehandle is then kept open for further queries. In other words,
494if you are wondering where one of your filehandles went, that's where.
495.SH "BUGS"
496.IX Header "BUGS"
497Does not yet support \s-1EBCDIC\s0 platforms.
498.SH "AUTHOR"
499.IX Header "AUTHOR"
500Jarkko Hietaniemi