Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v9 / man / man3 / Unicode::UCD.3
CommitLineData
920dae64
AT
1.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "Unicode::UCD 3"
132.TH Unicode::UCD 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
133.SH "NAME"
134Unicode::UCD \- Unicode character database
135.SH "SYNOPSIS"
136.IX Header "SYNOPSIS"
137.Vb 2
138\& use Unicode::UCD 'charinfo';
139\& my $charinfo = charinfo($codepoint);
140.Ve
141.PP
142.Vb 2
143\& use Unicode::UCD 'charblock';
144\& my $charblock = charblock($codepoint);
145.Ve
146.PP
147.Vb 2
148\& use Unicode::UCD 'charscript';
149\& my $charscript = charscript($codepoint);
150.Ve
151.PP
152.Vb 2
153\& use Unicode::UCD 'charblocks';
154\& my $charblocks = charblocks();
155.Ve
156.PP
157.Vb 2
158\& use Unicode::UCD 'charscripts';
159\& my %charscripts = charscripts();
160.Ve
161.PP
162.Vb 3
163\& use Unicode::UCD qw(charscript charinrange);
164\& my $range = charscript($script);
165\& print "looks like $script\en" if charinrange($range, $codepoint);
166.Ve
167.PP
168.Vb 2
169\& use Unicode::UCD 'compexcl';
170\& my $compexcl = compexcl($codepoint);
171.Ve
172.PP
173.Vb 2
174\& use Unicode::UCD 'namedseq';
175\& my $namedseq = namedseq($named_sequence_name);
176.Ve
177.PP
178.Vb 1
179\& my $unicode_version = Unicode::UCD::UnicodeVersion();
180.Ve
181.SH "DESCRIPTION"
182.IX Header "DESCRIPTION"
183The Unicode::UCD module offers a simple interface to the Unicode
184Character Database.
185.Sh "charinfo"
186.IX Subsection "charinfo"
187.Vb 1
188\& use Unicode::UCD 'charinfo';
189.Ve
190.PP
191.Vb 1
192\& my $charinfo = charinfo(0x41);
193.Ve
194.PP
195\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
196as defined by the Unicode standard:
197.PP
198.Vb 1
199\& key
200.Ve
201.PP
202.Vb 15
203\& code code point with at least four hexdigits
204\& name name of the character IN UPPER CASE
205\& category general category of the character
206\& combining classes used in the Canonical Ordering Algorithm
207\& bidi bidirectional category
208\& decomposition character decomposition mapping
209\& decimal if decimal digit this is the integer numeric value
210\& digit if digit this is the numeric value
211\& numeric if numeric is the integer or rational numeric value
212\& mirrored if mirrored in bidirectional text
213\& unicode10 Unicode 1.0 name if existed and different
214\& comment ISO 10646 comment field
215\& upper uppercase equivalent mapping
216\& lower lowercase equivalent mapping
217\& title titlecase equivalent mapping
218.Ve
219.PP
220.Vb 2
221\& block block the character belongs to (used in \ep{In...})
222\& script script the character belongs to
223.Ve
224.PP
225If no match is found, a reference to an empty hash is returned.
226.PP
227The \f(CW\*(C`block\*(C'\fR property is the same as returned by \fIcharinfo()\fR. It is
228not defined in the Unicode Character Database proper (Chapter 4 of the
229Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
230(Chapter 14 of \s-1TUS3\s0). Similarly for the \f(CW\*(C`script\*(C'\fR property.
231.PP
232Note that you cannot do (de)composition and casing based solely on the
233above \f(CW\*(C`decomposition\*(C'\fR and \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, \f(CW\*(C`title\*(C'\fR, properties,
234you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
235.Sh "charblock"
236.IX Subsection "charblock"
237.Vb 1
238\& use Unicode::UCD 'charblock';
239.Ve
240.PP
241.Vb 4
242\& my $charblock = charblock(0x41);
243\& my $charblock = charblock(1234);
244\& my $charblock = charblock("0x263a");
245\& my $charblock = charblock("U+263a");
246.Ve
247.PP
248.Vb 1
249\& my $range = charblock('Armenian');
250.Ve
251.PP
252With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
253belongs to, e.g. \f(CW\*(C`Basic Latin\*(C'\fR. Note that not all the character
254positions within all blocks are defined.
255.PP
256See also \*(L"Blocks versus Scripts\*(R".
257.PP
258If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
259to do the opposite and interpret the argument as a character block. The
260return value is a \fIrange\fR: an anonymous list of lists that contain
261\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether
262a code point is in a range using the \*(L"charinrange\*(R" function. If the
263argument is not a known character block, \f(CW\*(C`undef\*(C'\fR is returned.
264.Sh "charscript"
265.IX Subsection "charscript"
266.Vb 1
267\& use Unicode::UCD 'charscript';
268.Ve
269.PP
270.Vb 3
271\& my $charscript = charscript(0x41);
272\& my $charscript = charscript(1234);
273\& my $charscript = charscript("U+263a");
274.Ve
275.PP
276.Vb 1
277\& my $range = charscript('Thai');
278.Ve
279.PP
280With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
281character belongs to, e.g. \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR.
282.PP
283See also \*(L"Blocks versus Scripts\*(R".
284.PP
285If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
286to do the opposite and interpret the argument as a character script. The
287return value is a \fIrange\fR: an anonymous list of lists that contain
288\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
289code point is in a range using the \*(L"charinrange\*(R" function. If the
290argument is not a known character script, \f(CW\*(C`undef\*(C'\fR is returned.
291.Sh "charblocks"
292.IX Subsection "charblocks"
293.Vb 1
294\& use Unicode::UCD 'charblocks';
295.Ve
296.PP
297.Vb 1
298\& my $charblocks = charblocks();
299.Ve
300.PP
301\&\fIcharblocks()\fR returns a reference to a hash with the known block names
302as the keys, and the code point ranges (see \*(L"charblock\*(R") as the values.
303.PP
304See also \*(L"Blocks versus Scripts\*(R".
305.Sh "charscripts"
306.IX Subsection "charscripts"
307.Vb 1
308\& use Unicode::UCD 'charscripts';
309.Ve
310.PP
311.Vb 1
312\& my %charscripts = charscripts();
313.Ve
314.PP
315\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
316and the code point ranges (see \*(L"charscript\*(R") as the values.
317.PP
318See also \*(L"Blocks versus Scripts\*(R".
319.Sh "Blocks versus Scripts"
320.IX Subsection "Blocks versus Scripts"
321The difference between a block and a script is that scripts are closer
322to the linguistic notion of a set of characters required to present
323languages, while block is more of an artifact of the Unicode character
324numbering and separation into blocks of (mostly) 256 characters.
325.PP
326For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
327as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and
328\&\f(CW\*(C`Latin Extended\-B\*(C'\fR. On the other hand, the Latin script does not
329contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as
330the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
331or the punctuation.
332.PP
333For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
334.PP
335For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
336.Sh "Matching Scripts and Blocks"
337.IX Subsection "Matching Scripts and Blocks"
338Scripts are matched with the regular-expression construct
339\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script),
340while \f(CW\*(C`\ep{In...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{InTibetan}\*(C'\fR matches
341any of the 256 code points in the Tibetan block).
342.Sh "Code Point Arguments"
343.IX Subsection "Code Point Arguments"
344A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
345designating a Unicode character, or \f(CW\*(C`U+\*(C'\fR followed by hexadecimals
346designating a Unicode character. In other words, if you want a code
347point to be interpreted as a hexadecimal number, you must prefix it
348with either \f(CW\*(C`0x\*(C'\fR or \f(CW\*(C`U+\*(C'\fR, because a string like e.g. \f(CW123\fR will
349be interpreted as a decimal code point. Also note that Unicode is
350\&\fBnot\fR limited to 16 bits (the number of Unicode characters is
351open\-ended, in theory unlimited): you may have more than 4 hexdigits.
352.Sh "charinrange"
353.IX Subsection "charinrange"
354In addition to using the \f(CW\*(C`\ep{In...}\*(C'\fR and \f(CW\*(C`\eP{In...}\*(C'\fR constructs, you
355can also test whether a code point is in the \fIrange\fR as returned by
356\&\*(L"charblock\*(R" and \*(L"charscript\*(R" or as the values of the hash returned
357by \*(L"charblocks\*(R" and \*(L"charscripts\*(R" by using \fIcharinrange()\fR:
358.PP
359.Vb 1
360\& use Unicode::UCD qw(charscript charinrange);
361.Ve
362.PP
363.Vb 2
364\& $range = charscript('Hiragana');
365\& print "looks like hiragana\en" if charinrange($range, $codepoint);
366.Ve
367.Sh "compexcl"
368.IX Subsection "compexcl"
369.Vb 1
370\& use Unicode::UCD 'compexcl';
371.Ve
372.PP
373.Vb 1
374\& my $compexcl = compexcl("09dc");
375.Ve
376.PP
377The \fIcompexcl()\fR returns the composition exclusion (that is, if the
378character should not be produced during a precomposition) of the
379character specified by a \fBcode point argument\fR.
380.PP
381If there is a composition exclusion for the character, true is
382returned. Otherwise, false is returned.
383.Sh "casefold"
384.IX Subsection "casefold"
385.Vb 1
386\& use Unicode::UCD 'casefold';
387.Ve
388.PP
389.Vb 1
390\& my $casefold = casefold("00DF");
391.Ve
392.PP
393The \fIcasefold()\fR returns the locale-independent case folding of the
394character specified by a \fBcode point argument\fR.
395.PP
396If there is a case folding for that character, a reference to a hash
397with the following fields is returned:
398.PP
399.Vb 1
400\& key
401.Ve
402.PP
403.Vb 3
404\& code code point with at least four hexdigits
405\& status "C", "F", "S", or "I"
406\& mapping one or more codes separated by spaces
407.Ve
408.PP
409The meaning of the \fIstatus\fR is as follows:
410.PP
411.Vb 15
412\& C common case folding, common mappings shared
413\& by both simple and full mappings
414\& F full case folding, mappings that cause strings
415\& to grow in length. Multiple characters are separated
416\& by spaces
417\& S simple case folding, mappings to single characters
418\& where different from F
419\& I special case for dotted uppercase I and
420\& dotless lowercase i
421\& - If this mapping is included, the result is
422\& case-insensitive, but dotless and dotted I's
423\& are not distinguished
424\& - If this mapping is excluded, the result is not
425\& fully case-insensitive, but dotless and dotted
426\& I's are distinguished
427.Ve
428.PP
429If there is no case folding for that character, \f(CW\*(C`undef\*(C'\fR is returned.
430.PP
431For more information about case mappings see
432http://www.unicode.org/unicode/reports/tr21/
433.Sh "casespec"
434.IX Subsection "casespec"
435.Vb 1
436\& use Unicode::UCD 'casespec';
437.Ve
438.PP
439.Vb 1
440\& my $casespec = casespec("FB00");
441.Ve
442.PP
443The \fIcasespec()\fR returns the potentially locale-dependent case mapping
444of the character specified by a \fBcode point argument\fR. The mapping
445may change the length of the string (which the basic Unicode case
446mappings as returned by \fIcharinfo()\fR never do).
447.PP
448If there is a case folding for that character, a reference to a hash
449with the following fields is returned:
450.PP
451.Vb 1
452\& key
453.Ve
454.PP
455.Vb 5
456\& code code point with at least four hexdigits
457\& lower lowercase
458\& title titlecase
459\& upper uppercase
460\& condition condition list (may be undef)
461.Ve
462.PP
463The \f(CW\*(C`condition\*(C'\fR is optional. Where present, it consists of one or
464more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
465used to separate elements, spaces are to be ignored). A condition
466list overrides the normal behavior if all of the listed conditions are
467true. Case distinctions in the condition list are not significant.
468Conditions preceded by \*(L"\s-1NON_\s0\*(R" represent the negation of the condition.
469.PP
470Note that when there are multiple case folding definitions for a
471single code point because of different locales, the value returned by
472\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
473hash references as described above as the values.
474.PP
475A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
476followed by a \*(L"_\*(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
477by a \*(L"_\*(R" and a variant code). You can find the lists of those codes,
478see Locale::Country and Locale::Language.
479.PP
480A \fIcontext\fR is one of the following choices:
481.PP
482.Vb 4
483\& FINAL The letter is not followed by a letter of
484\& general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
485\& MODERN The mapping is only used for modern text
486\& AFTER_i The last base character was "i" (U+0069)
487.Ve
488.PP
489For more information about case mappings see
490http://www.unicode.org/unicode/reports/tr21/
491.Sh "\fInamedseq()\fP"
492.IX Subsection "namedseq()"
493.Vb 1
494\& use Unicode::UCD 'namedseq';
495.Ve
496.PP
497.Vb 3
498\& my $namedseq = namedseq("KATAKANA LETTER AINU P");
499\& my @namedseq = namedseq("KATAKANA LETTER AINU P");
500\& my %namedseq = namedseq();
501.Ve
502.PP
503If used with a single argument in a scalar context, returns the string
504consisting of the code points of the named sequence, or \f(CW\*(C`undef\*(C'\fR if no
505named sequence by that name exists. If used with a single argument in
506a list context, returns list of the code points. If used with no
507arguments in a list context, returns a hash with the names of the
508named sequences as the keys and the named sequences as strings as
509the values. Otherwise, returns \f(CW\*(C`undef\*(C'\fR or empty list depending
510on the context.
511.PP
512(New from Unicode 4.1.0)
513.Sh "Unicode::UCD::UnicodeVersion"
514.IX Subsection "Unicode::UCD::UnicodeVersion"
515\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
516Character Database, in other words, the version of the Unicode
517standard the database implements. The version is a string
518of numbers delimited by dots (\f(CW'.'\fR).
519.Sh "Implementation Note"
520.IX Subsection "Implementation Note"
521The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
522Character Database (the database is included in the Perl distribution).
523The filehandle is then kept open for further queries. In other words,
524if you are wondering where one of your filehandles went, that's where.
525.SH "BUGS"
526.IX Header "BUGS"
527Does not yet support \s-1EBCDIC\s0 platforms.
528.SH "AUTHOR"
529.IX Header "AUTHOR"
530Jarkko Hietaniemi