[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / Unicode::UCD.3

.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sh \" Subsection heading
.br
.if t .Sp
.ne 5
.PP
\fB\\$1\fR
.PP
..
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  | will give a
.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
.\" expand to `' in nroff, nothing in troff, for use with C<>.
.tr \(*W-|\(bv\*(Tr
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.if \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.\"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.hy 0
.if n .na
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Unicode::UCD 3"
.TH Unicode::UCD 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
.SH "NAME"
Unicode::UCD \- Unicode character database
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\&    use Unicode::UCD 'charinfo';
\&    my $charinfo   = charinfo($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charblock';
\&    my $charblock  = charblock($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charscript';
\&    my $charscript = charblock($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charblocks';
\&    my $charblocks = charblocks();
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charscripts';
\&    my %charscripts = charscripts();
.Ve
.PP
.Vb 3
\&    use Unicode::UCD qw(charscript charinrange);
\&    my $range = charscript($script);
\&    print "looks like $script\en" if charinrange($range, $codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'compexcl';
\&    my $compexcl = compexcl($codepoint);
.Ve
.PP
.Vb 1
\&    my $unicode_version = Unicode::UCD::UnicodeVersion();
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The Unicode::UCD module offers a simple interface to the Unicode
Character Database.
.Sh "charinfo"
.IX Subsection "charinfo"
.Vb 1
\&    use Unicode::UCD 'charinfo';
.Ve
.PP
.Vb 1
\&    my $charinfo = charinfo(0x41);
.Ve
.PP
\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
as defined by the Unicode standard:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 15
\&    code             code point with at least four hexdigits
\&    name             name of the character IN UPPER CASE
\&    category         general category of the character
\&    combining        classes used in the Canonical Ordering Algorithm
\&    bidi             bidirectional category
\&    decomposition    character decomposition mapping
\&    decimal          if decimal digit this is the integer numeric value
\&    digit            if digit this is the numeric value
\&    numeric          if numeric is the integer or rational numeric value
\&    mirrored         if mirrored in bidirectional text
\&    unicode10        Unicode 1.0 name if existed and different
\&    comment          ISO 10646 comment field
\&    upper            uppercase equivalent mapping
\&    lower            lowercase equivalent mapping
\&    title            titlecase equivalent mapping
.Ve
.PP
.Vb 2
\&    block            block the character belongs to (used in \ep{In...})
\&    script           script the character belongs to
.Ve
.PP
If no match is found, a reference to an empty hash is returned.
.PP
The \f(CW\*(C`block\*(C'\fR property is the same as returned by \fIcharinfo()\fR.  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
(Chapter 14 of \s-1TUS3\s0).  Similarly for the \f(CW\*(C`script\*(C'\fR property.
.PP
Note that you cannot do (de)composition and casing based solely on the
above \f(CW\*(C`decomposition\*(C'\fR and \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, \f(CW\*(C`title\*(C'\fR, properties,
you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
.Sh "charblock"
.IX Subsection "charblock"
.Vb 1
\&    use Unicode::UCD 'charblock';
.Ve
.PP
.Vb 4
\&    my $charblock = charblock(0x41);
\&    my $charblock = charblock(1234);
\&    my $charblock = charblock("0x263a");
\&    my $charblock = charblock("U+263a");
.Ve
.PP
.Vb 1
\&    my $range     = charblock('Armenian');
.Ve
.PP
With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
belongs to, e.g.  \f(CW\*(C`Basic Latin\*(C'\fR.  Note that not all the character
positions within all blocks are defined.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.PP
If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
to do the opposite and interpret the argument as a character block. The
return value is a \fIrange\fR: an anonymous list of lists that contain
\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
code point is in a range using the \*(L"charinrange\*(R" function. If the
argument is not a known charater block, \f(CW\*(C`undef\*(C'\fR is returned.
.Sh "charscript"
.IX Subsection "charscript"
.Vb 1
\&    use Unicode::UCD 'charscript';
.Ve
.PP
.Vb 3
\&    my $charscript = charscript(0x41);
\&    my $charscript = charscript(1234);
\&    my $charscript = charscript("U+263a");
.Ve
.PP
.Vb 1
\&    my $range      = charscript('Thai');
.Ve
.PP
With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
character belongs to, e.g.  \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.PP
If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
to do the opposite and interpret the argument as a character script. The
return value is a \fIrange\fR: an anonymous list of lists that contain
\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
code point is in a range using the \*(L"charinrange\*(R" function. If the
argument is not a known charater script, \f(CW\*(C`undef\*(C'\fR is returned.
.Sh "charblocks"
.IX Subsection "charblocks"
.Vb 1
\&    use Unicode::UCD 'charblocks';
.Ve
.PP
.Vb 1
\&    my $charblocks = charblocks();
.Ve
.PP
\&\fIcharblocks()\fR returns a reference to a hash with the known block names
as the keys, and the code point ranges (see \*(L"charblock\*(R") as the values.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.Sh "charscripts"
.IX Subsection "charscripts"
.Vb 1
\&    use Unicode::UCD 'charscripts';
.Ve
.PP
.Vb 1
\&    my %charscripts = charscripts();
.Ve
.PP
\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
and the code point ranges (see \*(L"charscript\*(R") as the values.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.Sh "Blocks versus Scripts"
.IX Subsection "Blocks versus Scripts"
The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of (mostly) 256 characters.
.PP
For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and
\&\f(CW\*(C`Latin Extended\-B\*(C'\fR.  On the other hand, the Latin script does not
contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as
the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
or the punctuation.
.PP
For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
.PP
For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
.Sh "Matching Scripts and Blocks"
.IX Subsection "Matching Scripts and Blocks"
Scripts are matched with the regular-expression construct
\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script),
while \f(CW\*(C`\ep{In...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{InTibetan}\*(C'\fR matches
any of the 256 code points in the Tibetan block).
.Sh "Code Point Arguments"
.IX Subsection "Code Point Arguments"
A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
designating a Unicode character, or \f(CW\*(C`U+\*(C'\fR followed by hexadecimals
designating a Unicode character.  Note that Unicode is \fBnot\fR limited
to 16 bits (the number of Unicode characters is open\-ended, in theory
unlimited): you may have more than 4 hexdigits.
.Sh "charinrange"
.IX Subsection "charinrange"
In addition to using the \f(CW\*(C`\ep{In...}\*(C'\fR and \f(CW\*(C`\eP{In...}\*(C'\fR constructs, you
can also test whether a code point is in the \fIrange\fR as returned by
\&\*(L"charblock\*(R" and \*(L"charscript\*(R" or as the values of the hash returned
by \*(L"charblocks\*(R" and \*(L"charscripts\*(R" by using \fIcharinrange()\fR:
.PP
.Vb 1
\&    use Unicode::UCD qw(charscript charinrange);
.Ve
.PP
.Vb 2
\&    $range = charscript('Hiragana');
\&    print "looks like hiragana\en" if charinrange($range, $codepoint);
.Ve
.Sh "compexcl"
.IX Subsection "compexcl"
.Vb 1
\&    use Unicode::UCD 'compexcl';
.Ve
.PP
.Vb 1
\&    my $compexcl = compexcl("09dc");
.Ve
.PP
The \fIcompexcl()\fR returns the composition exclusion (that is, if the
character should not be produced during a precomposition) of the 
character specified by a \fBcode point argument\fR.
.PP
If there is a composition exclusion for the character, true is
returned.  Otherwise, false is returned.
.Sh "casefold"
.IX Subsection "casefold"
.Vb 1
\&    use Unicode::UCD 'casefold';
.Ve
.PP
.Vb 1
\&    my %casefold = casefold("09dc");
.Ve
.PP
The \fIcasefold()\fR returns the locale-independent case folding of the
character specified by a \fBcode point argument\fR.
.PP
If there is a case folding for that character, a reference to a hash
with the following fields is returned:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 3
\&    code             code point with at least four hexdigits
\&    status           "C", "F", "S", or "I"
\&    mapping          one or more codes separated by spaces
.Ve
.PP
The meaning of the \fIstatus\fR is as follows:
.PP
.Vb 15
\&   C                 common case folding, common mappings shared
\&                     by both simple and full mappings
\&   F                 full case folding, mappings that cause strings
\&                     to grow in length. Multiple characters are separated
\&                     by spaces
\&   S                 simple case folding, mappings to single characters
\&                     where different from F
\&   I                 special case for dotted uppercase I and
\&                     dotless lowercase i
\&                     - If this mapping is included, the result is
\&                       case-insensitive, but dotless and dotted I's
\&                       are not distinguished
\&                     - If this mapping is excluded, the result is not
\&                       fully case-insensitive, but dotless and dotted
\&                       I's are distinguished
.Ve
.PP
If there is no case folding for that character, \f(CW\*(C`undef\*(C'\fR is returned.
.PP
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
.Sh "casespec"
.IX Subsection "casespec"
.Vb 1
\&    use Unicode::UCD 'casespec';
.Ve
.PP
.Vb 1
\&    my %casespec = casespec("09dc");
.Ve
.PP
The \fIcasespec()\fR returns the potentially locale-dependent case mapping
of the character specified by a \fBcode point argument\fR.  The mapping
may change the length of the string (which the basic Unicode case
mappings as returned by \fIcharinfo()\fR never do).
.PP
If there is a case folding for that character, a reference to a hash
with the following fields is returned:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 5
\&    code             code point with at least four hexdigits
\&    lower            lowercase
\&    title            titlecase
\&    upper            uppercase
\&    condition        condition list (may be undef)
.Ve
.PP
The \f(CW\*(C`condition\*(C'\fR is optional.  Where present, it consists of one or
more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
used to separate elements, spaces are to be ignored).  A condition
list overrides the normal behavior if all of the listed conditions are
true.  Case distinctions in the condition list are not significant.
Conditions preceded by \*(L"\s-1NON_\s0\*(R" represent the negation of the condition
.PP
Note that when there are multiple case folding definitions for a
single code point because of different locales, the value returned by
\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
hash references as described above as the values.
.PP
A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
followed by a \*(L"_\*(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
by a \*(L"_\*(R" and a variant code).  You can find the lists of those codes,
see Locale::Country and Locale::Language.
.PP
A \fIcontext\fR is one of the following choices:
.PP
.Vb 4
\&    FINAL            The letter is not followed by a letter of
\&                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
\&    MODERN           The mapping is only used for modern text
\&    AFTER_i          The last base character was "i" (U+0069)
.Ve
.PP
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
.Sh "Unicode::UCD::UnicodeVersion"
.IX Subsection "Unicode::UCD::UnicodeVersion"
\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
Character Database, in other words, the version of the Unicode
standard the database implements.  The version is a string
of numbers delimited by dots (\f(CW'.'\fR).
.Sh "Implementation Note"
.IX Subsection "Implementation Note"
The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.  In other words,
if you are wondering where one of your filehandles went, that's where.
.SH "BUGS"
.IX Header "BUGS"
Does not yet support \s-1EBCDIC\s0 platforms.
.SH "AUTHOR"
.IX Header "AUTHOR"
Jarkko Hietaniemi
Commit	Line	Data
86530b38 AT	1	.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
65	.hy 0
66	.if n .na
67	.\"
68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69	.\" Fear. Run. Save yourself. No user-serviceable parts.
70	. \" fudge factors for nroff and troff
71	.if n \{\
72	. ds #H 0
73	. ds #V .8m
74	. ds #F .3m
75	. ds #[ \f1
76	. ds #] \fP
77	.\}
78	.if t \{\
79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80	. ds #V .6m
81	. ds #F 0
82	. ds #[ \&
83	. ds #] \&
84	.\}
85	. \" simple accents for nroff and troff
86	.if n \{\
87	. ds ' \&
88	. ds ` \&
89	. ds ^ \&
90	. ds , \&
91	. ds ~ ~
92	. ds /
93	.\}
94	.if t \{\
95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
101	.\}
102	. \" troff and (daisy-wheel) nroff accents
103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
110	.ds ae a\h'-(\w'a'u*4/10)'e
111	.ds Ae A\h'-(\w'A'u*4/10)'E
112	. \" corrections for vroff
113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
115	. \" for low resolution devices (crt and lpr)
116	.if \n(.H>23 .if \n(.V>19 \
117	\{\
118	. ds : e
119	. ds 8 ss
120	. ds o a
121	. ds d- d\h'-1'\(ga
122	. ds D- D\h'-1'\(hy
123	. ds th \o'bp'
124	. ds Th \o'LP'
125	. ds ae ae
126	. ds Ae AE
127	.\}
128	.rm #[ #] #H #V #F C
129	.\" ========================================================================
130	.\"
131	.IX Title "Unicode::UCD 3"
132	.TH Unicode::UCD 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
133	.SH "NAME"
134	Unicode::UCD \- Unicode character database
135	.SH "SYNOPSIS"
136	.IX Header "SYNOPSIS"
137	.Vb 2
138	\& use Unicode::UCD 'charinfo';
139	\& my $charinfo = charinfo($codepoint);
140	.Ve
141	.PP
142	.Vb 2
143	\& use Unicode::UCD 'charblock';
144	\& my $charblock = charblock($codepoint);
145	.Ve
146	.PP
147	.Vb 2
148	\& use Unicode::UCD 'charscript';
149	\& my $charscript = charblock($codepoint);
150	.Ve
151	.PP
152	.Vb 2
153	\& use Unicode::UCD 'charblocks';
154	\& my $charblocks = charblocks();
155	.Ve
156	.PP
157	.Vb 2
158	\& use Unicode::UCD 'charscripts';
159	\& my %charscripts = charscripts();
160	.Ve
161	.PP
162	.Vb 3
163	\& use Unicode::UCD qw(charscript charinrange);
164	\& my $range = charscript($script);
165	\& print "looks like $script\en" if charinrange($range, $codepoint);
166	.Ve
167	.PP
168	.Vb 2
169	\& use Unicode::UCD 'compexcl';
170	\& my $compexcl = compexcl($codepoint);
171	.Ve
172	.PP
173	.Vb 1
174	\& my $unicode_version = Unicode::UCD::UnicodeVersion();
175	.Ve
176	.SH "DESCRIPTION"
177	.IX Header "DESCRIPTION"
178	The Unicode::UCD module offers a simple interface to the Unicode
179	Character Database.
180	.Sh "charinfo"
181	.IX Subsection "charinfo"
182	.Vb 1
183	\& use Unicode::UCD 'charinfo';
184	.Ve
185	.PP
186	.Vb 1
187	\& my $charinfo = charinfo(0x41);
188	.Ve
189	.PP
190	\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
191	as defined by the Unicode standard:
192	.PP
193	.Vb 1
194	\& key
195	.Ve
196	.PP
197	.Vb 15
198	\& code code point with at least four hexdigits
199	\& name name of the character IN UPPER CASE
200	\& category general category of the character
201	\& combining classes used in the Canonical Ordering Algorithm
202	\& bidi bidirectional category
203	\& decomposition character decomposition mapping
204	\& decimal if decimal digit this is the integer numeric value
205	\& digit if digit this is the numeric value
206	\& numeric if numeric is the integer or rational numeric value
207	\& mirrored if mirrored in bidirectional text
208	\& unicode10 Unicode 1.0 name if existed and different
209	\& comment ISO 10646 comment field
210	\& upper uppercase equivalent mapping
211	\& lower lowercase equivalent mapping
212	\& title titlecase equivalent mapping
213	.Ve
214	.PP
215	.Vb 2
216	\& block block the character belongs to (used in \ep{In...})
217	\& script script the character belongs to
218	.Ve
219	.PP
220	If no match is found, a reference to an empty hash is returned.
221	.PP
222	The \f(CW\(C`block\(C'\fR property is the same as returned by \fIcharinfo()\fR. It is
223	not defined in the Unicode Character Database proper (Chapter 4 of the
224	Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
225	(Chapter 14 of \s-1TUS3\s0). Similarly for the \f(CW\(C`script\(C'\fR property.
226	.PP
227	Note that you cannot do (de)composition and casing based solely on the
228	above \f(CW\(C`decomposition\(C'\fR and \f(CW\(C`lower\(C'\fR, \f(CW\(C`upper\(C'\fR, \f(CW\(C`title\(C'\fR, properties,
229	you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
230	.Sh "charblock"
231	.IX Subsection "charblock"
232	.Vb 1
233	\& use Unicode::UCD 'charblock';
234	.Ve
235	.PP
236	.Vb 4
237	\& my $charblock = charblock(0x41);
238	\& my $charblock = charblock(1234);
239	\& my $charblock = charblock("0x263a");
240	\& my $charblock = charblock("U+263a");
241	.Ve
242	.PP
243	.Vb 1
244	\& my $range = charblock('Armenian');
245	.Ve
246	.PP
247	With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
248	belongs to, e.g. \f(CW\(C`Basic Latin\(C'\fR. Note that not all the character
249	positions within all blocks are defined.
250	.PP
251	See also \(L"Blocks versus Scripts\(R".
252	.PP
253	If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
254	to do the opposite and interpret the argument as a character block. The
255	return value is a \fIrange\fR: an anonymous list of lists that contain
256	\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
257	code point is in a range using the \(L"charinrange\(R" function. If the
258	argument is not a known charater block, \f(CW\(C`undef\(C'\fR is returned.
259	.Sh "charscript"
260	.IX Subsection "charscript"
261	.Vb 1
262	\& use Unicode::UCD 'charscript';
263	.Ve
264	.PP
265	.Vb 3
266	\& my $charscript = charscript(0x41);
267	\& my $charscript = charscript(1234);
268	\& my $charscript = charscript("U+263a");
269	.Ve
270	.PP
271	.Vb 1
272	\& my $range = charscript('Thai');
273	.Ve
274	.PP
275	With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
276	character belongs to, e.g. \f(CW\(C`Latin\(C'\fR, \f(CW\(C`Greek\(C'\fR, \f(CW\(C`Han\(C'\fR.
277	.PP
278	See also \(L"Blocks versus Scripts\(R".
279	.PP
280	If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
281	to do the opposite and interpret the argument as a character script. The
282	return value is a \fIrange\fR: an anonymous list of lists that contain
283	\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
284	code point is in a range using the \(L"charinrange\(R" function. If the
285	argument is not a known charater script, \f(CW\(C`undef\(C'\fR is returned.
286	.Sh "charblocks"
287	.IX Subsection "charblocks"
288	.Vb 1
289	\& use Unicode::UCD 'charblocks';
290	.Ve
291	.PP
292	.Vb 1
293	\& my $charblocks = charblocks();
294	.Ve
295	.PP
296	\&\fIcharblocks()\fR returns a reference to a hash with the known block names
297	as the keys, and the code point ranges (see \(L"charblock\(R") as the values.
298	.PP
299	See also \(L"Blocks versus Scripts\(R".
300	.Sh "charscripts"
301	.IX Subsection "charscripts"
302	.Vb 1
303	\& use Unicode::UCD 'charscripts';
304	.Ve
305	.PP
306	.Vb 1
307	\& my %charscripts = charscripts();
308	.Ve
309	.PP
310	\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
311	and the code point ranges (see \(L"charscript\(R") as the values.
312	.PP
313	See also \(L"Blocks versus Scripts\(R".
314	.Sh "Blocks versus Scripts"
315	.IX Subsection "Blocks versus Scripts"
316	The difference between a block and a script is that scripts are closer
317	to the linguistic notion of a set of characters required to present
318	languages, while block is more of an artifact of the Unicode character
319	numbering and separation into blocks of (mostly) 256 characters.
320	.PP
321	For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
322	as \f(CW\(C`Basic Latin\(C'\fR, \f(CW\(C`Latin 1 Supplement\(C'\fR, \f(CW\(C`Latin Extended\-A\(C'\fR, and
323	\&\f(CW\(C`Latin Extended\-B\(C'\fR. On the other hand, the Latin script does not
324	contain all the characters of the \f(CW\(C`Basic Latin\(C'\fR block (also known as
325	the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
326	or the punctuation.
327	.PP
328	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
329	.PP
330	For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
331	.Sh "Matching Scripts and Blocks"
332	.IX Subsection "Matching Scripts and Blocks"
333	Scripts are matched with the regular-expression construct
334	\&\f(CW\(C`\ep{...}\(C'\fR (e.g. \f(CW\(C`\ep{Tibetan}\(C'\fR matches characters of the Tibetan script),
335	while \f(CW\(C`\ep{In...}\(C'\fR is used for blocks (e.g. \f(CW\(C`\ep{InTibetan}\(C'\fR matches
336	any of the 256 code points in the Tibetan block).
337	.Sh "Code Point Arguments"
338	.IX Subsection "Code Point Arguments"
339	A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
340	designating a Unicode character, or \f(CW\(C`U+\(C'\fR followed by hexadecimals
341	designating a Unicode character. Note that Unicode is \fBnot\fR limited
342	to 16 bits (the number of Unicode characters is open\-ended, in theory
343	unlimited): you may have more than 4 hexdigits.
344	.Sh "charinrange"
345	.IX Subsection "charinrange"
346	In addition to using the \f(CW\(C`\ep{In...}\(C'\fR and \f(CW\(C`\eP{In...}\(C'\fR constructs, you
347	can also test whether a code point is in the \fIrange\fR as returned by
348	\&\(L"charblock\(R" and \(L"charscript\(R" or as the values of the hash returned
349	by \(L"charblocks\(R" and \(L"charscripts\(R" by using \fIcharinrange()\fR:
350	.PP
351	.Vb 1
352	\& use Unicode::UCD qw(charscript charinrange);
353	.Ve
354	.PP
355	.Vb 2
356	\& $range = charscript('Hiragana');
357	\& print "looks like hiragana\en" if charinrange($range, $codepoint);
358	.Ve
359	.Sh "compexcl"
360	.IX Subsection "compexcl"
361	.Vb 1
362	\& use Unicode::UCD 'compexcl';
363	.Ve
364	.PP
365	.Vb 1
366	\& my $compexcl = compexcl("09dc");
367	.Ve
368	.PP
369	The \fIcompexcl()\fR returns the composition exclusion (that is, if the
370	character should not be produced during a precomposition) of the
371	character specified by a \fBcode point argument\fR.
372	.PP
373	If there is a composition exclusion for the character, true is
374	returned. Otherwise, false is returned.
375	.Sh "casefold"
376	.IX Subsection "casefold"
377	.Vb 1
378	\& use Unicode::UCD 'casefold';
379	.Ve
380	.PP
381	.Vb 1
382	\& my %casefold = casefold("09dc");
383	.Ve
384	.PP
385	The \fIcasefold()\fR returns the locale-independent case folding of the
386	character specified by a \fBcode point argument\fR.
387	.PP
388	If there is a case folding for that character, a reference to a hash
389	with the following fields is returned:
390	.PP
391	.Vb 1
392	\& key
393	.Ve
394	.PP
395	.Vb 3
396	\& code code point with at least four hexdigits
397	\& status "C", "F", "S", or "I"
398	\& mapping one or more codes separated by spaces
399	.Ve
400	.PP
401	The meaning of the \fIstatus\fR is as follows:
402	.PP
403	.Vb 15
404	\& C common case folding, common mappings shared
405	\& by both simple and full mappings
406	\& F full case folding, mappings that cause strings
407	\& to grow in length. Multiple characters are separated
408	\& by spaces
409	\& S simple case folding, mappings to single characters
410	\& where different from F
411	\& I special case for dotted uppercase I and
412	\& dotless lowercase i
413	\& - If this mapping is included, the result is
414	\& case-insensitive, but dotless and dotted I's
415	\& are not distinguished
416	\& - If this mapping is excluded, the result is not
417	\& fully case-insensitive, but dotless and dotted
418	\& I's are distinguished
419	.Ve
420	.PP
421	If there is no case folding for that character, \f(CW\(C`undef\(C'\fR is returned.
422	.PP
423	For more information about case mappings see
424	http://www.unicode.org/unicode/reports/tr21/
425	.Sh "casespec"
426	.IX Subsection "casespec"
427	.Vb 1
428	\& use Unicode::UCD 'casespec';
429	.Ve
430	.PP
431	.Vb 1
432	\& my %casespec = casespec("09dc");
433	.Ve
434	.PP
435	The \fIcasespec()\fR returns the potentially locale-dependent case mapping
436	of the character specified by a \fBcode point argument\fR. The mapping
437	may change the length of the string (which the basic Unicode case
438	mappings as returned by \fIcharinfo()\fR never do).
439	.PP
440	If there is a case folding for that character, a reference to a hash
441	with the following fields is returned:
442	.PP
443	.Vb 1
444	\& key
445	.Ve
446	.PP
447	.Vb 5
448	\& code code point with at least four hexdigits
449	\& lower lowercase
450	\& title titlecase
451	\& upper uppercase
452	\& condition condition list (may be undef)
453	.Ve
454	.PP
455	The \f(CW\(C`condition\(C'\fR is optional. Where present, it consists of one or
456	more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
457	used to separate elements, spaces are to be ignored). A condition
458	list overrides the normal behavior if all of the listed conditions are
459	true. Case distinctions in the condition list are not significant.
460	Conditions preceded by \(L"\s-1NON_\s0\(R" represent the negation of the condition
461	.PP
462	Note that when there are multiple case folding definitions for a
463	single code point because of different locales, the value returned by
464	\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
465	hash references as described above as the values.
466	.PP
467	A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
468	followed by a \(L"_\(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
469	by a \(L"_\(R" and a variant code). You can find the lists of those codes,
470	see Locale::Country and Locale::Language.
471	.PP
472	A \fIcontext\fR is one of the following choices:
473	.PP
474	.Vb 4
475	\& FINAL The letter is not followed by a letter of
476	\& general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
477	\& MODERN The mapping is only used for modern text
478	\& AFTER_i The last base character was "i" (U+0069)
479	.Ve
480	.PP
481	For more information about case mappings see
482	http://www.unicode.org/unicode/reports/tr21/
483	.Sh "Unicode::UCD::UnicodeVersion"
484	.IX Subsection "Unicode::UCD::UnicodeVersion"
485	\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
486	Character Database, in other words, the version of the Unicode
487	standard the database implements. The version is a string
488	of numbers delimited by dots (\f(CW'.'\fR).
489	.Sh "Implementation Note"
490	.IX Subsection "Implementation Note"
491	The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
492	Character Database (the database is included in the Perl distribution).
493	The filehandle is then kept open for further queries. In other words,
494	if you are wondering where one of your filehandles went, that's where.
495	.SH "BUGS"
496	.IX Header "BUGS"
497	Does not yet support \s-1EBCDIC\s0 platforms.
498	.SH "AUTHOR"
499	.IX Header "AUTHOR"
500	Jarkko Hietaniemi