[OpenSPARC-T2-SAM] / sam-t2 / devtools / v9 / man / man3 / Unicode::UCD.3

.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sh \" Subsection heading
.br
.if t .Sp
.ne 5
.PP
\fB\\$1\fR
.PP
..
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  | will give a
.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
.\" expand to `' in nroff, nothing in troff, for use with C<>.
.tr \(*W-|\(bv\*(Tr
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.if \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.\"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.hy 0
.if n .na
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Unicode::UCD 3"
.TH Unicode::UCD 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
.SH "NAME"
Unicode::UCD \- Unicode character database
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\&    use Unicode::UCD 'charinfo';
\&    my $charinfo   = charinfo($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charblock';
\&    my $charblock  = charblock($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charscript';
\&    my $charscript = charscript($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charblocks';
\&    my $charblocks = charblocks();
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charscripts';
\&    my %charscripts = charscripts();
.Ve
.PP
.Vb 3
\&    use Unicode::UCD qw(charscript charinrange);
\&    my $range = charscript($script);
\&    print "looks like $script\en" if charinrange($range, $codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'compexcl';
\&    my $compexcl = compexcl($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'namedseq';
\&    my $namedseq = namedseq($named_sequence_name);
.Ve
.PP
.Vb 1
\&    my $unicode_version = Unicode::UCD::UnicodeVersion();
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The Unicode::UCD module offers a simple interface to the Unicode
Character Database.
.Sh "charinfo"
.IX Subsection "charinfo"
.Vb 1
\&    use Unicode::UCD 'charinfo';
.Ve
.PP
.Vb 1
\&    my $charinfo = charinfo(0x41);
.Ve
.PP
\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
as defined by the Unicode standard:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 15
\&    code             code point with at least four hexdigits
\&    name             name of the character IN UPPER CASE
\&    category         general category of the character
\&    combining        classes used in the Canonical Ordering Algorithm
\&    bidi             bidirectional category
\&    decomposition    character decomposition mapping
\&    decimal          if decimal digit this is the integer numeric value
\&    digit            if digit this is the numeric value
\&    numeric          if numeric is the integer or rational numeric value
\&    mirrored         if mirrored in bidirectional text
\&    unicode10        Unicode 1.0 name if existed and different
\&    comment          ISO 10646 comment field
\&    upper            uppercase equivalent mapping
\&    lower            lowercase equivalent mapping
\&    title            titlecase equivalent mapping
.Ve
.PP
.Vb 2
\&    block            block the character belongs to (used in \ep{In...})
\&    script           script the character belongs to
.Ve
.PP
If no match is found, a reference to an empty hash is returned.
.PP
The \f(CW\*(C`block\*(C'\fR property is the same as returned by \fIcharinfo()\fR.  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
(Chapter 14 of \s-1TUS3\s0).  Similarly for the \f(CW\*(C`script\*(C'\fR property.
.PP
Note that you cannot do (de)composition and casing based solely on the
above \f(CW\*(C`decomposition\*(C'\fR and \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, \f(CW\*(C`title\*(C'\fR, properties,
you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
.Sh "charblock"
.IX Subsection "charblock"
.Vb 1
\&    use Unicode::UCD 'charblock';
.Ve
.PP
.Vb 4
\&    my $charblock = charblock(0x41);
\&    my $charblock = charblock(1234);
\&    my $charblock = charblock("0x263a");
\&    my $charblock = charblock("U+263a");
.Ve
.PP
.Vb 1
\&    my $range     = charblock('Armenian');
.Ve
.PP
With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
belongs to, e.g.  \f(CW\*(C`Basic Latin\*(C'\fR.  Note that not all the character
positions within all blocks are defined.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.PP
If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
to do the opposite and interpret the argument as a character block. The
return value is a \fIrange\fR: an anonymous list of lists that contain
\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether
a code point is in a range using the \*(L"charinrange\*(R" function. If the
argument is not a known character block, \f(CW\*(C`undef\*(C'\fR is returned.
.Sh "charscript"
.IX Subsection "charscript"
.Vb 1
\&    use Unicode::UCD 'charscript';
.Ve
.PP
.Vb 3
\&    my $charscript = charscript(0x41);
\&    my $charscript = charscript(1234);
\&    my $charscript = charscript("U+263a");
.Ve
.PP
.Vb 1
\&    my $range      = charscript('Thai');
.Ve
.PP
With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
character belongs to, e.g.  \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.PP
If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
to do the opposite and interpret the argument as a character script. The
return value is a \fIrange\fR: an anonymous list of lists that contain
\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
code point is in a range using the \*(L"charinrange\*(R" function. If the
argument is not a known character script, \f(CW\*(C`undef\*(C'\fR is returned.
.Sh "charblocks"
.IX Subsection "charblocks"
.Vb 1
\&    use Unicode::UCD 'charblocks';
.Ve
.PP
.Vb 1
\&    my $charblocks = charblocks();
.Ve
.PP
\&\fIcharblocks()\fR returns a reference to a hash with the known block names
as the keys, and the code point ranges (see \*(L"charblock\*(R") as the values.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.Sh "charscripts"
.IX Subsection "charscripts"
.Vb 1
\&    use Unicode::UCD 'charscripts';
.Ve
.PP
.Vb 1
\&    my %charscripts = charscripts();
.Ve
.PP
\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
and the code point ranges (see \*(L"charscript\*(R") as the values.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.Sh "Blocks versus Scripts"
.IX Subsection "Blocks versus Scripts"
The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of (mostly) 256 characters.
.PP
For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and
\&\f(CW\*(C`Latin Extended\-B\*(C'\fR.  On the other hand, the Latin script does not
contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as
the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
or the punctuation.
.PP
For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
.PP
For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
.Sh "Matching Scripts and Blocks"
.IX Subsection "Matching Scripts and Blocks"
Scripts are matched with the regular-expression construct
\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script),
while \f(CW\*(C`\ep{In...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{InTibetan}\*(C'\fR matches
any of the 256 code points in the Tibetan block).
.Sh "Code Point Arguments"
.IX Subsection "Code Point Arguments"
A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
designating a Unicode character, or \f(CW\*(C`U+\*(C'\fR followed by hexadecimals
designating a Unicode character.  In other words, if you want a code
point to be interpreted as a hexadecimal number, you must prefix it
with either \f(CW\*(C`0x\*(C'\fR or \f(CW\*(C`U+\*(C'\fR, because a string like e.g. \f(CW123\fR will
be interpreted as a decimal code point.  Also note that Unicode is
\&\fBnot\fR limited to 16 bits (the number of Unicode characters is
open\-ended, in theory unlimited): you may have more than 4 hexdigits.
.Sh "charinrange"
.IX Subsection "charinrange"
In addition to using the \f(CW\*(C`\ep{In...}\*(C'\fR and \f(CW\*(C`\eP{In...}\*(C'\fR constructs, you
can also test whether a code point is in the \fIrange\fR as returned by
\&\*(L"charblock\*(R" and \*(L"charscript\*(R" or as the values of the hash returned
by \*(L"charblocks\*(R" and \*(L"charscripts\*(R" by using \fIcharinrange()\fR:
.PP
.Vb 1
\&    use Unicode::UCD qw(charscript charinrange);
.Ve
.PP
.Vb 2
\&    $range = charscript('Hiragana');
\&    print "looks like hiragana\en" if charinrange($range, $codepoint);
.Ve
.Sh "compexcl"
.IX Subsection "compexcl"
.Vb 1
\&    use Unicode::UCD 'compexcl';
.Ve
.PP
.Vb 1
\&    my $compexcl = compexcl("09dc");
.Ve
.PP
The \fIcompexcl()\fR returns the composition exclusion (that is, if the
character should not be produced during a precomposition) of the 
character specified by a \fBcode point argument\fR.
.PP
If there is a composition exclusion for the character, true is
returned.  Otherwise, false is returned.
.Sh "casefold"
.IX Subsection "casefold"
.Vb 1
\&    use Unicode::UCD 'casefold';
.Ve
.PP
.Vb 1
\&    my $casefold = casefold("00DF");
.Ve
.PP
The \fIcasefold()\fR returns the locale-independent case folding of the
character specified by a \fBcode point argument\fR.
.PP
If there is a case folding for that character, a reference to a hash
with the following fields is returned:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 3
\&    code             code point with at least four hexdigits
\&    status           "C", "F", "S", or "I"
\&    mapping          one or more codes separated by spaces
.Ve
.PP
The meaning of the \fIstatus\fR is as follows:
.PP
.Vb 15
\&   C                 common case folding, common mappings shared
\&                     by both simple and full mappings
\&   F                 full case folding, mappings that cause strings
\&                     to grow in length. Multiple characters are separated
\&                     by spaces
\&   S                 simple case folding, mappings to single characters
\&                     where different from F
\&   I                 special case for dotted uppercase I and
\&                     dotless lowercase i
\&                     - If this mapping is included, the result is
\&                       case-insensitive, but dotless and dotted I's
\&                       are not distinguished
\&                     - If this mapping is excluded, the result is not
\&                       fully case-insensitive, but dotless and dotted
\&                       I's are distinguished
.Ve
.PP
If there is no case folding for that character, \f(CW\*(C`undef\*(C'\fR is returned.
.PP
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
.Sh "casespec"
.IX Subsection "casespec"
.Vb 1
\&    use Unicode::UCD 'casespec';
.Ve
.PP
.Vb 1
\&    my $casespec = casespec("FB00");
.Ve
.PP
The \fIcasespec()\fR returns the potentially locale-dependent case mapping
of the character specified by a \fBcode point argument\fR.  The mapping
may change the length of the string (which the basic Unicode case
mappings as returned by \fIcharinfo()\fR never do).
.PP
If there is a case folding for that character, a reference to a hash
with the following fields is returned:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 5
\&    code             code point with at least four hexdigits
\&    lower            lowercase
\&    title            titlecase
\&    upper            uppercase
\&    condition        condition list (may be undef)
.Ve
.PP
The \f(CW\*(C`condition\*(C'\fR is optional.  Where present, it consists of one or
more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
used to separate elements, spaces are to be ignored).  A condition
list overrides the normal behavior if all of the listed conditions are
true.  Case distinctions in the condition list are not significant.
Conditions preceded by \*(L"\s-1NON_\s0\*(R" represent the negation of the condition.
.PP
Note that when there are multiple case folding definitions for a
single code point because of different locales, the value returned by
\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
hash references as described above as the values.
.PP
A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
followed by a \*(L"_\*(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
by a \*(L"_\*(R" and a variant code).  You can find the lists of those codes,
see Locale::Country and Locale::Language.
.PP
A \fIcontext\fR is one of the following choices:
.PP
.Vb 4
\&    FINAL            The letter is not followed by a letter of
\&                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
\&    MODERN           The mapping is only used for modern text
\&    AFTER_i          The last base character was "i" (U+0069)
.Ve
.PP
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
.Sh "\fInamedseq()\fP"
.IX Subsection "namedseq()"
.Vb 1
\&    use Unicode::UCD 'namedseq';
.Ve
.PP
.Vb 3
\&    my $namedseq = namedseq("KATAKANA LETTER AINU P");
\&    my @namedseq = namedseq("KATAKANA LETTER AINU P");
\&    my %namedseq = namedseq();
.Ve
.PP
If used with a single argument in a scalar context, returns the string
consisting of the code points of the named sequence, or \f(CW\*(C`undef\*(C'\fR if no
named sequence by that name exists.  If used with a single argument in
a list context, returns list of the code points.  If used with no
arguments in a list context, returns a hash with the names of the
named sequences as the keys and the named sequences as strings as
the values.  Otherwise, returns \f(CW\*(C`undef\*(C'\fR or empty list depending
on the context.
.PP
(New from Unicode 4.1.0)
.Sh "Unicode::UCD::UnicodeVersion"
.IX Subsection "Unicode::UCD::UnicodeVersion"
\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
Character Database, in other words, the version of the Unicode
standard the database implements.  The version is a string
of numbers delimited by dots (\f(CW'.'\fR).
.Sh "Implementation Note"
.IX Subsection "Implementation Note"
The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.  In other words,
if you are wondering where one of your filehandles went, that's where.
.SH "BUGS"
.IX Header "BUGS"
Does not yet support \s-1EBCDIC\s0 platforms.
.SH "AUTHOR"
.IX Header "AUTHOR"
Jarkko Hietaniemi
Commit	Line	Data
920dae64 AT	1	.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
65	.hy 0
66	.if n .na
67	.\"
68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69	.\" Fear. Run. Save yourself. No user-serviceable parts.
70	. \" fudge factors for nroff and troff
71	.if n \{\
72	. ds #H 0
73	. ds #V .8m
74	. ds #F .3m
75	. ds #[ \f1
76	. ds #] \fP
77	.\}
78	.if t \{\
79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80	. ds #V .6m
81	. ds #F 0
82	. ds #[ \&
83	. ds #] \&
84	.\}
85	. \" simple accents for nroff and troff
86	.if n \{\
87	. ds ' \&
88	. ds ` \&
89	. ds ^ \&
90	. ds , \&
91	. ds ~ ~
92	. ds /
93	.\}
94	.if t \{\
95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
101	.\}
102	. \" troff and (daisy-wheel) nroff accents
103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
110	.ds ae a\h'-(\w'a'u*4/10)'e
111	.ds Ae A\h'-(\w'A'u*4/10)'E
112	. \" corrections for vroff
113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
115	. \" for low resolution devices (crt and lpr)
116	.if \n(.H>23 .if \n(.V>19 \
117	\{\
118	. ds : e
119	. ds 8 ss
120	. ds o a
121	. ds d- d\h'-1'\(ga
122	. ds D- D\h'-1'\(hy
123	. ds th \o'bp'
124	. ds Th \o'LP'
125	. ds ae ae
126	. ds Ae AE
127	.\}
128	.rm #[ #] #H #V #F C
129	.\" ========================================================================
130	.\"
131	.IX Title "Unicode::UCD 3"
132	.TH Unicode::UCD 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
133	.SH "NAME"
134	Unicode::UCD \- Unicode character database
135	.SH "SYNOPSIS"
136	.IX Header "SYNOPSIS"
137	.Vb 2
138	\& use Unicode::UCD 'charinfo';
139	\& my $charinfo = charinfo($codepoint);
140	.Ve
141	.PP
142	.Vb 2
143	\& use Unicode::UCD 'charblock';
144	\& my $charblock = charblock($codepoint);
145	.Ve
146	.PP
147	.Vb 2
148	\& use Unicode::UCD 'charscript';
149	\& my $charscript = charscript($codepoint);
150	.Ve
151	.PP
152	.Vb 2
153	\& use Unicode::UCD 'charblocks';
154	\& my $charblocks = charblocks();
155	.Ve
156	.PP
157	.Vb 2
158	\& use Unicode::UCD 'charscripts';
159	\& my %charscripts = charscripts();
160	.Ve
161	.PP
162	.Vb 3
163	\& use Unicode::UCD qw(charscript charinrange);
164	\& my $range = charscript($script);
165	\& print "looks like $script\en" if charinrange($range, $codepoint);
166	.Ve
167	.PP
168	.Vb 2
169	\& use Unicode::UCD 'compexcl';
170	\& my $compexcl = compexcl($codepoint);
171	.Ve
172	.PP
173	.Vb 2
174	\& use Unicode::UCD 'namedseq';
175	\& my $namedseq = namedseq($named_sequence_name);
176	.Ve
177	.PP
178	.Vb 1
179	\& my $unicode_version = Unicode::UCD::UnicodeVersion();
180	.Ve
181	.SH "DESCRIPTION"
182	.IX Header "DESCRIPTION"
183	The Unicode::UCD module offers a simple interface to the Unicode
184	Character Database.
185	.Sh "charinfo"
186	.IX Subsection "charinfo"
187	.Vb 1
188	\& use Unicode::UCD 'charinfo';
189	.Ve
190	.PP
191	.Vb 1
192	\& my $charinfo = charinfo(0x41);
193	.Ve
194	.PP
195	\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
196	as defined by the Unicode standard:
197	.PP
198	.Vb 1
199	\& key
200	.Ve
201	.PP
202	.Vb 15
203	\& code code point with at least four hexdigits
204	\& name name of the character IN UPPER CASE
205	\& category general category of the character
206	\& combining classes used in the Canonical Ordering Algorithm
207	\& bidi bidirectional category
208	\& decomposition character decomposition mapping
209	\& decimal if decimal digit this is the integer numeric value
210	\& digit if digit this is the numeric value
211	\& numeric if numeric is the integer or rational numeric value
212	\& mirrored if mirrored in bidirectional text
213	\& unicode10 Unicode 1.0 name if existed and different
214	\& comment ISO 10646 comment field
215	\& upper uppercase equivalent mapping
216	\& lower lowercase equivalent mapping
217	\& title titlecase equivalent mapping
218	.Ve
219	.PP
220	.Vb 2
221	\& block block the character belongs to (used in \ep{In...})
222	\& script script the character belongs to
223	.Ve
224	.PP
225	If no match is found, a reference to an empty hash is returned.
226	.PP
227	The \f(CW\(C`block\(C'\fR property is the same as returned by \fIcharinfo()\fR. It is
228	not defined in the Unicode Character Database proper (Chapter 4 of the
229	Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
230	(Chapter 14 of \s-1TUS3\s0). Similarly for the \f(CW\(C`script\(C'\fR property.
231	.PP
232	Note that you cannot do (de)composition and casing based solely on the
233	above \f(CW\(C`decomposition\(C'\fR and \f(CW\(C`lower\(C'\fR, \f(CW\(C`upper\(C'\fR, \f(CW\(C`title\(C'\fR, properties,
234	you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
235	.Sh "charblock"
236	.IX Subsection "charblock"
237	.Vb 1
238	\& use Unicode::UCD 'charblock';
239	.Ve
240	.PP
241	.Vb 4
242	\& my $charblock = charblock(0x41);
243	\& my $charblock = charblock(1234);
244	\& my $charblock = charblock("0x263a");
245	\& my $charblock = charblock("U+263a");
246	.Ve
247	.PP
248	.Vb 1
249	\& my $range = charblock('Armenian');
250	.Ve
251	.PP
252	With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
253	belongs to, e.g. \f(CW\(C`Basic Latin\(C'\fR. Note that not all the character
254	positions within all blocks are defined.
255	.PP
256	See also \(L"Blocks versus Scripts\(R".
257	.PP
258	If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
259	to do the opposite and interpret the argument as a character block. The
260	return value is a \fIrange\fR: an anonymous list of lists that contain
261	\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether
262	a code point is in a range using the \(L"charinrange\(R" function. If the
263	argument is not a known character block, \f(CW\(C`undef\(C'\fR is returned.
264	.Sh "charscript"
265	.IX Subsection "charscript"
266	.Vb 1
267	\& use Unicode::UCD 'charscript';
268	.Ve
269	.PP
270	.Vb 3
271	\& my $charscript = charscript(0x41);
272	\& my $charscript = charscript(1234);
273	\& my $charscript = charscript("U+263a");
274	.Ve
275	.PP
276	.Vb 1
277	\& my $range = charscript('Thai');
278	.Ve
279	.PP
280	With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
281	character belongs to, e.g. \f(CW\(C`Latin\(C'\fR, \f(CW\(C`Greek\(C'\fR, \f(CW\(C`Han\(C'\fR.
282	.PP
283	See also \(L"Blocks versus Scripts\(R".
284	.PP
285	If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
286	to do the opposite and interpret the argument as a character script. The
287	return value is a \fIrange\fR: an anonymous list of lists that contain
288	\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
289	code point is in a range using the \(L"charinrange\(R" function. If the
290	argument is not a known character script, \f(CW\(C`undef\(C'\fR is returned.
291	.Sh "charblocks"
292	.IX Subsection "charblocks"
293	.Vb 1
294	\& use Unicode::UCD 'charblocks';
295	.Ve
296	.PP
297	.Vb 1
298	\& my $charblocks = charblocks();
299	.Ve
300	.PP
301	\&\fIcharblocks()\fR returns a reference to a hash with the known block names
302	as the keys, and the code point ranges (see \(L"charblock\(R") as the values.
303	.PP
304	See also \(L"Blocks versus Scripts\(R".
305	.Sh "charscripts"
306	.IX Subsection "charscripts"
307	.Vb 1
308	\& use Unicode::UCD 'charscripts';
309	.Ve
310	.PP
311	.Vb 1
312	\& my %charscripts = charscripts();
313	.Ve
314	.PP
315	\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
316	and the code point ranges (see \(L"charscript\(R") as the values.
317	.PP
318	See also \(L"Blocks versus Scripts\(R".
319	.Sh "Blocks versus Scripts"
320	.IX Subsection "Blocks versus Scripts"
321	The difference between a block and a script is that scripts are closer
322	to the linguistic notion of a set of characters required to present
323	languages, while block is more of an artifact of the Unicode character
324	numbering and separation into blocks of (mostly) 256 characters.
325	.PP
326	For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
327	as \f(CW\(C`Basic Latin\(C'\fR, \f(CW\(C`Latin 1 Supplement\(C'\fR, \f(CW\(C`Latin Extended\-A\(C'\fR, and
328	\&\f(CW\(C`Latin Extended\-B\(C'\fR. On the other hand, the Latin script does not
329	contain all the characters of the \f(CW\(C`Basic Latin\(C'\fR block (also known as
330	the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
331	or the punctuation.
332	.PP
333	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
334	.PP
335	For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
336	.Sh "Matching Scripts and Blocks"
337	.IX Subsection "Matching Scripts and Blocks"
338	Scripts are matched with the regular-expression construct
339	\&\f(CW\(C`\ep{...}\(C'\fR (e.g. \f(CW\(C`\ep{Tibetan}\(C'\fR matches characters of the Tibetan script),
340	while \f(CW\(C`\ep{In...}\(C'\fR is used for blocks (e.g. \f(CW\(C`\ep{InTibetan}\(C'\fR matches
341	any of the 256 code points in the Tibetan block).
342	.Sh "Code Point Arguments"
343	.IX Subsection "Code Point Arguments"
344	A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
345	designating a Unicode character, or \f(CW\(C`U+\(C'\fR followed by hexadecimals
346	designating a Unicode character. In other words, if you want a code
347	point to be interpreted as a hexadecimal number, you must prefix it
348	with either \f(CW\(C`0x\(C'\fR or \f(CW\(C`U+\(C'\fR, because a string like e.g. \f(CW123\fR will
349	be interpreted as a decimal code point. Also note that Unicode is
350	\&\fBnot\fR limited to 16 bits (the number of Unicode characters is
351	open\-ended, in theory unlimited): you may have more than 4 hexdigits.
352	.Sh "charinrange"
353	.IX Subsection "charinrange"
354	In addition to using the \f(CW\(C`\ep{In...}\(C'\fR and \f(CW\(C`\eP{In...}\(C'\fR constructs, you
355	can also test whether a code point is in the \fIrange\fR as returned by
356	\&\(L"charblock\(R" and \(L"charscript\(R" or as the values of the hash returned
357	by \(L"charblocks\(R" and \(L"charscripts\(R" by using \fIcharinrange()\fR:
358	.PP
359	.Vb 1
360	\& use Unicode::UCD qw(charscript charinrange);
361	.Ve
362	.PP
363	.Vb 2
364	\& $range = charscript('Hiragana');
365	\& print "looks like hiragana\en" if charinrange($range, $codepoint);
366	.Ve
367	.Sh "compexcl"
368	.IX Subsection "compexcl"
369	.Vb 1
370	\& use Unicode::UCD 'compexcl';
371	.Ve
372	.PP
373	.Vb 1
374	\& my $compexcl = compexcl("09dc");
375	.Ve
376	.PP
377	The \fIcompexcl()\fR returns the composition exclusion (that is, if the
378	character should not be produced during a precomposition) of the
379	character specified by a \fBcode point argument\fR.
380	.PP
381	If there is a composition exclusion for the character, true is
382	returned. Otherwise, false is returned.
383	.Sh "casefold"
384	.IX Subsection "casefold"
385	.Vb 1
386	\& use Unicode::UCD 'casefold';
387	.Ve
388	.PP
389	.Vb 1
390	\& my $casefold = casefold("00DF");
391	.Ve
392	.PP
393	The \fIcasefold()\fR returns the locale-independent case folding of the
394	character specified by a \fBcode point argument\fR.
395	.PP
396	If there is a case folding for that character, a reference to a hash
397	with the following fields is returned:
398	.PP
399	.Vb 1
400	\& key
401	.Ve
402	.PP
403	.Vb 3
404	\& code code point with at least four hexdigits
405	\& status "C", "F", "S", or "I"
406	\& mapping one or more codes separated by spaces
407	.Ve
408	.PP
409	The meaning of the \fIstatus\fR is as follows:
410	.PP
411	.Vb 15
412	\& C common case folding, common mappings shared
413	\& by both simple and full mappings
414	\& F full case folding, mappings that cause strings
415	\& to grow in length. Multiple characters are separated
416	\& by spaces
417	\& S simple case folding, mappings to single characters
418	\& where different from F
419	\& I special case for dotted uppercase I and
420	\& dotless lowercase i
421	\& - If this mapping is included, the result is
422	\& case-insensitive, but dotless and dotted I's
423	\& are not distinguished
424	\& - If this mapping is excluded, the result is not
425	\& fully case-insensitive, but dotless and dotted
426	\& I's are distinguished
427	.Ve
428	.PP
429	If there is no case folding for that character, \f(CW\(C`undef\(C'\fR is returned.
430	.PP
431	For more information about case mappings see
432	http://www.unicode.org/unicode/reports/tr21/
433	.Sh "casespec"
434	.IX Subsection "casespec"
435	.Vb 1
436	\& use Unicode::UCD 'casespec';
437	.Ve
438	.PP
439	.Vb 1
440	\& my $casespec = casespec("FB00");
441	.Ve
442	.PP
443	The \fIcasespec()\fR returns the potentially locale-dependent case mapping
444	of the character specified by a \fBcode point argument\fR. The mapping
445	may change the length of the string (which the basic Unicode case
446	mappings as returned by \fIcharinfo()\fR never do).
447	.PP
448	If there is a case folding for that character, a reference to a hash
449	with the following fields is returned:
450	.PP
451	.Vb 1
452	\& key
453	.Ve
454	.PP
455	.Vb 5
456	\& code code point with at least four hexdigits
457	\& lower lowercase
458	\& title titlecase
459	\& upper uppercase
460	\& condition condition list (may be undef)
461	.Ve
462	.PP
463	The \f(CW\(C`condition\(C'\fR is optional. Where present, it consists of one or
464	more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
465	used to separate elements, spaces are to be ignored). A condition
466	list overrides the normal behavior if all of the listed conditions are
467	true. Case distinctions in the condition list are not significant.
468	Conditions preceded by \(L"\s-1NON_\s0\(R" represent the negation of the condition.
469	.PP
470	Note that when there are multiple case folding definitions for a
471	single code point because of different locales, the value returned by
472	\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
473	hash references as described above as the values.
474	.PP
475	A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
476	followed by a \(L"_\(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
477	by a \(L"_\(R" and a variant code). You can find the lists of those codes,
478	see Locale::Country and Locale::Language.
479	.PP
480	A \fIcontext\fR is one of the following choices:
481	.PP
482	.Vb 4
483	\& FINAL The letter is not followed by a letter of
484	\& general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
485	\& MODERN The mapping is only used for modern text
486	\& AFTER_i The last base character was "i" (U+0069)
487	.Ve
488	.PP
489	For more information about case mappings see
490	http://www.unicode.org/unicode/reports/tr21/
491	.Sh "\fInamedseq()\fP"
492	.IX Subsection "namedseq()"
493	.Vb 1
494	\& use Unicode::UCD 'namedseq';
495	.Ve
496	.PP
497	.Vb 3
498	\& my $namedseq = namedseq("KATAKANA LETTER AINU P");
499	\& my @namedseq = namedseq("KATAKANA LETTER AINU P");
500	\& my %namedseq = namedseq();
501	.Ve
502	.PP
503	If used with a single argument in a scalar context, returns the string
504	consisting of the code points of the named sequence, or \f(CW\(C`undef\(C'\fR if no
505	named sequence by that name exists. If used with a single argument in
506	a list context, returns list of the code points. If used with no
507	arguments in a list context, returns a hash with the names of the
508	named sequences as the keys and the named sequences as strings as
509	the values. Otherwise, returns \f(CW\(C`undef\(C'\fR or empty list depending
510	on the context.
511	.PP
512	(New from Unicode 4.1.0)
513	.Sh "Unicode::UCD::UnicodeVersion"
514	.IX Subsection "Unicode::UCD::UnicodeVersion"
515	\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
516	Character Database, in other words, the version of the Unicode
517	standard the database implements. The version is a string
518	of numbers delimited by dots (\f(CW'.'\fR).
519	.Sh "Implementation Note"
520	.IX Subsection "Implementation Note"
521	The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
522	Character Database (the database is included in the Perl distribution).
523	The filehandle is then kept open for further queries. In other words,
524	if you are wondering where one of your filehandles went, that's where.
525	.SH "BUGS"
526	.IX Header "BUGS"
527	Does not yet support \s-1EBCDIC\s0 platforms.
528	.SH "AUTHOR"
529	.IX Header "AUTHOR"
530	Jarkko Hietaniemi