sam-t2/devtools/amd64/man/man3/Unicode::UCD.3

.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sh \" Subsection heading
.br
.if t .Sp
.ne 5
.PP
\fB\\$1\fR
.PP
..
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  | will give a
.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
.\" expand to `' in nroff, nothing in troff, for use with C<>.
.tr \(*W-|\(bv\*(Tr
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.if \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.\"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.hy 0
.if n .na
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Unicode::UCD 3"
.TH Unicode::UCD 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
.SH "NAME"
Unicode::UCD \- Unicode character database
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 2
\&    use Unicode::UCD 'charinfo';
\&    my $charinfo   = charinfo($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charblock';
\&    my $charblock  = charblock($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charscript';
\&    my $charscript = charscript($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charblocks';
\&    my $charblocks = charblocks();
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'charscripts';
\&    my %charscripts = charscripts();
.Ve
.PP
.Vb 3
\&    use Unicode::UCD qw(charscript charinrange);
\&    my $range = charscript($script);
\&    print "looks like $script\en" if charinrange($range, $codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'compexcl';
\&    my $compexcl = compexcl($codepoint);
.Ve
.PP
.Vb 2
\&    use Unicode::UCD 'namedseq';
\&    my $namedseq = namedseq($named_sequence_name);
.Ve
.PP
.Vb 1
\&    my $unicode_version = Unicode::UCD::UnicodeVersion();
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The Unicode::UCD module offers a simple interface to the Unicode
Character Database.
.Sh "charinfo"
.IX Subsection "charinfo"
.Vb 1
\&    use Unicode::UCD 'charinfo';
.Ve
.PP
.Vb 1
\&    my $charinfo = charinfo(0x41);
.Ve
.PP
\&\fIcharinfo()\fR returns a reference to a hash that has the following fields
as defined by the Unicode standard:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 15
\&    code             code point with at least four hexdigits
\&    name             name of the character IN UPPER CASE
\&    category         general category of the character
\&    combining        classes used in the Canonical Ordering Algorithm
\&    bidi             bidirectional category
\&    decomposition    character decomposition mapping
\&    decimal          if decimal digit this is the integer numeric value
\&    digit            if digit this is the numeric value
\&    numeric          if numeric is the integer or rational numeric value
\&    mirrored         if mirrored in bidirectional text
\&    unicode10        Unicode 1.0 name if existed and different
\&    comment          ISO 10646 comment field
\&    upper            uppercase equivalent mapping
\&    lower            lowercase equivalent mapping
\&    title            titlecase equivalent mapping
.Ve
.PP
.Vb 2
\&    block            block the character belongs to (used in \ep{In...})
\&    script           script the character belongs to
.Ve
.PP
If no match is found, a reference to an empty hash is returned.
.PP
The \f(CW\*(C`block\*(C'\fR property is the same as returned by \fIcharinfo()\fR.  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard, aka \s-1TUS3\s0) but instead in an auxiliary database
(Chapter 14 of \s-1TUS3\s0).  Similarly for the \f(CW\*(C`script\*(C'\fR property.
.PP
Note that you cannot do (de)composition and casing based solely on the
above \f(CW\*(C`decomposition\*(C'\fR and \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, \f(CW\*(C`title\*(C'\fR, properties,
you will need also the \fIcompexcl()\fR, \fIcasefold()\fR, and \fIcasespec()\fR functions.
.Sh "charblock"
.IX Subsection "charblock"
.Vb 1
\&    use Unicode::UCD 'charblock';
.Ve
.PP
.Vb 4
\&    my $charblock = charblock(0x41);
\&    my $charblock = charblock(1234);
\&    my $charblock = charblock("0x263a");
\&    my $charblock = charblock("U+263a");
.Ve
.PP
.Vb 1
\&    my $range     = charblock('Armenian');
.Ve
.PP
With a \fBcode point argument\fR \fIcharblock()\fR returns the \fIblock\fR the character
belongs to, e.g.  \f(CW\*(C`Basic Latin\*(C'\fR.  Note that not all the character
positions within all blocks are defined.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.PP
If supplied with an argument that can't be a code point, \fIcharblock()\fR tries
to do the opposite and interpret the argument as a character block. The
return value is a \fIrange\fR: an anonymous list of lists that contain
\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether
a code point is in a range using the \*(L"charinrange\*(R" function. If the
argument is not a known character block, \f(CW\*(C`undef\*(C'\fR is returned.
.Sh "charscript"
.IX Subsection "charscript"
.Vb 1
\&    use Unicode::UCD 'charscript';
.Ve
.PP
.Vb 3
\&    my $charscript = charscript(0x41);
\&    my $charscript = charscript(1234);
\&    my $charscript = charscript("U+263a");
.Ve
.PP
.Vb 1
\&    my $range      = charscript('Thai');
.Ve
.PP
With a \fBcode point argument\fR \fIcharscript()\fR returns the \fIscript\fR the
character belongs to, e.g.  \f(CW\*(C`Latin\*(C'\fR, \f(CW\*(C`Greek\*(C'\fR, \f(CW\*(C`Han\*(C'\fR.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.PP
If supplied with an argument that can't be a code point, \fIcharscript()\fR tries
to do the opposite and interpret the argument as a character script. The
return value is a \fIrange\fR: an anonymous list of lists that contain
\&\fIstart-of-range\fR, \fIend-of-range\fR code point pairs. You can test whether a
code point is in a range using the \*(L"charinrange\*(R" function. If the
argument is not a known character script, \f(CW\*(C`undef\*(C'\fR is returned.
.Sh "charblocks"
.IX Subsection "charblocks"
.Vb 1
\&    use Unicode::UCD 'charblocks';
.Ve
.PP
.Vb 1
\&    my $charblocks = charblocks();
.Ve
.PP
\&\fIcharblocks()\fR returns a reference to a hash with the known block names
as the keys, and the code point ranges (see \*(L"charblock\*(R") as the values.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.Sh "charscripts"
.IX Subsection "charscripts"
.Vb 1
\&    use Unicode::UCD 'charscripts';
.Ve
.PP
.Vb 1
\&    my %charscripts = charscripts();
.Ve
.PP
\&\fIcharscripts()\fR returns a hash with the known script names as the keys,
and the code point ranges (see \*(L"charscript\*(R") as the values.
.PP
See also \*(L"Blocks versus Scripts\*(R".
.Sh "Blocks versus Scripts"
.IX Subsection "Blocks versus Scripts"
The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of (mostly) 256 characters.
.PP
For example the Latin \fBscript\fR is spread over several \fBblocks\fR, such
as \f(CW\*(C`Basic Latin\*(C'\fR, \f(CW\*(C`Latin 1 Supplement\*(C'\fR, \f(CW\*(C`Latin Extended\-A\*(C'\fR, and
\&\f(CW\*(C`Latin Extended\-B\*(C'\fR.  On the other hand, the Latin script does not
contain all the characters of the \f(CW\*(C`Basic Latin\*(C'\fR block (also known as
the \s-1ASCII\s0): it includes only the letters, and not, for example, the digits
or the punctuation.
.PP
For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
.PP
For scripts see \s-1UTR\s0 #24: http://www.unicode.org/unicode/reports/tr24/
.Sh "Matching Scripts and Blocks"
.IX Subsection "Matching Scripts and Blocks"
Scripts are matched with the regular-expression construct
\&\f(CW\*(C`\ep{...}\*(C'\fR (e.g. \f(CW\*(C`\ep{Tibetan}\*(C'\fR matches characters of the Tibetan script),
while \f(CW\*(C`\ep{In...}\*(C'\fR is used for blocks (e.g. \f(CW\*(C`\ep{InTibetan}\*(C'\fR matches
any of the 256 code points in the Tibetan block).
.Sh "Code Point Arguments"
.IX Subsection "Code Point Arguments"
A \fIcode point argument\fR is either a decimal or a hexadecimal scalar
designating a Unicode character, or \f(CW\*(C`U+\*(C'\fR followed by hexadecimals
designating a Unicode character.  In other words, if you want a code
point to be interpreted as a hexadecimal number, you must prefix it
with either \f(CW\*(C`0x\*(C'\fR or \f(CW\*(C`U+\*(C'\fR, because a string like e.g. \f(CW123\fR will
be interpreted as a decimal code point.  Also note that Unicode is
\&\fBnot\fR limited to 16 bits (the number of Unicode characters is
open\-ended, in theory unlimited): you may have more than 4 hexdigits.
.Sh "charinrange"
.IX Subsection "charinrange"
In addition to using the \f(CW\*(C`\ep{In...}\*(C'\fR and \f(CW\*(C`\eP{In...}\*(C'\fR constructs, you
can also test whether a code point is in the \fIrange\fR as returned by
\&\*(L"charblock\*(R" and \*(L"charscript\*(R" or as the values of the hash returned
by \*(L"charblocks\*(R" and \*(L"charscripts\*(R" by using \fIcharinrange()\fR:
.PP
.Vb 1
\&    use Unicode::UCD qw(charscript charinrange);
.Ve
.PP
.Vb 2
\&    $range = charscript('Hiragana');
\&    print "looks like hiragana\en" if charinrange($range, $codepoint);
.Ve
.Sh "compexcl"
.IX Subsection "compexcl"
.Vb 1
\&    use Unicode::UCD 'compexcl';
.Ve
.PP
.Vb 1
\&    my $compexcl = compexcl("09dc");
.Ve
.PP
The \fIcompexcl()\fR returns the composition exclusion (that is, if the
character should not be produced during a precomposition) of the
character specified by a \fBcode point argument\fR.
.PP
If there is a composition exclusion for the character, true is
returned.  Otherwise, false is returned.
.Sh "casefold"
.IX Subsection "casefold"
.Vb 1
\&    use Unicode::UCD 'casefold';
.Ve
.PP
.Vb 1
\&    my $casefold = casefold("00DF");
.Ve
.PP
The \fIcasefold()\fR returns the locale-independent case folding of the
character specified by a \fBcode point argument\fR.
.PP
If there is a case folding for that character, a reference to a hash
with the following fields is returned:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 3
\&    code             code point with at least four hexdigits
\&    status           "C", "F", "S", or "I"
\&    mapping          one or more codes separated by spaces
.Ve
.PP
The meaning of the \fIstatus\fR is as follows:
.PP
.Vb 15
\&   C                 common case folding, common mappings shared
\&                     by both simple and full mappings
\&   F                 full case folding, mappings that cause strings
\&                     to grow in length. Multiple characters are separated
\&                     by spaces
\&   S                 simple case folding, mappings to single characters
\&                     where different from F
\&   I                 special case for dotted uppercase I and
\&                     dotless lowercase i
\&                     - If this mapping is included, the result is
\&                       case-insensitive, but dotless and dotted I's
\&                       are not distinguished
\&                     - If this mapping is excluded, the result is not
\&                       fully case-insensitive, but dotless and dotted
\&                       I's are distinguished
.Ve
.PP
If there is no case folding for that character, \f(CW\*(C`undef\*(C'\fR is returned.
.PP
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
.Sh "casespec"
.IX Subsection "casespec"
.Vb 1
\&    use Unicode::UCD 'casespec';
.Ve
.PP
.Vb 1
\&    my $casespec = casespec("FB00");
.Ve
.PP
The \fIcasespec()\fR returns the potentially locale-dependent case mapping
of the character specified by a \fBcode point argument\fR.  The mapping
may change the length of the string (which the basic Unicode case
mappings as returned by \fIcharinfo()\fR never do).
.PP
If there is a case folding for that character, a reference to a hash
with the following fields is returned:
.PP
.Vb 1
\&    key
.Ve
.PP
.Vb 5
\&    code             code point with at least four hexdigits
\&    lower            lowercase
\&    title            titlecase
\&    upper            uppercase
\&    condition        condition list (may be undef)
.Ve
.PP
The \f(CW\*(C`condition\*(C'\fR is optional.  Where present, it consists of one or
more \fIlocales\fR or \fIcontexts\fR, separated by spaces (other than as
used to separate elements, spaces are to be ignored).  A condition
list overrides the normal behavior if all of the listed conditions are
true.  Case distinctions in the condition list are not significant.
Conditions preceded by \*(L"\s-1NON_\s0\*(R" represent the negation of the condition.
.PP
Note that when there are multiple case folding definitions for a
single code point because of different locales, the value returned by
\&\fIcasespec()\fR is a hash reference which has the locales as the keys and
hash references as described above as the values.
.PP
A \fIlocale\fR is defined as a 2\-letter \s-1ISO\s0 3166 country code, possibly
followed by a \*(L"_\*(R" and a 2\-letter \s-1ISO\s0 language code (possibly followed
by a \*(L"_\*(R" and a variant code).  You can find the lists of those codes,
see Locale::Country and Locale::Language.
.PP
A \fIcontext\fR is one of the following choices:
.PP
.Vb 4
\&    FINAL            The letter is not followed by a letter of
\&                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
\&    MODERN           The mapping is only used for modern text
\&    AFTER_i          The last base character was "i" (U+0069)
.Ve
.PP
For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/
.Sh "\fInamedseq()\fP"
.IX Subsection "namedseq()"
.Vb 1
\&    use Unicode::UCD 'namedseq';
.Ve
.PP
.Vb 3
\&    my $namedseq = namedseq("KATAKANA LETTER AINU P");
\&    my @namedseq = namedseq("KATAKANA LETTER AINU P");
\&    my %namedseq = namedseq();
.Ve
.PP
If used with a single argument in a scalar context, returns the string
consisting of the code points of the named sequence, or \f(CW\*(C`undef\*(C'\fR if no
named sequence by that name exists.  If used with a single argument in
a list context, returns list of the code points.  If used with no
arguments in a list context, returns a hash with the names of the
named sequences as the keys and the named sequences as strings as
the values.  Otherwise, returns \f(CW\*(C`undef\*(C'\fR or empty list depending
on the context.
.PP
(New from Unicode 4.1.0)
.Sh "Unicode::UCD::UnicodeVersion"
.IX Subsection "Unicode::UCD::UnicodeVersion"
\&\fIUnicode::UCD::UnicodeVersion()\fR returns the version of the Unicode
Character Database, in other words, the version of the Unicode
standard the database implements.  The version is a string
of numbers delimited by dots (\f(CW'.'\fR).
.Sh "Implementation Note"
.IX Subsection "Implementation Note"
The first use of \fIcharinfo()\fR opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.  In other words,
if you are wondering where one of your filehandles went, that's where.
.SH "BUGS"
.IX Header "BUGS"
Does not yet support \s-1EBCDIC\s0 platforms.
.SH "AUTHOR"
.IX Header "AUTHOR"
Jarkko Hietaniemi