[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / 5.8.0 / Unicode / UCD.pm

package Unicode::UCD;

use strict;
use warnings;

our $VERSION = '0.2';

require Exporter;

our @ISA = qw(Exporter);

our @EXPORT_OK = qw(charinfo
		    charblock charscript
		    charblocks charscripts
		    charinrange
		    compexcl
		    casefold casespec);

use Carp;

=head1 NAME

Unicode::UCD - Unicode character database

=head1 SYNOPSIS

    use Unicode::UCD 'charinfo';
    my $charinfo   = charinfo($codepoint);

    use Unicode::UCD 'charblock';
    my $charblock  = charblock($codepoint);

    use Unicode::UCD 'charscript';
    my $charscript = charblock($codepoint);

    use Unicode::UCD 'charblocks';
    my $charblocks = charblocks();

    use Unicode::UCD 'charscripts';
    my %charscripts = charscripts();

    use Unicode::UCD qw(charscript charinrange);
    my $range = charscript($script);
    print "looks like $script\n" if charinrange($range, $codepoint);

    use Unicode::UCD 'compexcl';
    my $compexcl = compexcl($codepoint);

    my $unicode_version = Unicode::UCD::UnicodeVersion();

=head1 DESCRIPTION

The Unicode::UCD module offers a simple interface to the Unicode
Character Database.

=cut

my $UNICODEFH;
my $BLOCKSFH;
my $SCRIPTSFH;
my $VERSIONFH;
my $COMPEXCLFH;
my $CASEFOLDFH;
my $CASESPECFH;

sub openunicode {
    my ($rfh, @path) = @_;
    my $f;
    unless (defined $$rfh) {
	for my $d (@INC) {
	    use File::Spec;
	    $f = File::Spec->catfile($d, "unicore", @path);
	    last if open($$rfh, $f);
	    undef $f;
	}
	croak __PACKAGE__, ": failed to find ",
              File::Spec->catfile(@path), " in @INC"
	    unless defined $f;
    }
    return $f;
}

=head2 charinfo

    use Unicode::UCD 'charinfo';

    my $charinfo = charinfo(0x41);

charinfo() returns a reference to a hash that has the following fields
as defined by the Unicode standard:

    key

    code             code point with at least four hexdigits
    name             name of the character IN UPPER CASE
    category         general category of the character
    combining        classes used in the Canonical Ordering Algorithm
    bidi             bidirectional category
    decomposition    character decomposition mapping
    decimal          if decimal digit this is the integer numeric value
    digit            if digit this is the numeric value
    numeric          if numeric is the integer or rational numeric value
    mirrored         if mirrored in bidirectional text
    unicode10        Unicode 1.0 name if existed and different
    comment          ISO 10646 comment field
    upper            uppercase equivalent mapping
    lower            lowercase equivalent mapping
    title            titlecase equivalent mapping

    block            block the character belongs to (used in \p{In...})
    script           script the character belongs to

If no match is found, a reference to an empty hash is returned.

The C<block> property is the same as returned by charinfo().  It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
(Chapter 14 of TUS3).  Similarly for the C<script> property.

Note that you cannot do (de)composition and casing based solely on the
above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
you will need also the compexcl(), casefold(), and casespec() functions.

=cut

sub _getcode {
    my $arg = shift;

    if ($arg =~ /^\d+$/) {
	return $arg;
    } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
	return hex($1);
    }

    return;
}

# Lingua::KO::Hangul::Util not part of the standard distribution
# but it will be used if available.

eval { require Lingua::KO::Hangul::Util };
my $hasHangulUtil = ! $@;
if ($hasHangulUtil) {
    Lingua::KO::Hangul::Util->import();
}

sub hangul_decomp { # internal: called from charinfo
    if ($hasHangulUtil) {
	my @tmp = decomposeHangul(shift);
	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
    }
    return;
}

sub hangul_charname { # internal: called from charinfo
    return sprintf("HANGUL SYLLABLE-%04X", shift);
}

sub han_charname { # internal: called from charinfo
    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
}

my @CharinfoRanges = (
# block name
# [ first, last, coderef to name, coderef to decompose ],
# CJK Ideographs Extension A
  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
# CJK Ideographs
  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
# Hangul Syllables
  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
# Non-Private Use High Surrogates
  [ 0xD800,   0xDB7F,   undef,   undef  ],
# Private Use High Surrogates
  [ 0xDB80,   0xDBFF,   undef,   undef  ],
# Low Surrogates
  [ 0xDC00,   0xDFFF,   undef,   undef  ],
# The Private Use Area
  [ 0xE000,   0xF8FF,   undef,   undef  ],
# CJK Ideographs Extension B
  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
# Plane 15 Private Use Area
  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
# Plane 16 Private Use Area
  [ 0x100000, 0x10FFFD, undef,   undef  ],
);

sub charinfo {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	unless defined $code;
    my $hexk = sprintf("%06X", $code);
    my($rcode,$rname,$rdec);
    foreach my $range (@CharinfoRanges){
      if ($range->[0] <= $code && $code <= $range->[1]) {
        $rcode = $hexk;
	$rcode =~ s/^0+//;
	$rcode =  sprintf("%04X", hex($rcode));
        $rname = $range->[2] ? $range->[2]->($code) : '';
        $rdec  = $range->[3] ? $range->[3]->($code) : '';
        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
        last;
      }
    }
    openunicode(\$UNICODEFH, "UnicodeData.txt");
    if (defined $UNICODEFH) {
	use Search::Dict 1.02;
	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
	    my $line = <$UNICODEFH>;
	    chomp $line;
	    my %prop;
	    @prop{qw(
		     code name category
		     combining bidi decomposition
		     decimal digit numeric
		     mirrored unicode10 comment
		     upper lower title
		    )} = split(/;/, $line, -1);
	    $hexk =~ s/^0+//;
	    $hexk =  sprintf("%04X", hex($hexk));
	    if ($prop{code} eq $hexk) {
		$prop{block}  = charblock($code);
		$prop{script} = charscript($code);
		if(defined $rname){
                    $prop{code} = $rcode;
                    $prop{name} = $rname;
                    $prop{decomposition} = $rdec;
                }
		return \%prop;
	    }
	}
    }
    return;
}

sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
    my ($table, $lo, $hi, $code) = @_;

    return if $lo > $hi;

    my $mid = int(($lo+$hi) / 2);

    if ($table->[$mid]->[0] < $code) {
	if ($table->[$mid]->[1] >= $code) {
	    return $table->[$mid]->[2];
	} else {
	    _search($table, $mid + 1, $hi, $code);
	}
    } elsif ($table->[$mid]->[0] > $code) {
	_search($table, $lo, $mid - 1, $code);
    } else {
	return $table->[$mid]->[2];
    }
}

sub charinrange {
    my ($range, $arg) = @_;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	unless defined $code;
    _search($range, 0, $#$range, $code);
}

=head2 charblock

    use Unicode::UCD 'charblock';

    my $charblock = charblock(0x41);
    my $charblock = charblock(1234);
    my $charblock = charblock("0x263a");
    my $charblock = charblock("U+263a");

    my $range     = charblock('Armenian');

With a B<code point argument> charblock() returns the I<block> the character
belongs to, e.g.  C<Basic Latin>.  Note that not all the character
positions within all blocks are defined.

See also L</Blocks versus Scripts>.

If supplied with an argument that can't be a code point, charblock() tries
to do the opposite and interpret the argument as a character block. The
return value is a I<range>: an anonymous list of lists that contain
I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
code point is in a range using the L</charinrange> function. If the
argument is not a known charater block, C<undef> is returned.

=cut

my @BLOCKS;
my %BLOCKS;

sub _charblocks {
    unless (@BLOCKS) {
	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	    while (<$BLOCKSFH>) {
		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
		    my ($lo, $hi) = (hex($1), hex($2));
		    my $subrange = [ $lo, $hi, $3 ];
		    push @BLOCKS, $subrange;
		    push @{$BLOCKS{$3}}, $subrange;
		}
	    }
	    close($BLOCKSFH);
	}
    }
}

sub charblock {
    my $arg = shift;

    _charblocks() unless @BLOCKS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@BLOCKS, 0, $#BLOCKS, $code);
    } else {
	if (exists $BLOCKS{$arg}) {
	    return $BLOCKS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charscript

    use Unicode::UCD 'charscript';

    my $charscript = charscript(0x41);
    my $charscript = charscript(1234);
    my $charscript = charscript("U+263a");

    my $range      = charscript('Thai');

With a B<code point argument> charscript() returns the I<script> the
character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.

See also L</Blocks versus Scripts>.

If supplied with an argument that can't be a code point, charscript() tries
to do the opposite and interpret the argument as a character script. The
return value is a I<range>: an anonymous list of lists that contain
I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
code point is in a range using the L</charinrange> function. If the
argument is not a known charater script, C<undef> is returned.

=cut

my @SCRIPTS;
my %SCRIPTS;

sub _charscripts {
    unless (@SCRIPTS) {
	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	    while (<$SCRIPTSFH>) {
		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
		    my $script = lc($3);
		    $script =~ s/\b(\w)/uc($1)/ge;
		    my $subrange = [ $lo, $hi, $script ];
		    push @SCRIPTS, $subrange;
		    push @{$SCRIPTS{$script}}, $subrange;
		}
	    }
	    close($SCRIPTSFH);
	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	}
    }
}

sub charscript {
    my $arg = shift;

    _charscripts() unless @SCRIPTS;

    my $code = _getcode($arg);

    if (defined $code) {
	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
    } else {
	if (exists $SCRIPTS{$arg}) {
	    return $SCRIPTS{$arg};
	} else {
	    return;
	}
    }
}

=head2 charblocks

    use Unicode::UCD 'charblocks';

    my $charblocks = charblocks();

charblocks() returns a reference to a hash with the known block names
as the keys, and the code point ranges (see L</charblock>) as the values.

See also L</Blocks versus Scripts>.

=cut

sub charblocks {
    _charblocks() unless %BLOCKS;
    return \%BLOCKS;
}

=head2 charscripts

    use Unicode::UCD 'charscripts';

    my %charscripts = charscripts();

charscripts() returns a hash with the known script names as the keys,
and the code point ranges (see L</charscript>) as the values.

See also L</Blocks versus Scripts>.

=cut

sub charscripts {
    _charscripts() unless %SCRIPTS;
    return \%SCRIPTS;
}

=head2 Blocks versus Scripts

The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
numbering and separation into blocks of (mostly) 256 characters.

For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>.  On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
the ASCII): it includes only the letters, and not, for example, the digits
or the punctuation.

For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt

For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/

=head2 Matching Scripts and Blocks

Scripts are matched with the regular-expression construct
C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
any of the 256 code points in the Tibetan block).

=head2 Code Point Arguments

A I<code point argument> is either a decimal or a hexadecimal scalar
designating a Unicode character, or C<U+> followed by hexadecimals
designating a Unicode character.  Note that Unicode is B<not> limited
to 16 bits (the number of Unicode characters is open-ended, in theory
unlimited): you may have more than 4 hexdigits.

=head2 charinrange

In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock> and L</charscript> or as the values of the hash returned
by L</charblocks> and L</charscripts> by using charinrange():

    use Unicode::UCD qw(charscript charinrange);

    $range = charscript('Hiragana');
    print "looks like hiragana\n" if charinrange($range, $codepoint);

=cut

=head2 compexcl

    use Unicode::UCD 'compexcl';

    my $compexcl = compexcl("09dc");

The compexcl() returns the composition exclusion (that is, if the
character should not be produced during a precomposition) of the 
character specified by a B<code point argument>.

If there is a composition exclusion for the character, true is
returned.  Otherwise, false is returned.

=cut

my %COMPEXCL;

sub _compexcl {
    unless (%COMPEXCL) {
	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
	    while (<$COMPEXCLFH>) {
		if (/^([0-9A-F]+)\s+\#\s+/) {
		    my $code = hex($1);
		    $COMPEXCL{$code} = undef;
		}
	    }
	    close($COMPEXCLFH);
	}
    }
}

sub compexcl {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::compexcl: unknown code '$arg'"
	unless defined $code;

    _compexcl() unless %COMPEXCL;

    return exists $COMPEXCL{$code};
}

=head2 casefold

    use Unicode::UCD 'casefold';

    my %casefold = casefold("09dc");

The casefold() returns the locale-independent case folding of the
character specified by a B<code point argument>.

If there is a case folding for that character, a reference to a hash
with the following fields is returned:

    key

    code             code point with at least four hexdigits
    status           "C", "F", "S", or "I"
    mapping          one or more codes separated by spaces

The meaning of the I<status> is as follows:

   C                 common case folding, common mappings shared
                     by both simple and full mappings
   F                 full case folding, mappings that cause strings
                     to grow in length. Multiple characters are separated
                     by spaces
   S                 simple case folding, mappings to single characters
                     where different from F
   I                 special case for dotted uppercase I and
                     dotless lowercase i
                     - If this mapping is included, the result is
                       case-insensitive, but dotless and dotted I's
                       are not distinguished
                     - If this mapping is excluded, the result is not
                       fully case-insensitive, but dotless and dotted
                       I's are distinguished

If there is no case folding for that character, C<undef> is returned.

For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/

=cut

my %CASEFOLD;

sub _casefold {
    unless (%CASEFOLD) {
	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
	    while (<$CASEFOLDFH>) {
		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
		    my $code = hex($1);
		    $CASEFOLD{$code} = { code    => $1,
					 status  => $2,
					 mapping => $3 };
		}
	    }
	    close($CASEFOLDFH);
	}
    }
}

sub casefold {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::casefold: unknown code '$arg'"
	unless defined $code;

    _casefold() unless %CASEFOLD;

    return $CASEFOLD{$code};
}

=head2 casespec

    use Unicode::UCD 'casespec';

    my %casespec = casespec("09dc");

The casespec() returns the potentially locale-dependent case mapping
of the character specified by a B<code point argument>.  The mapping
may change the length of the string (which the basic Unicode case
mappings as returned by charinfo() never do).

If there is a case folding for that character, a reference to a hash
with the following fields is returned:

    key

    code             code point with at least four hexdigits
    lower            lowercase
    title            titlecase
    upper            uppercase
    condition        condition list (may be undef)

The C<condition> is optional.  Where present, it consists of one or
more I<locales> or I<contexts>, separated by spaces (other than as
used to separate elements, spaces are to be ignored).  A condition
list overrides the normal behavior if all of the listed conditions are
true.  Case distinctions in the condition list are not significant.
Conditions preceded by "NON_" represent the negation of the condition

Note that when there are multiple case folding definitions for a
single code point because of different locales, the value returned by
casespec() is a hash reference which has the locales as the keys and
hash references as described above as the values.

A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
followed by a "_" and a 2-letter ISO language code (possibly followed
by a "_" and a variant code).  You can find the lists of those codes,
see L<Locale::Country> and L<Locale::Language>.

A I<context> is one of the following choices:

    FINAL            The letter is not followed by a letter of
                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
    MODERN           The mapping is only used for modern text
    AFTER_i          The last base character was "i" (U+0069)

For more information about case mappings see
http://www.unicode.org/unicode/reports/tr21/

=cut

my %CASESPEC;

sub _casespec {
    unless (%CASESPEC) {
	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
	    while (<$CASESPECFH>) {
		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
		    my ($hexcode, $lower, $title, $upper, $condition) =
			($1, $2, $3, $4, $5);
		    my $code = hex($hexcode);
		    if (exists $CASESPEC{$code}) {
			if (exists $CASESPEC{$code}->{code}) {
			    my ($oldlower,
				$oldtitle,
				$oldupper,
				$oldcondition) =
				    @{$CASESPEC{$code}}{qw(lower
							   title
							   upper
							   condition)};
			    if (defined $oldcondition) {
				my ($oldlocale) =
				($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
				delete $CASESPEC{$code};
				$CASESPEC{$code}->{$oldlocale} =
				{ code      => $hexcode,
				  lower     => $oldlower,
				  title     => $oldtitle,
				  upper     => $oldupper,
				  condition => $oldcondition };
			    }
			}
			my ($locale) =
			    ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
			$CASESPEC{$code}->{$locale} =
			{ code      => $hexcode,
			  lower     => $lower,
			  title     => $title,
			  upper     => $upper,
			  condition => $condition };
		    } else {
			$CASESPEC{$code} =
			{ code      => $hexcode,
			  lower     => $lower,
			  title     => $title,
			  upper     => $upper,
			  condition => $condition };
		    }
		}
	    }
	    close($CASESPECFH);
	}
    }
}

sub casespec {
    my $arg  = shift;
    my $code = _getcode($arg);
    croak __PACKAGE__, "::casespec: unknown code '$arg'"
	unless defined $code;

    _casespec() unless %CASESPEC;

    return $CASESPEC{$code};
}

=head2 Unicode::UCD::UnicodeVersion

Unicode::UCD::UnicodeVersion() returns the version of the Unicode
Character Database, in other words, the version of the Unicode
standard the database implements.  The version is a string
of numbers delimited by dots (C<'.'>).

=cut

my $UNICODEVERSION;

sub UnicodeVersion {
    unless (defined $UNICODEVERSION) {
	openunicode(\$VERSIONFH, "version");
	chomp($UNICODEVERSION = <$VERSIONFH>);
	close($VERSIONFH);
	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
    }
    return $UNICODEVERSION;
}

=head2 Implementation Note

The first use of charinfo() opens a read-only filehandle to the Unicode
Character Database (the database is included in the Perl distribution).
The filehandle is then kept open for further queries.  In other words,
if you are wondering where one of your filehandles went, that's where.

=head1 BUGS

Does not yet support EBCDIC platforms.

=head1 AUTHOR

Jarkko Hietaniemi

=cut

1;
Commit	Line	Data
86530b38 AT	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
	6	our $VERSION = '0.2';
	7
	8	require Exporter;
	9
	10	our @ISA = qw(Exporter);
	11
	12	our @EXPORT_OK = qw(charinfo
	13	charblock charscript
	14	charblocks charscripts
	15	charinrange
	16	compexcl
	17	casefold casespec);
	18
	19	use Carp;
	20
	21	=head1 NAME
	22
	23	Unicode::UCD - Unicode character database
	24
	25	=head1 SYNOPSIS
	26
	27	use Unicode::UCD 'charinfo';
	28	my $charinfo = charinfo($codepoint);
	29
	30	use Unicode::UCD 'charblock';
	31	my $charblock = charblock($codepoint);
	32
	33	use Unicode::UCD 'charscript';
	34	my $charscript = charblock($codepoint);
	35
	36	use Unicode::UCD 'charblocks';
	37	my $charblocks = charblocks();
	38
	39	use Unicode::UCD 'charscripts';
	40	my %charscripts = charscripts();
	41
	42	use Unicode::UCD qw(charscript charinrange);
	43	my $range = charscript($script);
	44	print "looks like $script\n" if charinrange($range, $codepoint);
	45
	46	use Unicode::UCD 'compexcl';
	47	my $compexcl = compexcl($codepoint);
	48
	49	my $unicode_version = Unicode::UCD::UnicodeVersion();
	50
	51	=head1 DESCRIPTION
	52
	53	The Unicode::UCD module offers a simple interface to the Unicode
	54	Character Database.
	55
	56	=cut
	57
	58	my $UNICODEFH;
	59	my $BLOCKSFH;
	60	my $SCRIPTSFH;
	61	my $VERSIONFH;
	62	my $COMPEXCLFH;
	63	my $CASEFOLDFH;
	64	my $CASESPECFH;
65
66	sub openunicode {
67	my ($rfh, @path) = @_;
68	my $f;
69	unless (defined $$rfh) {
70	for my $d (@INC) {
71	use File::Spec;
72	$f = File::Spec->catfile($d, "unicore", @path);
73	last if open($$rfh, $f);
74	undef $f;
75	}
76	croak __PACKAGE__, ": failed to find ",
77	File::Spec->catfile(@path), " in @INC"
78	unless defined $f;
79	}
80	return $f;
81	}
82
83	=head2 charinfo
84
85	use Unicode::UCD 'charinfo';
86
87	my $charinfo = charinfo(0x41);
88
89	charinfo() returns a reference to a hash that has the following fields
90	as defined by the Unicode standard:
91
92	key
93
94	code code point with at least four hexdigits
95	name name of the character IN UPPER CASE
96	category general category of the character
97	combining classes used in the Canonical Ordering Algorithm
98	bidi bidirectional category
99	decomposition character decomposition mapping
100	decimal if decimal digit this is the integer numeric value
101	digit if digit this is the numeric value
102	numeric if numeric is the integer or rational numeric value
103	mirrored if mirrored in bidirectional text
104	unicode10 Unicode 1.0 name if existed and different
105	comment ISO 10646 comment field
106	upper uppercase equivalent mapping
107	lower lowercase equivalent mapping
108	title titlecase equivalent mapping
109
110	block block the character belongs to (used in \p{In...})
111	script script the character belongs to
112
113	If no match is found, a reference to an empty hash is returned.
114
115	The C<block> property is the same as returned by charinfo(). It is
116	not defined in the Unicode Character Database proper (Chapter 4 of the
117	Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118	(Chapter 14 of TUS3). Similarly for the C<script> property.
119
120	Note that you cannot do (de)composition and casing based solely on the
121	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122	you will need also the compexcl(), casefold(), and casespec() functions.
123
124	=cut
125
126	sub _getcode {
127	my $arg = shift;
128
129	if ($arg =~ /^\d+$/) {
130	return $arg;
131	} elsif ($arg =~ /^(?:U\+\|0x)?([[:xdigit:]]+)$/) {
132	return hex($1);
133	}
134
135	return;
136	}
137
138	# Lingua::KO::Hangul::Util not part of the standard distribution
139	# but it will be used if available.
140
141	eval { require Lingua::KO::Hangul::Util };
142	my $hasHangulUtil = ! $@;
143	if ($hasHangulUtil) {
144	Lingua::KO::Hangul::Util->import();
145	}
146
147	sub hangul_decomp { # internal: called from charinfo
148	if ($hasHangulUtil) {
149	my @tmp = decomposeHangul(shift);
150	return sprintf("%04X %04X", @tmp) if @tmp == 2;
151	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152	}
153	return;
154	}
155
156	sub hangul_charname { # internal: called from charinfo
157	return sprintf("HANGUL SYLLABLE-%04X", shift);
158	}
159
160	sub han_charname { # internal: called from charinfo
161	return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
162	}
163
164	my @CharinfoRanges = (
165	# block name
166	# [ first, last, coderef to name, coderef to decompose ],
167	# CJK Ideographs Extension A
168	[ 0x3400, 0x4DB5, \&han_charname, undef ],
169	# CJK Ideographs
170	[ 0x4E00, 0x9FA5, \&han_charname, undef ],
171	# Hangul Syllables
172	[ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
173	# Non-Private Use High Surrogates
174	[ 0xD800, 0xDB7F, undef, undef ],
175	# Private Use High Surrogates
176	[ 0xDB80, 0xDBFF, undef, undef ],
177	# Low Surrogates
178	[ 0xDC00, 0xDFFF, undef, undef ],
179	# The Private Use Area
180	[ 0xE000, 0xF8FF, undef, undef ],
181	# CJK Ideographs Extension B
182	[ 0x20000, 0x2A6D6, \&han_charname, undef ],
183	# Plane 15 Private Use Area
184	[ 0xF0000, 0xFFFFD, undef, undef ],
185	# Plane 16 Private Use Area
186	[ 0x100000, 0x10FFFD, undef, undef ],
187	);
188
189	sub charinfo {
190	my $arg = shift;
191	my $code = _getcode($arg);
192	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193	unless defined $code;
194	my $hexk = sprintf("%06X", $code);
195	my($rcode,$rname,$rdec);
196	foreach my $range (@CharinfoRanges){
197	if ($range->[0] <= $code && $code <= $range->[1]) {
198	$rcode = $hexk;
199	$rcode =~ s/^0+//;
200	$rcode = sprintf("%04X", hex($rcode));
201	$rname = $range->[2] ? $range->[2]->($code) : '';
202	$rdec = $range->[3] ? $range->[3]->($code) : '';
203	$hexk = sprintf("%06X", $range->[0]); # replace by the first
204	last;
205	}
206	}
207	openunicode(\$UNICODEFH, "UnicodeData.txt");
208	if (defined $UNICODEFH) {
209	use Search::Dict 1.02;
210	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
211	my $line = <$UNICODEFH>;
212	chomp $line;
213	my %prop;
214	@prop{qw(
215	code name category
216	combining bidi decomposition
217	decimal digit numeric
218	mirrored unicode10 comment
219	upper lower title
220	)} = split(/;/, $line, -1);
221	$hexk =~ s/^0+//;
222	$hexk = sprintf("%04X", hex($hexk));
223	if ($prop{code} eq $hexk) {
224	$prop{block} = charblock($code);
225	$prop{script} = charscript($code);
226	if(defined $rname){
227	$prop{code} = $rcode;
228	$prop{name} = $rname;
229	$prop{decomposition} = $rdec;
230	}
231	return \%prop;
232	}
233	}
234	}
235	return;
236	}
237
238	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239	my ($table, $lo, $hi, $code) = @_;
240
241	return if $lo > $hi;
242
243	my $mid = int(($lo+$hi) / 2);
244
245	if ($table->[$mid]->[0] < $code) {
246	if ($table->[$mid]->[1] >= $code) {
247	return $table->[$mid]->[2];
248	} else {
249	_search($table, $mid + 1, $hi, $code);
250	}
251	} elsif ($table->[$mid]->[0] > $code) {
252	_search($table, $lo, $mid - 1, $code);
253	} else {
254	return $table->[$mid]->[2];
255	}
256	}
257
258	sub charinrange {
259	my ($range, $arg) = @_;
260	my $code = _getcode($arg);
261	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262	unless defined $code;
263	_search($range, 0, $#$range, $code);
264	}
265
266	=head2 charblock
267
268	use Unicode::UCD 'charblock';
269
270	my $charblock = charblock(0x41);
271	my $charblock = charblock(1234);
272	my $charblock = charblock("0x263a");
273	my $charblock = charblock("U+263a");
274
275	my $range = charblock('Armenian');
276
277	With a B<code point argument> charblock() returns the I<block> the character
278	belongs to, e.g. C<Basic Latin>. Note that not all the character
279	positions within all blocks are defined.
280
281	See also L</Blocks versus Scripts>.
282
283	If supplied with an argument that can't be a code point, charblock() tries
284	to do the opposite and interpret the argument as a character block. The
285	return value is a I<range>: an anonymous list of lists that contain
286	I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287	code point is in a range using the L</charinrange> function. If the
288	argument is not a known charater block, C<undef> is returned.
289
290	=cut
291
292	my @BLOCKS;
293	my %BLOCKS;
294
295	sub _charblocks {
296	unless (@BLOCKS) {
297	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
298	while (<$BLOCKSFH>) {
299	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
300	my ($lo, $hi) = (hex($1), hex($2));
301	my $subrange = [ $lo, $hi, $3 ];
302	push @BLOCKS, $subrange;
303	push @{$BLOCKS{$3}}, $subrange;
304	}
305	}
306	close($BLOCKSFH);
307	}
308	}
309	}
310
311	sub charblock {
312	my $arg = shift;
313
314	_charblocks() unless @BLOCKS;
315
316	my $code = _getcode($arg);
317
318	if (defined $code) {
319	_search(\@BLOCKS, 0, $#BLOCKS, $code);
320	} else {
321	if (exists $BLOCKS{$arg}) {
322	return $BLOCKS{$arg};
323	} else {
324	return;
325	}
326	}
327	}
328
329	=head2 charscript
330
331	use Unicode::UCD 'charscript';
332
333	my $charscript = charscript(0x41);
334	my $charscript = charscript(1234);
335	my $charscript = charscript("U+263a");
336
337	my $range = charscript('Thai');
338
339	With a B<code point argument> charscript() returns the I<script> the
340	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
341
342	See also L</Blocks versus Scripts>.
343
344	If supplied with an argument that can't be a code point, charscript() tries
345	to do the opposite and interpret the argument as a character script. The
346	return value is a I<range>: an anonymous list of lists that contain
347	I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
348	code point is in a range using the L</charinrange> function. If the
349	argument is not a known charater script, C<undef> is returned.
350
351	=cut
352
353	my @SCRIPTS;
354	my %SCRIPTS;
355
356	sub _charscripts {
357	unless (@SCRIPTS) {
358	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
359	while (<$SCRIPTSFH>) {
360	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
361	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
362	my $script = lc($3);
363	$script =~ s/\b(\w)/uc($1)/ge;
364	my $subrange = [ $lo, $hi, $script ];
365	push @SCRIPTS, $subrange;
366	push @{$SCRIPTS{$script}}, $subrange;
367	}
368	}
369	close($SCRIPTSFH);
370	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
371	}
372	}
373	}
374
375	sub charscript {
376	my $arg = shift;
377
378	_charscripts() unless @SCRIPTS;
379
380	my $code = _getcode($arg);
381
382	if (defined $code) {
383	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
384	} else {
385	if (exists $SCRIPTS{$arg}) {
386	return $SCRIPTS{$arg};
387	} else {
388	return;
389	}
390	}
391	}
392
393	=head2 charblocks
394
395	use Unicode::UCD 'charblocks';
396
397	my $charblocks = charblocks();
398
399	charblocks() returns a reference to a hash with the known block names
400	as the keys, and the code point ranges (see L</charblock>) as the values.
401
402	See also L</Blocks versus Scripts>.
403
404	=cut
405
406	sub charblocks {
407	_charblocks() unless %BLOCKS;
408	return \%BLOCKS;
409	}
410
411	=head2 charscripts
412
413	use Unicode::UCD 'charscripts';
414
415	my %charscripts = charscripts();
416
417	charscripts() returns a hash with the known script names as the keys,
418	and the code point ranges (see L</charscript>) as the values.
419
420	See also L</Blocks versus Scripts>.
421
422	=cut
423
424	sub charscripts {
425	_charscripts() unless %SCRIPTS;
426	return \%SCRIPTS;
427	}
428
429	=head2 Blocks versus Scripts
430
431	The difference between a block and a script is that scripts are closer
432	to the linguistic notion of a set of characters required to present
433	languages, while block is more of an artifact of the Unicode character
434	numbering and separation into blocks of (mostly) 256 characters.
435
436	For example the Latin B<script> is spread over several B<blocks>, such
437	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
438	C<Latin Extended-B>. On the other hand, the Latin script does not
439	contain all the characters of the C<Basic Latin> block (also known as
440	the ASCII): it includes only the letters, and not, for example, the digits
441	or the punctuation.
442
443	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
444
445	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
446
447	=head2 Matching Scripts and Blocks
448
449	Scripts are matched with the regular-expression construct
450	C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
451	while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
452	any of the 256 code points in the Tibetan block).
453
454	=head2 Code Point Arguments
455
456	A I<code point argument> is either a decimal or a hexadecimal scalar
457	designating a Unicode character, or C<U+> followed by hexadecimals
458	designating a Unicode character. Note that Unicode is B<not> limited
459	to 16 bits (the number of Unicode characters is open-ended, in theory
460	unlimited): you may have more than 4 hexdigits.
461
462	=head2 charinrange
463
464	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
465	can also test whether a code point is in the I<range> as returned by
466	L</charblock> and L</charscript> or as the values of the hash returned
467	by L</charblocks> and L</charscripts> by using charinrange():
468
469	use Unicode::UCD qw(charscript charinrange);
470
471	$range = charscript('Hiragana');
472	print "looks like hiragana\n" if charinrange($range, $codepoint);
473
474	=cut
475
476	=head2 compexcl
477
478	use Unicode::UCD 'compexcl';
479
480	my $compexcl = compexcl("09dc");
481
482	The compexcl() returns the composition exclusion (that is, if the
483	character should not be produced during a precomposition) of the
484	character specified by a B<code point argument>.
485
486	If there is a composition exclusion for the character, true is
487	returned. Otherwise, false is returned.
488
489	=cut
490
491	my %COMPEXCL;
492
493	sub _compexcl {
494	unless (%COMPEXCL) {
495	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
496	while (<$COMPEXCLFH>) {
497	if (/^([0-9A-F]+)\s+\#\s+/) {
498	my $code = hex($1);
499	$COMPEXCL{$code} = undef;
500	}
501	}
502	close($COMPEXCLFH);
503	}
504	}
505	}
506
507	sub compexcl {
508	my $arg = shift;
509	my $code = _getcode($arg);
510	croak __PACKAGE__, "::compexcl: unknown code '$arg'"
511	unless defined $code;
512
513	_compexcl() unless %COMPEXCL;
514
515	return exists $COMPEXCL{$code};
516	}
517
518	=head2 casefold
519
520	use Unicode::UCD 'casefold';
521
522	my %casefold = casefold("09dc");
523
524	The casefold() returns the locale-independent case folding of the
525	character specified by a B<code point argument>.
526
527	If there is a case folding for that character, a reference to a hash
528	with the following fields is returned:
529
530	key
531
532	code code point with at least four hexdigits
533	status "C", "F", "S", or "I"
534	mapping one or more codes separated by spaces
535
536	The meaning of the I<status> is as follows:
537
538	C common case folding, common mappings shared
539	by both simple and full mappings
540	F full case folding, mappings that cause strings
541	to grow in length. Multiple characters are separated
542	by spaces
543	S simple case folding, mappings to single characters
544	where different from F
545	I special case for dotted uppercase I and
546	dotless lowercase i
547	- If this mapping is included, the result is
548	case-insensitive, but dotless and dotted I's
549	are not distinguished
550	- If this mapping is excluded, the result is not
551	fully case-insensitive, but dotless and dotted
552	I's are distinguished
553
554	If there is no case folding for that character, C<undef> is returned.
555
556	For more information about case mappings see
557	http://www.unicode.org/unicode/reports/tr21/
558
559	=cut
560
561	my %CASEFOLD;
562
563	sub _casefold {
564	unless (%CASEFOLD) {
565	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
566	while (<$CASEFOLDFH>) {
567	if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
568	my $code = hex($1);
569	$CASEFOLD{$code} = { code => $1,
570	status => $2,
571	mapping => $3 };
572	}
573	}
574	close($CASEFOLDFH);
575	}
576	}
577	}
578
579	sub casefold {
580	my $arg = shift;
581	my $code = _getcode($arg);
582	croak __PACKAGE__, "::casefold: unknown code '$arg'"
583	unless defined $code;
584
585	_casefold() unless %CASEFOLD;
586
587	return $CASEFOLD{$code};
588	}
589
590	=head2 casespec
591
592	use Unicode::UCD 'casespec';
593
594	my %casespec = casespec("09dc");
595
596	The casespec() returns the potentially locale-dependent case mapping
597	of the character specified by a B<code point argument>. The mapping
598	may change the length of the string (which the basic Unicode case
599	mappings as returned by charinfo() never do).
600
601	If there is a case folding for that character, a reference to a hash
602	with the following fields is returned:
603
604	key
605
606	code code point with at least four hexdigits
607	lower lowercase
608	title titlecase
609	upper uppercase
610	condition condition list (may be undef)
611
612	The C<condition> is optional. Where present, it consists of one or
613	more I<locales> or I<contexts>, separated by spaces (other than as
614	used to separate elements, spaces are to be ignored). A condition
615	list overrides the normal behavior if all of the listed conditions are
616	true. Case distinctions in the condition list are not significant.
617	Conditions preceded by "NON_" represent the negation of the condition
618
619	Note that when there are multiple case folding definitions for a
620	single code point because of different locales, the value returned by
621	casespec() is a hash reference which has the locales as the keys and
622	hash references as described above as the values.
623
624	A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
625	followed by a "_" and a 2-letter ISO language code (possibly followed
626	by a "_" and a variant code). You can find the lists of those codes,
627	see L<Locale::Country> and L<Locale::Language>.
628
629	A I<context> is one of the following choices:
630
631	FINAL The letter is not followed by a letter of
632	general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
633	MODERN The mapping is only used for modern text
634	AFTER_i The last base character was "i" (U+0069)
635
636	For more information about case mappings see
637	http://www.unicode.org/unicode/reports/tr21/
638
639	=cut
640
641	my %CASESPEC;
642
643	sub _casespec {
644	unless (%CASESPEC) {
645	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
646	while (<$CASESPECFH>) {
647	if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; (\w+(?: \w+))?/) {
648	my ($hexcode, $lower, $title, $upper, $condition) =
649	($1, $2, $3, $4, $5);
650	my $code = hex($hexcode);
651	if (exists $CASESPEC{$code}) {
652	if (exists $CASESPEC{$code}->{code}) {
653	my ($oldlower,
654	$oldtitle,
655	$oldupper,
656	$oldcondition) =
657	@{$CASESPEC{$code}}{qw(lower
658	title
659	upper
660	condition)};
661	if (defined $oldcondition) {
662	my ($oldlocale) =
663	($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
664	delete $CASESPEC{$code};
665	$CASESPEC{$code}->{$oldlocale} =
666	{ code => $hexcode,
667	lower => $oldlower,
668	title => $oldtitle,
669	upper => $oldupper,
670	condition => $oldcondition };
671	}
672	}
673	my ($locale) =
674	($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675	$CASESPEC{$code}->{$locale} =
676	{ code => $hexcode,
677	lower => $lower,
678	title => $title,
679	upper => $upper,
680	condition => $condition };
681	} else {
682	$CASESPEC{$code} =
683	{ code => $hexcode,
684	lower => $lower,
685	title => $title,
686	upper => $upper,
687	condition => $condition };
688	}
689	}
690	}
691	close($CASESPECFH);
692	}
693	}
694	}
695
696	sub casespec {
697	my $arg = shift;
698	my $code = _getcode($arg);
699	croak __PACKAGE__, "::casespec: unknown code '$arg'"
700	unless defined $code;
701
702	_casespec() unless %CASESPEC;
703
704	return $CASESPEC{$code};
705	}
706
707	=head2 Unicode::UCD::UnicodeVersion
708
709	Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710	Character Database, in other words, the version of the Unicode
711	standard the database implements. The version is a string
712	of numbers delimited by dots (C<'.'>).
713
714	=cut
715
716	my $UNICODEVERSION;
717
718	sub UnicodeVersion {
719	unless (defined $UNICODEVERSION) {
720	openunicode(\$VERSIONFH, "version");
721	chomp($UNICODEVERSION = <$VERSIONFH>);
722	close($VERSIONFH);
723	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
725	}
726	return $UNICODEVERSION;
727	}
728
729	=head2 Implementation Note
730
731	The first use of charinfo() opens a read-only filehandle to the Unicode
732	Character Database (the database is included in the Perl distribution).
733	The filehandle is then kept open for further queries. In other words,
734	if you are wondering where one of your filehandles went, that's where.
735
736	=head1 BUGS
737
738	Does not yet support EBCDIC platforms.
739
740	=head1 AUTHOR
741
742	Jarkko Hietaniemi
743
744	=cut
745
746	1;