Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / 5.8.0 / Unicode / UCD.pm
CommitLineData
86530b38
AT
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.2';
7
8require Exporter;
9
10our @ISA = qw(Exporter);
11
12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
15 charinrange
16 compexcl
17 casefold casespec);
18
19use Carp;
20
21=head1 NAME
22
23Unicode::UCD - Unicode character database
24
25=head1 SYNOPSIS
26
27 use Unicode::UCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
29
30 use Unicode::UCD 'charblock';
31 my $charblock = charblock($codepoint);
32
33 use Unicode::UCD 'charscript';
34 my $charscript = charblock($codepoint);
35
36 use Unicode::UCD 'charblocks';
37 my $charblocks = charblocks();
38
39 use Unicode::UCD 'charscripts';
40 my %charscripts = charscripts();
41
42 use Unicode::UCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
46 use Unicode::UCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
48
49 my $unicode_version = Unicode::UCD::UnicodeVersion();
50
51=head1 DESCRIPTION
52
53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
55
56=cut
57
58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
72 $f = File::Spec->catfile($d, "unicore", @path);
73 last if open($$rfh, $f);
74 undef $f;
75 }
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
79 }
80 return $f;
81}
82
83=head2 charinfo
84
85 use Unicode::UCD 'charinfo';
86
87 my $charinfo = charinfo(0x41);
88
89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
109
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
112
113If no match is found, a reference to an empty hash is returned.
114
115The C<block> property is the same as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122you will need also the compexcl(), casefold(), and casespec() functions.
123
124=cut
125
126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
138# Lingua::KO::Hangul::Util not part of the standard distribution
139# but it will be used if available.
140
141eval { require Lingua::KO::Hangul::Util };
142my $hasHangulUtil = ! $@;
143if ($hasHangulUtil) {
144 Lingua::KO::Hangul::Util->import();
145}
146
147sub hangul_decomp { # internal: called from charinfo
148 if ($hasHangulUtil) {
149 my @tmp = decomposeHangul(shift);
150 return sprintf("%04X %04X", @tmp) if @tmp == 2;
151 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
152 }
153 return;
154}
155
156sub hangul_charname { # internal: called from charinfo
157 return sprintf("HANGUL SYLLABLE-%04X", shift);
158}
159
160sub han_charname { # internal: called from charinfo
161 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
162}
163
164my @CharinfoRanges = (
165# block name
166# [ first, last, coderef to name, coderef to decompose ],
167# CJK Ideographs Extension A
168 [ 0x3400, 0x4DB5, \&han_charname, undef ],
169# CJK Ideographs
170 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
171# Hangul Syllables
172 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
173# Non-Private Use High Surrogates
174 [ 0xD800, 0xDB7F, undef, undef ],
175# Private Use High Surrogates
176 [ 0xDB80, 0xDBFF, undef, undef ],
177# Low Surrogates
178 [ 0xDC00, 0xDFFF, undef, undef ],
179# The Private Use Area
180 [ 0xE000, 0xF8FF, undef, undef ],
181# CJK Ideographs Extension B
182 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
183# Plane 15 Private Use Area
184 [ 0xF0000, 0xFFFFD, undef, undef ],
185# Plane 16 Private Use Area
186 [ 0x100000, 0x10FFFD, undef, undef ],
187);
188
189sub charinfo {
190 my $arg = shift;
191 my $code = _getcode($arg);
192 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
193 unless defined $code;
194 my $hexk = sprintf("%06X", $code);
195 my($rcode,$rname,$rdec);
196 foreach my $range (@CharinfoRanges){
197 if ($range->[0] <= $code && $code <= $range->[1]) {
198 $rcode = $hexk;
199 $rcode =~ s/^0+//;
200 $rcode = sprintf("%04X", hex($rcode));
201 $rname = $range->[2] ? $range->[2]->($code) : '';
202 $rdec = $range->[3] ? $range->[3]->($code) : '';
203 $hexk = sprintf("%06X", $range->[0]); # replace by the first
204 last;
205 }
206 }
207 openunicode(\$UNICODEFH, "UnicodeData.txt");
208 if (defined $UNICODEFH) {
209 use Search::Dict 1.02;
210 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
211 my $line = <$UNICODEFH>;
212 chomp $line;
213 my %prop;
214 @prop{qw(
215 code name category
216 combining bidi decomposition
217 decimal digit numeric
218 mirrored unicode10 comment
219 upper lower title
220 )} = split(/;/, $line, -1);
221 $hexk =~ s/^0+//;
222 $hexk = sprintf("%04X", hex($hexk));
223 if ($prop{code} eq $hexk) {
224 $prop{block} = charblock($code);
225 $prop{script} = charscript($code);
226 if(defined $rname){
227 $prop{code} = $rcode;
228 $prop{name} = $rname;
229 $prop{decomposition} = $rdec;
230 }
231 return \%prop;
232 }
233 }
234 }
235 return;
236}
237
238sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
239 my ($table, $lo, $hi, $code) = @_;
240
241 return if $lo > $hi;
242
243 my $mid = int(($lo+$hi) / 2);
244
245 if ($table->[$mid]->[0] < $code) {
246 if ($table->[$mid]->[1] >= $code) {
247 return $table->[$mid]->[2];
248 } else {
249 _search($table, $mid + 1, $hi, $code);
250 }
251 } elsif ($table->[$mid]->[0] > $code) {
252 _search($table, $lo, $mid - 1, $code);
253 } else {
254 return $table->[$mid]->[2];
255 }
256}
257
258sub charinrange {
259 my ($range, $arg) = @_;
260 my $code = _getcode($arg);
261 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
262 unless defined $code;
263 _search($range, 0, $#$range, $code);
264}
265
266=head2 charblock
267
268 use Unicode::UCD 'charblock';
269
270 my $charblock = charblock(0x41);
271 my $charblock = charblock(1234);
272 my $charblock = charblock("0x263a");
273 my $charblock = charblock("U+263a");
274
275 my $range = charblock('Armenian');
276
277With a B<code point argument> charblock() returns the I<block> the character
278belongs to, e.g. C<Basic Latin>. Note that not all the character
279positions within all blocks are defined.
280
281See also L</Blocks versus Scripts>.
282
283If supplied with an argument that can't be a code point, charblock() tries
284to do the opposite and interpret the argument as a character block. The
285return value is a I<range>: an anonymous list of lists that contain
286I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
287code point is in a range using the L</charinrange> function. If the
288argument is not a known charater block, C<undef> is returned.
289
290=cut
291
292my @BLOCKS;
293my %BLOCKS;
294
295sub _charblocks {
296 unless (@BLOCKS) {
297 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
298 while (<$BLOCKSFH>) {
299 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
300 my ($lo, $hi) = (hex($1), hex($2));
301 my $subrange = [ $lo, $hi, $3 ];
302 push @BLOCKS, $subrange;
303 push @{$BLOCKS{$3}}, $subrange;
304 }
305 }
306 close($BLOCKSFH);
307 }
308 }
309}
310
311sub charblock {
312 my $arg = shift;
313
314 _charblocks() unless @BLOCKS;
315
316 my $code = _getcode($arg);
317
318 if (defined $code) {
319 _search(\@BLOCKS, 0, $#BLOCKS, $code);
320 } else {
321 if (exists $BLOCKS{$arg}) {
322 return $BLOCKS{$arg};
323 } else {
324 return;
325 }
326 }
327}
328
329=head2 charscript
330
331 use Unicode::UCD 'charscript';
332
333 my $charscript = charscript(0x41);
334 my $charscript = charscript(1234);
335 my $charscript = charscript("U+263a");
336
337 my $range = charscript('Thai');
338
339With a B<code point argument> charscript() returns the I<script> the
340character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
341
342See also L</Blocks versus Scripts>.
343
344If supplied with an argument that can't be a code point, charscript() tries
345to do the opposite and interpret the argument as a character script. The
346return value is a I<range>: an anonymous list of lists that contain
347I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
348code point is in a range using the L</charinrange> function. If the
349argument is not a known charater script, C<undef> is returned.
350
351=cut
352
353my @SCRIPTS;
354my %SCRIPTS;
355
356sub _charscripts {
357 unless (@SCRIPTS) {
358 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
359 while (<$SCRIPTSFH>) {
360 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
361 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
362 my $script = lc($3);
363 $script =~ s/\b(\w)/uc($1)/ge;
364 my $subrange = [ $lo, $hi, $script ];
365 push @SCRIPTS, $subrange;
366 push @{$SCRIPTS{$script}}, $subrange;
367 }
368 }
369 close($SCRIPTSFH);
370 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
371 }
372 }
373}
374
375sub charscript {
376 my $arg = shift;
377
378 _charscripts() unless @SCRIPTS;
379
380 my $code = _getcode($arg);
381
382 if (defined $code) {
383 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
384 } else {
385 if (exists $SCRIPTS{$arg}) {
386 return $SCRIPTS{$arg};
387 } else {
388 return;
389 }
390 }
391}
392
393=head2 charblocks
394
395 use Unicode::UCD 'charblocks';
396
397 my $charblocks = charblocks();
398
399charblocks() returns a reference to a hash with the known block names
400as the keys, and the code point ranges (see L</charblock>) as the values.
401
402See also L</Blocks versus Scripts>.
403
404=cut
405
406sub charblocks {
407 _charblocks() unless %BLOCKS;
408 return \%BLOCKS;
409}
410
411=head2 charscripts
412
413 use Unicode::UCD 'charscripts';
414
415 my %charscripts = charscripts();
416
417charscripts() returns a hash with the known script names as the keys,
418and the code point ranges (see L</charscript>) as the values.
419
420See also L</Blocks versus Scripts>.
421
422=cut
423
424sub charscripts {
425 _charscripts() unless %SCRIPTS;
426 return \%SCRIPTS;
427}
428
429=head2 Blocks versus Scripts
430
431The difference between a block and a script is that scripts are closer
432to the linguistic notion of a set of characters required to present
433languages, while block is more of an artifact of the Unicode character
434numbering and separation into blocks of (mostly) 256 characters.
435
436For example the Latin B<script> is spread over several B<blocks>, such
437as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
438C<Latin Extended-B>. On the other hand, the Latin script does not
439contain all the characters of the C<Basic Latin> block (also known as
440the ASCII): it includes only the letters, and not, for example, the digits
441or the punctuation.
442
443For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
444
445For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
446
447=head2 Matching Scripts and Blocks
448
449Scripts are matched with the regular-expression construct
450C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
451while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
452any of the 256 code points in the Tibetan block).
453
454=head2 Code Point Arguments
455
456A I<code point argument> is either a decimal or a hexadecimal scalar
457designating a Unicode character, or C<U+> followed by hexadecimals
458designating a Unicode character. Note that Unicode is B<not> limited
459to 16 bits (the number of Unicode characters is open-ended, in theory
460unlimited): you may have more than 4 hexdigits.
461
462=head2 charinrange
463
464In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
465can also test whether a code point is in the I<range> as returned by
466L</charblock> and L</charscript> or as the values of the hash returned
467by L</charblocks> and L</charscripts> by using charinrange():
468
469 use Unicode::UCD qw(charscript charinrange);
470
471 $range = charscript('Hiragana');
472 print "looks like hiragana\n" if charinrange($range, $codepoint);
473
474=cut
475
476=head2 compexcl
477
478 use Unicode::UCD 'compexcl';
479
480 my $compexcl = compexcl("09dc");
481
482The compexcl() returns the composition exclusion (that is, if the
483character should not be produced during a precomposition) of the
484character specified by a B<code point argument>.
485
486If there is a composition exclusion for the character, true is
487returned. Otherwise, false is returned.
488
489=cut
490
491my %COMPEXCL;
492
493sub _compexcl {
494 unless (%COMPEXCL) {
495 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
496 while (<$COMPEXCLFH>) {
497 if (/^([0-9A-F]+)\s+\#\s+/) {
498 my $code = hex($1);
499 $COMPEXCL{$code} = undef;
500 }
501 }
502 close($COMPEXCLFH);
503 }
504 }
505}
506
507sub compexcl {
508 my $arg = shift;
509 my $code = _getcode($arg);
510 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
511 unless defined $code;
512
513 _compexcl() unless %COMPEXCL;
514
515 return exists $COMPEXCL{$code};
516}
517
518=head2 casefold
519
520 use Unicode::UCD 'casefold';
521
522 my %casefold = casefold("09dc");
523
524The casefold() returns the locale-independent case folding of the
525character specified by a B<code point argument>.
526
527If there is a case folding for that character, a reference to a hash
528with the following fields is returned:
529
530 key
531
532 code code point with at least four hexdigits
533 status "C", "F", "S", or "I"
534 mapping one or more codes separated by spaces
535
536The meaning of the I<status> is as follows:
537
538 C common case folding, common mappings shared
539 by both simple and full mappings
540 F full case folding, mappings that cause strings
541 to grow in length. Multiple characters are separated
542 by spaces
543 S simple case folding, mappings to single characters
544 where different from F
545 I special case for dotted uppercase I and
546 dotless lowercase i
547 - If this mapping is included, the result is
548 case-insensitive, but dotless and dotted I's
549 are not distinguished
550 - If this mapping is excluded, the result is not
551 fully case-insensitive, but dotless and dotted
552 I's are distinguished
553
554If there is no case folding for that character, C<undef> is returned.
555
556For more information about case mappings see
557http://www.unicode.org/unicode/reports/tr21/
558
559=cut
560
561my %CASEFOLD;
562
563sub _casefold {
564 unless (%CASEFOLD) {
565 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
566 while (<$CASEFOLDFH>) {
567 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
568 my $code = hex($1);
569 $CASEFOLD{$code} = { code => $1,
570 status => $2,
571 mapping => $3 };
572 }
573 }
574 close($CASEFOLDFH);
575 }
576 }
577}
578
579sub casefold {
580 my $arg = shift;
581 my $code = _getcode($arg);
582 croak __PACKAGE__, "::casefold: unknown code '$arg'"
583 unless defined $code;
584
585 _casefold() unless %CASEFOLD;
586
587 return $CASEFOLD{$code};
588}
589
590=head2 casespec
591
592 use Unicode::UCD 'casespec';
593
594 my %casespec = casespec("09dc");
595
596The casespec() returns the potentially locale-dependent case mapping
597of the character specified by a B<code point argument>. The mapping
598may change the length of the string (which the basic Unicode case
599mappings as returned by charinfo() never do).
600
601If there is a case folding for that character, a reference to a hash
602with the following fields is returned:
603
604 key
605
606 code code point with at least four hexdigits
607 lower lowercase
608 title titlecase
609 upper uppercase
610 condition condition list (may be undef)
611
612The C<condition> is optional. Where present, it consists of one or
613more I<locales> or I<contexts>, separated by spaces (other than as
614used to separate elements, spaces are to be ignored). A condition
615list overrides the normal behavior if all of the listed conditions are
616true. Case distinctions in the condition list are not significant.
617Conditions preceded by "NON_" represent the negation of the condition
618
619Note that when there are multiple case folding definitions for a
620single code point because of different locales, the value returned by
621casespec() is a hash reference which has the locales as the keys and
622hash references as described above as the values.
623
624A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
625followed by a "_" and a 2-letter ISO language code (possibly followed
626by a "_" and a variant code). You can find the lists of those codes,
627see L<Locale::Country> and L<Locale::Language>.
628
629A I<context> is one of the following choices:
630
631 FINAL The letter is not followed by a letter of
632 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
633 MODERN The mapping is only used for modern text
634 AFTER_i The last base character was "i" (U+0069)
635
636For more information about case mappings see
637http://www.unicode.org/unicode/reports/tr21/
638
639=cut
640
641my %CASESPEC;
642
643sub _casespec {
644 unless (%CASESPEC) {
645 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
646 while (<$CASESPECFH>) {
647 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
648 my ($hexcode, $lower, $title, $upper, $condition) =
649 ($1, $2, $3, $4, $5);
650 my $code = hex($hexcode);
651 if (exists $CASESPEC{$code}) {
652 if (exists $CASESPEC{$code}->{code}) {
653 my ($oldlower,
654 $oldtitle,
655 $oldupper,
656 $oldcondition) =
657 @{$CASESPEC{$code}}{qw(lower
658 title
659 upper
660 condition)};
661 if (defined $oldcondition) {
662 my ($oldlocale) =
663 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
664 delete $CASESPEC{$code};
665 $CASESPEC{$code}->{$oldlocale} =
666 { code => $hexcode,
667 lower => $oldlower,
668 title => $oldtitle,
669 upper => $oldupper,
670 condition => $oldcondition };
671 }
672 }
673 my ($locale) =
674 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675 $CASESPEC{$code}->{$locale} =
676 { code => $hexcode,
677 lower => $lower,
678 title => $title,
679 upper => $upper,
680 condition => $condition };
681 } else {
682 $CASESPEC{$code} =
683 { code => $hexcode,
684 lower => $lower,
685 title => $title,
686 upper => $upper,
687 condition => $condition };
688 }
689 }
690 }
691 close($CASESPECFH);
692 }
693 }
694}
695
696sub casespec {
697 my $arg = shift;
698 my $code = _getcode($arg);
699 croak __PACKAGE__, "::casespec: unknown code '$arg'"
700 unless defined $code;
701
702 _casespec() unless %CASESPEC;
703
704 return $CASESPEC{$code};
705}
706
707=head2 Unicode::UCD::UnicodeVersion
708
709Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710Character Database, in other words, the version of the Unicode
711standard the database implements. The version is a string
712of numbers delimited by dots (C<'.'>).
713
714=cut
715
716my $UNICODEVERSION;
717
718sub UnicodeVersion {
719 unless (defined $UNICODEVERSION) {
720 openunicode(\$VERSIONFH, "version");
721 chomp($UNICODEVERSION = <$VERSIONFH>);
722 close($VERSIONFH);
723 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
725 }
726 return $UNICODEVERSION;
727}
728
729=head2 Implementation Note
730
731The first use of charinfo() opens a read-only filehandle to the Unicode
732Character Database (the database is included in the Perl distribution).
733The filehandle is then kept open for further queries. In other words,
734if you are wondering where one of your filehandles went, that's where.
735
736=head1 BUGS
737
738Does not yet support EBCDIC platforms.
739
740=head1 AUTHOR
741
742Jarkko Hietaniemi
743
744=cut
745
7461;