| 1 | package Unicode::UCD; |
| 2 | |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| 6 | our $VERSION = '0.24'; |
| 7 | |
| 8 | use Storable qw(dclone); |
| 9 | |
| 10 | require Exporter; |
| 11 | |
| 12 | our @ISA = qw(Exporter); |
| 13 | |
| 14 | our @EXPORT_OK = qw(charinfo |
| 15 | charblock charscript |
| 16 | charblocks charscripts |
| 17 | charinrange |
| 18 | compexcl |
| 19 | casefold casespec |
| 20 | namedseq); |
| 21 | |
| 22 | use Carp; |
| 23 | |
| 24 | =head1 NAME |
| 25 | |
| 26 | Unicode::UCD - Unicode character database |
| 27 | |
| 28 | =head1 SYNOPSIS |
| 29 | |
| 30 | use Unicode::UCD 'charinfo'; |
| 31 | my $charinfo = charinfo($codepoint); |
| 32 | |
| 33 | use Unicode::UCD 'charblock'; |
| 34 | my $charblock = charblock($codepoint); |
| 35 | |
| 36 | use Unicode::UCD 'charscript'; |
| 37 | my $charscript = charscript($codepoint); |
| 38 | |
| 39 | use Unicode::UCD 'charblocks'; |
| 40 | my $charblocks = charblocks(); |
| 41 | |
| 42 | use Unicode::UCD 'charscripts'; |
| 43 | my %charscripts = charscripts(); |
| 44 | |
| 45 | use Unicode::UCD qw(charscript charinrange); |
| 46 | my $range = charscript($script); |
| 47 | print "looks like $script\n" if charinrange($range, $codepoint); |
| 48 | |
| 49 | use Unicode::UCD 'compexcl'; |
| 50 | my $compexcl = compexcl($codepoint); |
| 51 | |
| 52 | use Unicode::UCD 'namedseq'; |
| 53 | my $namedseq = namedseq($named_sequence_name); |
| 54 | |
| 55 | my $unicode_version = Unicode::UCD::UnicodeVersion(); |
| 56 | |
| 57 | =head1 DESCRIPTION |
| 58 | |
| 59 | The Unicode::UCD module offers a simple interface to the Unicode |
| 60 | Character Database. |
| 61 | |
| 62 | =cut |
| 63 | |
| 64 | my $UNICODEFH; |
| 65 | my $BLOCKSFH; |
| 66 | my $SCRIPTSFH; |
| 67 | my $VERSIONFH; |
| 68 | my $COMPEXCLFH; |
| 69 | my $CASEFOLDFH; |
| 70 | my $CASESPECFH; |
| 71 | my $NAMEDSEQFH; |
| 72 | |
| 73 | sub openunicode { |
| 74 | my ($rfh, @path) = @_; |
| 75 | my $f; |
| 76 | unless (defined $$rfh) { |
| 77 | for my $d (@INC) { |
| 78 | use File::Spec; |
| 79 | $f = File::Spec->catfile($d, "unicore", @path); |
| 80 | last if open($$rfh, $f); |
| 81 | undef $f; |
| 82 | } |
| 83 | croak __PACKAGE__, ": failed to find ", |
| 84 | File::Spec->catfile(@path), " in @INC" |
| 85 | unless defined $f; |
| 86 | } |
| 87 | return $f; |
| 88 | } |
| 89 | |
| 90 | =head2 charinfo |
| 91 | |
| 92 | use Unicode::UCD 'charinfo'; |
| 93 | |
| 94 | my $charinfo = charinfo(0x41); |
| 95 | |
| 96 | charinfo() returns a reference to a hash that has the following fields |
| 97 | as defined by the Unicode standard: |
| 98 | |
| 99 | key |
| 100 | |
| 101 | code code point with at least four hexdigits |
| 102 | name name of the character IN UPPER CASE |
| 103 | category general category of the character |
| 104 | combining classes used in the Canonical Ordering Algorithm |
| 105 | bidi bidirectional category |
| 106 | decomposition character decomposition mapping |
| 107 | decimal if decimal digit this is the integer numeric value |
| 108 | digit if digit this is the numeric value |
| 109 | numeric if numeric is the integer or rational numeric value |
| 110 | mirrored if mirrored in bidirectional text |
| 111 | unicode10 Unicode 1.0 name if existed and different |
| 112 | comment ISO 10646 comment field |
| 113 | upper uppercase equivalent mapping |
| 114 | lower lowercase equivalent mapping |
| 115 | title titlecase equivalent mapping |
| 116 | |
| 117 | block block the character belongs to (used in \p{In...}) |
| 118 | script script the character belongs to |
| 119 | |
| 120 | If no match is found, a reference to an empty hash is returned. |
| 121 | |
| 122 | The C<block> property is the same as returned by charinfo(). It is |
| 123 | not defined in the Unicode Character Database proper (Chapter 4 of the |
| 124 | Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database |
| 125 | (Chapter 14 of TUS3). Similarly for the C<script> property. |
| 126 | |
| 127 | Note that you cannot do (de)composition and casing based solely on the |
| 128 | above C<decomposition> and C<lower>, C<upper>, C<title>, properties, |
| 129 | you will need also the compexcl(), casefold(), and casespec() functions. |
| 130 | |
| 131 | =cut |
| 132 | |
| 133 | # NB: This function is duplicated in charnames.pm |
| 134 | sub _getcode { |
| 135 | my $arg = shift; |
| 136 | |
| 137 | if ($arg =~ /^[1-9]\d*$/) { |
| 138 | return $arg; |
| 139 | } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { |
| 140 | return hex($1); |
| 141 | } |
| 142 | |
| 143 | return; |
| 144 | } |
| 145 | |
| 146 | # Lingua::KO::Hangul::Util not part of the standard distribution |
| 147 | # but it will be used if available. |
| 148 | |
| 149 | eval { require Lingua::KO::Hangul::Util }; |
| 150 | my $hasHangulUtil = ! $@; |
| 151 | if ($hasHangulUtil) { |
| 152 | Lingua::KO::Hangul::Util->import(); |
| 153 | } |
| 154 | |
| 155 | sub hangul_decomp { # internal: called from charinfo |
| 156 | if ($hasHangulUtil) { |
| 157 | my @tmp = decomposeHangul(shift); |
| 158 | return sprintf("%04X %04X", @tmp) if @tmp == 2; |
| 159 | return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; |
| 160 | } |
| 161 | return; |
| 162 | } |
| 163 | |
| 164 | sub hangul_charname { # internal: called from charinfo |
| 165 | return sprintf("HANGUL SYLLABLE-%04X", shift); |
| 166 | } |
| 167 | |
| 168 | sub han_charname { # internal: called from charinfo |
| 169 | return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); |
| 170 | } |
| 171 | |
| 172 | my @CharinfoRanges = ( |
| 173 | # block name |
| 174 | # [ first, last, coderef to name, coderef to decompose ], |
| 175 | # CJK Ideographs Extension A |
| 176 | [ 0x3400, 0x4DB5, \&han_charname, undef ], |
| 177 | # CJK Ideographs |
| 178 | [ 0x4E00, 0x9FA5, \&han_charname, undef ], |
| 179 | # Hangul Syllables |
| 180 | [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], |
| 181 | # Non-Private Use High Surrogates |
| 182 | [ 0xD800, 0xDB7F, undef, undef ], |
| 183 | # Private Use High Surrogates |
| 184 | [ 0xDB80, 0xDBFF, undef, undef ], |
| 185 | # Low Surrogates |
| 186 | [ 0xDC00, 0xDFFF, undef, undef ], |
| 187 | # The Private Use Area |
| 188 | [ 0xE000, 0xF8FF, undef, undef ], |
| 189 | # CJK Ideographs Extension B |
| 190 | [ 0x20000, 0x2A6D6, \&han_charname, undef ], |
| 191 | # Plane 15 Private Use Area |
| 192 | [ 0xF0000, 0xFFFFD, undef, undef ], |
| 193 | # Plane 16 Private Use Area |
| 194 | [ 0x100000, 0x10FFFD, undef, undef ], |
| 195 | ); |
| 196 | |
| 197 | sub charinfo { |
| 198 | my $arg = shift; |
| 199 | my $code = _getcode($arg); |
| 200 | croak __PACKAGE__, "::charinfo: unknown code '$arg'" |
| 201 | unless defined $code; |
| 202 | my $hexk = sprintf("%06X", $code); |
| 203 | my($rcode,$rname,$rdec); |
| 204 | foreach my $range (@CharinfoRanges){ |
| 205 | if ($range->[0] <= $code && $code <= $range->[1]) { |
| 206 | $rcode = $hexk; |
| 207 | $rcode =~ s/^0+//; |
| 208 | $rcode = sprintf("%04X", hex($rcode)); |
| 209 | $rname = $range->[2] ? $range->[2]->($code) : ''; |
| 210 | $rdec = $range->[3] ? $range->[3]->($code) : ''; |
| 211 | $hexk = sprintf("%06X", $range->[0]); # replace by the first |
| 212 | last; |
| 213 | } |
| 214 | } |
| 215 | openunicode(\$UNICODEFH, "UnicodeData.txt"); |
| 216 | if (defined $UNICODEFH) { |
| 217 | use Search::Dict 1.02; |
| 218 | if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { |
| 219 | my $line = <$UNICODEFH>; |
| 220 | return unless defined $line; |
| 221 | chomp $line; |
| 222 | my %prop; |
| 223 | @prop{qw( |
| 224 | code name category |
| 225 | combining bidi decomposition |
| 226 | decimal digit numeric |
| 227 | mirrored unicode10 comment |
| 228 | upper lower title |
| 229 | )} = split(/;/, $line, -1); |
| 230 | $hexk =~ s/^0+//; |
| 231 | $hexk = sprintf("%04X", hex($hexk)); |
| 232 | if ($prop{code} eq $hexk) { |
| 233 | $prop{block} = charblock($code); |
| 234 | $prop{script} = charscript($code); |
| 235 | if(defined $rname){ |
| 236 | $prop{code} = $rcode; |
| 237 | $prop{name} = $rname; |
| 238 | $prop{decomposition} = $rdec; |
| 239 | } |
| 240 | return \%prop; |
| 241 | } |
| 242 | } |
| 243 | } |
| 244 | return; |
| 245 | } |
| 246 | |
| 247 | sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. |
| 248 | my ($table, $lo, $hi, $code) = @_; |
| 249 | |
| 250 | return if $lo > $hi; |
| 251 | |
| 252 | my $mid = int(($lo+$hi) / 2); |
| 253 | |
| 254 | if ($table->[$mid]->[0] < $code) { |
| 255 | if ($table->[$mid]->[1] >= $code) { |
| 256 | return $table->[$mid]->[2]; |
| 257 | } else { |
| 258 | _search($table, $mid + 1, $hi, $code); |
| 259 | } |
| 260 | } elsif ($table->[$mid]->[0] > $code) { |
| 261 | _search($table, $lo, $mid - 1, $code); |
| 262 | } else { |
| 263 | return $table->[$mid]->[2]; |
| 264 | } |
| 265 | } |
| 266 | |
| 267 | sub charinrange { |
| 268 | my ($range, $arg) = @_; |
| 269 | my $code = _getcode($arg); |
| 270 | croak __PACKAGE__, "::charinrange: unknown code '$arg'" |
| 271 | unless defined $code; |
| 272 | _search($range, 0, $#$range, $code); |
| 273 | } |
| 274 | |
| 275 | =head2 charblock |
| 276 | |
| 277 | use Unicode::UCD 'charblock'; |
| 278 | |
| 279 | my $charblock = charblock(0x41); |
| 280 | my $charblock = charblock(1234); |
| 281 | my $charblock = charblock("0x263a"); |
| 282 | my $charblock = charblock("U+263a"); |
| 283 | |
| 284 | my $range = charblock('Armenian'); |
| 285 | |
| 286 | With a B<code point argument> charblock() returns the I<block> the character |
| 287 | belongs to, e.g. C<Basic Latin>. Note that not all the character |
| 288 | positions within all blocks are defined. |
| 289 | |
| 290 | See also L</Blocks versus Scripts>. |
| 291 | |
| 292 | If supplied with an argument that can't be a code point, charblock() tries |
| 293 | to do the opposite and interpret the argument as a character block. The |
| 294 | return value is a I<range>: an anonymous list of lists that contain |
| 295 | I<start-of-range>, I<end-of-range> code point pairs. You can test whether |
| 296 | a code point is in a range using the L</charinrange> function. If the |
| 297 | argument is not a known character block, C<undef> is returned. |
| 298 | |
| 299 | =cut |
| 300 | |
| 301 | my @BLOCKS; |
| 302 | my %BLOCKS; |
| 303 | |
| 304 | sub _charblocks { |
| 305 | unless (@BLOCKS) { |
| 306 | if (openunicode(\$BLOCKSFH, "Blocks.txt")) { |
| 307 | local $_; |
| 308 | while (<$BLOCKSFH>) { |
| 309 | if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { |
| 310 | my ($lo, $hi) = (hex($1), hex($2)); |
| 311 | my $subrange = [ $lo, $hi, $3 ]; |
| 312 | push @BLOCKS, $subrange; |
| 313 | push @{$BLOCKS{$3}}, $subrange; |
| 314 | } |
| 315 | } |
| 316 | close($BLOCKSFH); |
| 317 | } |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | sub charblock { |
| 322 | my $arg = shift; |
| 323 | |
| 324 | _charblocks() unless @BLOCKS; |
| 325 | |
| 326 | my $code = _getcode($arg); |
| 327 | |
| 328 | if (defined $code) { |
| 329 | _search(\@BLOCKS, 0, $#BLOCKS, $code); |
| 330 | } else { |
| 331 | if (exists $BLOCKS{$arg}) { |
| 332 | return dclone $BLOCKS{$arg}; |
| 333 | } else { |
| 334 | return; |
| 335 | } |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | =head2 charscript |
| 340 | |
| 341 | use Unicode::UCD 'charscript'; |
| 342 | |
| 343 | my $charscript = charscript(0x41); |
| 344 | my $charscript = charscript(1234); |
| 345 | my $charscript = charscript("U+263a"); |
| 346 | |
| 347 | my $range = charscript('Thai'); |
| 348 | |
| 349 | With a B<code point argument> charscript() returns the I<script> the |
| 350 | character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. |
| 351 | |
| 352 | See also L</Blocks versus Scripts>. |
| 353 | |
| 354 | If supplied with an argument that can't be a code point, charscript() tries |
| 355 | to do the opposite and interpret the argument as a character script. The |
| 356 | return value is a I<range>: an anonymous list of lists that contain |
| 357 | I<start-of-range>, I<end-of-range> code point pairs. You can test whether a |
| 358 | code point is in a range using the L</charinrange> function. If the |
| 359 | argument is not a known character script, C<undef> is returned. |
| 360 | |
| 361 | =cut |
| 362 | |
| 363 | my @SCRIPTS; |
| 364 | my %SCRIPTS; |
| 365 | |
| 366 | sub _charscripts { |
| 367 | unless (@SCRIPTS) { |
| 368 | if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { |
| 369 | local $_; |
| 370 | while (<$SCRIPTSFH>) { |
| 371 | if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { |
| 372 | my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); |
| 373 | my $script = lc($3); |
| 374 | $script =~ s/\b(\w)/uc($1)/ge; |
| 375 | my $subrange = [ $lo, $hi, $script ]; |
| 376 | push @SCRIPTS, $subrange; |
| 377 | push @{$SCRIPTS{$script}}, $subrange; |
| 378 | } |
| 379 | } |
| 380 | close($SCRIPTSFH); |
| 381 | @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; |
| 382 | } |
| 383 | } |
| 384 | } |
| 385 | |
| 386 | sub charscript { |
| 387 | my $arg = shift; |
| 388 | |
| 389 | _charscripts() unless @SCRIPTS; |
| 390 | |
| 391 | my $code = _getcode($arg); |
| 392 | |
| 393 | if (defined $code) { |
| 394 | _search(\@SCRIPTS, 0, $#SCRIPTS, $code); |
| 395 | } else { |
| 396 | if (exists $SCRIPTS{$arg}) { |
| 397 | return dclone $SCRIPTS{$arg}; |
| 398 | } else { |
| 399 | return; |
| 400 | } |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | =head2 charblocks |
| 405 | |
| 406 | use Unicode::UCD 'charblocks'; |
| 407 | |
| 408 | my $charblocks = charblocks(); |
| 409 | |
| 410 | charblocks() returns a reference to a hash with the known block names |
| 411 | as the keys, and the code point ranges (see L</charblock>) as the values. |
| 412 | |
| 413 | See also L</Blocks versus Scripts>. |
| 414 | |
| 415 | =cut |
| 416 | |
| 417 | sub charblocks { |
| 418 | _charblocks() unless %BLOCKS; |
| 419 | return dclone \%BLOCKS; |
| 420 | } |
| 421 | |
| 422 | =head2 charscripts |
| 423 | |
| 424 | use Unicode::UCD 'charscripts'; |
| 425 | |
| 426 | my %charscripts = charscripts(); |
| 427 | |
| 428 | charscripts() returns a hash with the known script names as the keys, |
| 429 | and the code point ranges (see L</charscript>) as the values. |
| 430 | |
| 431 | See also L</Blocks versus Scripts>. |
| 432 | |
| 433 | =cut |
| 434 | |
| 435 | sub charscripts { |
| 436 | _charscripts() unless %SCRIPTS; |
| 437 | return dclone \%SCRIPTS; |
| 438 | } |
| 439 | |
| 440 | =head2 Blocks versus Scripts |
| 441 | |
| 442 | The difference between a block and a script is that scripts are closer |
| 443 | to the linguistic notion of a set of characters required to present |
| 444 | languages, while block is more of an artifact of the Unicode character |
| 445 | numbering and separation into blocks of (mostly) 256 characters. |
| 446 | |
| 447 | For example the Latin B<script> is spread over several B<blocks>, such |
| 448 | as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and |
| 449 | C<Latin Extended-B>. On the other hand, the Latin script does not |
| 450 | contain all the characters of the C<Basic Latin> block (also known as |
| 451 | the ASCII): it includes only the letters, and not, for example, the digits |
| 452 | or the punctuation. |
| 453 | |
| 454 | For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt |
| 455 | |
| 456 | For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ |
| 457 | |
| 458 | =head2 Matching Scripts and Blocks |
| 459 | |
| 460 | Scripts are matched with the regular-expression construct |
| 461 | C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), |
| 462 | while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches |
| 463 | any of the 256 code points in the Tibetan block). |
| 464 | |
| 465 | =head2 Code Point Arguments |
| 466 | |
| 467 | A I<code point argument> is either a decimal or a hexadecimal scalar |
| 468 | designating a Unicode character, or C<U+> followed by hexadecimals |
| 469 | designating a Unicode character. In other words, if you want a code |
| 470 | point to be interpreted as a hexadecimal number, you must prefix it |
| 471 | with either C<0x> or C<U+>, because a string like e.g. C<123> will |
| 472 | be interpreted as a decimal code point. Also note that Unicode is |
| 473 | B<not> limited to 16 bits (the number of Unicode characters is |
| 474 | open-ended, in theory unlimited): you may have more than 4 hexdigits. |
| 475 | |
| 476 | =head2 charinrange |
| 477 | |
| 478 | In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you |
| 479 | can also test whether a code point is in the I<range> as returned by |
| 480 | L</charblock> and L</charscript> or as the values of the hash returned |
| 481 | by L</charblocks> and L</charscripts> by using charinrange(): |
| 482 | |
| 483 | use Unicode::UCD qw(charscript charinrange); |
| 484 | |
| 485 | $range = charscript('Hiragana'); |
| 486 | print "looks like hiragana\n" if charinrange($range, $codepoint); |
| 487 | |
| 488 | =cut |
| 489 | |
| 490 | =head2 compexcl |
| 491 | |
| 492 | use Unicode::UCD 'compexcl'; |
| 493 | |
| 494 | my $compexcl = compexcl("09dc"); |
| 495 | |
| 496 | The compexcl() returns the composition exclusion (that is, if the |
| 497 | character should not be produced during a precomposition) of the |
| 498 | character specified by a B<code point argument>. |
| 499 | |
| 500 | If there is a composition exclusion for the character, true is |
| 501 | returned. Otherwise, false is returned. |
| 502 | |
| 503 | =cut |
| 504 | |
| 505 | my %COMPEXCL; |
| 506 | |
| 507 | sub _compexcl { |
| 508 | unless (%COMPEXCL) { |
| 509 | if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { |
| 510 | local $_; |
| 511 | while (<$COMPEXCLFH>) { |
| 512 | if (/^([0-9A-F]+)\s+\#\s+/) { |
| 513 | my $code = hex($1); |
| 514 | $COMPEXCL{$code} = undef; |
| 515 | } |
| 516 | } |
| 517 | close($COMPEXCLFH); |
| 518 | } |
| 519 | } |
| 520 | } |
| 521 | |
| 522 | sub compexcl { |
| 523 | my $arg = shift; |
| 524 | my $code = _getcode($arg); |
| 525 | croak __PACKAGE__, "::compexcl: unknown code '$arg'" |
| 526 | unless defined $code; |
| 527 | |
| 528 | _compexcl() unless %COMPEXCL; |
| 529 | |
| 530 | return exists $COMPEXCL{$code}; |
| 531 | } |
| 532 | |
| 533 | =head2 casefold |
| 534 | |
| 535 | use Unicode::UCD 'casefold'; |
| 536 | |
| 537 | my $casefold = casefold("00DF"); |
| 538 | |
| 539 | The casefold() returns the locale-independent case folding of the |
| 540 | character specified by a B<code point argument>. |
| 541 | |
| 542 | If there is a case folding for that character, a reference to a hash |
| 543 | with the following fields is returned: |
| 544 | |
| 545 | key |
| 546 | |
| 547 | code code point with at least four hexdigits |
| 548 | status "C", "F", "S", or "I" |
| 549 | mapping one or more codes separated by spaces |
| 550 | |
| 551 | The meaning of the I<status> is as follows: |
| 552 | |
| 553 | C common case folding, common mappings shared |
| 554 | by both simple and full mappings |
| 555 | F full case folding, mappings that cause strings |
| 556 | to grow in length. Multiple characters are separated |
| 557 | by spaces |
| 558 | S simple case folding, mappings to single characters |
| 559 | where different from F |
| 560 | I special case for dotted uppercase I and |
| 561 | dotless lowercase i |
| 562 | - If this mapping is included, the result is |
| 563 | case-insensitive, but dotless and dotted I's |
| 564 | are not distinguished |
| 565 | - If this mapping is excluded, the result is not |
| 566 | fully case-insensitive, but dotless and dotted |
| 567 | I's are distinguished |
| 568 | |
| 569 | If there is no case folding for that character, C<undef> is returned. |
| 570 | |
| 571 | For more information about case mappings see |
| 572 | http://www.unicode.org/unicode/reports/tr21/ |
| 573 | |
| 574 | =cut |
| 575 | |
| 576 | my %CASEFOLD; |
| 577 | |
| 578 | sub _casefold { |
| 579 | unless (%CASEFOLD) { |
| 580 | if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { |
| 581 | local $_; |
| 582 | while (<$CASEFOLDFH>) { |
| 583 | if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { |
| 584 | my $code = hex($1); |
| 585 | $CASEFOLD{$code} = { code => $1, |
| 586 | status => $2, |
| 587 | mapping => $3 }; |
| 588 | } |
| 589 | } |
| 590 | close($CASEFOLDFH); |
| 591 | } |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | sub casefold { |
| 596 | my $arg = shift; |
| 597 | my $code = _getcode($arg); |
| 598 | croak __PACKAGE__, "::casefold: unknown code '$arg'" |
| 599 | unless defined $code; |
| 600 | |
| 601 | _casefold() unless %CASEFOLD; |
| 602 | |
| 603 | return $CASEFOLD{$code}; |
| 604 | } |
| 605 | |
| 606 | =head2 casespec |
| 607 | |
| 608 | use Unicode::UCD 'casespec'; |
| 609 | |
| 610 | my $casespec = casespec("FB00"); |
| 611 | |
| 612 | The casespec() returns the potentially locale-dependent case mapping |
| 613 | of the character specified by a B<code point argument>. The mapping |
| 614 | may change the length of the string (which the basic Unicode case |
| 615 | mappings as returned by charinfo() never do). |
| 616 | |
| 617 | If there is a case folding for that character, a reference to a hash |
| 618 | with the following fields is returned: |
| 619 | |
| 620 | key |
| 621 | |
| 622 | code code point with at least four hexdigits |
| 623 | lower lowercase |
| 624 | title titlecase |
| 625 | upper uppercase |
| 626 | condition condition list (may be undef) |
| 627 | |
| 628 | The C<condition> is optional. Where present, it consists of one or |
| 629 | more I<locales> or I<contexts>, separated by spaces (other than as |
| 630 | used to separate elements, spaces are to be ignored). A condition |
| 631 | list overrides the normal behavior if all of the listed conditions are |
| 632 | true. Case distinctions in the condition list are not significant. |
| 633 | Conditions preceded by "NON_" represent the negation of the condition. |
| 634 | |
| 635 | Note that when there are multiple case folding definitions for a |
| 636 | single code point because of different locales, the value returned by |
| 637 | casespec() is a hash reference which has the locales as the keys and |
| 638 | hash references as described above as the values. |
| 639 | |
| 640 | A I<locale> is defined as a 2-letter ISO 3166 country code, possibly |
| 641 | followed by a "_" and a 2-letter ISO language code (possibly followed |
| 642 | by a "_" and a variant code). You can find the lists of those codes, |
| 643 | see L<Locale::Country> and L<Locale::Language>. |
| 644 | |
| 645 | A I<context> is one of the following choices: |
| 646 | |
| 647 | FINAL The letter is not followed by a letter of |
| 648 | general category L (e.g. Ll, Lt, Lu, Lm, or Lo) |
| 649 | MODERN The mapping is only used for modern text |
| 650 | AFTER_i The last base character was "i" (U+0069) |
| 651 | |
| 652 | For more information about case mappings see |
| 653 | http://www.unicode.org/unicode/reports/tr21/ |
| 654 | |
| 655 | =cut |
| 656 | |
| 657 | my %CASESPEC; |
| 658 | |
| 659 | sub _casespec { |
| 660 | unless (%CASESPEC) { |
| 661 | if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { |
| 662 | local $_; |
| 663 | while (<$CASESPECFH>) { |
| 664 | if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { |
| 665 | my ($hexcode, $lower, $title, $upper, $condition) = |
| 666 | ($1, $2, $3, $4, $5); |
| 667 | my $code = hex($hexcode); |
| 668 | if (exists $CASESPEC{$code}) { |
| 669 | if (exists $CASESPEC{$code}->{code}) { |
| 670 | my ($oldlower, |
| 671 | $oldtitle, |
| 672 | $oldupper, |
| 673 | $oldcondition) = |
| 674 | @{$CASESPEC{$code}}{qw(lower |
| 675 | title |
| 676 | upper |
| 677 | condition)}; |
| 678 | if (defined $oldcondition) { |
| 679 | my ($oldlocale) = |
| 680 | ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); |
| 681 | delete $CASESPEC{$code}; |
| 682 | $CASESPEC{$code}->{$oldlocale} = |
| 683 | { code => $hexcode, |
| 684 | lower => $oldlower, |
| 685 | title => $oldtitle, |
| 686 | upper => $oldupper, |
| 687 | condition => $oldcondition }; |
| 688 | } |
| 689 | } |
| 690 | my ($locale) = |
| 691 | ($condition =~ /^([a-z][a-z](?:_\S+)?)/); |
| 692 | $CASESPEC{$code}->{$locale} = |
| 693 | { code => $hexcode, |
| 694 | lower => $lower, |
| 695 | title => $title, |
| 696 | upper => $upper, |
| 697 | condition => $condition }; |
| 698 | } else { |
| 699 | $CASESPEC{$code} = |
| 700 | { code => $hexcode, |
| 701 | lower => $lower, |
| 702 | title => $title, |
| 703 | upper => $upper, |
| 704 | condition => $condition }; |
| 705 | } |
| 706 | } |
| 707 | } |
| 708 | close($CASESPECFH); |
| 709 | } |
| 710 | } |
| 711 | } |
| 712 | |
| 713 | sub casespec { |
| 714 | my $arg = shift; |
| 715 | my $code = _getcode($arg); |
| 716 | croak __PACKAGE__, "::casespec: unknown code '$arg'" |
| 717 | unless defined $code; |
| 718 | |
| 719 | _casespec() unless %CASESPEC; |
| 720 | |
| 721 | return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; |
| 722 | } |
| 723 | |
| 724 | =head2 namedseq() |
| 725 | |
| 726 | use Unicode::UCD 'namedseq'; |
| 727 | |
| 728 | my $namedseq = namedseq("KATAKANA LETTER AINU P"); |
| 729 | my @namedseq = namedseq("KATAKANA LETTER AINU P"); |
| 730 | my %namedseq = namedseq(); |
| 731 | |
| 732 | If used with a single argument in a scalar context, returns the string |
| 733 | consisting of the code points of the named sequence, or C<undef> if no |
| 734 | named sequence by that name exists. If used with a single argument in |
| 735 | a list context, returns list of the code points. If used with no |
| 736 | arguments in a list context, returns a hash with the names of the |
| 737 | named sequences as the keys and the named sequences as strings as |
| 738 | the values. Otherwise, returns C<undef> or empty list depending |
| 739 | on the context. |
| 740 | |
| 741 | (New from Unicode 4.1.0) |
| 742 | |
| 743 | =cut |
| 744 | |
| 745 | my %NAMEDSEQ; |
| 746 | |
| 747 | sub _namedseq { |
| 748 | unless (%NAMEDSEQ) { |
| 749 | if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { |
| 750 | local $_; |
| 751 | while (<$NAMEDSEQFH>) { |
| 752 | if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { |
| 753 | my ($n, $s) = ($1, $2); |
| 754 | my @s = map { chr(hex($_)) } split(' ', $s); |
| 755 | $NAMEDSEQ{$n} = join("", @s); |
| 756 | } |
| 757 | } |
| 758 | close($NAMEDSEQFH); |
| 759 | } |
| 760 | } |
| 761 | } |
| 762 | |
| 763 | sub namedseq { |
| 764 | _namedseq() unless %NAMEDSEQ; |
| 765 | my $wantarray = wantarray(); |
| 766 | if (defined $wantarray) { |
| 767 | if ($wantarray) { |
| 768 | if (@_ == 0) { |
| 769 | return %NAMEDSEQ; |
| 770 | } elsif (@_ == 1) { |
| 771 | my $s = $NAMEDSEQ{ $_[0] }; |
| 772 | return defined $s ? map { ord($_) } split('', $s) : (); |
| 773 | } |
| 774 | } elsif (@_ == 1) { |
| 775 | return $NAMEDSEQ{ $_[0] }; |
| 776 | } |
| 777 | } |
| 778 | return; |
| 779 | } |
| 780 | |
| 781 | =head2 Unicode::UCD::UnicodeVersion |
| 782 | |
| 783 | Unicode::UCD::UnicodeVersion() returns the version of the Unicode |
| 784 | Character Database, in other words, the version of the Unicode |
| 785 | standard the database implements. The version is a string |
| 786 | of numbers delimited by dots (C<'.'>). |
| 787 | |
| 788 | =cut |
| 789 | |
| 790 | my $UNICODEVERSION; |
| 791 | |
| 792 | sub UnicodeVersion { |
| 793 | unless (defined $UNICODEVERSION) { |
| 794 | openunicode(\$VERSIONFH, "version"); |
| 795 | chomp($UNICODEVERSION = <$VERSIONFH>); |
| 796 | close($VERSIONFH); |
| 797 | croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" |
| 798 | unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; |
| 799 | } |
| 800 | return $UNICODEVERSION; |
| 801 | } |
| 802 | |
| 803 | =head2 Implementation Note |
| 804 | |
| 805 | The first use of charinfo() opens a read-only filehandle to the Unicode |
| 806 | Character Database (the database is included in the Perl distribution). |
| 807 | The filehandle is then kept open for further queries. In other words, |
| 808 | if you are wondering where one of your filehandles went, that's where. |
| 809 | |
| 810 | =head1 BUGS |
| 811 | |
| 812 | Does not yet support EBCDIC platforms. |
| 813 | |
| 814 | =head1 AUTHOR |
| 815 | |
| 816 | Jarkko Hietaniemi |
| 817 | |
| 818 | =cut |
| 819 | |
| 820 | 1; |