| 1 | package Unicode::Normalize; |
| 2 | |
| 3 | BEGIN { |
| 4 | unless ("A" eq pack('U', 0x41)) { |
| 5 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
| 6 | } |
| 7 | } |
| 8 | |
| 9 | use 5.006; |
| 10 | use strict; |
| 11 | use warnings; |
| 12 | use Carp; |
| 13 | |
| 14 | no warnings 'utf8'; |
| 15 | |
| 16 | our $VERSION = '0.32'; |
| 17 | our $PACKAGE = __PACKAGE__; |
| 18 | |
| 19 | require Exporter; |
| 20 | require DynaLoader; |
| 21 | |
| 22 | our @ISA = qw(Exporter DynaLoader); |
| 23 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
| 24 | our @EXPORT_OK = qw( |
| 25 | normalize decompose reorder compose |
| 26 | checkNFD checkNFKD checkNFC checkNFKC check |
| 27 | getCanon getCompat getComposite getCombinClass |
| 28 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex |
| 29 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE |
| 30 | FCD checkFCD FCC checkFCC composeContiguous |
| 31 | splitOnLastStarter |
| 32 | ); |
| 33 | our %EXPORT_TAGS = ( |
| 34 | all => [ @EXPORT, @EXPORT_OK ], |
| 35 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], |
| 36 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], |
| 37 | fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], |
| 38 | ); |
| 39 | |
| 40 | ###### |
| 41 | |
| 42 | bootstrap Unicode::Normalize $VERSION; |
| 43 | |
| 44 | ###### |
| 45 | |
| 46 | sub pack_U { |
| 47 | return pack('U*', @_); |
| 48 | } |
| 49 | |
| 50 | sub unpack_U { |
| 51 | return unpack('U*', pack('U*').shift); |
| 52 | } |
| 53 | |
| 54 | |
| 55 | ## |
| 56 | ## normalization forms |
| 57 | ## |
| 58 | |
| 59 | use constant COMPAT => 1; |
| 60 | |
| 61 | sub NFD ($) { reorder(decompose($_[0])) } |
| 62 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } |
| 63 | sub NFC ($) { compose(reorder(decompose($_[0]))) } |
| 64 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } |
| 65 | |
| 66 | sub FCD ($) { |
| 67 | my $str = shift; |
| 68 | return checkFCD($str) ? $str : NFD($str); |
| 69 | } |
| 70 | sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } |
| 71 | |
| 72 | our %formNorm = ( |
| 73 | NFC => \&NFC, C => \&NFC, |
| 74 | NFD => \&NFD, D => \&NFD, |
| 75 | NFKC => \&NFKC, KC => \&NFKC, |
| 76 | NFKD => \&NFKD, KD => \&NFKD, |
| 77 | FCD => \&FCD, FCC => \&FCC, |
| 78 | ); |
| 79 | |
| 80 | sub normalize($$) |
| 81 | { |
| 82 | my $form = shift; |
| 83 | my $str = shift; |
| 84 | return exists $formNorm{$form} |
| 85 | ? $formNorm{$form}->($str) |
| 86 | : croak $PACKAGE."::normalize: invalid form name: $form"; |
| 87 | } |
| 88 | |
| 89 | |
| 90 | ## |
| 91 | ## quick check |
| 92 | ## |
| 93 | |
| 94 | our %formCheck = ( |
| 95 | NFC => \&checkNFC, C => \&checkNFC, |
| 96 | NFD => \&checkNFD, D => \&checkNFD, |
| 97 | NFKC => \&checkNFKC, KC => \&checkNFKC, |
| 98 | NFKD => \&checkNFKD, KD => \&checkNFKD, |
| 99 | FCD => \&checkFCD, FCC => \&checkFCC, |
| 100 | ); |
| 101 | |
| 102 | sub check($$) |
| 103 | { |
| 104 | my $form = shift; |
| 105 | my $str = shift; |
| 106 | return exists $formCheck{$form} |
| 107 | ? $formCheck{$form}->($str) |
| 108 | : croak $PACKAGE."::check: invalid form name: $form"; |
| 109 | } |
| 110 | |
| 111 | 1; |
| 112 | __END__ |
| 113 | |
| 114 | =head1 NAME |
| 115 | |
| 116 | Unicode::Normalize - Unicode Normalization Forms |
| 117 | |
| 118 | =head1 SYNOPSIS |
| 119 | |
| 120 | (1) using function names exported by default: |
| 121 | |
| 122 | use Unicode::Normalize; |
| 123 | |
| 124 | $NFD_string = NFD($string); # Normalization Form D |
| 125 | $NFC_string = NFC($string); # Normalization Form C |
| 126 | $NFKD_string = NFKD($string); # Normalization Form KD |
| 127 | $NFKC_string = NFKC($string); # Normalization Form KC |
| 128 | |
| 129 | (2) using function names exported on request: |
| 130 | |
| 131 | use Unicode::Normalize 'normalize'; |
| 132 | |
| 133 | $NFD_string = normalize('D', $string); # Normalization Form D |
| 134 | $NFC_string = normalize('C', $string); # Normalization Form C |
| 135 | $NFKD_string = normalize('KD', $string); # Normalization Form KD |
| 136 | $NFKC_string = normalize('KC', $string); # Normalization Form KC |
| 137 | |
| 138 | =head1 DESCRIPTION |
| 139 | |
| 140 | Parameters: |
| 141 | |
| 142 | C<$string> is used as a string under character semantics |
| 143 | (see F<perlunicode>). |
| 144 | |
| 145 | C<$codepoint> should be an unsigned integer |
| 146 | representing a Unicode code point. |
| 147 | |
| 148 | Note: Between XSUB and pure Perl, there is an incompatibility |
| 149 | about the interpretation of C<$codepoint> as a decimal number. |
| 150 | XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not. |
| 151 | Do not use a floating point nor a negative sign in C<$codepoint>. |
| 152 | |
| 153 | =head2 Normalization Forms |
| 154 | |
| 155 | =over 4 |
| 156 | |
| 157 | =item C<$NFD_string = NFD($string)> |
| 158 | |
| 159 | returns the Normalization Form D (formed by canonical decomposition). |
| 160 | |
| 161 | =item C<$NFC_string = NFC($string)> |
| 162 | |
| 163 | returns the Normalization Form C (formed by canonical decomposition |
| 164 | followed by canonical composition). |
| 165 | |
| 166 | =item C<$NFKD_string = NFKD($string)> |
| 167 | |
| 168 | returns the Normalization Form KD (formed by compatibility decomposition). |
| 169 | |
| 170 | =item C<$NFKC_string = NFKC($string)> |
| 171 | |
| 172 | returns the Normalization Form KC (formed by compatibility decomposition |
| 173 | followed by B<canonical> composition). |
| 174 | |
| 175 | =item C<$FCD_string = FCD($string)> |
| 176 | |
| 177 | If the given string is in FCD ("Fast C or D" form; cf. UTN #5), |
| 178 | returns it without modification; otherwise returns an FCD string. |
| 179 | |
| 180 | Note: FCD is not always unique, then plural forms may be equivalent |
| 181 | each other. C<FCD()> will return one of these equivalent forms. |
| 182 | |
| 183 | =item C<$FCC_string = FCC($string)> |
| 184 | |
| 185 | returns the FCC form ("Fast C Contiguous"; cf. UTN #5). |
| 186 | |
| 187 | Note: FCC is unique, as well as four normalization forms (NF*). |
| 188 | |
| 189 | =item C<$normalized_string = normalize($form_name, $string)> |
| 190 | |
| 191 | As C<$form_name>, one of the following names must be given. |
| 192 | |
| 193 | 'C' or 'NFC' for Normalization Form C (UAX #15) |
| 194 | 'D' or 'NFD' for Normalization Form D (UAX #15) |
| 195 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) |
| 196 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) |
| 197 | |
| 198 | 'FCD' for "Fast C or D" Form (UTN #5) |
| 199 | 'FCC' for "Fast C Contiguous" (UTN #5) |
| 200 | |
| 201 | =back |
| 202 | |
| 203 | =head2 Decomposition and Composition |
| 204 | |
| 205 | =over 4 |
| 206 | |
| 207 | =item C<$decomposed_string = decompose($string)> |
| 208 | |
| 209 | =item C<$decomposed_string = decompose($string, $useCompatMapping)> |
| 210 | |
| 211 | Decomposes the specified string and returns the result. |
| 212 | |
| 213 | If the second parameter (a boolean) is omitted or false, decomposes it |
| 214 | using the Canonical Decomposition Mapping. |
| 215 | If true, decomposes it using the Compatibility Decomposition Mapping. |
| 216 | |
| 217 | The string returned is not always in NFD/NFKD. |
| 218 | Reordering may be required. |
| 219 | |
| 220 | $NFD_string = reorder(decompose($string)); # eq. to NFD() |
| 221 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() |
| 222 | |
| 223 | =item C<$reordered_string = reorder($string)> |
| 224 | |
| 225 | Reorders the combining characters and the like in the canonical ordering |
| 226 | and returns the result. |
| 227 | |
| 228 | E.g., when you have a list of NFD/NFKD strings, |
| 229 | you can get the concatenated NFD/NFKD string from them, saying |
| 230 | |
| 231 | $concat_NFD = reorder(join '', @NFD_strings); |
| 232 | $concat_NFKD = reorder(join '', @NFKD_strings); |
| 233 | |
| 234 | =item C<$composed_string = compose($string)> |
| 235 | |
| 236 | Returns the string where composable pairs are composed. |
| 237 | |
| 238 | E.g., when you have a NFD/NFKD string, |
| 239 | you can get its NFC/NFKC string, saying |
| 240 | |
| 241 | $NFC_string = compose($NFD_string); |
| 242 | $NFKC_string = compose($NFKD_string); |
| 243 | |
| 244 | =back |
| 245 | |
| 246 | =head2 Quick Check |
| 247 | |
| 248 | (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) |
| 249 | |
| 250 | The following functions check whether the string is in that normalization form. |
| 251 | |
| 252 | The result returned will be: |
| 253 | |
| 254 | YES The string is in that normalization form. |
| 255 | NO The string is not in that normalization form. |
| 256 | MAYBE Dubious. Maybe yes, maybe no. |
| 257 | |
| 258 | =over 4 |
| 259 | |
| 260 | =item C<$result = checkNFD($string)> |
| 261 | |
| 262 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
| 263 | |
| 264 | =item C<$result = checkNFC($string)> |
| 265 | |
| 266 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
| 267 | C<undef> if C<MAYBE>. |
| 268 | |
| 269 | =item C<$result = checkNFKD($string)> |
| 270 | |
| 271 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
| 272 | |
| 273 | =item C<$result = checkNFKC($string)> |
| 274 | |
| 275 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
| 276 | C<undef> if C<MAYBE>. |
| 277 | |
| 278 | =item C<$result = checkFCD($string)> |
| 279 | |
| 280 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
| 281 | |
| 282 | =item C<$result = checkFCC($string)> |
| 283 | |
| 284 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
| 285 | C<undef> if C<MAYBE>. |
| 286 | |
| 287 | If a string is not in FCD, it must not be in FCC. |
| 288 | So C<checkFCC($not_FCD_string)> should return C<NO>. |
| 289 | |
| 290 | =item C<$result = check($form_name, $string)> |
| 291 | |
| 292 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
| 293 | C<undef> if C<MAYBE>. |
| 294 | |
| 295 | As C<$form_name>, one of the following names must be given. |
| 296 | |
| 297 | 'C' or 'NFC' for Normalization Form C (UAX #15) |
| 298 | 'D' or 'NFD' for Normalization Form D (UAX #15) |
| 299 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) |
| 300 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) |
| 301 | |
| 302 | 'FCD' for "Fast C or D" Form (UTN #5) |
| 303 | 'FCC' for "Fast C Contiguous" (UTN #5) |
| 304 | |
| 305 | =back |
| 306 | |
| 307 | B<Note> |
| 308 | |
| 309 | In the cases of NFD, NFKD, and FCD, the answer must be |
| 310 | either C<YES> or C<NO>. The answer C<MAYBE> may be returned |
| 311 | in the cases of NFC, NFKC, and FCC. |
| 312 | |
| 313 | A C<MAYBE> string should contain at least one combining character |
| 314 | or the like. For example, C<COMBINING ACUTE ACCENT> has |
| 315 | the MAYBE_NFC/MAYBE_NFKC property. |
| 316 | |
| 317 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> |
| 318 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. |
| 319 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC |
| 320 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), |
| 321 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. |
| 322 | |
| 323 | If you want to check exactly, compare the string with its NFC/NFKC/FCC. |
| 324 | |
| 325 | if ($string eq NFC($string)) { |
| 326 | # $string is exactly normalized in NFC; |
| 327 | } else { |
| 328 | # $string is not normalized in NFC; |
| 329 | } |
| 330 | |
| 331 | if ($string eq NFKC($string)) { |
| 332 | # $string is exactly normalized in NFKC; |
| 333 | } else { |
| 334 | # $string is not normalized in NFKC; |
| 335 | } |
| 336 | |
| 337 | =head2 Character Data |
| 338 | |
| 339 | These functions are interface of character data used internally. |
| 340 | If you want only to get Unicode normalization forms, you don't need |
| 341 | call them yourself. |
| 342 | |
| 343 | =over 4 |
| 344 | |
| 345 | =item C<$canonical_decomposed = getCanon($codepoint)> |
| 346 | |
| 347 | If the character of the specified codepoint is canonically |
| 348 | decomposable (including Hangul Syllables), |
| 349 | returns the B<completely decomposed> string canonically equivalent to it. |
| 350 | |
| 351 | If it is not decomposable, returns C<undef>. |
| 352 | |
| 353 | =item C<$compatibility_decomposed = getCompat($codepoint)> |
| 354 | |
| 355 | If the character of the specified codepoint is compatibility |
| 356 | decomposable (including Hangul Syllables), |
| 357 | returns the B<completely decomposed> string compatibility equivalent to it. |
| 358 | |
| 359 | If it is not decomposable, returns C<undef>. |
| 360 | |
| 361 | =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> |
| 362 | |
| 363 | If two characters here and next (as codepoints) are composable |
| 364 | (including Hangul Jamo/Syllables and Composition Exclusions), |
| 365 | returns the codepoint of the composite. |
| 366 | |
| 367 | If they are not composable, returns C<undef>. |
| 368 | |
| 369 | =item C<$combining_class = getCombinClass($codepoint)> |
| 370 | |
| 371 | Returns the combining class of the character as an integer. |
| 372 | |
| 373 | =item C<$is_exclusion = isExclusion($codepoint)> |
| 374 | |
| 375 | Returns a boolean whether the character of the specified codepoint |
| 376 | is a composition exclusion. |
| 377 | |
| 378 | =item C<$is_singleton = isSingleton($codepoint)> |
| 379 | |
| 380 | Returns a boolean whether the character of the specified codepoint is |
| 381 | a singleton. |
| 382 | |
| 383 | =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> |
| 384 | |
| 385 | Returns a boolean whether the canonical decomposition |
| 386 | of the character of the specified codepoint |
| 387 | is a Non-Starter Decomposition. |
| 388 | |
| 389 | =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> |
| 390 | |
| 391 | Returns a boolean whether the character of the specified codepoint |
| 392 | may be composed with the previous one in a certain composition |
| 393 | (including Hangul Compositions, but excluding |
| 394 | Composition Exclusions and Non-Starter Decompositions). |
| 395 | |
| 396 | =back |
| 397 | |
| 398 | =head1 EXPORT |
| 399 | |
| 400 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. |
| 401 | |
| 402 | C<normalize> and other some functions: on request. |
| 403 | |
| 404 | =head1 CAVEATS |
| 405 | |
| 406 | =over 4 |
| 407 | |
| 408 | =item Perl's version vs. Unicode version |
| 409 | |
| 410 | Since this module refers to perl core's Unicode database in the directory |
| 411 | F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of |
| 412 | normalization implemented by this module depends on your perl's version. |
| 413 | |
| 414 | perl's version implemented Unicode version |
| 415 | 5.6.1 3.0.1 |
| 416 | 5.7.2 3.1.0 |
| 417 | 5.7.3 3.1.1 (same normalized form as that of 3.1.0) |
| 418 | 5.8.0 3.2.0 |
| 419 | 5.8.1-5.8.3 4.0.0 |
| 420 | 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0) |
| 421 | |
| 422 | =item Correction of decomposition mapping |
| 423 | |
| 424 | In older Unicode versions, a small number of characters (all of which are |
| 425 | CJK compatibility ideographs as far as they have been found) may have |
| 426 | an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). |
| 427 | Anyhow, this module will neither refer to F<NormalizationCorrections.txt> |
| 428 | nor provide any specific version of normalization. Therefore this module |
| 429 | running on an older perl with an older Unicode database may use |
| 430 | the erroneous decomposition mapping blindly conforming to the Unicode database. |
| 431 | |
| 432 | =item Revised definition of canonical composition |
| 433 | |
| 434 | In Unicode 4.1.0, the definition D2 of canonical composition (which |
| 435 | affects NFC and NFKC) has been changed (see Public Review Issue #29 |
| 436 | and recent UAX #15). This module has used the newer definition |
| 437 | since the version 0.07 (Oct 31, 2001). |
| 438 | This module does not support normalization according to the older |
| 439 | definition, even if the Unicode version implemented by perl is |
| 440 | lower than 4.1.0. |
| 441 | |
| 442 | =back |
| 443 | |
| 444 | =head1 AUTHOR |
| 445 | |
| 446 | SADAHIRO Tomoyuki <SADAHIRO@cpan.org> |
| 447 | |
| 448 | Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. |
| 449 | |
| 450 | This module is free software; you can redistribute it |
| 451 | and/or modify it under the same terms as Perl itself. |
| 452 | |
| 453 | =head1 SEE ALSO |
| 454 | |
| 455 | =over 4 |
| 456 | |
| 457 | =item http://www.unicode.org/reports/tr15/ |
| 458 | |
| 459 | Unicode Normalization Forms - UAX #15 |
| 460 | |
| 461 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt |
| 462 | |
| 463 | Derived Normalization Properties |
| 464 | |
| 465 | =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt |
| 466 | |
| 467 | Normalization Corrections |
| 468 | |
| 469 | =item http://www.unicode.org/review/pr-29.html |
| 470 | |
| 471 | Public Review Issue #29: Normalization Issue |
| 472 | |
| 473 | =item http://www.unicode.org/notes/tn5/ |
| 474 | |
| 475 | Canonical Equivalence in Applications - UTN #5 |
| 476 | |
| 477 | =back |
| 478 | |
| 479 | =cut |