Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Unicode::Normalize; |
2 | ||
3 | BEGIN { | |
4 | unless ("A" eq pack('U', 0x41)) { | |
5 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; | |
6 | } | |
7 | } | |
8 | ||
9 | use 5.006; | |
10 | use strict; | |
11 | use warnings; | |
12 | use Carp; | |
13 | ||
14 | no warnings 'utf8'; | |
15 | ||
16 | our $VERSION = '0.32'; | |
17 | our $PACKAGE = __PACKAGE__; | |
18 | ||
19 | require Exporter; | |
20 | require DynaLoader; | |
21 | ||
22 | our @ISA = qw(Exporter DynaLoader); | |
23 | our @EXPORT = qw( NFC NFD NFKC NFKD ); | |
24 | our @EXPORT_OK = qw( | |
25 | normalize decompose reorder compose | |
26 | checkNFD checkNFKD checkNFC checkNFKC check | |
27 | getCanon getCompat getComposite getCombinClass | |
28 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex | |
29 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE | |
30 | FCD checkFCD FCC checkFCC composeContiguous | |
31 | splitOnLastStarter | |
32 | ); | |
33 | our %EXPORT_TAGS = ( | |
34 | all => [ @EXPORT, @EXPORT_OK ], | |
35 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], | |
36 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], | |
37 | fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], | |
38 | ); | |
39 | ||
40 | ###### | |
41 | ||
42 | bootstrap Unicode::Normalize $VERSION; | |
43 | ||
44 | ###### | |
45 | ||
46 | sub pack_U { | |
47 | return pack('U*', @_); | |
48 | } | |
49 | ||
50 | sub unpack_U { | |
51 | return unpack('U*', pack('U*').shift); | |
52 | } | |
53 | ||
54 | ||
55 | ## | |
56 | ## normalization forms | |
57 | ## | |
58 | ||
59 | use constant COMPAT => 1; | |
60 | ||
61 | sub NFD ($) { reorder(decompose($_[0])) } | |
62 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } | |
63 | sub NFC ($) { compose(reorder(decompose($_[0]))) } | |
64 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } | |
65 | ||
66 | sub FCD ($) { | |
67 | my $str = shift; | |
68 | return checkFCD($str) ? $str : NFD($str); | |
69 | } | |
70 | sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } | |
71 | ||
72 | our %formNorm = ( | |
73 | NFC => \&NFC, C => \&NFC, | |
74 | NFD => \&NFD, D => \&NFD, | |
75 | NFKC => \&NFKC, KC => \&NFKC, | |
76 | NFKD => \&NFKD, KD => \&NFKD, | |
77 | FCD => \&FCD, FCC => \&FCC, | |
78 | ); | |
79 | ||
80 | sub normalize($$) | |
81 | { | |
82 | my $form = shift; | |
83 | my $str = shift; | |
84 | return exists $formNorm{$form} | |
85 | ? $formNorm{$form}->($str) | |
86 | : croak $PACKAGE."::normalize: invalid form name: $form"; | |
87 | } | |
88 | ||
89 | ||
90 | ## | |
91 | ## quick check | |
92 | ## | |
93 | ||
94 | our %formCheck = ( | |
95 | NFC => \&checkNFC, C => \&checkNFC, | |
96 | NFD => \&checkNFD, D => \&checkNFD, | |
97 | NFKC => \&checkNFKC, KC => \&checkNFKC, | |
98 | NFKD => \&checkNFKD, KD => \&checkNFKD, | |
99 | FCD => \&checkFCD, FCC => \&checkFCC, | |
100 | ); | |
101 | ||
102 | sub check($$) | |
103 | { | |
104 | my $form = shift; | |
105 | my $str = shift; | |
106 | return exists $formCheck{$form} | |
107 | ? $formCheck{$form}->($str) | |
108 | : croak $PACKAGE."::check: invalid form name: $form"; | |
109 | } | |
110 | ||
111 | 1; | |
112 | __END__ | |
113 | ||
114 | =head1 NAME | |
115 | ||
116 | Unicode::Normalize - Unicode Normalization Forms | |
117 | ||
118 | =head1 SYNOPSIS | |
119 | ||
120 | (1) using function names exported by default: | |
121 | ||
122 | use Unicode::Normalize; | |
123 | ||
124 | $NFD_string = NFD($string); # Normalization Form D | |
125 | $NFC_string = NFC($string); # Normalization Form C | |
126 | $NFKD_string = NFKD($string); # Normalization Form KD | |
127 | $NFKC_string = NFKC($string); # Normalization Form KC | |
128 | ||
129 | (2) using function names exported on request: | |
130 | ||
131 | use Unicode::Normalize 'normalize'; | |
132 | ||
133 | $NFD_string = normalize('D', $string); # Normalization Form D | |
134 | $NFC_string = normalize('C', $string); # Normalization Form C | |
135 | $NFKD_string = normalize('KD', $string); # Normalization Form KD | |
136 | $NFKC_string = normalize('KC', $string); # Normalization Form KC | |
137 | ||
138 | =head1 DESCRIPTION | |
139 | ||
140 | Parameters: | |
141 | ||
142 | C<$string> is used as a string under character semantics | |
143 | (see F<perlunicode>). | |
144 | ||
145 | C<$codepoint> should be an unsigned integer | |
146 | representing a Unicode code point. | |
147 | ||
148 | Note: Between XSUB and pure Perl, there is an incompatibility | |
149 | about the interpretation of C<$codepoint> as a decimal number. | |
150 | XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not. | |
151 | Do not use a floating point nor a negative sign in C<$codepoint>. | |
152 | ||
153 | =head2 Normalization Forms | |
154 | ||
155 | =over 4 | |
156 | ||
157 | =item C<$NFD_string = NFD($string)> | |
158 | ||
159 | returns the Normalization Form D (formed by canonical decomposition). | |
160 | ||
161 | =item C<$NFC_string = NFC($string)> | |
162 | ||
163 | returns the Normalization Form C (formed by canonical decomposition | |
164 | followed by canonical composition). | |
165 | ||
166 | =item C<$NFKD_string = NFKD($string)> | |
167 | ||
168 | returns the Normalization Form KD (formed by compatibility decomposition). | |
169 | ||
170 | =item C<$NFKC_string = NFKC($string)> | |
171 | ||
172 | returns the Normalization Form KC (formed by compatibility decomposition | |
173 | followed by B<canonical> composition). | |
174 | ||
175 | =item C<$FCD_string = FCD($string)> | |
176 | ||
177 | If the given string is in FCD ("Fast C or D" form; cf. UTN #5), | |
178 | returns it without modification; otherwise returns an FCD string. | |
179 | ||
180 | Note: FCD is not always unique, then plural forms may be equivalent | |
181 | each other. C<FCD()> will return one of these equivalent forms. | |
182 | ||
183 | =item C<$FCC_string = FCC($string)> | |
184 | ||
185 | returns the FCC form ("Fast C Contiguous"; cf. UTN #5). | |
186 | ||
187 | Note: FCC is unique, as well as four normalization forms (NF*). | |
188 | ||
189 | =item C<$normalized_string = normalize($form_name, $string)> | |
190 | ||
191 | As C<$form_name>, one of the following names must be given. | |
192 | ||
193 | 'C' or 'NFC' for Normalization Form C (UAX #15) | |
194 | 'D' or 'NFD' for Normalization Form D (UAX #15) | |
195 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) | |
196 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) | |
197 | ||
198 | 'FCD' for "Fast C or D" Form (UTN #5) | |
199 | 'FCC' for "Fast C Contiguous" (UTN #5) | |
200 | ||
201 | =back | |
202 | ||
203 | =head2 Decomposition and Composition | |
204 | ||
205 | =over 4 | |
206 | ||
207 | =item C<$decomposed_string = decompose($string)> | |
208 | ||
209 | =item C<$decomposed_string = decompose($string, $useCompatMapping)> | |
210 | ||
211 | Decomposes the specified string and returns the result. | |
212 | ||
213 | If the second parameter (a boolean) is omitted or false, decomposes it | |
214 | using the Canonical Decomposition Mapping. | |
215 | If true, decomposes it using the Compatibility Decomposition Mapping. | |
216 | ||
217 | The string returned is not always in NFD/NFKD. | |
218 | Reordering may be required. | |
219 | ||
220 | $NFD_string = reorder(decompose($string)); # eq. to NFD() | |
221 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() | |
222 | ||
223 | =item C<$reordered_string = reorder($string)> | |
224 | ||
225 | Reorders the combining characters and the like in the canonical ordering | |
226 | and returns the result. | |
227 | ||
228 | E.g., when you have a list of NFD/NFKD strings, | |
229 | you can get the concatenated NFD/NFKD string from them, saying | |
230 | ||
231 | $concat_NFD = reorder(join '', @NFD_strings); | |
232 | $concat_NFKD = reorder(join '', @NFKD_strings); | |
233 | ||
234 | =item C<$composed_string = compose($string)> | |
235 | ||
236 | Returns the string where composable pairs are composed. | |
237 | ||
238 | E.g., when you have a NFD/NFKD string, | |
239 | you can get its NFC/NFKC string, saying | |
240 | ||
241 | $NFC_string = compose($NFD_string); | |
242 | $NFKC_string = compose($NFKD_string); | |
243 | ||
244 | =back | |
245 | ||
246 | =head2 Quick Check | |
247 | ||
248 | (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) | |
249 | ||
250 | The following functions check whether the string is in that normalization form. | |
251 | ||
252 | The result returned will be: | |
253 | ||
254 | YES The string is in that normalization form. | |
255 | NO The string is not in that normalization form. | |
256 | MAYBE Dubious. Maybe yes, maybe no. | |
257 | ||
258 | =over 4 | |
259 | ||
260 | =item C<$result = checkNFD($string)> | |
261 | ||
262 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. | |
263 | ||
264 | =item C<$result = checkNFC($string)> | |
265 | ||
266 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; | |
267 | C<undef> if C<MAYBE>. | |
268 | ||
269 | =item C<$result = checkNFKD($string)> | |
270 | ||
271 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. | |
272 | ||
273 | =item C<$result = checkNFKC($string)> | |
274 | ||
275 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; | |
276 | C<undef> if C<MAYBE>. | |
277 | ||
278 | =item C<$result = checkFCD($string)> | |
279 | ||
280 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. | |
281 | ||
282 | =item C<$result = checkFCC($string)> | |
283 | ||
284 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; | |
285 | C<undef> if C<MAYBE>. | |
286 | ||
287 | If a string is not in FCD, it must not be in FCC. | |
288 | So C<checkFCC($not_FCD_string)> should return C<NO>. | |
289 | ||
290 | =item C<$result = check($form_name, $string)> | |
291 | ||
292 | returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; | |
293 | C<undef> if C<MAYBE>. | |
294 | ||
295 | As C<$form_name>, one of the following names must be given. | |
296 | ||
297 | 'C' or 'NFC' for Normalization Form C (UAX #15) | |
298 | 'D' or 'NFD' for Normalization Form D (UAX #15) | |
299 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) | |
300 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) | |
301 | ||
302 | 'FCD' for "Fast C or D" Form (UTN #5) | |
303 | 'FCC' for "Fast C Contiguous" (UTN #5) | |
304 | ||
305 | =back | |
306 | ||
307 | B<Note> | |
308 | ||
309 | In the cases of NFD, NFKD, and FCD, the answer must be | |
310 | either C<YES> or C<NO>. The answer C<MAYBE> may be returned | |
311 | in the cases of NFC, NFKC, and FCC. | |
312 | ||
313 | A C<MAYBE> string should contain at least one combining character | |
314 | or the like. For example, C<COMBINING ACUTE ACCENT> has | |
315 | the MAYBE_NFC/MAYBE_NFKC property. | |
316 | ||
317 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> | |
318 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. | |
319 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC | |
320 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), | |
321 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. | |
322 | ||
323 | If you want to check exactly, compare the string with its NFC/NFKC/FCC. | |
324 | ||
325 | if ($string eq NFC($string)) { | |
326 | # $string is exactly normalized in NFC; | |
327 | } else { | |
328 | # $string is not normalized in NFC; | |
329 | } | |
330 | ||
331 | if ($string eq NFKC($string)) { | |
332 | # $string is exactly normalized in NFKC; | |
333 | } else { | |
334 | # $string is not normalized in NFKC; | |
335 | } | |
336 | ||
337 | =head2 Character Data | |
338 | ||
339 | These functions are interface of character data used internally. | |
340 | If you want only to get Unicode normalization forms, you don't need | |
341 | call them yourself. | |
342 | ||
343 | =over 4 | |
344 | ||
345 | =item C<$canonical_decomposed = getCanon($codepoint)> | |
346 | ||
347 | If the character of the specified codepoint is canonically | |
348 | decomposable (including Hangul Syllables), | |
349 | returns the B<completely decomposed> string canonically equivalent to it. | |
350 | ||
351 | If it is not decomposable, returns C<undef>. | |
352 | ||
353 | =item C<$compatibility_decomposed = getCompat($codepoint)> | |
354 | ||
355 | If the character of the specified codepoint is compatibility | |
356 | decomposable (including Hangul Syllables), | |
357 | returns the B<completely decomposed> string compatibility equivalent to it. | |
358 | ||
359 | If it is not decomposable, returns C<undef>. | |
360 | ||
361 | =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> | |
362 | ||
363 | If two characters here and next (as codepoints) are composable | |
364 | (including Hangul Jamo/Syllables and Composition Exclusions), | |
365 | returns the codepoint of the composite. | |
366 | ||
367 | If they are not composable, returns C<undef>. | |
368 | ||
369 | =item C<$combining_class = getCombinClass($codepoint)> | |
370 | ||
371 | Returns the combining class of the character as an integer. | |
372 | ||
373 | =item C<$is_exclusion = isExclusion($codepoint)> | |
374 | ||
375 | Returns a boolean whether the character of the specified codepoint | |
376 | is a composition exclusion. | |
377 | ||
378 | =item C<$is_singleton = isSingleton($codepoint)> | |
379 | ||
380 | Returns a boolean whether the character of the specified codepoint is | |
381 | a singleton. | |
382 | ||
383 | =item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> | |
384 | ||
385 | Returns a boolean whether the canonical decomposition | |
386 | of the character of the specified codepoint | |
387 | is a Non-Starter Decomposition. | |
388 | ||
389 | =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> | |
390 | ||
391 | Returns a boolean whether the character of the specified codepoint | |
392 | may be composed with the previous one in a certain composition | |
393 | (including Hangul Compositions, but excluding | |
394 | Composition Exclusions and Non-Starter Decompositions). | |
395 | ||
396 | =back | |
397 | ||
398 | =head1 EXPORT | |
399 | ||
400 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. | |
401 | ||
402 | C<normalize> and other some functions: on request. | |
403 | ||
404 | =head1 CAVEATS | |
405 | ||
406 | =over 4 | |
407 | ||
408 | =item Perl's version vs. Unicode version | |
409 | ||
410 | Since this module refers to perl core's Unicode database in the directory | |
411 | F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of | |
412 | normalization implemented by this module depends on your perl's version. | |
413 | ||
414 | perl's version implemented Unicode version | |
415 | 5.6.1 3.0.1 | |
416 | 5.7.2 3.1.0 | |
417 | 5.7.3 3.1.1 (same normalized form as that of 3.1.0) | |
418 | 5.8.0 3.2.0 | |
419 | 5.8.1-5.8.3 4.0.0 | |
420 | 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0) | |
421 | ||
422 | =item Correction of decomposition mapping | |
423 | ||
424 | In older Unicode versions, a small number of characters (all of which are | |
425 | CJK compatibility ideographs as far as they have been found) may have | |
426 | an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). | |
427 | Anyhow, this module will neither refer to F<NormalizationCorrections.txt> | |
428 | nor provide any specific version of normalization. Therefore this module | |
429 | running on an older perl with an older Unicode database may use | |
430 | the erroneous decomposition mapping blindly conforming to the Unicode database. | |
431 | ||
432 | =item Revised definition of canonical composition | |
433 | ||
434 | In Unicode 4.1.0, the definition D2 of canonical composition (which | |
435 | affects NFC and NFKC) has been changed (see Public Review Issue #29 | |
436 | and recent UAX #15). This module has used the newer definition | |
437 | since the version 0.07 (Oct 31, 2001). | |
438 | This module does not support normalization according to the older | |
439 | definition, even if the Unicode version implemented by perl is | |
440 | lower than 4.1.0. | |
441 | ||
442 | =back | |
443 | ||
444 | =head1 AUTHOR | |
445 | ||
446 | SADAHIRO Tomoyuki <SADAHIRO@cpan.org> | |
447 | ||
448 | Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. | |
449 | ||
450 | This module is free software; you can redistribute it | |
451 | and/or modify it under the same terms as Perl itself. | |
452 | ||
453 | =head1 SEE ALSO | |
454 | ||
455 | =over 4 | |
456 | ||
457 | =item http://www.unicode.org/reports/tr15/ | |
458 | ||
459 | Unicode Normalization Forms - UAX #15 | |
460 | ||
461 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt | |
462 | ||
463 | Derived Normalization Properties | |
464 | ||
465 | =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt | |
466 | ||
467 | Normalization Corrections | |
468 | ||
469 | =item http://www.unicode.org/review/pr-29.html | |
470 | ||
471 | Public Review Issue #29: Normalization Issue | |
472 | ||
473 | =item http://www.unicode.org/notes/tn5/ | |
474 | ||
475 | Canonical Equivalence in Applications - UTN #5 | |
476 | ||
477 | =back | |
478 | ||
479 | =cut |