Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package Unicode::Normalize; |
2 | ||
3 | BEGIN { | |
4 | if (ord("A") == 193) { | |
5 | die "Unicode::Normalize not ported to EBCDIC\n"; | |
6 | } | |
7 | } | |
8 | ||
9 | use 5.006; | |
10 | use strict; | |
11 | use warnings; | |
12 | use Carp; | |
13 | ||
14 | our $VERSION = '0.17'; | |
15 | our $PACKAGE = __PACKAGE__; | |
16 | ||
17 | require Exporter; | |
18 | require DynaLoader; | |
19 | require AutoLoader; | |
20 | ||
21 | our @ISA = qw(Exporter DynaLoader); | |
22 | our @EXPORT = qw( NFC NFD NFKC NFKD ); | |
23 | our @EXPORT_OK = qw( | |
24 | normalize decompose reorder compose | |
25 | checkNFD checkNFKD checkNFC checkNFKC check | |
26 | getCanon getCompat getComposite getCombinClass | |
27 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex | |
28 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE | |
29 | ); | |
30 | our %EXPORT_TAGS = ( | |
31 | all => [ @EXPORT, @EXPORT_OK ], | |
32 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], | |
33 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], | |
34 | ); | |
35 | ||
36 | bootstrap Unicode::Normalize $VERSION; | |
37 | ||
38 | use constant COMPAT => 1; | |
39 | ||
40 | sub NFD ($) { reorder(decompose($_[0])) } | |
41 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } | |
42 | sub NFC ($) { compose(reorder(decompose($_[0]))) } | |
43 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } | |
44 | ||
45 | sub normalize($$) | |
46 | { | |
47 | my $form = shift; | |
48 | my $str = shift; | |
49 | $form =~ s/^NF//; | |
50 | return | |
51 | $form eq 'D' ? NFD ($str) : | |
52 | $form eq 'C' ? NFC ($str) : | |
53 | $form eq 'KD' ? NFKD($str) : | |
54 | $form eq 'KC' ? NFKC($str) : | |
55 | croak $PACKAGE."::normalize: invalid form name: $form"; | |
56 | } | |
57 | ||
58 | sub check($$) | |
59 | { | |
60 | my $form = shift; | |
61 | my $str = shift; | |
62 | $form =~ s/^NF//; | |
63 | return | |
64 | $form eq 'D' ? checkNFD ($str) : | |
65 | $form eq 'C' ? checkNFC ($str) : | |
66 | $form eq 'KD' ? checkNFKD($str) : | |
67 | $form eq 'KC' ? checkNFKC($str) : | |
68 | croak $PACKAGE."::check: invalid form name: $form"; | |
69 | } | |
70 | ||
71 | 1; | |
72 | __END__ | |
73 | ||
74 | =head1 NAME | |
75 | ||
76 | Unicode::Normalize - Unicode Normalization Forms | |
77 | ||
78 | =head1 SYNOPSIS | |
79 | ||
80 | use Unicode::Normalize; | |
81 | ||
82 | $NFD_string = NFD($string); # Normalization Form D | |
83 | $NFC_string = NFC($string); # Normalization Form C | |
84 | $NFKD_string = NFKD($string); # Normalization Form KD | |
85 | $NFKC_string = NFKC($string); # Normalization Form KC | |
86 | ||
87 | or | |
88 | ||
89 | use Unicode::Normalize 'normalize'; | |
90 | ||
91 | $NFD_string = normalize('D', $string); # Normalization Form D | |
92 | $NFC_string = normalize('C', $string); # Normalization Form C | |
93 | $NFKD_string = normalize('KD', $string); # Normalization Form KD | |
94 | $NFKC_string = normalize('KC', $string); # Normalization Form KC | |
95 | ||
96 | =head1 DESCRIPTION | |
97 | ||
98 | =head2 Normalization Forms | |
99 | ||
100 | =over 4 | |
101 | ||
102 | =item C<$NFD_string = NFD($string)> | |
103 | ||
104 | returns the Normalization Form D (formed by canonical decomposition). | |
105 | ||
106 | =item C<$NFC_string = NFC($string)> | |
107 | ||
108 | returns the Normalization Form C (formed by canonical decomposition | |
109 | followed by canonical composition). | |
110 | ||
111 | =item C<$NFKD_string = NFKD($string)> | |
112 | ||
113 | returns the Normalization Form KD (formed by compatibility decomposition). | |
114 | ||
115 | =item C<$NFKC_string = NFKC($string)> | |
116 | ||
117 | returns the Normalization Form KC (formed by compatibility decomposition | |
118 | followed by B<canonical> composition). | |
119 | ||
120 | =item C<$normalized_string = normalize($form_name, $string)> | |
121 | ||
122 | As C<$form_name>, one of the following names must be given. | |
123 | ||
124 | 'C' or 'NFC' for Normalization Form C | |
125 | 'D' or 'NFD' for Normalization Form D | |
126 | 'KC' or 'NFKC' for Normalization Form KC | |
127 | 'KD' or 'NFKD' for Normalization Form KD | |
128 | ||
129 | =back | |
130 | ||
131 | =head2 Decomposition and Composition | |
132 | ||
133 | =over 4 | |
134 | ||
135 | =item C<$decomposed_string = decompose($string)> | |
136 | ||
137 | =item C<$decomposed_string = decompose($string, $useCompatMapping)> | |
138 | ||
139 | Decompose the specified string and returns the result. | |
140 | ||
141 | If the second parameter (a boolean) is omitted or false, decomposes it | |
142 | using the Canonical Decomposition Mapping. | |
143 | If true, decomposes it using the Compatibility Decomposition Mapping. | |
144 | ||
145 | The string returned is not always in NFD/NFKD. | |
146 | Reordering may be required. | |
147 | ||
148 | $NFD_string = reorder(decompose($string)); # eq. to NFD() | |
149 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() | |
150 | ||
151 | =item C<$reordered_string = reorder($string)> | |
152 | ||
153 | Reorder the combining characters and the like in the canonical ordering | |
154 | and returns the result. | |
155 | ||
156 | E.g., when you have a list of NFD/NFKD strings, | |
157 | you can get the concatenated NFD/NFKD string from them, saying | |
158 | ||
159 | $concat_NFD = reorder(join '', @NFD_strings); | |
160 | $concat_NFKD = reorder(join '', @NFKD_strings); | |
161 | ||
162 | =item C<$composed_string = compose($string)> | |
163 | ||
164 | Returns the string where composable pairs are composed. | |
165 | ||
166 | E.g., when you have a NFD/NFKD string, | |
167 | you can get its NFC/NFKC string, saying | |
168 | ||
169 | $NFC_string = compose($NFD_string); | |
170 | $NFKC_string = compose($NFKD_string); | |
171 | ||
172 | =back | |
173 | ||
174 | =head2 Quick Check | |
175 | ||
176 | (see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>) | |
177 | ||
178 | The following functions check whether the string is in that normalization form. | |
179 | ||
180 | The result returned will be: | |
181 | ||
182 | YES The string is in that normalization form. | |
183 | NO The string is not in that normalization form. | |
184 | MAYBE Dubious. Maybe yes, maybe no. | |
185 | ||
186 | =over 4 | |
187 | ||
188 | =item C<$result = checkNFD($string)> | |
189 | ||
190 | returns C<YES> (C<1>) or C<NO> (C<empty string>). | |
191 | ||
192 | =item C<$result = checkNFC($string)> | |
193 | ||
194 | returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). | |
195 | ||
196 | =item C<$result = checkNFKD($string)> | |
197 | ||
198 | returns C<YES> (C<1>) or C<NO> (C<empty string>). | |
199 | ||
200 | =item C<$result = checkNFKC($string)> | |
201 | ||
202 | returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). | |
203 | ||
204 | =item C<$result = check($form_name, $string)> | |
205 | ||
206 | returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>). | |
207 | ||
208 | C<$form_name> is alike to that for C<normalize()>. | |
209 | ||
210 | =back | |
211 | ||
212 | B<Note> | |
213 | ||
214 | In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>. | |
215 | The answer C<MAYBE> may be returned in the cases of NFC and NFKC. | |
216 | ||
217 | A MAYBE-NFC/NFKC string should contain at least | |
218 | one combining character or the like. | |
219 | For example, C<COMBINING ACUTE ACCENT> has | |
220 | the MAYBE_NFC/MAYBE_NFKC property. | |
221 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> | |
222 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. | |
223 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC | |
224 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), | |
225 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. | |
226 | ||
227 | If you want to check exactly, compare the string with its NFC/NFKC; i.e., | |
228 | ||
229 | $string eq NFC($string) # more thorough than checkNFC($string) | |
230 | $string eq NFKC($string) # more thorough than checkNFKC($string) | |
231 | ||
232 | =head2 Character Data | |
233 | ||
234 | These functions are interface of character data used internally. | |
235 | If you want only to get Unicode normalization forms, you don't need | |
236 | call them yourself. | |
237 | ||
238 | =over 4 | |
239 | ||
240 | =item C<$canonical_decomposed = getCanon($codepoint)> | |
241 | ||
242 | If the character of the specified codepoint is canonically | |
243 | decomposable (including Hangul Syllables), | |
244 | returns the B<completely decomposed> string canonically equivalent to it. | |
245 | ||
246 | If it is not decomposable, returns C<undef>. | |
247 | ||
248 | =item C<$compatibility_decomposed = getCompat($codepoint)> | |
249 | ||
250 | If the character of the specified codepoint is compatibility | |
251 | decomposable (including Hangul Syllables), | |
252 | returns the B<completely decomposed> string compatibility equivalent to it. | |
253 | ||
254 | If it is not decomposable, returns C<undef>. | |
255 | ||
256 | =item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> | |
257 | ||
258 | If two characters here and next (as codepoints) are composable | |
259 | (including Hangul Jamo/Syllables and Composition Exclusions), | |
260 | returns the codepoint of the composite. | |
261 | ||
262 | If they are not composable, returns C<undef>. | |
263 | ||
264 | =item C<$combining_class = getCombinClass($codepoint)> | |
265 | ||
266 | Returns the combining class of the character as an integer. | |
267 | ||
268 | =item C<$is_exclusion = isExclusion($codepoint)> | |
269 | ||
270 | Returns a boolean whether the character of the specified codepoint | |
271 | is a composition exclusion. | |
272 | ||
273 | =item C<$is_singleton = isSingleton($codepoint)> | |
274 | ||
275 | Returns a boolean whether the character of the specified codepoint is | |
276 | a singleton. | |
277 | ||
278 | =item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)> | |
279 | ||
280 | Returns a boolean whether the canonical decomposition | |
281 | of the character of the specified codepoint | |
282 | is a Non-Starter Decomposition. | |
283 | ||
284 | =item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> | |
285 | ||
286 | Returns a boolean whether the character of the specified codepoint | |
287 | may be composed with the previous one in a certain composition | |
288 | (including Hangul Compositions, but excluding | |
289 | Composition Exclusions and Non-Starter Decompositions). | |
290 | ||
291 | =back | |
292 | ||
293 | =head2 EXPORT | |
294 | ||
295 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. | |
296 | ||
297 | C<normalize> and other some functions: on request. | |
298 | ||
299 | =head1 AUTHOR | |
300 | ||
301 | SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> | |
302 | ||
303 | http://homepage1.nifty.com/nomenclator/perl/ | |
304 | ||
305 | Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved. | |
306 | ||
307 | This program is free software; you can redistribute it and/or | |
308 | modify it under the same terms as Perl itself. | |
309 | ||
310 | =head1 SEE ALSO | |
311 | ||
312 | =over 4 | |
313 | ||
314 | =item http://www.unicode.org/unicode/reports/tr15/ | |
315 | ||
316 | Unicode Normalization Forms - UAX #15 | |
317 | ||
318 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt | |
319 | ||
320 | Derived Normalization Properties | |
321 | ||
322 | =back | |
323 | ||
324 | =cut | |
325 |