Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / 5.8.0 / sun4-solaris / Unicode / Normalize.pm
CommitLineData
86530b38
AT
1package Unicode::Normalize;
2
3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Normalize not ported to EBCDIC\n";
6 }
7}
8
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
14our $VERSION = '0.17';
15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29);
30our %EXPORT_TAGS = (
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
34);
35
36bootstrap Unicode::Normalize $VERSION;
37
38use constant COMPAT => 1;
39
40sub NFD ($) { reorder(decompose($_[0])) }
41sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
42sub NFC ($) { compose(reorder(decompose($_[0]))) }
43sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
44
45sub normalize($$)
46{
47 my $form = shift;
48 my $str = shift;
49 $form =~ s/^NF//;
50 return
51 $form eq 'D' ? NFD ($str) :
52 $form eq 'C' ? NFC ($str) :
53 $form eq 'KD' ? NFKD($str) :
54 $form eq 'KC' ? NFKC($str) :
55 croak $PACKAGE."::normalize: invalid form name: $form";
56}
57
58sub check($$)
59{
60 my $form = shift;
61 my $str = shift;
62 $form =~ s/^NF//;
63 return
64 $form eq 'D' ? checkNFD ($str) :
65 $form eq 'C' ? checkNFC ($str) :
66 $form eq 'KD' ? checkNFKD($str) :
67 $form eq 'KC' ? checkNFKC($str) :
68 croak $PACKAGE."::check: invalid form name: $form";
69}
70
711;
72__END__
73
74=head1 NAME
75
76Unicode::Normalize - Unicode Normalization Forms
77
78=head1 SYNOPSIS
79
80 use Unicode::Normalize;
81
82 $NFD_string = NFD($string); # Normalization Form D
83 $NFC_string = NFC($string); # Normalization Form C
84 $NFKD_string = NFKD($string); # Normalization Form KD
85 $NFKC_string = NFKC($string); # Normalization Form KC
86
87 or
88
89 use Unicode::Normalize 'normalize';
90
91 $NFD_string = normalize('D', $string); # Normalization Form D
92 $NFC_string = normalize('C', $string); # Normalization Form C
93 $NFKD_string = normalize('KD', $string); # Normalization Form KD
94 $NFKC_string = normalize('KC', $string); # Normalization Form KC
95
96=head1 DESCRIPTION
97
98=head2 Normalization Forms
99
100=over 4
101
102=item C<$NFD_string = NFD($string)>
103
104returns the Normalization Form D (formed by canonical decomposition).
105
106=item C<$NFC_string = NFC($string)>
107
108returns the Normalization Form C (formed by canonical decomposition
109followed by canonical composition).
110
111=item C<$NFKD_string = NFKD($string)>
112
113returns the Normalization Form KD (formed by compatibility decomposition).
114
115=item C<$NFKC_string = NFKC($string)>
116
117returns the Normalization Form KC (formed by compatibility decomposition
118followed by B<canonical> composition).
119
120=item C<$normalized_string = normalize($form_name, $string)>
121
122As C<$form_name>, one of the following names must be given.
123
124 'C' or 'NFC' for Normalization Form C
125 'D' or 'NFD' for Normalization Form D
126 'KC' or 'NFKC' for Normalization Form KC
127 'KD' or 'NFKD' for Normalization Form KD
128
129=back
130
131=head2 Decomposition and Composition
132
133=over 4
134
135=item C<$decomposed_string = decompose($string)>
136
137=item C<$decomposed_string = decompose($string, $useCompatMapping)>
138
139Decompose the specified string and returns the result.
140
141If the second parameter (a boolean) is omitted or false, decomposes it
142using the Canonical Decomposition Mapping.
143If true, decomposes it using the Compatibility Decomposition Mapping.
144
145The string returned is not always in NFD/NFKD.
146Reordering may be required.
147
148 $NFD_string = reorder(decompose($string)); # eq. to NFD()
149 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
150
151=item C<$reordered_string = reorder($string)>
152
153Reorder the combining characters and the like in the canonical ordering
154and returns the result.
155
156E.g., when you have a list of NFD/NFKD strings,
157you can get the concatenated NFD/NFKD string from them, saying
158
159 $concat_NFD = reorder(join '', @NFD_strings);
160 $concat_NFKD = reorder(join '', @NFKD_strings);
161
162=item C<$composed_string = compose($string)>
163
164Returns the string where composable pairs are composed.
165
166E.g., when you have a NFD/NFKD string,
167you can get its NFC/NFKC string, saying
168
169 $NFC_string = compose($NFD_string);
170 $NFKC_string = compose($NFKD_string);
171
172=back
173
174=head2 Quick Check
175
176(see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
177
178The following functions check whether the string is in that normalization form.
179
180The result returned will be:
181
182 YES The string is in that normalization form.
183 NO The string is not in that normalization form.
184 MAYBE Dubious. Maybe yes, maybe no.
185
186=over 4
187
188=item C<$result = checkNFD($string)>
189
190returns C<YES> (C<1>) or C<NO> (C<empty string>).
191
192=item C<$result = checkNFC($string)>
193
194returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
195
196=item C<$result = checkNFKD($string)>
197
198returns C<YES> (C<1>) or C<NO> (C<empty string>).
199
200=item C<$result = checkNFKC($string)>
201
202returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
203
204=item C<$result = check($form_name, $string)>
205
206returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
207
208C<$form_name> is alike to that for C<normalize()>.
209
210=back
211
212B<Note>
213
214In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
215The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
216
217A MAYBE-NFC/NFKC string should contain at least
218one combining character or the like.
219For example, C<COMBINING ACUTE ACCENT> has
220the MAYBE_NFC/MAYBE_NFKC property.
221Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
222and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
223C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
224(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
225while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
226
227If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
228
229 $string eq NFC($string) # more thorough than checkNFC($string)
230 $string eq NFKC($string) # more thorough than checkNFKC($string)
231
232=head2 Character Data
233
234These functions are interface of character data used internally.
235If you want only to get Unicode normalization forms, you don't need
236call them yourself.
237
238=over 4
239
240=item C<$canonical_decomposed = getCanon($codepoint)>
241
242If the character of the specified codepoint is canonically
243decomposable (including Hangul Syllables),
244returns the B<completely decomposed> string canonically equivalent to it.
245
246If it is not decomposable, returns C<undef>.
247
248=item C<$compatibility_decomposed = getCompat($codepoint)>
249
250If the character of the specified codepoint is compatibility
251decomposable (including Hangul Syllables),
252returns the B<completely decomposed> string compatibility equivalent to it.
253
254If it is not decomposable, returns C<undef>.
255
256=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
257
258If two characters here and next (as codepoints) are composable
259(including Hangul Jamo/Syllables and Composition Exclusions),
260returns the codepoint of the composite.
261
262If they are not composable, returns C<undef>.
263
264=item C<$combining_class = getCombinClass($codepoint)>
265
266Returns the combining class of the character as an integer.
267
268=item C<$is_exclusion = isExclusion($codepoint)>
269
270Returns a boolean whether the character of the specified codepoint
271is a composition exclusion.
272
273=item C<$is_singleton = isSingleton($codepoint)>
274
275Returns a boolean whether the character of the specified codepoint is
276a singleton.
277
278=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
279
280Returns a boolean whether the canonical decomposition
281of the character of the specified codepoint
282is a Non-Starter Decomposition.
283
284=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
285
286Returns a boolean whether the character of the specified codepoint
287may be composed with the previous one in a certain composition
288(including Hangul Compositions, but excluding
289Composition Exclusions and Non-Starter Decompositions).
290
291=back
292
293=head2 EXPORT
294
295C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
296
297C<normalize> and other some functions: on request.
298
299=head1 AUTHOR
300
301SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
302
303 http://homepage1.nifty.com/nomenclator/perl/
304
305 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
306
307 This program is free software; you can redistribute it and/or
308 modify it under the same terms as Perl itself.
309
310=head1 SEE ALSO
311
312=over 4
313
314=item http://www.unicode.org/unicode/reports/tr15/
315
316Unicode Normalization Forms - UAX #15
317
318=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
319
320Derived Normalization Properties
321
322=back
323
324=cut
325