[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / 5.8.0 / sun4-solaris / Unicode / Normalize.pm

package Unicode::Normalize;

BEGIN {
    if (ord("A") == 193) {
	die "Unicode::Normalize not ported to EBCDIC\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

our $VERSION = '0.17';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;
require AutoLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    checkNFD checkNFKD checkNFC checkNFKC check
    getCanon getCompat getComposite getCombinClass
    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
);
our %EXPORT_TAGS = (
    all       => [ @EXPORT, @EXPORT_OK ],
    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
);

bootstrap Unicode::Normalize $VERSION;

use constant COMPAT => 1;

sub NFD  ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
sub NFC  ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }

sub normalize($$)
{
    my $form = shift;
    my $str = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? NFD ($str) :
	$form eq 'C'  ? NFC ($str) :
	$form eq 'KD' ? NFKD($str) :
	$form eq 'KC' ? NFKC($str) :
      croak $PACKAGE."::normalize: invalid form name: $form";
}

sub check($$)
{
    my $form = shift;
    my $str = shift;
    $form =~ s/^NF//;
    return
	$form eq 'D'  ? checkNFD ($str) :
	$form eq 'C'  ? checkNFC ($str) :
	$form eq 'KD' ? checkNFKD($str) :
	$form eq 'KC' ? checkNFKC($str) :
      croak $PACKAGE."::check: invalid form name: $form";
}

1;
__END__

=head1 NAME

Unicode::Normalize - Unicode Normalization Forms

=head1 SYNOPSIS

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC

=head1 DESCRIPTION

=head2 Normalization Forms

=over 4

=item C<$NFD_string = NFD($string)>

returns the Normalization Form D (formed by canonical decomposition).

=item C<$NFC_string = NFC($string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$NFKD_string = NFKD($string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$NFKC_string = NFKC($string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$normalized_string = normalize($form_name, $string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

=back

=head2 Decomposition and Composition

=over 4

=item C<$decomposed_string = decompose($string)>

=item C<$decomposed_string = decompose($string, $useCompatMapping)>

Decompose the specified string and returns the result.

If the second parameter (a boolean) is omitted or false, decomposes it
using the Canonical Decomposition Mapping.
If true, decomposes it using the Compatibility Decomposition Mapping.

The string returned is not always in NFD/NFKD.
Reordering may be required.

    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()

=item C<$reordered_string  = reorder($string)>

Reorder the combining characters and the like in the canonical ordering
and returns the result.

E.g., when you have a list of NFD/NFKD strings,
you can get the concatenated NFD/NFKD string from them, saying

    $concat_NFD  = reorder(join '', @NFD_strings);
    $concat_NFKD = reorder(join '', @NFKD_strings);

=item C<$composed_string   = compose($string)>

Returns the string where composable pairs are composed.

E.g., when you have a NFD/NFKD string,
you can get its NFC/NFKC string, saying

    $NFC_string  = compose($NFD_string);
    $NFKC_string = compose($NFKD_string);

=back

=head2 Quick Check

(see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)

The following functions check whether the string is in that normalization form.

The result returned will be:

    YES     The string is in that normalization form.
    NO      The string is not in that normalization form.
    MAYBE   Dubious. Maybe yes, maybe no.

=over 4

=item C<$result = checkNFD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = checkNFKD($string)>

returns C<YES> (C<1>) or C<NO> (C<empty string>).

=item C<$result = checkNFKC($string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

=item C<$result = check($form_name, $string)>

returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).

C<$form_name> is alike to that for C<normalize()>.

=back

B<Note>

In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
The answer C<MAYBE> may be returned in the cases of NFC and NFKC.

A MAYBE-NFC/NFKC string should contain at least
one combining character or the like.
For example, C<COMBINING ACUTE ACCENT> has
the MAYBE_NFC/MAYBE_NFKC property.
Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.

If you want to check exactly, compare the string with its NFC/NFKC; i.e.,

    $string eq NFC($string)    # more thorough than checkNFC($string)
    $string eq NFKC($string)   # more thorough than checkNFKC($string)

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposed = getCanon($codepoint)>

If the character of the specified codepoint is canonically
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string canonically equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$compatibility_decomposed = getCompat($codepoint)>

If the character of the specified codepoint is compatibility
decomposable (including Hangul Syllables),
returns the B<completely decomposed> string compatibility equivalent to it.

If it is not decomposable, returns C<undef>.

=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>

If two characters here and next (as codepoints) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.

If they are not composable, returns C<undef>.

=item C<$combining_class = getCombinClass($codepoint)>

Returns the combining class of the character as an integer.

=item C<$is_exclusion = isExclusion($codepoint)>

Returns a boolean whether the character of the specified codepoint
is a composition exclusion.

=item C<$is_singleton = isSingleton($codepoint)>

Returns a boolean whether the character of the specified codepoint is
a singleton.

=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>

Returns a boolean whether the canonical decomposition
of the character of the specified codepoint
is a Non-Starter Decomposition.

=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>

Returns a boolean whether the character of the specified codepoint
may be composed with the previous one in a certain composition
(including Hangul Compositions, but excluding
Composition Exclusions and Non-Starter Decompositions).

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt

Derived Normalization Properties

=back

=cut
Commit	Line	Data
86530b38 AT	1	package Unicode::Normalize;
	2
	3	BEGIN {
	4	if (ord("A") == 193) {
	5	die "Unicode::Normalize not ported to EBCDIC\n";
	6	}
	7	}
	8
	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13
	14	our $VERSION = '0.17';
	15	our $PACKAGE = __PACKAGE__;
	16
	17	require Exporter;
	18	require DynaLoader;
	19	require AutoLoader;
	20
	21	our @ISA = qw(Exporter DynaLoader);
	22	our @EXPORT = qw( NFC NFD NFKC NFKD );
	23	our @EXPORT_OK = qw(
	24	normalize decompose reorder compose
	25	checkNFD checkNFKD checkNFC checkNFKC check
	26	getCanon getCompat getComposite getCombinClass
	27	isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
	28	isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
	29	);
	30	our %EXPORT_TAGS = (
	31	all => [ @EXPORT, @EXPORT_OK ],
	32	normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
	33	check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
	34	);
	35
	36	bootstrap Unicode::Normalize $VERSION;
	37
	38	use constant COMPAT => 1;
	39
	40	sub NFD ($) { reorder(decompose($_[0])) }
	41	sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
	42	sub NFC ($) { compose(reorder(decompose($_[0]))) }
	43	sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
	44
	45	sub normalize($$)
	46	{
	47	my $form = shift;
	48	my $str = shift;
	49	$form =~ s/^NF//;
	50	return
	51	$form eq 'D' ? NFD ($str) :
	52	$form eq 'C' ? NFC ($str) :
	53	$form eq 'KD' ? NFKD($str) :
	54	$form eq 'KC' ? NFKC($str) :
	55	croak $PACKAGE."::normalize: invalid form name: $form";
	56	}
	57
	58	sub check($$)
	59	{
	60	my $form = shift;
	61	my $str = shift;
	62	$form =~ s/^NF//;
	63	return
	64	$form eq 'D' ? checkNFD ($str) :
65	$form eq 'C' ? checkNFC ($str) :
66	$form eq 'KD' ? checkNFKD($str) :
67	$form eq 'KC' ? checkNFKC($str) :
68	croak $PACKAGE."::check: invalid form name: $form";
69	}
70
71	1;
72	__END__
73
74	=head1 NAME
75
76	Unicode::Normalize - Unicode Normalization Forms
77
78	=head1 SYNOPSIS
79
80	use Unicode::Normalize;
81
82	$NFD_string = NFD($string); # Normalization Form D
83	$NFC_string = NFC($string); # Normalization Form C
84	$NFKD_string = NFKD($string); # Normalization Form KD
85	$NFKC_string = NFKC($string); # Normalization Form KC
86
87	or
88
89	use Unicode::Normalize 'normalize';
90
91	$NFD_string = normalize('D', $string); # Normalization Form D
92	$NFC_string = normalize('C', $string); # Normalization Form C
93	$NFKD_string = normalize('KD', $string); # Normalization Form KD
94	$NFKC_string = normalize('KC', $string); # Normalization Form KC
95
96	=head1 DESCRIPTION
97
98	=head2 Normalization Forms
99
100	=over 4
101
102	=item C<$NFD_string = NFD($string)>
103
104	returns the Normalization Form D (formed by canonical decomposition).
105
106	=item C<$NFC_string = NFC($string)>
107
108	returns the Normalization Form C (formed by canonical decomposition
109	followed by canonical composition).
110
111	=item C<$NFKD_string = NFKD($string)>
112
113	returns the Normalization Form KD (formed by compatibility decomposition).
114
115	=item C<$NFKC_string = NFKC($string)>
116
117	returns the Normalization Form KC (formed by compatibility decomposition
118	followed by B<canonical> composition).
119
120	=item C<$normalized_string = normalize($form_name, $string)>
121
122	As C<$form_name>, one of the following names must be given.
123
124	'C' or 'NFC' for Normalization Form C
125	'D' or 'NFD' for Normalization Form D
126	'KC' or 'NFKC' for Normalization Form KC
127	'KD' or 'NFKD' for Normalization Form KD
128
129	=back
130
131	=head2 Decomposition and Composition
132
133	=over 4
134
135	=item C<$decomposed_string = decompose($string)>
136
137	=item C<$decomposed_string = decompose($string, $useCompatMapping)>
138
139	Decompose the specified string and returns the result.
140
141	If the second parameter (a boolean) is omitted or false, decomposes it
142	using the Canonical Decomposition Mapping.
143	If true, decomposes it using the Compatibility Decomposition Mapping.
144
145	The string returned is not always in NFD/NFKD.
146	Reordering may be required.
147
148	$NFD_string = reorder(decompose($string)); # eq. to NFD()
149	$NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
150
151	=item C<$reordered_string = reorder($string)>
152
153	Reorder the combining characters and the like in the canonical ordering
154	and returns the result.
155
156	E.g., when you have a list of NFD/NFKD strings,
157	you can get the concatenated NFD/NFKD string from them, saying
158
159	$concat_NFD = reorder(join '', @NFD_strings);
160	$concat_NFKD = reorder(join '', @NFKD_strings);
161
162	=item C<$composed_string = compose($string)>
163
164	Returns the string where composable pairs are composed.
165
166	E.g., when you have a NFD/NFKD string,
167	you can get its NFC/NFKC string, saying
168
169	$NFC_string = compose($NFD_string);
170	$NFKC_string = compose($NFKD_string);
171
172	=back
173
174	=head2 Quick Check
175
176	(see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
177
178	The following functions check whether the string is in that normalization form.
179
180	The result returned will be:
181
182	YES The string is in that normalization form.
183	NO The string is not in that normalization form.
184	MAYBE Dubious. Maybe yes, maybe no.
185
186	=over 4
187
188	=item C<$result = checkNFD($string)>
189
190	returns C<YES> (C<1>) or C<NO> (C<empty string>).
191
192	=item C<$result = checkNFC($string)>
193
194	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
195
196	=item C<$result = checkNFKD($string)>
197
198	returns C<YES> (C<1>) or C<NO> (C<empty string>).
199
200	=item C<$result = checkNFKC($string)>
201
202	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
203
204	=item C<$result = check($form_name, $string)>
205
206	returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
207
208	C<$form_name> is alike to that for C<normalize()>.
209
210	=back
211
212	B<Note>
213
214	In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
215	The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
216
217	A MAYBE-NFC/NFKC string should contain at least
218	one combining character or the like.
219	For example, C<COMBINING ACUTE ACCENT> has
220	the MAYBE_NFC/MAYBE_NFKC property.
221	Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
222	and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
223	C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
224	(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
225	while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
226
227	If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
228
229	$string eq NFC($string) # more thorough than checkNFC($string)
230	$string eq NFKC($string) # more thorough than checkNFKC($string)
231
232	=head2 Character Data
233
234	These functions are interface of character data used internally.
235	If you want only to get Unicode normalization forms, you don't need
236	call them yourself.
237
238	=over 4
239
240	=item C<$canonical_decomposed = getCanon($codepoint)>
241
242	If the character of the specified codepoint is canonically
243	decomposable (including Hangul Syllables),
244	returns the B<completely decomposed> string canonically equivalent to it.
245
246	If it is not decomposable, returns C<undef>.
247
248	=item C<$compatibility_decomposed = getCompat($codepoint)>
249
250	If the character of the specified codepoint is compatibility
251	decomposable (including Hangul Syllables),
252	returns the B<completely decomposed> string compatibility equivalent to it.
253
254	If it is not decomposable, returns C<undef>.
255
256	=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
257
258	If two characters here and next (as codepoints) are composable
259	(including Hangul Jamo/Syllables and Composition Exclusions),
260	returns the codepoint of the composite.
261
262	If they are not composable, returns C<undef>.
263
264	=item C<$combining_class = getCombinClass($codepoint)>
265
266	Returns the combining class of the character as an integer.
267
268	=item C<$is_exclusion = isExclusion($codepoint)>
269
270	Returns a boolean whether the character of the specified codepoint
271	is a composition exclusion.
272
273	=item C<$is_singleton = isSingleton($codepoint)>
274
275	Returns a boolean whether the character of the specified codepoint is
276	a singleton.
277
278	=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
279
280	Returns a boolean whether the canonical decomposition
281	of the character of the specified codepoint
282	is a Non-Starter Decomposition.
283
284	=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
285
286	Returns a boolean whether the character of the specified codepoint
287	may be composed with the previous one in a certain composition
288	(including Hangul Compositions, but excluding
289	Composition Exclusions and Non-Starter Decompositions).
290
291	=back
292
293	=head2 EXPORT
294
295	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
296
297	C<normalize> and other some functions: on request.
298
299	=head1 AUTHOR
300
301	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
302
303	http://homepage1.nifty.com/nomenclator/perl/
304
305	Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
306
307	This program is free software; you can redistribute it and/or
308	modify it under the same terms as Perl itself.
309
310	=head1 SEE ALSO
311
312	=over 4
313
314	=item http://www.unicode.org/unicode/reports/tr15/
315
316	Unicode Normalization Forms - UAX #15
317
318	=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
319
320	Derived Normalization Properties
321
322	=back
323
324	=cut
325