Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / 5.8.0 / Unicode / Collate.pm
CommitLineData
86530b38
AT
1package Unicode::Collate;
2
3BEGIN {
4 if (ord("A") == 193) {
5 die "Unicode::Collate not ported to EBCDIC\n";
6 }
7}
8
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13use File::Spec;
14
15require Exporter;
16
17our $VERSION = '0.12';
18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
29our $UNICODE_VERSION;
30
31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
36else { # XXX, Perl 5.6.1
37 my($f, $fh);
38 foreach my $d (@INC) {
39 use File::Spec;
40 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
41 if (open($fh, $f)) {
42 $UNICODE_VERSION = '3.0.1';
43 close $fh;
44 last;
45 }
46 }
47}
48
49our $getCombinClass; # coderef for combining class from Unicode::Normalize
50
51use constant Min2 => 0x20; # minimum weight at level 2
52use constant Min3 => 0x02; # minimum weight at level 3
53use constant UNDEFINED => 0xFF80; # special value for undefined CE's
54
55our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
56
57sub UCA_Version { "8.0" }
58
59sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
60
61##
62## constructor
63##
64sub new
65{
66 my $class = shift;
67 my $self = bless { @_ }, $class;
68
69 # alternate lowercased
70 $self->{alternate} =
71 ! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
72
73 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
74 unless $self->{alternate} eq 'blanked'
75 || $self->{alternate} eq 'non-ignorable'
76 || $self->{alternate} eq 'shifted'
77 || $self->{alternate} eq 'shift-trimmed';
78
79 # collation level
80 $self->{level} ||= 4;
81
82 croak "Illegal level lower than 1 (passed $self->{level})."
83 if $self->{level} < 1;
84 croak "A level higher than 4 (passed $self->{level}) is not supported."
85 if 4 < $self->{level};
86
87 # overrideHangul and -CJK
88 # If true: CODEREF used; '': default; undef: derived elements
89 $self->{overrideHangul} = ''
90 if ! exists $self->{overrideHangul};
91 $self->{overrideCJK} = ''
92 if ! exists $self->{overrideCJK};
93
94 # normalization form
95 $self->{normalization} = 'D'
96 if ! exists $self->{normalization};
97 $self->{UNF} = undef;
98
99 if (defined $self->{normalization}) {
100 eval { require Unicode::Normalize };
101 croak "Unicode/Normalize.pm is required to normalize strings: $@"
102 if $@;
103
104 Unicode::Normalize->import();
105 $getCombinClass = \&Unicode::Normalize::getCombinClass
106 if ! $getCombinClass;
107
108 $self->{UNF} =
109 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
110 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
111 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
112 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
113 croak "$PACKAGE unknown normalization form name: "
114 . $self->{normalization};
115 }
116
117 # Open a table file.
118 # If undef is passed explicitly, no file is read.
119 $self->{table} = $KeyFile
120 if ! exists $self->{table};
121 $self->read_table
122 if defined $self->{table};
123
124 if ($self->{entry}) {
125 $self->parseEntry($_) foreach split /\n/, $self->{entry};
126 }
127
128 # backwards
129 $self->{backwards} ||= [ ];
130 $self->{backwards} = [ $self->{backwards} ]
131 if ! ref $self->{backwards};
132
133 # rearrange
134 $self->{rearrange} = $DefaultRearrange
135 if ! exists $self->{rearrange};
136 $self->{rearrange} = []
137 if ! defined $self->{rearrange};
138 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
139 if ! ref $self->{rearrange};
140
141 # keys of $self->{rearrangeHash} are $self->{rearrange}.
142 $self->{rearrangeHash} = undef;
143
144 if (@{ $self->{rearrange} }) {
145 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
146 }
147
148 return $self;
149}
150
151sub read_table {
152 my $self = shift;
153 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
154
155 my $filepath = File::Spec->catfile($Path, $file);
156 open my $fk, "<$filepath"
157 or croak "File does not exist at $filepath";
158
159 while (<$fk>) {
160 next if /^\s*#/;
161 if (/^\s*\@/) {
162 if (/^\@version\s*(\S*)/) {
163 $self->{version} ||= $1;
164 }
165 elsif (/^\@alternate\s+(.*)/) {
166 $self->{alternate} ||= $1;
167 }
168 elsif (/^\@backwards\s+(.*)/) {
169 push @{ $self->{backwards} }, $1;
170 }
171 elsif (/^\@rearrange\s+(.*)/) {
172 push @{ $self->{rearrange} }, _getHexArray($1);
173 }
174 next;
175 }
176 $self->parseEntry($_);
177 }
178 close $fk;
179}
180
181
182##
183## get $line, parse it, and write an entry in $self
184##
185sub parseEntry
186{
187 my $self = shift;
188 my $line = shift;
189 my($name, $ele, @key);
190
191 return if $line !~ /^\s*[0-9A-Fa-f]/;
192
193 # removes comment and gets name
194 $name = $1
195 if $line =~ s/[#%]\s*(.*)//;
196 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
197
198 # gets element
199 my($e, $k) = split /;/, $line;
200 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
201 if ! $k;
202
203 my @e = _getHexArray($e);
204 $ele = pack('U*', @e);
205 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
206
207 # get sort key
208 if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
209 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
210 {
211 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
212 }
213 else {
214 my $combining = 1; # primary = 0, secondary != 0;
215
216 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
217 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
218 push @key, $self->altCE($var, _getHexArray($arr));
219 $combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
220 }
221 $self->{entries}{$ele} = \@key;
222 $self->{combining}{$ele} = 1 if $combining;
223 }
224 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
225}
226
227
228##
229## arrayref CE = altCE(bool variable?, list[num] weights)
230##
231sub altCE
232{
233 my $self = shift;
234 my $var = shift;
235 my @c = @_;
236
237 $self->{alternate} eq 'blanked' ?
238 $var ? [0,0,0,$c[3]] : \@c :
239 $self->{alternate} eq 'non-ignorable' ?
240 \@c :
241 $self->{alternate} eq 'shifted' ?
242 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
243 $self->{alternate} eq 'shift-trimmed' ?
244 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
245 croak "$PACKAGE unknown alternate name: $self->{alternate}";
246}
247
248##
249## string hex_sortkey = splitCE(string arg)
250##
251sub viewSortKey
252{
253 my $self = shift;
254 my $key = $self->getSortKey(@_);
255 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
256 $view =~ s/ ?0000 ?/|/g;
257 return "[$view]";
258}
259
260
261##
262## list[strings] elements = splitCE(string arg)
263##
264sub splitCE
265{
266 my $self = shift;
267 my $code = $self->{preprocess};
268 my $norm = $self->{UNF};
269 my $ent = $self->{entries};
270 my $max = $self->{maxlength};
271 my $reH = $self->{rearrangeHash};
272
273 my $str = ref $code ? &$code(shift) : shift;
274 $str = &$norm($str) if ref $norm;
275
276 my @src = unpack('U*', $str);
277 my @buf;
278
279 # rearrangement
280 if ($reH) {
281 for (my $i = 0; $i < @src; $i++) {
282 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
283 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
284 $i++;
285 }
286 }
287 }
288
289 for (my $i = 0; $i < @src; $i++) {
290 my $ch;
291 my $u = $src[$i];
292
293 # non-characters
294 next unless defined $u;
295 next if $u < 0 || 0x10FFFF < $u # out of range
296 || (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
297 my $four = $u & 0xFFFF;
298 next if $four == 0xFFFE || $four == 0xFFFF;
299
300 if ($max->{$u}) { # contract
301 for (my $j = $max->{$u}; $j >= 1; $j--) {
302 next unless $i+$j-1 < @src;
303 $ch = pack 'U*', @src[$i .. $i+$j-1];
304 $i += $j-1, last if $ent->{$ch};
305 }
306 } else {
307 $ch = pack('U', $u);
308 }
309
310 # with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
311 if ($getCombinClass && defined $ch) {
312 for (my $j = $i+1; $j < @src; $j++) {
313 next unless defined $src[$j];
314 last unless $getCombinClass->( $src[$j] );
315 my $comb = pack 'U', $src[$j];
316 next if ! $ent->{ $ch.$comb };
317 $ch .= $comb;
318 $src[$j] = undef;
319 }
320 }
321 push @buf, $ch;
322 }
323 wantarray ? @buf : \@buf;
324}
325
326
327##
328## list[arrayrefs] weight = getWt(string element)
329##
330sub getWt
331{
332 my $self = shift;
333 my $ch = shift;
334 my $ent = $self->{entries};
335 my $ign = $self->{ignored};
336 my $cjk = $self->{overrideCJK};
337 my $hang = $self->{overrideHangul};
338
339 return if !defined $ch || $ign->{$ch}; # ignored
340 return @{ $ent->{$ch} } if $ent->{$ch};
341 my $u = unpack('U', $ch);
342
343 if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
344 return $hang
345 ? &$hang($u)
346 : defined $hang
347 ? map({
348 my $v = $_;
349 my $ar = $ent->{pack('U', $v)};
350 $ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
351 } _decompHangul($u))
352 : map($self->altCE(0,@$_), _derivCE($u));
353 }
354 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
355 0x4E00 <= $u && $u <= 0x9FA5 ||
356 0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
357 return $cjk
358 ? &$cjk($u)
359 : defined $cjk && $u <= 0xFFFF
360 ? $self->altCE(0, ($u, 0x20, 0x02, $u))
361 : map($self->altCE(0,@$_), _derivCE($u));
362 }
363 else {
364 return map($self->altCE(0,@$_), _derivCE($u));
365 }
366}
367
368##
369## int = index(string, substring)
370##
371sub index
372{
373 my $self = shift;
374 my $lev = $self->{level};
375 my $comb = $self->{combining};
376 my $str = $self->splitCE(shift);
377 my $sub = $self->splitCE(shift);
378
379 return wantarray ? (0,0) : 0 if ! @$sub;
380 return wantarray ? () : -1 if ! @$str;
381
382 my @subWt = grep _ignorableAtLevel($_,$lev),
383 map $self->getWt($_), @$sub;
384
385 my(@strWt,@strPt);
386 my $count = 0;
387 for (my $i = 0; $i < @$str; $i++) {
388 my $go_ahead = 0;
389
390 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
391 $go_ahead += length $str->[$i];
392
393 # /*XXX*/ still broken.
394 # index("e\x{300}", "e") should be 'no match' at level 2 or higher
395 # as "e\x{300}" is a *single* grapheme cluster and not equal to "e".
396
397 # go ahead as far as we find a combining character;
398 while ($i + 1 < @$str &&
399 (! defined $str->[$i+1] || $comb->{ $str->[$i+1] }) ) {
400 $i++;
401 $go_ahead += length $str->[$i];
402 next if ! defined $str->[$i];
403 push @tmp,
404 grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
405 }
406
407 push @strWt, @tmp;
408 push @strPt, ($count) x @tmp;
409 $count += $go_ahead;
410
411 while (@strWt >= @subWt) {
412 if (_eqArray(\@strWt, \@subWt, $lev)) {
413 my $pos = $strPt[0];
414 return wantarray ? ($pos, $count-$pos) : $pos;
415 }
416 shift @strWt;
417 shift @strPt;
418 }
419 }
420 return wantarray ? () : -1;
421}
422
423##
424## bool _eqArray(arrayref, arrayref, level)
425##
426sub _eqArray($$$)
427{
428 my $a = shift; # length $a >= length $b;
429 my $b = shift;
430 my $lev = shift;
431 for my $v (0..$lev-1) {
432 for my $c (0..@$b-1){
433 return if $a->[$c][$v] != $b->[$c][$v];
434 }
435 }
436 return 1;
437}
438
439
440##
441## bool _ignorableAtLevel(CE, level)
442##
443sub _ignorableAtLevel($$)
444{
445 my $ce = shift;
446 return unless defined $ce;
447 my $lv = shift;
448 return ! grep { ! $ce->[$_] } 0..$lv-1;
449}
450
451
452##
453## string sortkey = getSortKey(string arg)
454##
455sub getSortKey
456{
457 my $self = shift;
458 my $lev = $self->{level};
459 my $rCE = $self->splitCE(shift); # get an arrayref
460
461 # weight arrays
462 my @buf = grep defined(), map $self->getWt($_), @$rCE;
463
464 # make sort key
465 my @ret = ([],[],[],[]);
466 foreach my $v (0..$lev-1) {
467 foreach my $b (@buf) {
468 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
469 }
470 }
471 foreach (@{ $self->{backwards} }) {
472 my $v = $_ - 1;
473 @{ $ret[$v] } = reverse @{ $ret[$v] };
474 }
475
476 # modification of tertiary weights
477 if ($self->{upper_before_lower}) {
478 foreach (@{ $ret[2] }) {
479 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
480 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
481 elsif ($_ == 0x1C) { $_ += 1 } # square upper
482 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
483 }
484 }
485 if ($self->{katakana_before_hiragana}) {
486 foreach (@{ $ret[2] }) {
487 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
488 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
489 }
490 }
491 join "\0\0", map pack('n*', @$_), @ret;
492}
493
494
495##
496## int compare = cmp(string a, string b)
497##
498sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
499sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
500sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
501sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
502sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
503sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
504sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
505
506##
507## list[strings] sorted = sort(list[strings] arg)
508##
509sub sort {
510 my $obj = shift;
511 return
512 map { $_->[1] }
513 sort{ $a->[0] cmp $b->[0] }
514 map [ $obj->getSortKey($_), $_ ], @_;
515}
516
517##
518## list[arrayrefs] CE = _derivCE(int codepoint)
519##
520sub _derivCE {
521 my $code = shift;
522 my $a = UNDEFINED + ($code >> 15); # ok
523 my $b = ($code & 0x7FFF) | 0x8000; # ok
524# my $a = 0xFFC2 + ($code >> 15); # ng
525# my $b = $code & 0x7FFF | 0x1000; # ng
526 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
527}
528
529##
530## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
531##
532sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
533
534#
535# $code must be in Hangul syllable.
536# Check it before you enter here.
537#
538sub _decompHangul {
539 my $code = shift;
540 my $SIndex = $code - 0xAC00;
541 my $LIndex = int( $SIndex / 588);
542 my $VIndex = int(($SIndex % 588) / 28);
543 my $TIndex = $SIndex % 28;
544 return (
545 0x1100 + $LIndex,
546 0x1161 + $VIndex,
547 $TIndex ? (0x11A7 + $TIndex) : (),
548 );
549}
550
5511;
552__END__
553
554=head1 NAME
555
556Unicode::Collate - Unicode Collation Algorithm
557
558=head1 SYNOPSIS
559
560 use Unicode::Collate;
561
562 #construct
563 $Collator = Unicode::Collate->new(%tailoring);
564
565 #sort
566 @sorted = $Collator->sort(@not_sorted);
567
568 #compare
569 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
570
571=head1 DESCRIPTION
572
573=head2 Constructor and Tailoring
574
575The C<new> method returns a collator object.
576
577 $Collator = Unicode::Collate->new(
578 alternate => $alternate,
579 backwards => $levelNumber, # or \@levelNumbers
580 entry => $element,
581 normalization => $normalization_form,
582 ignoreName => qr/$ignoreName/,
583 ignoreChar => qr/$ignoreChar/,
584 katakana_before_hiragana => $bool,
585 level => $collationLevel,
586 overrideCJK => \&overrideCJK,
587 overrideHangul => \&overrideHangul,
588 preprocess => \&preprocess,
589 rearrange => \@charList,
590 table => $filename,
591 undefName => qr/$undefName/,
592 undefChar => qr/$undefChar/,
593 upper_before_lower => $bool,
594 );
595 # if %tailoring is false (i.e. empty),
596 # $Collator should do the default collation.
597
598=over 4
599
600=item alternate
601
602-- see 3.2.2 Alternate Weighting, UTR #10.
603
604This key allows to alternate weighting for variable collation elements,
605which are marked with an ASTERISK in the table
606(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
607
608 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
609
610These names are case-insensitive.
611By default (if specification is omitted), 'shifted' is adopted.
612
613 'Blanked' Variable elements are ignorable at levels 1 through 3;
614 considered at the 4th level.
615
616 'Non-ignorable' Variable elements are not reset to ignorable.
617
618 'Shifted' Variable elements are ignorable at levels 1 through 3
619 their level 4 weight is replaced by the old level 1 weight.
620 Level 4 weight for Non-Variable elements is 0xFFFF.
621
622 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
623 are trimmed.
624
625=item backwards
626
627-- see 3.1.2 French Accents, UTR #10.
628
629 backwards => $levelNumber or \@levelNumbers
630
631Weights in reverse order; ex. level 2 (diacritic ordering) in French.
632If omitted, forwards at all the levels.
633
634=item entry
635
636-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
637
638Overrides a default order or defines additional collation elements
639
640 entry => <<'ENTRIES', # use the UCA file format
64100E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
6420063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
6430043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
644ENTRIES
645
646=item ignoreName
647
648=item ignoreChar
649
650-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
651
652Makes the entry in the table ignorable.
653If a collation element is ignorable,
654it is ignored as if the element had been deleted from there.
655
656E.g. when 'a' and 'e' are ignorable,
657'element' is equal to 'lament' (or 'lmnt').
658
659=item level
660
661-- see 4.3 Form a sort key for each string, UTR #10.
662
663Set the maximum level.
664Any higher levels than the specified one are ignored.
665
666 Level 1: alphabetic ordering
667 Level 2: diacritic ordering
668 Level 3: case ordering
669 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
670
671 ex.level => 2,
672
673If omitted, the maximum is the 4th.
674
675=item normalization
676
677-- see 4.1 Normalize each input string, UTR #10.
678
679If specified, strings are normalized before preparation of sort keys
680(the normalization is executed after preprocess).
681
682As a form name, one of the following names must be used.
683
684 'C' or 'NFC' for Normalization Form C
685 'D' or 'NFD' for Normalization Form D
686 'KC' or 'NFKC' for Normalization Form KC
687 'KD' or 'NFKD' for Normalization Form KD
688
689If omitted, the string is put into Normalization Form D.
690
691If C<undef> is passed explicitly as the value for this key,
692any normalization is not carried out (this may make tailoring easier
693if any normalization is not desired).
694
695see B<CAVEAT>.
696
697=item overrideCJK
698
699-- see 7.1 Derived Collation Elements, UTR #10.
700
701By default, mapping of CJK Unified Ideographs
702uses the Unicode codepoint order.
703But the mapping of CJK Unified Ideographs may be overrided.
704
705ex. CJK Unified Ideographs in the JIS code point order.
706
707 overrideCJK => sub {
708 my $u = shift; # get a Unicode codepoint
709 my $b = pack('n', $u); # to UTF-16BE
710 my $s = your_unicode_to_sjis_converter($b); # convert
711 my $n = unpack('n', $s); # convert sjis to short
712 [ $n, 0x20, 0x2, $u ]; # return the collation element
713 },
714
715ex. ignores all CJK Unified Ideographs.
716
717 overrideCJK => sub {()}, # CODEREF returning empty list
718
719 # where ->eq("Pe\x{4E00}rl", "Perl") is true
720 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
721
722If C<undef> is passed explicitly as the value for this key,
723weights for CJK Unified Ideographs are treated as undefined.
724But assignment of weight for CJK Unified Ideographs
725in table or L<entry> is still valid.
726
727=item overrideHangul
728
729-- see 7.1 Derived Collation Elements, UTR #10.
730
731By default, Hangul Syllables are decomposed into Hangul Jamo.
732But the mapping of Hangul Syllables may be overrided.
733
734This tag works like L<overrideCJK>, so see there for examples.
735
736If you want to override the mapping of Hangul Syllables,
737the Normalization Forms D and KD are not appropriate
738(they will be decomposed before overriding).
739
740If C<undef> is passed explicitly as the value for this key,
741weight for Hangul Syllables is treated as undefined
742without decomposition into Hangul Jamo.
743But definition of weight for Hangul Syllables
744in table or L<entry> is still valid.
745
746=item preprocess
747
748-- see 5.1 Preprocessing, UTR #10.
749
750If specified, the coderef is used to preprocess
751before the formation of sort keys.
752
753ex. dropping English articles, such as "a" or "the".
754Then, "the pen" is before "a pencil".
755
756 preprocess => sub {
757 my $str = shift;
758 $str =~ s/\b(?:an?|the)\s+//gi;
759 $str;
760 },
761
762=item rearrange
763
764-- see 3.1.3 Rearrangement, UTR #10.
765
766Characters that are not coded in logical order and to be rearranged.
767By default,
768
769 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
770
771If you want to disallow any rearrangement,
772pass C<undef> or C<[]> (a reference to an empty list)
773as the value for this key.
774
775=item table
776
777-- see 3.2 Default Unicode Collation Element Table, UTR #10.
778
779You can use another element table if desired.
780The table file must be in your C<lib/Unicode/Collate> directory.
781
782By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
783
784If C<undef> is passed explicitly as the value for this key,
785no file is read (but you can define collation elements via L<entry>).
786
787A typical way to define a collation element table
788without any file of table:
789
790 $onlyABC = Unicode::Collate->new(
791 table => undef,
792 entry => << 'ENTRIES',
7930061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
7940041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
7950062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
7960042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
7970063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
7980043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
799ENTRIES
800 );
801
802=item undefName
803
804=item undefChar
805
806-- see 6.3.4 Reducing the Repertoire, UTR #10.
807
808Undefines the collation element as if it were unassigned in the table.
809This reduces the size of the table.
810If an unassigned character appears in the string to be collated,
811the sort key is made from its codepoint
812as a single-character collation element,
813as it is greater than any other assigned collation elements
814(in the codepoint order among the unassigned characters).
815But, it'd be better to ignore characters
816unfamiliar to you and maybe never used.
817
818=item katakana_before_hiragana
819
820=item upper_before_lower
821
822-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
823
824By default, lowercase is before uppercase
825and hiragana is before katakana.
826
827If the tag is made true, this is reversed.
828
829B<NOTE>: These tags simplemindedly assume
830any lowercase/uppercase or hiragana/katakana distinctions
831should occur in level 3, and their weights at level 3
832should be same as those mentioned in 7.3.1, UTR #10.
833If you define your collation elements which violates this,
834these tags doesn't work validly.
835
836=back
837
838=head2 Methods for Collation
839
840=over 4
841
842=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
843
844Sorts a list of strings.
845
846=item C<$result = $Collator-E<gt>cmp($a, $b)>
847
848Returns 1 (when C<$a> is greater than C<$b>)
849or 0 (when C<$a> is equal to C<$b>)
850or -1 (when C<$a> is lesser than C<$b>).
851
852=item C<$result = $Collator-E<gt>eq($a, $b)>
853
854=item C<$result = $Collator-E<gt>ne($a, $b)>
855
856=item C<$result = $Collator-E<gt>lt($a, $b)>
857
858=item C<$result = $Collator-E<gt>le($a, $b)>
859
860=item C<$result = $Collator-E<gt>gt($a, $b)>
861
862=item C<$result = $Collator-E<gt>ge($a, $b)>
863
864They works like the same name operators as theirs.
865
866 eq : whether $a is equal to $b.
867 ne : whether $a is not equal to $b.
868 lt : whether $a is lesser than $b.
869 le : whether $a is lesser than $b or equal to $b.
870 gt : whether $a is greater than $b.
871 ge : whether $a is greater than $b or equal to $b.
872
873=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
874
875-- see 4.3 Form a sort key for each string, UTR #10.
876
877Returns a sort key.
878
879You compare the sort keys using a binary comparison
880and get the result of the comparison of the strings using UCA.
881
882 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
883
884 is equivalent to
885
886 $Collator->cmp($a, $b)
887
888=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
889
890Returns a string formalized to display a sort key.
891Weights are enclosed with C<'['> and C<']'>
892and level boundaries are denoted by C<'|'>.
893
894 use Unicode::Collate;
895 my $c = Unicode::Collate->new();
896 print $c->viewSortKey("Perl"),"\n";
897
898 # output:
899 # [09B3 08B1 09CB 094F|0020 0020 0020 0020|0008 0002 0002 0002|FFFF FFFF FFFF FFFF]
900 # Level 1 Level 2 Level 3 Level 4
901
902=item C<$position = $Collator-E<gt>index($string, $substring)>
903
904=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
905
906-- see 6.8 Searching, UTR #10.
907
908If C<$substring> matches a part of C<$string>, returns
909the position of the first occurrence of the matching part in scalar context;
910in list context, returns a two-element list of
911the position and the length of the matching part.
912
913B<Notice> that the length of the matching part may differ from
914the length of C<$substring>.
915
916B<Note> that the position and the length are counted on the string
917after the process of preprocess, normalization, and rearrangement.
918Therefore, in case the specified string is not binary equal to
919the preprocessed/normalized/rearranged string, the position and the length
920may differ form those on the specified string. But it is guaranteed
921that, if matched, it returns a non-negative value as C<$position>.
922
923If C<$substring> does not match any part of C<$string>,
924returns C<-1> in scalar context and
925an empty list in list context.
926
927e.g. you say
928
929 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
930 my $str = "Ich mu\x{00DF} studieren.";
931 my $sub = "m\x{00FC}ss";
932 my $match;
933 if (my($pos,$len) = $Collator->index($str, $sub)) {
934 $match = substr($str, $pos, $len);
935 }
936
937and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
938is primary equal to C<"m>E<252>C<ss">.
939
940=back
941
942=head2 Other Methods
943
944=over 4
945
946=item UCA_Version
947
948Returns the version number of Unicode Technical Standard 10
949this module consults.
950
951=item Base_Unicode_Version
952
953Returns the version number of the Unicode Standard
954this module is based on.
955
956=back
957
958=head2 EXPORT
959
960None by default.
961
962=head2 TODO
963
964Unicode::Collate has not been ported to EBCDIC. The code mostly would
965work just fine but a decision needs to be made: how the module should
966work in EBCDIC? Should the low 256 characters be understood as
967Unicode or as EBCDIC code points? Should one be chosen or should
968there be a way to do either? Or should such translation be left
969outside the module for the user to do, for example by using
970Encode::from_to()?
971(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
972
973=head2 CAVEAT
974
975Use of the C<normalization> parameter requires
976the B<Unicode::Normalize> module.
977
978If you need not it (say, in the case when you need not
979handle any combining characters),
980assign C<normalization =E<gt> undef> explicitly.
981
982-- see 6.5 Avoiding Normalization, UTR #10.
983
984=head2 BUGS
985
986C<index()> is an experimental method and
987its return value may be unreliable.
988The correct implementation for C<index()> must be based
989on Locale-Sensitive Support: Level 3 in UTR #18,
990F<Unicode Regular Expression Guidelines>.
991
992See also 4.2 Locale-Dependent Graphemes in UTR #18.
993
994=head1 AUTHOR
995
996SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
997
998 http://homepage1.nifty.com/nomenclator/perl/
999
1000 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
1001
1002 This library is free software; you can redistribute it
1003 and/or modify it under the same terms as Perl itself.
1004
1005=head1 SEE ALSO
1006
1007=over 4
1008
1009=item http://www.unicode.org/unicode/reports/tr10/
1010
1011Unicode Collation Algorithm - UTR #10
1012
1013=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
1014
1015The Default Unicode Collation Element Table
1016
1017=item http://www.unicode.org/unicode/reports/tr15/
1018
1019Unicode Normalization Forms - UAX #15
1020
1021=item http://www.unicode.org/unicode/reports/tr18
1022
1023Unicode Regular Expression Guidelines - UTR #18
1024
1025=item L<Unicode::Normalize>
1026
1027=back
1028
1029=cut