git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/lib/5.8.0/Unicode/Collate.pm

... / ...

Commit	Line	Data
	1	package Unicode::Collate;
	2
	3	BEGIN {
	4	if (ord("A") == 193) {
	5	die "Unicode::Collate not ported to EBCDIC\n";
	6	}
	7	}
	8
	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13	use File::Spec;
	14
	15	require Exporter;
	16
	17	our $VERSION = '0.12';
	18	our $PACKAGE = __PACKAGE__;
	19
	20	our @ISA = qw(Exporter);
	21
	22	our %EXPORT_TAGS = ();
	23	our @EXPORT_OK = ();
	24	our @EXPORT = ();
	25
	26	(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
	27	our $KeyFile = "allkeys.txt";
	28
	29	our $UNICODE_VERSION;
	30
	31	eval { require Unicode::UCD };
	32
	33	unless ($@) {
	34	$UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
	35	}
	36	else { # XXX, Perl 5.6.1
	37	my($f, $fh);
	38	foreach my $d (@INC) {
	39	use File::Spec;
	40	$f = File::Spec->catfile($d, "unicode", "Unicode.301");
	41	if (open($fh, $f)) {
	42	$UNICODE_VERSION = '3.0.1';
	43	close $fh;
	44	last;
	45	}
	46	}
	47	}
	48
	49	our $getCombinClass; # coderef for combining class from Unicode::Normalize
	50
	51	use constant Min2 => 0x20; # minimum weight at level 2
	52	use constant Min3 => 0x02; # minimum weight at level 3
	53	use constant UNDEFINED => 0xFF80; # special value for undefined CE's
	54
	55	our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
	56
	57	sub UCA_Version { "8.0" }
	58
	59	sub Base_Unicode_Version { $UNICODE_VERSION \|\| 'unknown' }
	60
	61	##
	62	## constructor
	63	##
	64	sub new
	65	{
	66	my $class = shift;
	67	my $self = bless { @_ }, $class;
	68
	69	# alternate lowercased
	70	$self->{alternate} =
	71	! exists $self->{alternate} ? 'shifted' : lc($self->{alternate});
	72
	73	croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
	74	unless $self->{alternate} eq 'blanked'
	75	\|\| $self->{alternate} eq 'non-ignorable'
	76	\|\| $self->{alternate} eq 'shifted'
	77	\|\| $self->{alternate} eq 'shift-trimmed';
	78
	79	# collation level
	80	$self->{level} \|\|= 4;
	81
	82	croak "Illegal level lower than 1 (passed $self->{level})."
	83	if $self->{level} < 1;
	84	croak "A level higher than 4 (passed $self->{level}) is not supported."
	85	if 4 < $self->{level};
	86
	87	# overrideHangul and -CJK
	88	# If true: CODEREF used; '': default; undef: derived elements
	89	$self->{overrideHangul} = ''
	90	if ! exists $self->{overrideHangul};
	91	$self->{overrideCJK} = ''
	92	if ! exists $self->{overrideCJK};
	93
	94	# normalization form
	95	$self->{normalization} = 'D'
	96	if ! exists $self->{normalization};
	97	$self->{UNF} = undef;
	98
	99	if (defined $self->{normalization}) {
	100	eval { require Unicode::Normalize };
	101	croak "Unicode/Normalize.pm is required to normalize strings: $@"
	102	if $@;
	103
	104	Unicode::Normalize->import();
	105	$getCombinClass = \&Unicode::Normalize::getCombinClass
	106	if ! $getCombinClass;
	107
	108	$self->{UNF} =
	109	$self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
	110	$self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
	111	$self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
	112	$self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
	113	croak "$PACKAGE unknown normalization form name: "
	114	. $self->{normalization};
	115	}
	116
	117	# Open a table file.
	118	# If undef is passed explicitly, no file is read.
	119	$self->{table} = $KeyFile
	120	if ! exists $self->{table};
	121	$self->read_table
	122	if defined $self->{table};
	123
	124	if ($self->{entry}) {
	125	$self->parseEntry($_) foreach split /\n/, $self->{entry};
	126	}
	127
	128	# backwards
	129	$self->{backwards} \|\|= [ ];
	130	$self->{backwards} = [ $self->{backwards} ]
	131	if ! ref $self->{backwards};
	132
	133	# rearrange
	134	$self->{rearrange} = $DefaultRearrange
	135	if ! exists $self->{rearrange};
	136	$self->{rearrange} = []
	137	if ! defined $self->{rearrange};
	138	croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
	139	if ! ref $self->{rearrange};
	140
	141	# keys of $self->{rearrangeHash} are $self->{rearrange}.
	142	$self->{rearrangeHash} = undef;
	143
	144	if (@{ $self->{rearrange} }) {
	145	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
	146	}
	147
	148	return $self;
	149	}
	150
	151	sub read_table {
	152	my $self = shift;
	153	my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
	154
	155	my $filepath = File::Spec->catfile($Path, $file);
	156	open my $fk, "<$filepath"
	157	or croak "File does not exist at $filepath";
	158
	159	while (<$fk>) {
	160	next if /^\s*#/;
	161	if (/^\s*\@/) {
	162	if (/^\@version\s(\S)/) {
	163	$self->{version} \|\|= $1;
	164	}
	165	elsif (/^\@alternate\s+(.*)/) {
	166	$self->{alternate} \|\|= $1;
	167	}
	168	elsif (/^\@backwards\s+(.*)/) {
	169	push @{ $self->{backwards} }, $1;
	170	}
	171	elsif (/^\@rearrange\s+(.*)/) {
	172	push @{ $self->{rearrange} }, _getHexArray($1);
	173	}
	174	next;
	175	}
	176	$self->parseEntry($_);
	177	}
	178	close $fk;
	179	}
	180
	181
	182	##
	183	## get $line, parse it, and write an entry in $self
	184	##
	185	sub parseEntry
	186	{
	187	my $self = shift;
	188	my $line = shift;
	189	my($name, $ele, @key);
	190
	191	return if $line !~ /^\s*[0-9A-Fa-f]/;
	192
	193	# removes comment and gets name
	194	$name = $1
	195	if $line =~ s/[#%]\s(.)//;
	196	return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
	197
	198	# gets element
	199	my($e, $k) = split /;/, $line;
	200	croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
	201	if ! $k;
	202
	203	my @e = _getHexArray($e);
	204	$ele = pack('U*', @e);
	205	return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
	206
	207	# get sort key
	208	if (defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ \|\|
	209	defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/)
	210	{
	211	$self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
	212	}
	213	else {
	214	my $combining = 1; # primary = 0, secondary != 0;
	215
	216	foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
	217	my $var = $arr =~ /\/; # exactly /^\/ but be lenient.
	218	push @key, $self->altCE($var, _getHexArray($arr));
	219	$combining = 0 unless $key[-1][0] == 0 && $key[-1][1] != 0;
	220	}
	221	$self->{entries}{$ele} = \@key;
	222	$self->{combining}{$ele} = 1 if $combining;
	223	}
	224	$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
	225	}
	226
	227
	228	##
	229	## arrayref CE = altCE(bool variable?, list[num] weights)
	230	##
	231	sub altCE
	232	{
	233	my $self = shift;
	234	my $var = shift;
	235	my @c = @_;
	236
	237	$self->{alternate} eq 'blanked' ?
	238	$var ? [0,0,0,$c[3]] : \@c :
	239	$self->{alternate} eq 'non-ignorable' ?
	240	\@c :
	241	$self->{alternate} eq 'shifted' ?
	242	$var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
	243	$self->{alternate} eq 'shift-trimmed' ?
	244	$var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
	245	croak "$PACKAGE unknown alternate name: $self->{alternate}";
	246	}
	247
	248	##
	249	## string hex_sortkey = splitCE(string arg)
	250	##
	251	sub viewSortKey
	252	{
	253	my $self = shift;
	254	my $key = $self->getSortKey(@_);
	255	my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
	256	$view =~ s/ ?0000 ?/\|/g;
	257	return "[$view]";
	258	}
	259
	260
	261	##
	262	## list[strings] elements = splitCE(string arg)
	263	##
	264	sub splitCE
	265	{
	266	my $self = shift;
	267	my $code = $self->{preprocess};
	268	my $norm = $self->{UNF};
	269	my $ent = $self->{entries};
	270	my $max = $self->{maxlength};
	271	my $reH = $self->{rearrangeHash};
	272
	273	my $str = ref $code ? &$code(shift) : shift;
	274	$str = &$norm($str) if ref $norm;
	275
	276	my @src = unpack('U*', $str);
	277	my @buf;
	278
	279	# rearrangement
	280	if ($reH) {
	281	for (my $i = 0; $i < @src; $i++) {
	282	if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
	283	($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
	284	$i++;
	285	}
	286	}
	287	}
	288
	289	for (my $i = 0; $i < @src; $i++) {
	290	my $ch;
	291	my $u = $src[$i];
	292
	293	# non-characters
	294	next unless defined $u;
	295	next if $u < 0 \|\| 0x10FFFF < $u # out of range
	296	\|\| (0xD800 <= $u && $u <= 0xDFFF); # unpaired surrogates
	297	my $four = $u & 0xFFFF;
	298	next if $four == 0xFFFE \|\| $four == 0xFFFF;
	299
	300	if ($max->{$u}) { # contract
	301	for (my $j = $max->{$u}; $j >= 1; $j--) {
	302	next unless $i+$j-1 < @src;
	303	$ch = pack 'U*', @src[$i .. $i+$j-1];
	304	$i += $j-1, last if $ent->{$ch};
	305	}
	306	} else {
	307	$ch = pack('U', $u);
	308	}
	309
	310	# with Combining Char (UTS#10, 4.2.1), here requires Unicode::Normalize.
	311	if ($getCombinClass && defined $ch) {
	312	for (my $j = $i+1; $j < @src; $j++) {
	313	next unless defined $src[$j];
	314	last unless $getCombinClass->( $src[$j] );
	315	my $comb = pack 'U', $src[$j];
	316	next if ! $ent->{ $ch.$comb };
	317	$ch .= $comb;
	318	$src[$j] = undef;
	319	}
	320	}
	321	push @buf, $ch;
	322	}
	323	wantarray ? @buf : \@buf;
	324	}
	325
	326
	327	##
	328	## list[arrayrefs] weight = getWt(string element)
	329	##
	330	sub getWt
	331	{
	332	my $self = shift;
	333	my $ch = shift;
	334	my $ent = $self->{entries};
	335	my $ign = $self->{ignored};
	336	my $cjk = $self->{overrideCJK};
	337	my $hang = $self->{overrideHangul};
	338
	339	return if !defined $ch \|\| $ign->{$ch}; # ignored
	340	return @{ $ent->{$ch} } if $ent->{$ch};
	341	my $u = unpack('U', $ch);
	342
	343	if (0xAC00 <= $u && $u <= 0xD7A3) { # is_Hangul
	344	return $hang
	345	? &$hang($u)
	346	: defined $hang
	347	? map({
	348	my $v = $_;
	349	my $ar = $ent->{pack('U', $v)};
	350	$ar ? @$ar : map($self->altCE(0,@$_), _derivCE($v));
	351	} _decompHangul($u))
	352	: map($self->altCE(0,@$_), _derivCE($u));
	353	}
	354	elsif (0x3400 <= $u && $u <= 0x4DB5 \|\|
	355	0x4E00 <= $u && $u <= 0x9FA5 \|\|
	356	0x20000 <= $u && $u <= 0x2A6D6) { # is_CJK
	357	return $cjk
	358	? &$cjk($u)
	359	: defined $cjk && $u <= 0xFFFF
	360	? $self->altCE(0, ($u, 0x20, 0x02, $u))
	361	: map($self->altCE(0,@$_), _derivCE($u));
	362	}
	363	else {
	364	return map($self->altCE(0,@$_), _derivCE($u));
	365	}
	366	}
	367
	368	##
	369	## int = index(string, substring)
	370	##
	371	sub index
	372	{
	373	my $self = shift;
	374	my $lev = $self->{level};
	375	my $comb = $self->{combining};
	376	my $str = $self->splitCE(shift);
	377	my $sub = $self->splitCE(shift);
	378
	379	return wantarray ? (0,0) : 0 if ! @$sub;
	380	return wantarray ? () : -1 if ! @$str;
	381
	382	my @subWt = grep _ignorableAtLevel($_,$lev),
	383	map $self->getWt($_), @$sub;
	384
	385	my(@strWt,@strPt);
	386	my $count = 0;
	387	for (my $i = 0; $i < @$str; $i++) {
	388	my $go_ahead = 0;
	389
	390	my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
	391	$go_ahead += length $str->[$i];
	392
	393	# /XXX/ still broken.
	394	# index("e\x{300}", "e") should be 'no match' at level 2 or higher
	395	# as "e\x{300}" is a single grapheme cluster and not equal to "e".
	396
	397	# go ahead as far as we find a combining character;
	398	while ($i + 1 < @$str &&
	399	(! defined $str->[$i+1] \|\| $comb->{ $str->[$i+1] }) ) {
	400	$i++;
	401	$go_ahead += length $str->[$i];
	402	next if ! defined $str->[$i];
	403	push @tmp,
	404	grep _ignorableAtLevel($_,$lev), $self->getWt($str->[$i]);
	405	}
	406
	407	push @strWt, @tmp;
	408	push @strPt, ($count) x @tmp;
	409	$count += $go_ahead;
	410
	411	while (@strWt >= @subWt) {
	412	if (_eqArray(\@strWt, \@subWt, $lev)) {
	413	my $pos = $strPt[0];
	414	return wantarray ? ($pos, $count-$pos) : $pos;
	415	}
	416	shift @strWt;
	417	shift @strPt;
	418	}
	419	}
	420	return wantarray ? () : -1;
	421	}
	422
	423	##
	424	## bool _eqArray(arrayref, arrayref, level)
	425	##
	426	sub _eqArray($$$)
	427	{
	428	my $a = shift; # length $a >= length $b;
	429	my $b = shift;
	430	my $lev = shift;
	431	for my $v (0..$lev-1) {
	432	for my $c (0..@$b-1){
	433	return if $a->[$c][$v] != $b->[$c][$v];
	434	}
	435	}
	436	return 1;
	437	}
	438
	439
	440	##
	441	## bool _ignorableAtLevel(CE, level)
	442	##
	443	sub _ignorableAtLevel($$)
	444	{
	445	my $ce = shift;
	446	return unless defined $ce;
	447	my $lv = shift;
	448	return ! grep { ! $ce->[$_] } 0..$lv-1;
	449	}
	450
	451
	452	##
	453	## string sortkey = getSortKey(string arg)
	454	##
	455	sub getSortKey
	456	{
	457	my $self = shift;
	458	my $lev = $self->{level};
	459	my $rCE = $self->splitCE(shift); # get an arrayref
	460
	461	# weight arrays
	462	my @buf = grep defined(), map $self->getWt($_), @$rCE;
	463
	464	# make sort key
	465	my @ret = ([],[],[],[]);
	466	foreach my $v (0..$lev-1) {
	467	foreach my $b (@buf) {
	468	push @{ $ret[$v] }, $b->[$v] if $b->[$v];
	469	}
	470	}
	471	foreach (@{ $self->{backwards} }) {
	472	my $v = $_ - 1;
	473	@{ $ret[$v] } = reverse @{ $ret[$v] };
	474	}
	475
	476	# modification of tertiary weights
	477	if ($self->{upper_before_lower}) {
	478	foreach (@{ $ret[2] }) {
	479	if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
	480	elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
	481	elsif ($_ == 0x1C) { $_ += 1 } # square upper
	482	elsif ($_ == 0x1D) { $_ -= 1 } # square lower
	483	}
	484	}
	485	if ($self->{katakana_before_hiragana}) {
	486	foreach (@{ $ret[2] }) {
	487	if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
	488	elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
	489	}
	490	}
	491	join "\0\0", map pack('n*', @$_), @ret;
	492	}
	493
	494
	495	##
	496	## int compare = cmp(string a, string b)
	497	##
	498	sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
	499	sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
	500	sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
	501	sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
	502	sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
	503	sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
	504	sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
	505
	506	##
	507	## list[strings] sorted = sort(list[strings] arg)
	508	##
	509	sub sort {
	510	my $obj = shift;
	511	return
	512	map { $_->[1] }
	513	sort{ $a->[0] cmp $b->[0] }
	514	map [ $obj->getSortKey($_), $_ ], @_;
	515	}
	516
	517	##
	518	## list[arrayrefs] CE = _derivCE(int codepoint)
	519	##
	520	sub _derivCE {
	521	my $code = shift;
	522	my $a = UNDEFINED + ($code >> 15); # ok
	523	my $b = ($code & 0x7FFF) \| 0x8000; # ok
	524	# my $a = 0xFFC2 + ($code >> 15); # ng
	525	# my $b = $code & 0x7FFF \| 0x1000; # ng
	526	$b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
	527	}
	528
	529	##
	530	## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
	531	##
	532	sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
	533
	534	#
	535	# $code must be in Hangul syllable.
	536	# Check it before you enter here.
	537	#
	538	sub _decompHangul {
	539	my $code = shift;
	540	my $SIndex = $code - 0xAC00;
	541	my $LIndex = int( $SIndex / 588);
	542	my $VIndex = int(($SIndex % 588) / 28);
	543	my $TIndex = $SIndex % 28;
	544	return (
	545	0x1100 + $LIndex,
	546	0x1161 + $VIndex,
	547	$TIndex ? (0x11A7 + $TIndex) : (),
	548	);
	549	}
	550
	551	1;
	552	__END__
	553
	554	=head1 NAME
	555
	556	Unicode::Collate - Unicode Collation Algorithm
	557
	558	=head1 SYNOPSIS
	559
	560	use Unicode::Collate;
	561
	562	#construct
	563	$Collator = Unicode::Collate->new(%tailoring);
	564
	565	#sort
	566	@sorted = $Collator->sort(@not_sorted);
	567
	568	#compare
	569	$result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
	570
	571	=head1 DESCRIPTION
	572
	573	=head2 Constructor and Tailoring
	574
	575	The C<new> method returns a collator object.
	576
	577	$Collator = Unicode::Collate->new(
	578	alternate => $alternate,
	579	backwards => $levelNumber, # or \@levelNumbers
	580	entry => $element,
	581	normalization => $normalization_form,
	582	ignoreName => qr/$ignoreName/,
	583	ignoreChar => qr/$ignoreChar/,
	584	katakana_before_hiragana => $bool,
	585	level => $collationLevel,
	586	overrideCJK => \&overrideCJK,
	587	overrideHangul => \&overrideHangul,
	588	preprocess => \&preprocess,
	589	rearrange => \@charList,
	590	table => $filename,
	591	undefName => qr/$undefName/,
	592	undefChar => qr/$undefChar/,
	593	upper_before_lower => $bool,
	594	);
	595	# if %tailoring is false (i.e. empty),
	596	# $Collator should do the default collation.
	597
	598	=over 4
	599
	600	=item alternate
	601
	602	-- see 3.2.2 Alternate Weighting, UTR #10.
	603
	604	This key allows to alternate weighting for variable collation elements,
	605	which are marked with an ASTERISK in the table
	606	(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
	607
	608	alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
	609
	610	These names are case-insensitive.
	611	By default (if specification is omitted), 'shifted' is adopted.
	612
	613	'Blanked' Variable elements are ignorable at levels 1 through 3;
	614	considered at the 4th level.
	615
	616	'Non-ignorable' Variable elements are not reset to ignorable.
	617
	618	'Shifted' Variable elements are ignorable at levels 1 through 3
	619	their level 4 weight is replaced by the old level 1 weight.
	620	Level 4 weight for Non-Variable elements is 0xFFFF.
	621
	622	'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
	623	are trimmed.
	624
	625	=item backwards
	626
	627	-- see 3.1.2 French Accents, UTR #10.
	628
	629	backwards => $levelNumber or \@levelNumbers
	630
	631	Weights in reverse order; ex. level 2 (diacritic ordering) in French.
	632	If omitted, forwards at all the levels.
	633
	634	=item entry
	635
	636	-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
	637
	638	Overrides a default order or defines additional collation elements
	639
	640	entry => <<'ENTRIES', # use the UCA file format
	641	00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
	642	0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
	643	0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
	644	ENTRIES
	645
	646	=item ignoreName
	647
	648	=item ignoreChar
	649
	650	-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
	651
	652	Makes the entry in the table ignorable.
	653	If a collation element is ignorable,
	654	it is ignored as if the element had been deleted from there.
	655
	656	E.g. when 'a' and 'e' are ignorable,
	657	'element' is equal to 'lament' (or 'lmnt').
	658
	659	=item level
	660
	661	-- see 4.3 Form a sort key for each string, UTR #10.
	662
	663	Set the maximum level.
	664	Any higher levels than the specified one are ignored.
	665
	666	Level 1: alphabetic ordering
	667	Level 2: diacritic ordering
	668	Level 3: case ordering
	669	Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
	670
	671	ex.level => 2,
	672
	673	If omitted, the maximum is the 4th.
	674
	675	=item normalization
	676
	677	-- see 4.1 Normalize each input string, UTR #10.
	678
	679	If specified, strings are normalized before preparation of sort keys
	680	(the normalization is executed after preprocess).
	681
	682	As a form name, one of the following names must be used.
	683
	684	'C' or 'NFC' for Normalization Form C
	685	'D' or 'NFD' for Normalization Form D
	686	'KC' or 'NFKC' for Normalization Form KC
	687	'KD' or 'NFKD' for Normalization Form KD
	688
	689	If omitted, the string is put into Normalization Form D.
	690
	691	If C<undef> is passed explicitly as the value for this key,
	692	any normalization is not carried out (this may make tailoring easier
	693	if any normalization is not desired).
	694
	695	see B<CAVEAT>.
	696
	697	=item overrideCJK
	698
	699	-- see 7.1 Derived Collation Elements, UTR #10.
	700
	701	By default, mapping of CJK Unified Ideographs
	702	uses the Unicode codepoint order.
	703	But the mapping of CJK Unified Ideographs may be overrided.
	704
	705	ex. CJK Unified Ideographs in the JIS code point order.
	706
	707	overrideCJK => sub {
	708	my $u = shift; # get a Unicode codepoint
	709	my $b = pack('n', $u); # to UTF-16BE
	710	my $s = your_unicode_to_sjis_converter($b); # convert
	711	my $n = unpack('n', $s); # convert sjis to short
	712	[ $n, 0x20, 0x2, $u ]; # return the collation element
	713	},
	714
	715	ex. ignores all CJK Unified Ideographs.
	716
	717	overrideCJK => sub {()}, # CODEREF returning empty list
	718
	719	# where ->eq("Pe\x{4E00}rl", "Perl") is true
	720	# as U+4E00 is a CJK Unified Ideograph and to be ignorable.
	721
	722	If C<undef> is passed explicitly as the value for this key,
	723	weights for CJK Unified Ideographs are treated as undefined.
	724	But assignment of weight for CJK Unified Ideographs
	725	in table or L<entry> is still valid.
	726
	727	=item overrideHangul
	728
	729	-- see 7.1 Derived Collation Elements, UTR #10.
	730
	731	By default, Hangul Syllables are decomposed into Hangul Jamo.
	732	But the mapping of Hangul Syllables may be overrided.
	733
	734	This tag works like L<overrideCJK>, so see there for examples.
	735
	736	If you want to override the mapping of Hangul Syllables,
	737	the Normalization Forms D and KD are not appropriate
	738	(they will be decomposed before overriding).
	739
	740	If C<undef> is passed explicitly as the value for this key,
	741	weight for Hangul Syllables is treated as undefined
	742	without decomposition into Hangul Jamo.
	743	But definition of weight for Hangul Syllables
	744	in table or L<entry> is still valid.
	745
	746	=item preprocess
	747
	748	-- see 5.1 Preprocessing, UTR #10.
	749
	750	If specified, the coderef is used to preprocess
	751	before the formation of sort keys.
	752
	753	ex. dropping English articles, such as "a" or "the".
	754	Then, "the pen" is before "a pencil".
	755
	756	preprocess => sub {
	757	my $str = shift;
	758	$str =~ s/\b(?:an?\|the)\s+//gi;
	759	$str;
	760	},
	761
	762	=item rearrange
	763
	764	-- see 3.1.3 Rearrangement, UTR #10.
	765
	766	Characters that are not coded in logical order and to be rearranged.
	767	By default,
	768
	769	rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
	770
	771	If you want to disallow any rearrangement,
	772	pass C<undef> or C<[]> (a reference to an empty list)
	773	as the value for this key.
	774
	775	=item table
	776
	777	-- see 3.2 Default Unicode Collation Element Table, UTR #10.
	778
	779	You can use another element table if desired.
	780	The table file must be in your C<lib/Unicode/Collate> directory.
	781
	782	By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
	783
	784	If C<undef> is passed explicitly as the value for this key,
	785	no file is read (but you can define collation elements via L<entry>).
	786
	787	A typical way to define a collation element table
	788	without any file of table:
	789
	790	$onlyABC = Unicode::Collate->new(
	791	table => undef,
	792	entry => << 'ENTRIES',
	793	0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
	794	0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
	795	0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
	796	0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
	797	0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
	798	0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
	799	ENTRIES
	800	);
	801
	802	=item undefName
	803
	804	=item undefChar
	805
	806	-- see 6.3.4 Reducing the Repertoire, UTR #10.
	807
	808	Undefines the collation element as if it were unassigned in the table.
	809	This reduces the size of the table.
	810	If an unassigned character appears in the string to be collated,
	811	the sort key is made from its codepoint
	812	as a single-character collation element,
	813	as it is greater than any other assigned collation elements
	814	(in the codepoint order among the unassigned characters).
	815	But, it'd be better to ignore characters
	816	unfamiliar to you and maybe never used.
	817
	818	=item katakana_before_hiragana
	819
	820	=item upper_before_lower
	821
	822	-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
	823
	824	By default, lowercase is before uppercase
	825	and hiragana is before katakana.
	826
	827	If the tag is made true, this is reversed.
	828
	829	B<NOTE>: These tags simplemindedly assume
	830	any lowercase/uppercase or hiragana/katakana distinctions
	831	should occur in level 3, and their weights at level 3
	832	should be same as those mentioned in 7.3.1, UTR #10.
	833	If you define your collation elements which violates this,
	834	these tags doesn't work validly.
	835
	836	=back
	837
	838	=head2 Methods for Collation
	839
	840	=over 4
	841
	842	=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
	843
	844	Sorts a list of strings.
	845
	846	=item C<$result = $Collator-E<gt>cmp($a, $b)>
	847
	848	Returns 1 (when C<$a> is greater than C<$b>)
	849	or 0 (when C<$a> is equal to C<$b>)
	850	or -1 (when C<$a> is lesser than C<$b>).
	851
	852	=item C<$result = $Collator-E<gt>eq($a, $b)>
	853
	854	=item C<$result = $Collator-E<gt>ne($a, $b)>
	855
	856	=item C<$result = $Collator-E<gt>lt($a, $b)>
	857
	858	=item C<$result = $Collator-E<gt>le($a, $b)>
	859
	860	=item C<$result = $Collator-E<gt>gt($a, $b)>
	861
	862	=item C<$result = $Collator-E<gt>ge($a, $b)>
	863
	864	They works like the same name operators as theirs.
	865
	866	eq : whether $a is equal to $b.
	867	ne : whether $a is not equal to $b.
	868	lt : whether $a is lesser than $b.
	869	le : whether $a is lesser than $b or equal to $b.
	870	gt : whether $a is greater than $b.
	871	ge : whether $a is greater than $b or equal to $b.
	872
	873	=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
	874
	875	-- see 4.3 Form a sort key for each string, UTR #10.
	876
	877	Returns a sort key.
	878
	879	You compare the sort keys using a binary comparison
	880	and get the result of the comparison of the strings using UCA.
	881
	882	$Collator->getSortKey($a) cmp $Collator->getSortKey($b)
	883
	884	is equivalent to
	885
	886	$Collator->cmp($a, $b)
	887
	888	=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
	889
	890	Returns a string formalized to display a sort key.
	891	Weights are enclosed with C<'['> and C<']'>
	892	and level boundaries are denoted by C<'\|'>.
	893
	894	use Unicode::Collate;
	895	my $c = Unicode::Collate->new();
	896	print $c->viewSortKey("Perl"),"\n";
	897
	898	# output:
	899	# [09B3 08B1 09CB 094F\|0020 0020 0020 0020\|0008 0002 0002 0002\|FFFF FFFF FFFF FFFF]
	900	# Level 1 Level 2 Level 3 Level 4
	901
	902	=item C<$position = $Collator-E<gt>index($string, $substring)>
	903
	904	=item C<($position, $length) = $Collator-E<gt>index($string, $substring)>
	905
	906	-- see 6.8 Searching, UTR #10.
	907
	908	If C<$substring> matches a part of C<$string>, returns
	909	the position of the first occurrence of the matching part in scalar context;
	910	in list context, returns a two-element list of
	911	the position and the length of the matching part.
	912
	913	B<Notice> that the length of the matching part may differ from
	914	the length of C<$substring>.
	915
	916	B<Note> that the position and the length are counted on the string
	917	after the process of preprocess, normalization, and rearrangement.
	918	Therefore, in case the specified string is not binary equal to
	919	the preprocessed/normalized/rearranged string, the position and the length
	920	may differ form those on the specified string. But it is guaranteed
	921	that, if matched, it returns a non-negative value as C<$position>.
	922
	923	If C<$substring> does not match any part of C<$string>,
	924	returns C<-1> in scalar context and
	925	an empty list in list context.
	926
	927	e.g. you say
	928
	929	my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
	930	my $str = "Ich mu\x{00DF} studieren.";
	931	my $sub = "m\x{00FC}ss";
	932	my $match;
	933	if (my($pos,$len) = $Collator->index($str, $sub)) {
	934	$match = substr($str, $pos, $len);
	935	}
	936
	937	and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
	938	is primary equal to C<"m>E<252>C<ss">.
	939
	940	=back
	941
	942	=head2 Other Methods
	943
	944	=over 4
	945
	946	=item UCA_Version
	947
	948	Returns the version number of Unicode Technical Standard 10
	949	this module consults.
	950
	951	=item Base_Unicode_Version
	952
	953	Returns the version number of the Unicode Standard
	954	this module is based on.
	955
	956	=back
	957
	958	=head2 EXPORT
	959
	960	None by default.
	961
	962	=head2 TODO
	963
	964	Unicode::Collate has not been ported to EBCDIC. The code mostly would
	965	work just fine but a decision needs to be made: how the module should
	966	work in EBCDIC? Should the low 256 characters be understood as
	967	Unicode or as EBCDIC code points? Should one be chosen or should
	968	there be a way to do either? Or should such translation be left
	969	outside the module for the user to do, for example by using
	970	Encode::from_to()?
	971	(or utf8::unicode_to_native()/utf8::native_to_unicode()?)
	972
	973	=head2 CAVEAT
	974
	975	Use of the C<normalization> parameter requires
	976	the B<Unicode::Normalize> module.
	977
	978	If you need not it (say, in the case when you need not
	979	handle any combining characters),
	980	assign C<normalization =E<gt> undef> explicitly.
	981
	982	-- see 6.5 Avoiding Normalization, UTR #10.
	983
	984	=head2 BUGS
	985
	986	C<index()> is an experimental method and
	987	its return value may be unreliable.
	988	The correct implementation for C<index()> must be based
	989	on Locale-Sensitive Support: Level 3 in UTR #18,
	990	F<Unicode Regular Expression Guidelines>.
	991
	992	See also 4.2 Locale-Dependent Graphemes in UTR #18.
	993
	994	=head1 AUTHOR
	995
	996	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
	997
	998	http://homepage1.nifty.com/nomenclator/perl/
	999
	1000	Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
	1001
	1002	This library is free software; you can redistribute it
	1003	and/or modify it under the same terms as Perl itself.
	1004
	1005	=head1 SEE ALSO
	1006
	1007	=over 4
	1008
	1009	=item http://www.unicode.org/unicode/reports/tr10/
	1010
	1011	Unicode Collation Algorithm - UTR #10
	1012
	1013	=item http://www.unicode.org/unicode/reports/tr10/allkeys.txt
	1014
	1015	The Default Unicode Collation Element Table
	1016
	1017	=item http://www.unicode.org/unicode/reports/tr15/
	1018
	1019	Unicode Normalization Forms - UAX #15
	1020
	1021	=item http://www.unicode.org/unicode/reports/tr18
	1022
	1023	Unicode Regular Expression Guidelines - UTR #18
	1024
	1025	=item L<Unicode::Normalize>
	1026
	1027	=back
	1028
	1029	=cut