| 1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
| 2 | .\" |
| 3 | .\" Standard preamble: |
| 4 | .\" ======================================================================== |
| 5 | .de Sh \" Subsection heading |
| 6 | .br |
| 7 | .if t .Sp |
| 8 | .ne 5 |
| 9 | .PP |
| 10 | \fB\\$1\fR |
| 11 | .PP |
| 12 | .. |
| 13 | .de Sp \" Vertical space (when we can't use .PP) |
| 14 | .if t .sp .5v |
| 15 | .if n .sp |
| 16 | .. |
| 17 | .de Vb \" Begin verbatim text |
| 18 | .ft CW |
| 19 | .nf |
| 20 | .ne \\$1 |
| 21 | .. |
| 22 | .de Ve \" End verbatim text |
| 23 | .ft R |
| 24 | .fi |
| 25 | .. |
| 26 | .\" Set up some character translations and predefined strings. \*(-- will |
| 27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
| 28 | .\" double quote, and \*(R" will give a right double quote. | will give a |
| 29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to |
| 30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' |
| 31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. |
| 32 | .tr \(*W-|\(bv\*(Tr |
| 33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
| 34 | .ie n \{\ |
| 35 | . ds -- \(*W- |
| 36 | . ds PI pi |
| 37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
| 38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
| 39 | . ds L" "" |
| 40 | . ds R" "" |
| 41 | . ds C` "" |
| 42 | . ds C' "" |
| 43 | 'br\} |
| 44 | .el\{\ |
| 45 | . ds -- \|\(em\| |
| 46 | . ds PI \(*p |
| 47 | . ds L" `` |
| 48 | . ds R" '' |
| 49 | 'br\} |
| 50 | .\" |
| 51 | .\" If the F register is turned on, we'll generate index entries on stderr for |
| 52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index |
| 53 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
| 54 | .\" output yourself in some meaningful fashion. |
| 55 | .if \nF \{\ |
| 56 | . de IX |
| 57 | . tm Index:\\$1\t\\n%\t"\\$2" |
| 58 | .. |
| 59 | . nr % 0 |
| 60 | . rr F |
| 61 | .\} |
| 62 | .\" |
| 63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
| 64 | .\" way too many mistakes in technical documents. |
| 65 | .hy 0 |
| 66 | .if n .na |
| 67 | .\" |
| 68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
| 69 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
| 70 | . \" fudge factors for nroff and troff |
| 71 | .if n \{\ |
| 72 | . ds #H 0 |
| 73 | . ds #V .8m |
| 74 | . ds #F .3m |
| 75 | . ds #[ \f1 |
| 76 | . ds #] \fP |
| 77 | .\} |
| 78 | .if t \{\ |
| 79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
| 80 | . ds #V .6m |
| 81 | . ds #F 0 |
| 82 | . ds #[ \& |
| 83 | . ds #] \& |
| 84 | .\} |
| 85 | . \" simple accents for nroff and troff |
| 86 | .if n \{\ |
| 87 | . ds ' \& |
| 88 | . ds ` \& |
| 89 | . ds ^ \& |
| 90 | . ds , \& |
| 91 | . ds ~ ~ |
| 92 | . ds / |
| 93 | .\} |
| 94 | .if t \{\ |
| 95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
| 96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
| 97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
| 98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
| 99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
| 100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
| 101 | .\} |
| 102 | . \" troff and (daisy-wheel) nroff accents |
| 103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
| 104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
| 105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
| 106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
| 107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
| 108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
| 109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
| 110 | .ds ae a\h'-(\w'a'u*4/10)'e |
| 111 | .ds Ae A\h'-(\w'A'u*4/10)'E |
| 112 | . \" corrections for vroff |
| 113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
| 114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
| 115 | . \" for low resolution devices (crt and lpr) |
| 116 | .if \n(.H>23 .if \n(.V>19 \ |
| 117 | \{\ |
| 118 | . ds : e |
| 119 | . ds 8 ss |
| 120 | . ds o a |
| 121 | . ds d- d\h'-1'\(ga |
| 122 | . ds D- D\h'-1'\(hy |
| 123 | . ds th \o'bp' |
| 124 | . ds Th \o'LP' |
| 125 | . ds ae ae |
| 126 | . ds Ae AE |
| 127 | .\} |
| 128 | .rm #[ #] #H #V #F C |
| 129 | .\" ======================================================================== |
| 130 | .\" |
| 131 | .IX Title "Unicode::Collate 3" |
| 132 | .TH Unicode::Collate 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide" |
| 133 | .SH "NAME" |
| 134 | Unicode::Collate \- Unicode Collation Algorithm |
| 135 | .SH "SYNOPSIS" |
| 136 | .IX Header "SYNOPSIS" |
| 137 | .Vb 1 |
| 138 | \& use Unicode::Collate; |
| 139 | .Ve |
| 140 | .PP |
| 141 | .Vb 2 |
| 142 | \& #construct |
| 143 | \& $Collator = Unicode::Collate->new(%tailoring); |
| 144 | .Ve |
| 145 | .PP |
| 146 | .Vb 2 |
| 147 | \& #sort |
| 148 | \& @sorted = $Collator->sort(@not_sorted); |
| 149 | .Ve |
| 150 | .PP |
| 151 | .Vb 2 |
| 152 | \& #compare |
| 153 | \& $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. |
| 154 | .Ve |
| 155 | .PP |
| 156 | .Vb 2 |
| 157 | \& # If %tailoring is false (i.e. empty), |
| 158 | \& # $Collator should do the default collation. |
| 159 | .Ve |
| 160 | .SH "DESCRIPTION" |
| 161 | .IX Header "DESCRIPTION" |
| 162 | This module is an implementation of Unicode Technical Standard #10 |
| 163 | (a.k.a. \s-1UTS\s0 #10) \- Unicode Collation Algorithm (a.k.a. \s-1UCA\s0). |
| 164 | .Sh "Constructor and Tailoring" |
| 165 | .IX Subsection "Constructor and Tailoring" |
| 166 | The \f(CW\*(C`new\*(C'\fR method returns a collator object. |
| 167 | .PP |
| 168 | .Vb 21 |
| 169 | \& $Collator = Unicode::Collate->new( |
| 170 | \& UCA_Version => $UCA_Version, |
| 171 | \& alternate => $alternate, # deprecated: use of 'variable' is recommended. |
| 172 | \& backwards => $levelNumber, # or \e@levelNumbers |
| 173 | \& entry => $element, |
| 174 | \& hangul_terminator => $term_primary_weight, |
| 175 | \& ignoreName => qr/$ignoreName/, |
| 176 | \& ignoreChar => qr/$ignoreChar/, |
| 177 | \& katakana_before_hiragana => $bool, |
| 178 | \& level => $collationLevel, |
| 179 | \& normalization => $normalization_form, |
| 180 | \& overrideCJK => \e&overrideCJK, |
| 181 | \& overrideHangul => \e&overrideHangul, |
| 182 | \& preprocess => \e&preprocess, |
| 183 | \& rearrange => \e@charList, |
| 184 | \& table => $filename, |
| 185 | \& undefName => qr/$undefName/, |
| 186 | \& undefChar => qr/$undefChar/, |
| 187 | \& upper_before_lower => $bool, |
| 188 | \& variable => $variable, |
| 189 | \& ); |
| 190 | .Ve |
| 191 | .IP "UCA_Version" 4 |
| 192 | .IX Item "UCA_Version" |
| 193 | If the tracking version number of \s-1UCA\s0 is given, |
| 194 | behavior of that tracking version is emulated on collating. |
| 195 | If omitted, the return value of \f(CW\*(C`UCA_Version()\*(C'\fR is used. |
| 196 | \&\f(CW\*(C`UCA_Version()\*(C'\fR should return the latest tracking version supported. |
| 197 | .Sp |
| 198 | The supported tracking version: 8, 9, 11, or 14. |
| 199 | .Sp |
| 200 | .Vb 6 |
| 201 | \& UCA Unicode Standard DUCET (@version) |
| 202 | \& --------------------------------------------------- |
| 203 | \& 8 3.1 3.0.1 (3.0.1d9) |
| 204 | \& 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) |
| 205 | \& 11 4.0 4.0.0 (4.0.0) |
| 206 | \& 14 4.1.0 4.1.0 (4.1.0) |
| 207 | .Ve |
| 208 | .Sp |
| 209 | Note: Recent \s-1UTS\s0 #10 renames \*(L"Tracking Version\*(R" to \*(L"Revision.\*(R" |
| 210 | .IP "alternate" 4 |
| 211 | .IX Item "alternate" |
| 212 | \&\-\- see 3.2.2 Alternate Weighting, version 8 of \s-1UTS\s0 #10 |
| 213 | .Sp |
| 214 | For backward compatibility, \f(CW\*(C`alternate\*(C'\fR (old name) can be used |
| 215 | as an alias for \f(CW\*(C`variable\*(C'\fR. |
| 216 | .IP "backwards" 4 |
| 217 | .IX Item "backwards" |
| 218 | \&\-\- see 3.1.2 French Accents, \s-1UTS\s0 #10. |
| 219 | .Sp |
| 220 | .Vb 1 |
| 221 | \& backwards => $levelNumber or \e@levelNumbers |
| 222 | .Ve |
| 223 | .Sp |
| 224 | Weights in reverse order; ex. level 2 (diacritic ordering) in French. |
| 225 | If omitted, forwards at all the levels. |
| 226 | .IP "entry" 4 |
| 227 | .IX Item "entry" |
| 228 | \&\-\- see 3.1 Linguistic Features; 3.2.1 File Format, \s-1UTS\s0 #10. |
| 229 | .Sp |
| 230 | If the same character (or a sequence of characters) exists |
| 231 | in the collation element table through \f(CW\*(C`table\*(C'\fR, |
| 232 | mapping to collation elements is overrided. |
| 233 | If it does not exist, the mapping is defined additionally. |
| 234 | .Sp |
| 235 | .Vb 12 |
| 236 | \& entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) |
| 237 | \&0063 0068 ; [.0E6A.0020.0002.0063] # ch |
| 238 | \&0043 0068 ; [.0E6A.0020.0007.0043] # Ch |
| 239 | \&0043 0048 ; [.0E6A.0020.0008.0043] # CH |
| 240 | \&006C 006C ; [.0F4C.0020.0002.006C] # ll |
| 241 | \&004C 006C ; [.0F4C.0020.0007.004C] # Ll |
| 242 | \&004C 004C ; [.0F4C.0020.0008.004C] # LL |
| 243 | \&00F1 ; [.0F7B.0020.0002.00F1] # n-tilde |
| 244 | \&006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde |
| 245 | \&00D1 ; [.0F7B.0020.0008.00D1] # N-tilde |
| 246 | \&004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde |
| 247 | \&ENTRY |
| 248 | .Ve |
| 249 | .Sp |
| 250 | .Vb 4 |
| 251 | \& entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) |
| 252 | \&00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> |
| 253 | \&00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> |
| 254 | \&ENTRY |
| 255 | .Ve |
| 256 | .Sp |
| 257 | \&\fB\s-1NOTE:\s0\fR The code point in the \s-1UCA\s0 file format (before \f(CW';'\fR) |
| 258 | \&\fBmust\fR be a Unicode code point (defined as hexadecimal), |
| 259 | but not a native code point. |
| 260 | So \f(CW0063\fR must always denote \f(CW\*(C`U+0063\*(C'\fR, |
| 261 | but not a character of \f(CW"\ex63"\fR. |
| 262 | .Sp |
| 263 | Weighting may vary depending on collation element table. |
| 264 | So ensure the weights defined in \f(CW\*(C`entry\*(C'\fR will be consistent with |
| 265 | those in the collation element table loaded via \f(CW\*(C`table\*(C'\fR. |
| 266 | .Sp |
| 267 | In \s-1DUCET\s0 v4.0.0, primary weight of \f(CW\*(C`C\*(C'\fR is \f(CW0E60\fR |
| 268 | and that of \f(CW\*(C`D\*(C'\fR is \f(CW\*(C`0E6D\*(C'\fR. So setting primary weight of \f(CW\*(C`CH\*(C'\fR to \f(CW\*(C`0E6A\*(C'\fR |
| 269 | (as a value between \f(CW0E60\fR and \f(CW\*(C`0E6D\*(C'\fR) |
| 270 | makes ordering as \f(CW\*(C`C < CH < D\*(C'\fR. |
| 271 | Exactly speaking \s-1DUCET\s0 already has some characters between \f(CW\*(C`C\*(C'\fR and \f(CW\*(C`D\*(C'\fR: |
| 272 | \&\f(CW\*(C`small capital C\*(C'\fR (\f(CW\*(C`U+1D04\*(C'\fR) with primary weight \f(CW0E64\fR, |
| 273 | \&\f(CW\*(C`c\-hook/C\-hook\*(C'\fR (\f(CW\*(C`U+0188/U+0187\*(C'\fR) with \f(CW0E65\fR, |
| 274 | and \f(CW\*(C`c\-curl\*(C'\fR (\f(CW\*(C`U+0255\*(C'\fR) with \f(CW0E69\fR. |
| 275 | Then primary weight \f(CW\*(C`0E6A\*(C'\fR for \f(CW\*(C`CH\*(C'\fR makes \f(CW\*(C`CH\*(C'\fR |
| 276 | ordered between \f(CW\*(C`c\-curl\*(C'\fR and \f(CW\*(C`D\*(C'\fR. |
| 277 | .IP "hangul_terminator" 4 |
| 278 | .IX Item "hangul_terminator" |
| 279 | \&\-\- see 7.1.4 Trailing Weights, \s-1UTS\s0 #10. |
| 280 | .Sp |
| 281 | If a true value is given (non\-zero but should be positive), |
| 282 | it will be added as a terminator primary weight to the end of |
| 283 | every standard Hangul syllable. Secondary and any higher weights |
| 284 | for terminator are set to zero. |
| 285 | If the value is false or \f(CW\*(C`hangul_terminator\*(C'\fR key does not exist, |
| 286 | insertion of terminator weights will not be performed. |
| 287 | .Sp |
| 288 | Boundaries of Hangul syllables are determined |
| 289 | according to conjoining Jamo behavior in \fIthe Unicode Standard\fR |
| 290 | and \fIHangulSyllableType.txt\fR. |
| 291 | .Sp |
| 292 | \&\fBImplementation Note:\fR |
| 293 | (1) For expansion mapping (Unicode character mapped |
| 294 | to a sequence of collation elements), a terminator will not be added |
| 295 | between collation elements, even if Hangul syllable boundary exists there. |
| 296 | Addition of terminator is restricted to the next position |
| 297 | to the last collation element. |
| 298 | .Sp |
| 299 | (2) Non-conjoining Hangul letters |
| 300 | (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not |
| 301 | automatically terminated with a terminator primary weight. |
| 302 | These characters may need terminator included in a collation element |
| 303 | table beforehand. |
| 304 | .IP "ignoreChar" 4 |
| 305 | .IX Item "ignoreChar" |
| 306 | .PD 0 |
| 307 | .IP "ignoreName" 4 |
| 308 | .IX Item "ignoreName" |
| 309 | .PD |
| 310 | \&\-\- see 3.2.2 Variable Weighting, \s-1UTS\s0 #10. |
| 311 | .Sp |
| 312 | Makes the entry in the table completely ignorable; |
| 313 | i.e. as if the weights were zero at all level. |
| 314 | .Sp |
| 315 | Through \f(CW\*(C`ignoreChar\*(C'\fR, any character matching \f(CW\*(C`qr/$ignoreChar/\*(C'\fR |
| 316 | will be ignored. Through \f(CW\*(C`ignoreName\*(C'\fR, any character whose name |
| 317 | (given in the \f(CW\*(C`table\*(C'\fR file as a comment) matches \f(CW\*(C`qr/$ignoreName/\*(C'\fR |
| 318 | will be ignored. |
| 319 | .Sp |
| 320 | E.g. when 'a' and 'e' are ignorable, |
| 321 | \&'element' is equal to 'lament' (or 'lmnt'). |
| 322 | .IP "katakana_before_hiragana" 4 |
| 323 | .IX Item "katakana_before_hiragana" |
| 324 | \&\-\- see 7.3.1 Tertiary Weight Table, \s-1UTS\s0 #10. |
| 325 | .Sp |
| 326 | By default, hiragana is before katakana. |
| 327 | If the parameter is made true, this is reversed. |
| 328 | .Sp |
| 329 | \&\fB\s-1NOTE\s0\fR: This parameter simplemindedly assumes that any hiragana/katakana |
| 330 | distinctions must occur in level 3, and their weights at level 3 must be |
| 331 | same as those mentioned in 7.3.1, \s-1UTS\s0 #10. |
| 332 | If you define your collation elements which violate this requirement, |
| 333 | this parameter does not work validly. |
| 334 | .IP "level" 4 |
| 335 | .IX Item "level" |
| 336 | \&\-\- see 4.3 Form Sort Key, \s-1UTS\s0 #10. |
| 337 | .Sp |
| 338 | Set the maximum level. |
| 339 | Any higher levels than the specified one are ignored. |
| 340 | .Sp |
| 341 | .Vb 4 |
| 342 | \& Level 1: alphabetic ordering |
| 343 | \& Level 2: diacritic ordering |
| 344 | \& Level 3: case ordering |
| 345 | \& Level 4: tie-breaking (e.g. in the case when variable is 'shifted') |
| 346 | .Ve |
| 347 | .Sp |
| 348 | .Vb 1 |
| 349 | \& ex.level => 2, |
| 350 | .Ve |
| 351 | .Sp |
| 352 | If omitted, the maximum is the 4th. |
| 353 | .IP "normalization" 4 |
| 354 | .IX Item "normalization" |
| 355 | \&\-\- see 4.1 Normalize, \s-1UTS\s0 #10. |
| 356 | .Sp |
| 357 | If specified, strings are normalized before preparation of sort keys |
| 358 | (the normalization is executed after preprocess). |
| 359 | .Sp |
| 360 | A form name \f(CW\*(C`Unicode::Normalize::normalize()\*(C'\fR accepts will be applied |
| 361 | as \f(CW$normalization_form\fR. |
| 362 | Acceptable names include \f(CW'NFD'\fR, \f(CW'NFC'\fR, \f(CW'NFKD'\fR, and \f(CW'NFKC'\fR. |
| 363 | See \f(CW\*(C`Unicode::Normalize::normalize()\*(C'\fR for detail. |
| 364 | If omitted, \f(CW'NFD'\fR is used. |
| 365 | .Sp |
| 366 | \&\f(CW\*(C`normalization\*(C'\fR is performed after \f(CW\*(C`preprocess\*(C'\fR (if defined). |
| 367 | .Sp |
| 368 | Furthermore, special values, \f(CW\*(C`undef\*(C'\fR and \f(CW"prenormalized"\fR, can be used, |
| 369 | though they are not concerned with \f(CW\*(C`Unicode::Normalize::normalize()\*(C'\fR. |
| 370 | .Sp |
| 371 | If \f(CW\*(C`undef\*(C'\fR (not a string \f(CW"undef"\fR) is passed explicitly |
| 372 | as the value for this key, |
| 373 | any normalization is not carried out (this may make tailoring easier |
| 374 | if any normalization is not desired). Under \f(CW\*(C`(normalization => undef)\*(C'\fR, |
| 375 | only contiguous contractions are resolved; |
| 376 | e.g. even if \f(CW\*(C`A\-ring\*(C'\fR (and \f(CW\*(C`A\-ring\-cedilla\*(C'\fR) is ordered after \f(CW\*(C`Z\*(C'\fR, |
| 377 | \&\f(CW\*(C`A\-cedilla\-ring\*(C'\fR would be primary equal to \f(CW\*(C`A\*(C'\fR. |
| 378 | In this point, |
| 379 | \&\f(CW\*(C`(normalization => undef, preprocess => sub { NFD(shift) })\*(C'\fR |
| 380 | \&\fBis not\fR equivalent to \f(CW\*(C`(normalization => 'NFD')\*(C'\fR. |
| 381 | .Sp |
| 382 | In the case of \f(CW\*(C`(normalization => "prenormalized")\*(C'\fR, |
| 383 | any normalization is not performed, but |
| 384 | non-contiguous contractions with combining characters are performed. |
| 385 | Therefore |
| 386 | \&\f(CW\*(C`(normalization => 'prenormalized', preprocess => sub { NFD(shift) })\*(C'\fR |
| 387 | \&\fBis\fR equivalent to \f(CW\*(C`(normalization => 'NFD')\*(C'\fR. |
| 388 | If source strings are finely prenormalized, |
| 389 | \&\f(CW\*(C`(normalization => 'prenormalized')\*(C'\fR may save time for normalization. |
| 390 | .Sp |
| 391 | Except \f(CW\*(C`(normalization => undef)\*(C'\fR, |
| 392 | \&\fBUnicode::Normalize\fR is required (see also \fB\s-1CAVEAT\s0\fR). |
| 393 | .IP "overrideCJK" 4 |
| 394 | .IX Item "overrideCJK" |
| 395 | \&\-\- see 7.1 Derived Collation Elements, \s-1UTS\s0 #10. |
| 396 | .Sp |
| 397 | By default, \s-1CJK\s0 Unified Ideographs are ordered in Unicode codepoint order |
| 398 | but \f(CW\*(C`CJK Unified Ideographs\*(C'\fR (if \f(CW\*(C`UCA_Version\*(C'\fR is 8 to 11, its range is |
| 399 | \&\f(CW\*(C`U+4E00..U+9FA5\*(C'\fR; if \f(CW\*(C`UCA_Version\*(C'\fR is 14, its range is \f(CW\*(C`U+4E00..U+9FBB\*(C'\fR) |
| 400 | are lesser than \f(CW\*(C`CJK Unified Ideographs Extension\*(C'\fR (its range is |
| 401 | \&\f(CW\*(C`U+3400..U+4DB5\*(C'\fR and \f(CW\*(C`U+20000..U+2A6D6\*(C'\fR). |
| 402 | .Sp |
| 403 | Through \f(CW\*(C`overrideCJK\*(C'\fR, ordering of \s-1CJK\s0 Unified Ideographs can be overrided. |
| 404 | .Sp |
| 405 | ex. \s-1CJK\s0 Unified Ideographs in the \s-1JIS\s0 code point order. |
| 406 | .Sp |
| 407 | .Vb 7 |
| 408 | \& overrideCJK => sub { |
| 409 | \& my $u = shift; # get a Unicode codepoint |
| 410 | \& my $b = pack('n', $u); # to UTF-16BE |
| 411 | \& my $s = your_unicode_to_sjis_converter($b); # convert |
| 412 | \& my $n = unpack('n', $s); # convert sjis to short |
| 413 | \& [ $n, 0x20, 0x2, $u ]; # return the collation element |
| 414 | \& }, |
| 415 | .Ve |
| 416 | .Sp |
| 417 | ex. ignores all \s-1CJK\s0 Unified Ideographs. |
| 418 | .Sp |
| 419 | .Vb 1 |
| 420 | \& overrideCJK => sub {()}, # CODEREF returning empty list |
| 421 | .Ve |
| 422 | .Sp |
| 423 | .Vb 2 |
| 424 | \& # where ->eq("Pe\ex{4E00}rl", "Perl") is true |
| 425 | \& # as U+4E00 is a CJK Unified Ideograph and to be ignorable. |
| 426 | .Ve |
| 427 | .Sp |
| 428 | If \f(CW\*(C`undef\*(C'\fR is passed explicitly as the value for this key, |
| 429 | weights for \s-1CJK\s0 Unified Ideographs are treated as undefined. |
| 430 | But assignment of weight for \s-1CJK\s0 Unified Ideographs |
| 431 | in table or \f(CW\*(C`entry\*(C'\fR is still valid. |
| 432 | .IP "overrideHangul" 4 |
| 433 | .IX Item "overrideHangul" |
| 434 | \&\-\- see 7.1 Derived Collation Elements, \s-1UTS\s0 #10. |
| 435 | .Sp |
| 436 | By default, Hangul Syllables are decomposed into Hangul Jamo, |
| 437 | even if \f(CW\*(C`(normalization => undef)\*(C'\fR. |
| 438 | But the mapping of Hangul Syllables may be overrided. |
| 439 | .Sp |
| 440 | This parameter works like \f(CW\*(C`overrideCJK\*(C'\fR, so see there for examples. |
| 441 | .Sp |
| 442 | If you want to override the mapping of Hangul Syllables, |
| 443 | \&\s-1NFD\s0, \s-1NFKD\s0, and \s-1FCD\s0 are not appropriate, |
| 444 | since they will decompose Hangul Syllables before overriding. |
| 445 | .Sp |
| 446 | If \f(CW\*(C`undef\*(C'\fR is passed explicitly as the value for this key, |
| 447 | weight for Hangul Syllables is treated as undefined |
| 448 | without decomposition into Hangul Jamo. |
| 449 | But definition of weight for Hangul Syllables |
| 450 | in table or \f(CW\*(C`entry\*(C'\fR is still valid. |
| 451 | .IP "preprocess" 4 |
| 452 | .IX Item "preprocess" |
| 453 | \&\-\- see 5.1 Preprocessing, \s-1UTS\s0 #10. |
| 454 | .Sp |
| 455 | If specified, the coderef is used to preprocess |
| 456 | before the formation of sort keys. |
| 457 | .Sp |
| 458 | ex. dropping English articles, such as \*(L"a\*(R" or \*(L"the\*(R". |
| 459 | Then, \*(L"the pen\*(R" is before \*(L"a pencil\*(R". |
| 460 | .Sp |
| 461 | .Vb 5 |
| 462 | \& preprocess => sub { |
| 463 | \& my $str = shift; |
| 464 | \& $str =~ s/\eb(?:an?|the)\es+//gi; |
| 465 | \& return $str; |
| 466 | \& }, |
| 467 | .Ve |
| 468 | .Sp |
| 469 | \&\f(CW\*(C`preprocess\*(C'\fR is performed before \f(CW\*(C`normalization\*(C'\fR (if defined). |
| 470 | .IP "rearrange" 4 |
| 471 | .IX Item "rearrange" |
| 472 | \&\-\- see 3.1.3 Rearrangement, \s-1UTS\s0 #10. |
| 473 | .Sp |
| 474 | Characters that are not coded in logical order and to be rearranged. |
| 475 | If \f(CW\*(C`UCA_Version\*(C'\fR is equal to or lesser than 11, default is: |
| 476 | .Sp |
| 477 | .Vb 1 |
| 478 | \& rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], |
| 479 | .Ve |
| 480 | .Sp |
| 481 | If you want to disallow any rearrangement, pass \f(CW\*(C`undef\*(C'\fR or \f(CW\*(C`[]\*(C'\fR |
| 482 | (a reference to empty list) as the value for this key. |
| 483 | .Sp |
| 484 | If \f(CW\*(C`UCA_Version\*(C'\fR is equal to 14, default is \f(CW\*(C`[]\*(C'\fR (i.e. no rearrangement). |
| 485 | .Sp |
| 486 | \&\fBAccording to the version 9 of \s-1UCA\s0, this parameter shall not be used; |
| 487 | but it is not warned at present.\fR |
| 488 | .IP "table" 4 |
| 489 | .IX Item "table" |
| 490 | \&\-\- see 3.2 Default Unicode Collation Element Table, \s-1UTS\s0 #10. |
| 491 | .Sp |
| 492 | You can use another collation element table if desired. |
| 493 | .Sp |
| 494 | The table file should locate in the \fIUnicode/Collate\fR directory |
| 495 | on \f(CW@INC\fR. Say, if the filename is \fIFoo.txt\fR, |
| 496 | the table file is searched as \fIUnicode/Collate/Foo.txt\fR in \f(CW@INC\fR. |
| 497 | .Sp |
| 498 | By default, \fIallkeys.txt\fR (as the filename of \s-1DUCET\s0) is used. |
| 499 | If you will prepare your own table file, any name other than \fIallkeys.txt\fR |
| 500 | may be better to avoid namespace conflict. |
| 501 | .Sp |
| 502 | If \f(CW\*(C`undef\*(C'\fR is passed explicitly as the value for this key, |
| 503 | no file is read (but you can define collation elements via \f(CW\*(C`entry\*(C'\fR). |
| 504 | .Sp |
| 505 | A typical way to define a collation element table |
| 506 | without any file of table: |
| 507 | .Sp |
| 508 | .Vb 11 |
| 509 | \& $onlyABC = Unicode::Collate->new( |
| 510 | \& table => undef, |
| 511 | \& entry => << 'ENTRIES', |
| 512 | \&0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A |
| 513 | \&0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A |
| 514 | \&0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B |
| 515 | \&0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B |
| 516 | \&0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C |
| 517 | \&0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C |
| 518 | \&ENTRIES |
| 519 | \& ); |
| 520 | .Ve |
| 521 | .Sp |
| 522 | If \f(CW\*(C`ignoreName\*(C'\fR or \f(CW\*(C`undefName\*(C'\fR is used, character names should be |
| 523 | specified as a comment (following \f(CW\*(C`#\*(C'\fR) on each line. |
| 524 | .IP "undefChar" 4 |
| 525 | .IX Item "undefChar" |
| 526 | .PD 0 |
| 527 | .IP "undefName" 4 |
| 528 | .IX Item "undefName" |
| 529 | .PD |
| 530 | \&\-\- see 6.3.4 Reducing the Repertoire, \s-1UTS\s0 #10. |
| 531 | .Sp |
| 532 | Undefines the collation element as if it were unassigned in the table. |
| 533 | This reduces the size of the table. |
| 534 | If an unassigned character appears in the string to be collated, |
| 535 | the sort key is made from its codepoint |
| 536 | as a single-character collation element, |
| 537 | as it is greater than any other assigned collation elements |
| 538 | (in the codepoint order among the unassigned characters). |
| 539 | But, it'd be better to ignore characters |
| 540 | unfamiliar to you and maybe never used. |
| 541 | .Sp |
| 542 | Through \f(CW\*(C`undefChar\*(C'\fR, any character matching \f(CW\*(C`qr/$undefChar/\*(C'\fR |
| 543 | will be undefined. Through \f(CW\*(C`undefName\*(C'\fR, any character whose name |
| 544 | (given in the \f(CW\*(C`table\*(C'\fR file as a comment) matches \f(CW\*(C`qr/$undefName/\*(C'\fR |
| 545 | will be undefined. |
| 546 | .Sp |
| 547 | ex. Collation weights for beyond-BMP characters are not stored in object: |
| 548 | .Sp |
| 549 | .Vb 1 |
| 550 | \& undefChar => qr/[^\e0-\ex{fffd}]/, |
| 551 | .Ve |
| 552 | .IP "upper_before_lower" 4 |
| 553 | .IX Item "upper_before_lower" |
| 554 | \&\-\- see 6.6 Case Comparisons, \s-1UTS\s0 #10. |
| 555 | .Sp |
| 556 | By default, lowercase is before uppercase. |
| 557 | If the parameter is made true, this is reversed. |
| 558 | .Sp |
| 559 | \&\fB\s-1NOTE\s0\fR: This parameter simplemindedly assumes that any lowercase/uppercase |
| 560 | distinctions must occur in level 3, and their weights at level 3 must be |
| 561 | same as those mentioned in 7.3.1, \s-1UTS\s0 #10. |
| 562 | If you define your collation elements which differs from this requirement, |
| 563 | this parameter doesn't work validly. |
| 564 | .IP "variable" 4 |
| 565 | .IX Item "variable" |
| 566 | \&\-\- see 3.2.2 Variable Weighting, \s-1UTS\s0 #10. |
| 567 | .Sp |
| 568 | This key allows to variable weighting for variable collation elements, |
| 569 | which are marked with an \s-1ASTERISK\s0 in the table |
| 570 | (\s-1NOTE:\s0 Many punction marks and symbols are variable in \fIallkeys.txt\fR). |
| 571 | .Sp |
| 572 | .Vb 1 |
| 573 | \& variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. |
| 574 | .Ve |
| 575 | .Sp |
| 576 | These names are case\-insensitive. |
| 577 | By default (if specification is omitted), 'shifted' is adopted. |
| 578 | .Sp |
| 579 | .Vb 2 |
| 580 | \& 'Blanked' Variable elements are made ignorable at levels 1 through 3; |
| 581 | \& considered at the 4th level. |
| 582 | .Ve |
| 583 | .Sp |
| 584 | .Vb 1 |
| 585 | \& 'Non-Ignorable' Variable elements are not reset to ignorable. |
| 586 | .Ve |
| 587 | .Sp |
| 588 | .Vb 3 |
| 589 | \& 'Shifted' Variable elements are made ignorable at levels 1 through 3 |
| 590 | \& their level 4 weight is replaced by the old level 1 weight. |
| 591 | \& Level 4 weight for Non-Variable elements is 0xFFFF. |
| 592 | .Ve |
| 593 | .Sp |
| 594 | .Vb 2 |
| 595 | \& 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level |
| 596 | \& are trimmed. |
| 597 | .Ve |
| 598 | .Sh "Methods for Collation" |
| 599 | .IX Subsection "Methods for Collation" |
| 600 | .ie n .IP """@sorted = $Collator\->sort(@not_sorted)""" 4 |
| 601 | .el .IP "\f(CW@sorted = $Collator\->sort(@not_sorted)\fR" 4 |
| 602 | .IX Item "@sorted = $Collator->sort(@not_sorted)" |
| 603 | Sorts a list of strings. |
| 604 | .ie n .IP """$result = $Collator\->cmp($a, $b)""" 4 |
| 605 | .el .IP "\f(CW$result = $Collator\->cmp($a, $b)\fR" 4 |
| 606 | .IX Item "$result = $Collator->cmp($a, $b)" |
| 607 | Returns 1 (when \f(CW$a\fR is greater than \f(CW$b\fR) |
| 608 | or 0 (when \f(CW$a\fR is equal to \f(CW$b\fR) |
| 609 | or \-1 (when \f(CW$a\fR is lesser than \f(CW$b\fR). |
| 610 | .ie n .IP """$result = $Collator\->eq($a, $b)""" 4 |
| 611 | .el .IP "\f(CW$result = $Collator\->eq($a, $b)\fR" 4 |
| 612 | .IX Item "$result = $Collator->eq($a, $b)" |
| 613 | .PD 0 |
| 614 | .ie n .IP """$result = $Collator\->ne($a, $b)""" 4 |
| 615 | .el .IP "\f(CW$result = $Collator\->ne($a, $b)\fR" 4 |
| 616 | .IX Item "$result = $Collator->ne($a, $b)" |
| 617 | .ie n .IP """$result = $Collator\->lt($a, $b)""" 4 |
| 618 | .el .IP "\f(CW$result = $Collator\->lt($a, $b)\fR" 4 |
| 619 | .IX Item "$result = $Collator->lt($a, $b)" |
| 620 | .ie n .IP """$result = $Collator\->le($a, $b)""" 4 |
| 621 | .el .IP "\f(CW$result = $Collator\->le($a, $b)\fR" 4 |
| 622 | .IX Item "$result = $Collator->le($a, $b)" |
| 623 | .ie n .IP """$result = $Collator\->gt($a, $b)""" 4 |
| 624 | .el .IP "\f(CW$result = $Collator\->gt($a, $b)\fR" 4 |
| 625 | .IX Item "$result = $Collator->gt($a, $b)" |
| 626 | .ie n .IP """$result = $Collator\->ge($a, $b)""" 4 |
| 627 | .el .IP "\f(CW$result = $Collator\->ge($a, $b)\fR" 4 |
| 628 | .IX Item "$result = $Collator->ge($a, $b)" |
| 629 | .PD |
| 630 | They works like the same name operators as theirs. |
| 631 | .Sp |
| 632 | .Vb 6 |
| 633 | \& eq : whether $a is equal to $b. |
| 634 | \& ne : whether $a is not equal to $b. |
| 635 | \& lt : whether $a is lesser than $b. |
| 636 | \& le : whether $a is lesser than $b or equal to $b. |
| 637 | \& gt : whether $a is greater than $b. |
| 638 | \& ge : whether $a is greater than $b or equal to $b. |
| 639 | .Ve |
| 640 | .ie n .IP """$sortKey = $Collator\->getSortKey($string)""" 4 |
| 641 | .el .IP "\f(CW$sortKey = $Collator\->getSortKey($string)\fR" 4 |
| 642 | .IX Item "$sortKey = $Collator->getSortKey($string)" |
| 643 | \&\-\- see 4.3 Form Sort Key, \s-1UTS\s0 #10. |
| 644 | .Sp |
| 645 | Returns a sort key. |
| 646 | .Sp |
| 647 | You compare the sort keys using a binary comparison |
| 648 | and get the result of the comparison of the strings using \s-1UCA\s0. |
| 649 | .Sp |
| 650 | .Vb 1 |
| 651 | \& $Collator->getSortKey($a) cmp $Collator->getSortKey($b) |
| 652 | .Ve |
| 653 | .Sp |
| 654 | .Vb 1 |
| 655 | \& is equivalent to |
| 656 | .Ve |
| 657 | .Sp |
| 658 | .Vb 1 |
| 659 | \& $Collator->cmp($a, $b) |
| 660 | .Ve |
| 661 | .ie n .IP """$sortKeyForm = $Collator\->viewSortKey($string)""" 4 |
| 662 | .el .IP "\f(CW$sortKeyForm = $Collator\->viewSortKey($string)\fR" 4 |
| 663 | .IX Item "$sortKeyForm = $Collator->viewSortKey($string)" |
| 664 | Converts a sorting key into its representation form. |
| 665 | If \f(CW\*(C`UCA_Version\*(C'\fR is 8, the output is slightly different. |
| 666 | .Sp |
| 667 | .Vb 3 |
| 668 | \& use Unicode::Collate; |
| 669 | \& my $c = Unicode::Collate->new(); |
| 670 | \& print $c->viewSortKey("Perl"),"\en"; |
| 671 | .Ve |
| 672 | .Sp |
| 673 | .Vb 3 |
| 674 | \& # output: |
| 675 | \& # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] |
| 676 | \& # Level 1 Level 2 Level 3 Level 4 |
| 677 | .Ve |
| 678 | .Sh "Methods for Searching" |
| 679 | .IX Subsection "Methods for Searching" |
| 680 | \&\fB\s-1DISCLAIMER:\s0\fR If \f(CW\*(C`preprocess\*(C'\fR or \f(CW\*(C`normalization\*(C'\fR parameter is true |
| 681 | for \f(CW$Collator\fR, calling these methods (\f(CW\*(C`index\*(C'\fR, \f(CW\*(C`match\*(C'\fR, \f(CW\*(C`gmatch\*(C'\fR, |
| 682 | \&\f(CW\*(C`subst\*(C'\fR, \f(CW\*(C`gsubst\*(C'\fR) is croaked, |
| 683 | as the position and the length might differ |
| 684 | from those on the specified string. |
| 685 | (And \f(CW\*(C`rearrange\*(C'\fR and \f(CW\*(C`hangul_terminator\*(C'\fR parameters are neglected.) |
| 686 | .PP |
| 687 | The \f(CW\*(C`match\*(C'\fR, \f(CW\*(C`gmatch\*(C'\fR, \f(CW\*(C`subst\*(C'\fR, \f(CW\*(C`gsubst\*(C'\fR methods work |
| 688 | like \f(CW\*(C`m//\*(C'\fR, \f(CW\*(C`m//g\*(C'\fR, \f(CW\*(C`s///\*(C'\fR, \f(CW\*(C`s///g\*(C'\fR, respectively, |
| 689 | but they are not aware of any pattern, but only a literal substring. |
| 690 | .ie n .IP """$position = $Collator\->index($string, $substring[, $position])""" 4 |
| 691 | .el .IP "\f(CW$position = $Collator\->index($string, $substring[, $position])\fR" 4 |
| 692 | .IX Item "$position = $Collator->index($string, $substring[, $position])" |
| 693 | .PD 0 |
| 694 | .ie n .IP """($position, $length) = $Collator\->index($string, $substring[, $position])""" 4 |
| 695 | .el .IP "\f(CW($position, $length) = $Collator\->index($string, $substring[, $position])\fR" 4 |
| 696 | .IX Item "($position, $length) = $Collator->index($string, $substring[, $position])" |
| 697 | .PD |
| 698 | If \f(CW$substring\fR matches a part of \f(CW$string\fR, returns |
| 699 | the position of the first occurrence of the matching part in scalar context; |
| 700 | in list context, returns a two-element list of |
| 701 | the position and the length of the matching part. |
| 702 | .Sp |
| 703 | If \f(CW$substring\fR does not match any part of \f(CW$string\fR, |
| 704 | returns \f(CW\*(C`\-1\*(C'\fR in scalar context and |
| 705 | an empty list in list context. |
| 706 | .Sp |
| 707 | e.g. you say |
| 708 | .Sp |
| 709 | .Vb 8 |
| 710 | \& my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); |
| 711 | \& # (normalization => undef) is REQUIRED. |
| 712 |