| 1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
| 2 | .\" |
| 3 | .\" Standard preamble: |
| 4 | .\" ======================================================================== |
| 5 | .de Sh \" Subsection heading |
| 6 | .br |
| 7 | .if t .Sp |
| 8 | .ne 5 |
| 9 | .PP |
| 10 | \fB\\$1\fR |
| 11 | .PP |
| 12 | .. |
| 13 | .de Sp \" Vertical space (when we can't use .PP) |
| 14 | .if t .sp .5v |
| 15 | .if n .sp |
| 16 | .. |
| 17 | .de Vb \" Begin verbatim text |
| 18 | .ft CW |
| 19 | .nf |
| 20 | .ne \\$1 |
| 21 | .. |
| 22 | .de Ve \" End verbatim text |
| 23 | .ft R |
| 24 | .fi |
| 25 | .. |
| 26 | .\" Set up some character translations and predefined strings. \*(-- will |
| 27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
| 28 | .\" double quote, and \*(R" will give a right double quote. | will give a |
| 29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to |
| 30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' |
| 31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. |
| 32 | .tr \(*W-|\(bv\*(Tr |
| 33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
| 34 | .ie n \{\ |
| 35 | . ds -- \(*W- |
| 36 | . ds PI pi |
| 37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
| 38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
| 39 | . ds L" "" |
| 40 | . ds R" "" |
| 41 | . ds C` "" |
| 42 | . ds C' "" |
| 43 | 'br\} |
| 44 | .el\{\ |
| 45 | . ds -- \|\(em\| |
| 46 | . ds PI \(*p |
| 47 | . ds L" `` |
| 48 | . ds R" '' |
| 49 | 'br\} |
| 50 | .\" |
| 51 | .\" If the F register is turned on, we'll generate index entries on stderr for |
| 52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index |
| 53 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
| 54 | .\" output yourself in some meaningful fashion. |
| 55 | .if \nF \{\ |
| 56 | . de IX |
| 57 | . tm Index:\\$1\t\\n%\t"\\$2" |
| 58 | .. |
| 59 | . nr % 0 |
| 60 | . rr F |
| 61 | .\} |
| 62 | .\" |
| 63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
| 64 | .\" way too many mistakes in technical documents. |
| 65 | .hy 0 |
| 66 | .if n .na |
| 67 | .\" |
| 68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
| 69 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
| 70 | . \" fudge factors for nroff and troff |
| 71 | .if n \{\ |
| 72 | . ds #H 0 |
| 73 | . ds #V .8m |
| 74 | . ds #F .3m |
| 75 | . ds #[ \f1 |
| 76 | . ds #] \fP |
| 77 | .\} |
| 78 | .if t \{\ |
| 79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
| 80 | . ds #V .6m |
| 81 | . ds #F 0 |
| 82 | . ds #[ \& |
| 83 | . ds #] \& |
| 84 | .\} |
| 85 | . \" simple accents for nroff and troff |
| 86 | .if n \{\ |
| 87 | . ds ' \& |
| 88 | . ds ` \& |
| 89 | . ds ^ \& |
| 90 | . ds , \& |
| 91 | . ds ~ ~ |
| 92 | . ds / |
| 93 | .\} |
| 94 | .if t \{\ |
| 95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
| 96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
| 97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
| 98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
| 99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
| 100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
| 101 | .\} |
| 102 | . \" troff and (daisy-wheel) nroff accents |
| 103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
| 104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
| 105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
| 106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
| 107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
| 108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
| 109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
| 110 | .ds ae a\h'-(\w'a'u*4/10)'e |
| 111 | .ds Ae A\h'-(\w'A'u*4/10)'E |
| 112 | . \" corrections for vroff |
| 113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
| 114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
| 115 | . \" for low resolution devices (crt and lpr) |
| 116 | .if \n(.H>23 .if \n(.V>19 \ |
| 117 | \{\ |
| 118 | . ds : e |
| 119 | . ds 8 ss |
| 120 | . ds o a |
| 121 | . ds d- d\h'-1'\(ga |
| 122 | . ds D- D\h'-1'\(hy |
| 123 | . ds th \o'bp' |
| 124 | . ds Th \o'LP' |
| 125 | . ds ae ae |
| 126 | . ds Ae AE |
| 127 | .\} |
| 128 | .rm #[ #] #H #V #F C |
| 129 | .\" ======================================================================== |
| 130 | .\" |
| 131 | .IX Title "I18N::LangTags 3" |
| 132 | .TH I18N::LangTags 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide" |
| 133 | .SH "NAME" |
| 134 | I18N::LangTags \- functions for dealing with RFC3066\-style language tags |
| 135 | .SH "SYNOPSIS" |
| 136 | .IX Header "SYNOPSIS" |
| 137 | .Vb 1 |
| 138 | \& use I18N::LangTags(); |
| 139 | .Ve |
| 140 | .PP |
| 141 | \&...or specify whichever of those functions you want to import, like so: |
| 142 | .PP |
| 143 | .Vb 1 |
| 144 | \& use I18N::LangTags qw(implicate_supers similarity_language_tag); |
| 145 | .Ve |
| 146 | .PP |
| 147 | All the exportable functions are listed below \*(-- you're free to import |
| 148 | only some, or none at all. By default, none are imported. If you |
| 149 | say: |
| 150 | .PP |
| 151 | .Vb 1 |
| 152 | \& use I18N::LangTags qw(:ALL) |
| 153 | .Ve |
| 154 | .PP |
| 155 | \&...then all are exported. (This saves you from having to use |
| 156 | something less obvious like \f(CW\*(C`use I18N::LangTags qw(/./)\*(C'\fR.) |
| 157 | .PP |
| 158 | If you don't import any of these functions, assume a \f(CW&I18N::LangTags::\fR |
| 159 | in front of all the function names in the following examples. |
| 160 | .SH "DESCRIPTION" |
| 161 | .IX Header "DESCRIPTION" |
| 162 | Language tags are a formalism, described in \s-1RFC\s0 3066 (obsoleting |
| 163 | 1766), for declaring what language form (language and possibly |
| 164 | dialect) a given chunk of information is in. |
| 165 | .PP |
| 166 | This library provides functions for common tasks involving language |
| 167 | tags as they are needed in a variety of protocols and applications. |
| 168 | .PP |
| 169 | Please see the \*(L"See Also\*(R" references for a thorough explanation |
| 170 | of how to correctly use language tags. |
| 171 | .IP "* the function is_language_tag($lang1)" 4 |
| 172 | .IX Item "the function is_language_tag($lang1)" |
| 173 | Returns true iff \f(CW$lang1\fR is a formally valid language tag. |
| 174 | .Sp |
| 175 | .Vb 3 |
| 176 | \& is_language_tag("fr") is TRUE |
| 177 | \& is_language_tag("x-jicarilla") is FALSE |
| 178 | \& (Subtags can be 8 chars long at most -- 'jicarilla' is 9) |
| 179 | .Ve |
| 180 | .Sp |
| 181 | .Vb 2 |
| 182 | \& is_language_tag("sgn-US") is TRUE |
| 183 | \& (That's American Sign Language) |
| 184 | .Ve |
| 185 | .Sp |
| 186 | .Vb 3 |
| 187 | \& is_language_tag("i-Klikitat") is TRUE |
| 188 | \& (True without regard to the fact noone has actually |
| 189 | \& registered Klikitat -- it's a formally valid tag) |
| 190 | .Ve |
| 191 | .Sp |
| 192 | .Vb 2 |
| 193 | \& is_language_tag("fr-patois") is TRUE |
| 194 | \& (Formally valid -- altho descriptively weak!) |
| 195 | .Ve |
| 196 | .Sp |
| 197 | .Vb 4 |
| 198 | \& is_language_tag("Spanish") is FALSE |
| 199 | \& is_language_tag("french-patois") is FALSE |
| 200 | \& (No good -- first subtag has to match |
| 201 | \& /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) |
| 202 | .Ve |
| 203 | .Sp |
| 204 | .Vb 2 |
| 205 | \& is_language_tag("x-borg-prot2532") is TRUE |
| 206 | \& (Yes, subtags can contain digits, as of RFC3066) |
| 207 | .Ve |
| 208 | .IP "* the function extract_language_tags($whatever)" 4 |
| 209 | .IX Item "the function extract_language_tags($whatever)" |
| 210 | Returns a list of whatever looks like formally valid language tags |
| 211 | in \f(CW$whatever\fR. Not very smart, so don't get too creative with |
| 212 | what you want to feed it. |
| 213 | .Sp |
| 214 | .Vb 2 |
| 215 | \& extract_language_tags("fr, fr-ca, i-mingo") |
| 216 | \& returns: ('fr', 'fr-ca', 'i-mingo') |
| 217 | .Ve |
| 218 | .Sp |
| 219 | .Vb 3 |
| 220 | \& extract_language_tags("It's like this: I'm in fr -- French!") |
| 221 | \& returns: ('It', 'in', 'fr') |
| 222 | \& (So don't just feed it any old thing.) |
| 223 | .Ve |
| 224 | .Sp |
| 225 | The output is untainted. If you don't know what tainting is, |
| 226 | don't worry about it. |
| 227 | .ie n .IP "* the function same_language_tag($lang1, $lang2)" 4 |
| 228 | .el .IP "* the function same_language_tag($lang1, \f(CW$lang2\fR)" 4 |
| 229 | .IX Item "the function same_language_tag($lang1, $lang2)" |
| 230 | Returns true iff \f(CW$lang1\fR and \f(CW$lang2\fR are acceptable variant tags |
| 231 | representing the same language\-form. |
| 232 | .Sp |
| 233 | .Vb 10 |
| 234 | \& same_language_tag('x-kadara', 'i-kadara') is TRUE |
| 235 | \& (The x/i- alternation doesn't matter) |
| 236 | \& same_language_tag('X-KADARA', 'i-kadara') is TRUE |
| 237 | \& (...and neither does case) |
| 238 | \& same_language_tag('en', 'en-US') is FALSE |
| 239 | \& (all-English is not the SAME as US English) |
| 240 | \& same_language_tag('x-kadara', 'x-kadar') is FALSE |
| 241 | \& (these are totally unrelated tags) |
| 242 | \& same_language_tag('no-bok', 'nb') is TRUE |
| 243 | \& (no-bok is a legacy tag for nb (Norwegian Bokmal)) |
| 244 | .Ve |
| 245 | .Sp |
| 246 | \&\f(CW\*(C`same_language_tag\*(C'\fR works by just seeing whether |
| 247 | \&\f(CW\*(C`encode_language_tag($lang1)\*(C'\fR is the same as |
| 248 | \&\f(CW\*(C`encode_language_tag($lang2)\*(C'\fR. |
| 249 | .Sp |
| 250 | (Yes, I know this function is named a bit oddly. Call it historic |
| 251 | reasons.) |
| 252 | .ie n .IP "* the function similarity_language_tag($lang1, $lang2)" 4 |
| 253 | .el .IP "* the function similarity_language_tag($lang1, \f(CW$lang2\fR)" 4 |
| 254 | .IX Item "the function similarity_language_tag($lang1, $lang2)" |
| 255 | Returns an integer representing the degree of similarity between |
| 256 | tags \f(CW$lang1\fR and \f(CW$lang2\fR (the order of which does not matter), where |
| 257 | similarity is the number of common elements on the left, |
| 258 | without regard to case and to x/i\- alternation. |
| 259 | .Sp |
| 260 | .Vb 4 |
| 261 | \& similarity_language_tag('fr', 'fr-ca') is 1 |
| 262 | \& (one element in common) |
| 263 | \& similarity_language_tag('fr-ca', 'fr-FR') is 1 |
| 264 | \& (one element in common) |
| 265 | .Ve |
| 266 | .Sp |
| 267 | .Vb 4 |
| 268 | \& similarity_language_tag('fr-CA-joual', |
| 269 | \& 'fr-CA-PEI') is 2 |
| 270 | \& similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 |
| 271 | \& (two elements in common) |
| 272 | .Ve |
| 273 | .Sp |
| 274 | .Vb 2 |
| 275 | \& similarity_language_tag('x-kadara', 'i-kadara') is 1 |
| 276 | \& (x/i- doesn't matter) |
| 277 | .Ve |
| 278 | .Sp |
| 279 | .Vb 3 |
| 280 | \& similarity_language_tag('en', 'x-kadar') is 0 |
| 281 | \& similarity_language_tag('x-kadara', 'x-kadar') is 0 |
| 282 | \& (unrelated tags -- no similarity) |
| 283 | .Ve |
| 284 | .Sp |
| 285 | .Vb 3 |
| 286 | \& similarity_language_tag('i-cree-syllabic', |
| 287 | \& 'i-cherokee-syllabic') is 0 |
| 288 | \& (no B<leftmost> elements in common!) |
| 289 | .Ve |
| 290 | .ie n .IP "* the function is_dialect_of($lang1, $lang2)" 4 |
| 291 | .el .IP "* the function is_dialect_of($lang1, \f(CW$lang2\fR)" 4 |
| 292 | .IX Item "the function is_dialect_of($lang1, $lang2)" |
| 293 | Returns true iff language tag \f(CW$lang1\fR represents a subform of |
| 294 | language tag \f(CW$lang2\fR. |
| 295 | .Sp |
| 296 | \&\fBGet the order right! It doesn't work the other way around!\fR |
| 297 | .Sp |
| 298 | .Vb 2 |
| 299 | \& is_dialect_of('en-US', 'en') is TRUE |
| 300 | \& (American English IS a dialect of all-English) |
| 301 | .Ve |
| 302 | .Sp |
| 303 | .Vb 3 |
| 304 | \& is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE |
| 305 | \& is_dialect_of('fr-CA-joual', 'fr') is TRUE |
| 306 | \& (Joual is a dialect of (a dialect of) French) |
| 307 | .Ve |
| 308 | .Sp |
| 309 | .Vb 2 |
| 310 | \& is_dialect_of('en', 'en-US') is FALSE |
| 311 | \& (all-English is a NOT dialect of American English) |
| 312 | .Ve |
| 313 | .Sp |
| 314 | .Vb 1 |
| 315 | \& is_dialect_of('fr', 'en-CA') is FALSE |
| 316 | .Ve |
| 317 | .Sp |
| 318 | .Vb 3 |
| 319 | \& is_dialect_of('en', 'en' ) is TRUE |
| 320 | \& is_dialect_of('en-US', 'en-US') is TRUE |
| 321 | \& (B<Note:> these are degenerate cases) |
| 322 | .Ve |
| 323 | .Sp |
| 324 | .Vb 2 |
| 325 | \& is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE |
| 326 | \& (the x/i thing doesn't matter, nor does case) |
| 327 | .Ve |
| 328 | .Sp |
| 329 | .Vb 4 |
| 330 | \& is_dialect_of('nn', 'no') is TRUE |
| 331 | \& (because 'nn' (New Norse) is aliased to 'no-nyn', |
| 332 | \& as a special legacy case, and 'no-nyn' is a |
| 333 | \& subform of 'no' (Norwegian)) |
| 334 | .Ve |
| 335 | .IP "* the function super_languages($lang1)" 4 |
| 336 | .IX Item "the function super_languages($lang1)" |
| 337 | Returns a list of language tags that are superordinate tags to \f(CW$lang1\fR |
| 338 | \&\*(-- it gets this by removing subtags from the end of \f(CW$lang1\fR until |
| 339 | nothing (or just \*(L"i\*(R" or \*(L"x\*(R") is left. |
| 340 | .Sp |
| 341 | .Vb 1 |
| 342 | \& super_languages("fr-CA-joual") is ("fr-CA", "fr") |
| 343 | .Ve |
| 344 | .Sp |
| 345 | .Vb 1 |
| 346 | \& super_languages("en-AU") is ("en") |
| 347 | .Ve |
| 348 | .Sp |
| 349 | .Vb 1 |
| 350 | \& super_languages("en") is empty-list, () |
| 351 | .Ve |
| 352 | .Sp |
| 353 | .Vb 2 |
| 354 | \& super_languages("i-cherokee") is empty-list, () |
| 355 | \& ...not ("i"), which would be illegal as well as pointless. |
| 356 | .Ve |
| 357 | .Sp |
| 358 | If \f(CW$lang1\fR is not a valid language tag, returns empty-list in |
| 359 | a list context, undef in a scalar context. |
| 360 | .Sp |
| 361 | A notable and rather unavoidable problem with this method: |
| 362 | \&\*(L"x\-mingo\-tom\*(R" has an \*(L"x\*(R" because the whole tag isn't an |
| 363 | IANA-registered tag \*(-- but super_languages('x\-mingo\-tom') is |
| 364 | ('x\-mingo') \*(-- which isn't really right, since 'i\-mingo' is |
| 365 | registered. But this module has no way of knowing that. (But note |
| 366 | that same_language_tag('x\-mingo', 'i\-mingo') is \s-1TRUE\s0.) |
| 367 | .Sp |
| 368 | More importantly, you assume \fIat your peril\fR that superordinates of |
| 369 | \&\f(CW$lang1\fR are mutually intelligible with \f(CW$lang1\fR. Consider this |
| 370 | carefully. |
| 371 | .IP "* the function locale2language_tag($locale_identifier)" 4 |
| 372 | .IX Item "the function locale2language_tag($locale_identifier)" |
| 373 | This takes a locale name (like \*(L"en\*(R", \*(L"en_US\*(R", or \*(L"en_US.ISO8859\-1\*(R") |
| 374 | and maps it to a language tag. If it's not mappable (as with, |
| 375 | notably, \*(L"C\*(R" and \*(L"\s-1POSIX\s0\*(R"), this returns empty-list in a list context, |
| 376 | or undef in a scalar context. |
| 377 | .Sp |
| 378 | .Vb 1 |
| 379 | \& locale2language_tag("en") is "en" |
| 380 | .Ve |
| 381 | .Sp |
| 382 | .Vb 1 |
| 383 | \& locale2language_tag("en_US") is "en-US" |
| 384 | .Ve |
| 385 | .Sp |
| 386 | .Vb 1 |
| 387 | \& locale2language_tag("en_US.ISO8859-1") is "en-US" |
| 388 | .Ve |
| 389 | .Sp |
| 390 | .Vb 1 |
| 391 | \& locale2language_tag("C") is undef or () |
| 392 | .Ve |
| 393 | .Sp |
| 394 | .Vb 1 |
| 395 | \& locale2language_tag("POSIX") is undef or () |
| 396 | .Ve |
| 397 | .Sp |
| 398 | .Vb 1 |
| 399 | \& locale2language_tag("POSIX") is undef or () |
| 400 | .Ve |
| 401 | .Sp |
| 402 | I'm not totally sure that locale names map satisfactorily to language |
| 403 | tags. Think \s-1REAL\s0 hard about how you use this. \s-1YOU\s0 \s-1HAVE\s0 \s-1BEEN\s0 \s-1WARNED\s0. |
| 404 | .Sp |
| 405 | The output is untainted. If you don't know what tainting is, |
| 406 | don't worry about it. |
| 407 | .IP "* the function encode_language_tag($lang1)" 4 |
| 408 | .IX Item "the function encode_language_tag($lang1)" |
| 409 | This function, if given a language tag, returns an encoding of it such |
| 410 | that: |
| 411 | .Sp |
| 412 | * tags representing different languages never get the same encoding. |
| 413 | .Sp |
| 414 | * tags representing the same language always get the same encoding. |
| 415 | .Sp |
| 416 | * an encoding of a formally valid language tag always is a string |
| 417 | value that is defined, has length, and is true if considered as a |
| 418 | boolean. |
| 419 | .Sp |
| 420 | Note that the encoding itself is \fBnot\fR a formally valid language tag. |
| 421 | Note also that you cannot, currently, go from an encoding back to a |
| 422 | language tag that it's an encoding of. |
| 423 | .Sp |
| 424 | Note also that you \fBmust\fR consider the encoded value as atomic; i.e., |
| 425 | you should not consider it as anything but an opaque, unanalysable |
| 426 | string value. (The internals of the encoding method may change in |
| 427 | future versions, as the language tagging standard changes over time.) |
| 428 | .Sp |
| 429 | \&\f(CW\*(C`encode_language_tag\*(C'\fR returns undef if given anything other than a |
| 430 | formally valid language tag. |
| 431 | .Sp |
| 432 | The reason \f(CW\*(C`encode_language_tag\*(C'\fR exists is because different language |
| 433 | tags may represent the same language; this is normally treatable with |
| 434 | \&\f(CW\*(C`same_language_tag\*(C'\fR, but consider this situation: |
| 435 | .Sp |
| 436 | You have a data file that expresses greetings in different languages. |
| 437 | Its format is \*(L"[language tag]=[how to say 'Hello']\*(R", like: |
| 438 | .Sp |
| 439 | .Vb 3 |
| 440 | \& en-US=Hiho |
| 441 | \& fr=Bonjour |
| 442 | \& i-mingo=Hau' |
| 443 | .Ve |
| 444 | .Sp |
| 445 | And suppose you write a program that reads that file and then runs as |
| 446 | a daemon, answering client requests that specify a language tag and |
| 447 | then expect the string that says how to greet in that language. So an |
| 448 | interaction looks like: |
| 449 | .Sp |
| 450 | .Vb 2 |
| 451 | \& greeting-client asks: fr |
| 452 | \& greeting-server answers: Bonjour |
| 453 | .Ve |
| 454 | .Sp |
| 455 | So far so good. But suppose the way you're implementing this is: |
| 456 | .Sp |
| 457 | .Vb 9 |
| 458 | \& my %greetings; |
| 459 | \& die unless open(IN, "<in.dat"); |
| 460 | \& while(<IN>) { |
| 461 | \& chomp; |
| 462 | \& next unless /^([^=]+)=(.+)/s; |
| 463 | \& my($lang, $expr) = ($1, $2); |
| 464 | \& $greetings{$lang} = $expr; |
| 465 | \& } |
| 466 | \& close(IN); |
| 467 | .Ve |
| 468 | .Sp |
| 469 | at which point \f(CW%greetings\fR has the contents: |
| 470 | .Sp |
| 471 | .Vb 3 |
| 472 | \& "en-US" => "Hiho" |
| 473 | \& "fr" => "Bonjour" |
| 474 | \& "i-mingo" => "Hau'" |
| 475 | .Ve |
| 476 | .Sp |
| 477 | And suppose then that you answer client requests for language \f(CW$wanted\fR |
| 478 | by just looking up \f(CW$greetings\fR{$wanted}. |
| 479 | .Sp |
| 480 | If the client asks for \*(L"fr\*(R", that will look up successfully in |
| 481 | \&\f(CW%greetings\fR, to the value \*(L"Bonjour\*(R". And if the client asks for |
| 482 | \&\*(L"i\-mingo\*(R", that will look up successfully in \f(CW%greetings\fR, to the value |
| 483 | \&\*(L"Hau'\*(R". |
| 484 | .Sp |
| 485 | But if the client asks for \*(L"i\-Mingo\*(R" or \*(L"x\-mingo\*(R", or \*(L"Fr\*(R", then the |
| 486 | lookup in \f(CW%greetings\fR fails. That's the Wrong Thing. |
| 487 | .Sp |
| 488 | You could instead do lookups on \f(CW$wanted\fR with: |
| 489 | .Sp |
| 490 | .Vb 8 |
| 491 | \& use I18N::LangTags qw(same_language_tag); |
| 492 | \& my $repsonse = ''; |
| 493 | \& foreach my $l2 (keys %greetings) { |
| 494 | \& if(same_language_tag($wanted, $l2)) { |
| 495 | \& $response = $greetings{$l2}; |
| 496 | \& last; |
| 497 | \& } |
| 498 | \& } |
| 499 | .Ve |
| 500 | .Sp |
| 501 | But that's rather inefficient. A better way to do it is to start your |
| 502 | program with: |
| 503 | .Sp |
| 504 | .Vb 12 |
| 505 | \& use I18N::LangTags qw(encode_language_tag); |
| 506 | \& my %greetings; |
| 507 | \& die unless open(IN, "<in.dat"); |
| 508 | \& while(<IN>) { |
| 509 | \& chomp; |
| 510 | \& next unless /^([^=]+)=(.+)/s; |
| 511 | \& my($lang, $expr) = ($1, $2); |
| 512 | \& $greetings{ |
| 513 | \& encode_language_tag($lang) |
| 514 | \& } = $expr; |
| 515 | \& } |
| 516 | \& close(IN); |
| 517 | .Ve |
| 518 | .Sp |
| 519 | and then just answer client requests for language \f(CW$wanted\fR by just |
| 520 | looking up |
| 521 | .Sp |
| 522 | .Vb 1 |
| 523 | \& $greetings{encode_language_tag($wanted)} |
| 524 | .Ve |
| 525 | .Sp |
| 526 | And that does the Right Thing. |
| 527 | .IP "* the function alternate_language_tags($lang1)" 4 |
| 528 | .IX Item "the function alternate_language_tags($lang1)" |
| 529 | This function, if given a language tag, returns all language tags that |
| 530 | are alternate forms of this language tag. (I.e., tags which refer to |
| 531 | the same language.) This is meant to handle legacy tags caused by |
| 532 | the minor changes in language tag standards over the years; and |
| 533 | the x\-/i\- alternation is also dealt with. |
| 534 | .Sp |
| 535 | Note that this function does \fInot\fR try to equate new (and never\-used, |
| 536 | and unusable) |
| 537 | \&\s-1ISO639\-2\s0 three-letter tags to old (and still in use) \s-1ISO639\-1\s0 |
| 538 | two-letter equivalents \*(-- like \*(L"ara\*(R" \-> \*(L"ar\*(R" \*(-- because |
| 539 | \&\*(L"ara\*(R" has \fInever\fR been in use as an Internet language tag, |
| 540 | and \s-1RFC\s0 3066 stipulates that it never should be, since a shorter |
| 541 | tag (\*(L"ar\*(R") exists. |
| 542 | .Sp |
| 543 | Examples: |
| 544 | .Sp |
| 545 | .Vb 10 |
| 546 | \& alternate_language_tags('no-bok') is ('nb') |
| 547 | \& alternate_language_tags('nb') is ('no-bok') |
| 548 | \& alternate_language_tags('he') is ('iw') |
| 549 | \& alternate_language_tags('iw') is ('he') |
| 550 | \& alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') |
| 551 | \& alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') |
| 552 | \& alternate_language_tags('en') is () |
| 553 | \& alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') |
| 554 | \& alternate_language_tags('x-klikitat') is ('i-klikitat') |
| 555 | \& alternate_language_tags('i-klikitat') is ('x-klikitat') |
| 556 | .Ve |
| 557 | .Sp |
| 558 | This function returns empty-list if given anything other than a formally |
| 559 | valid language tag. |
| 560 | .ie n .IP "* the function @langs = panic_languages(@accept_languages)" 4 |
| 561 | .el .IP "* the function \f(CW@langs\fR = panic_languages(@accept_languages)" 4 |
| 562 | .IX Item "the function @langs = panic_languages(@accept_languages)" |
| 563 | This function takes a list of 0 or more language |
| 564 | tags that constitute a given user's Accept-Language list, and |
| 565 | returns a list of tags for \fIother\fR (non\-super) |
| 566 | languages that are probably acceptable to the user, to be |
| 567 | used \fIif all else fails\fR. |
| 568 | .Sp |
| 569 | For example, if a user accepts only 'ca' (Catalan) and |
| 570 | \&'es' (Spanish), and the documents/interfaces you have |
| 571 | available are just in German, Italian, and Chinese, then |
| 572 | the user will most likely want the Italian one (and not |
| 573 | the Chinese or German one!), instead of getting |
| 574 | nothing. So \f(CW\*(C`panic_languages('ca', 'es')\*(C'\fR returns |
| 575 | a list containing 'it' (Italian). |
| 576 | .Sp |
| 577 | English ('en') is \fIalways\fR in the return list, but |
| 578 | whether it's at the very end or not depends |
| 579 | on the input languages. This function works by consulting |
| 580 | an internal table that stipulates what common |
| 581 | languages are \*(L"close\*(R" to each other. |
| 582 | .Sp |
| 583 | A useful construct you might consider using is: |
| 584 | .Sp |
| 585 | .Vb 4 |
| 586 | \& @fallbacks = super_languages(@accept_languages); |
| 587 | \& push @fallbacks, panic_languages( |
| 588 | \& @accept_languages, @fallbacks, |
| 589 | \& ); |
| 590 | .Ve |
| 591 | .IP "* the function implicate_supers( ...languages... )" 4 |
| 592 | .IX Item "the function implicate_supers( ...languages... )" |
| 593 | This takes a list of strings (which are presumed to be language\-tags; |
| 594 | strings that aren't, are ignored); and after each one, this function |
| 595 | inserts super-ordinate forms that don't already appear in the list. |
| 596 | The original list, plus these insertions, is returned. |
| 597 | .Sp |
| 598 | In other words, it takes this: |
| 599 | .Sp |
| 600 | .Vb 1 |
| 601 | \& pt-br de-DE en-US fr pt-br-janeiro |
| 602 | .Ve |
| 603 | .Sp |
| 604 | and returns this: |
| 605 | .Sp |
| 606 | .Vb 1 |
| 607 | \& pt-br pt de-DE de en-US en fr pt-br-janeiro |
| 608 | .Ve |
| 609 | .Sp |
| 610 | This function is most useful in the idiom |
| 611 | .Sp |
| 612 | .Vb 1 |
| 613 | \& implicate_supers( I18N::LangTags::Detect::detect() ); |
| 614 | .Ve |
| 615 | .Sp |
| 616 | (See I18N::LangTags::Detect.) |
| 617 | .IP "* the function implicate_supers_strictly( ...languages... )" 4 |
| 618 | .IX Item "the function implicate_supers_strictly( ...languages... )" |
| 619 | This works like \f(CW\*(C`implicate_supers\*(C'\fR except that the implicated |
| 620 | forms are added to the end of the return list. |
| 621 | .Sp |
| 622 | In other words, implicate_supers_strictly takes a list of strings |
| 623 | (which are presumed to be language\-tags; strings that aren't, are |
| 624 | ignored) and after the whole given list, it inserts the super-ordinate forms |
| 625 | of all given tags, minus any tags that already appear in the input list. |
| 626 | .Sp |
| 627 | In other words, it takes this: |
| 628 | .Sp |
| 629 | .Vb 1 |
| 630 | \& pt-br de-DE en-US fr pt-br-janeiro |
| 631 | .Ve |
| 632 | .Sp |
| 633 | and returns this: |
| 634 | .Sp |
| 635 | .Vb 1 |
| 636 | \& pt-br de-DE en-US fr pt-br-janeiro pt de en |
| 637 | .Ve |
| 638 | .Sp |
| 639 | The reason this function has \*(L"_strictly\*(R" in its name is that when |
| 640 | you're processing an Accept-Language list according to the RFCs, if |
| 641 | you interpret the RFCs quite strictly, then you would use |
| 642 | implicate_supers_strictly, but for normal use (i.e., common-sense use, |
| 643 | as far as I'm concerned) you'd use implicate_supers. |
| 644 | .SH "ABOUT LOWERCASING" |
| 645 | .IX Header "ABOUT LOWERCASING" |
| 646 | I've considered making all the above functions that output language |
| 647 | tags return all those tags strictly in lowercase. Having all your |
| 648 | language tags in lowercase does make some things easier. But you |
| 649 | might as well just lowercase as you like, or call |
| 650 | \&\f(CW\*(C`encode_language_tag($lang1)\*(C'\fR where appropriate. |
| 651 | .SH "ABOUT UNICODE PLAINTEXT LANGUAGE TAGS" |
| 652 | .IX Header "ABOUT UNICODE PLAINTEXT LANGUAGE TAGS" |
| 653 | In some future version of I18N::LangTags, I plan to include support |
| 654 | for RFC2482\-style language tags \*(-- which are basically just normal |
| 655 | language tags with their \s-1ASCII\s0 characters shifted into Plane 14. |
| 656 | .SH "SEE ALSO" |
| 657 | .IX Header "SEE ALSO" |
| 658 | * I18N::LangTags::List |
| 659 | .PP |
| 660 | * \s-1RFC\s0 3066, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc3066.txt\*(C'\fR, \*(L"Tags for the |
| 661 | Identification of Languages\*(R". (Obsoletes \s-1RFC\s0 1766) |
| 662 | .PP |
| 663 | * \s-1RFC\s0 2277, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc2277.txt\*(C'\fR, \*(L"\s-1IETF\s0 Policy on |
| 664 | Character Sets and Languages\*(R". |
| 665 | .PP |
| 666 | * \s-1RFC\s0 2231, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc2231.txt\*(C'\fR, \*(L"\s-1MIME\s0 Parameter |
| 667 | Value and Encoded Word Extensions: Character Sets, Languages, and |
| 668 | Continuations\*(R". |
| 669 | .PP |
| 670 | * \s-1RFC\s0 2482, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc2482.txt\*(C'\fR, |
| 671 | \&\*(L"Language Tagging in Unicode Plain Text\*(R". |
| 672 | .PP |
| 673 | * Locale::Codes, in |
| 674 | \&\f(CW\*(C`http://www.perl.com/CPAN/modules/by\-module/Locale/\*(C'\fR |
| 675 | .PP |
| 676 | * \s-1ISO\s0 639\-2, \*(L"Codes for the representation of names of languages\*(R", |
| 677 | including two-letter and three-letter codes, |
| 678 | \&\f(CW\*(C`http://www.loc.gov/standards/iso639\-2/langcodes.html\*(C'\fR |
| 679 | .PP |
| 680 | * The \s-1IANA\s0 list of registered languages (hopefully up\-to\-date), |
| 681 | \&\f(CW\*(C`http://www.iana.org/assignments/language\-tags\*(C'\fR |
| 682 | .SH "COPYRIGHT" |
| 683 | .IX Header "COPYRIGHT" |
| 684 | Copyright (c) 1998+ Sean M. Burke. All rights reserved. |
| 685 | .PP |
| 686 | This library is free software; you can redistribute it and/or |
| 687 | modify it under the same terms as Perl itself. |
| 688 | .PP |
| 689 | The programs and documentation in this dist are distributed in |
| 690 | the hope that they will be useful, but without any warranty; without |
| 691 | even the implied warranty of merchantability or fitness for a |
| 692 | particular purpose. |
| 693 | .SH "AUTHOR" |
| 694 | .IX Header "AUTHOR" |
| 695 | Sean M. Burke \f(CW\*(C`sburke@cpan.org\*(C'\fR |