Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "I18N::LangTags 3" | |
132 | .TH I18N::LangTags 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | I18N::LangTags \- functions for dealing with RFC3066\-style language tags | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 6 | |
138 | \& use I18N::LangTags qw(is_language_tag same_language_tag | |
139 | \& extract_language_tags super_languages | |
140 | \& similarity_language_tag is_dialect_of | |
141 | \& locale2language_tag alternate_language_tags | |
142 | \& encode_language_tag panic_languages | |
143 | \& ); | |
144 | .Ve | |
145 | .PP | |
146 | \&...or whatever of those functions you want to import. Those are | |
147 | all the exportable functions \*(-- you're free to import only some, | |
148 | or none at all. By default, none are imported. If you say: | |
149 | .PP | |
150 | .Vb 1 | |
151 | \& use I18N::LangTags qw(:ALL) | |
152 | .Ve | |
153 | .PP | |
154 | \&...then all are exported. (This saves you from having to use | |
155 | something less obvious like \f(CW\*(C`use I18N::LangTags qw(/./)\*(C'\fR.) | |
156 | .PP | |
157 | If you don't import any of these functions, assume a \f(CW&I18N::LangTags::\fR | |
158 | in front of all the function names in the following examples. | |
159 | .SH "DESCRIPTION" | |
160 | .IX Header "DESCRIPTION" | |
161 | Language tags are a formalism, described in \s-1RFC\s0 3066 (obsoleting | |
162 | 1766), for declaring what language form (language and possibly | |
163 | dialect) a given chunk of information is in. | |
164 | .PP | |
165 | This library provides functions for common tasks involving language | |
166 | tags as they are needed in a variety of protocols and applications. | |
167 | .PP | |
168 | Please see the \*(L"See Also\*(R" references for a thorough explanation | |
169 | of how to correctly use language tags. | |
170 | .IP "\(bu the function is_language_tag($lang1)" 4 | |
171 | .IX Item "the function is_language_tag($lang1)" | |
172 | Returns true iff \f(CW$lang1\fR is a formally valid language tag. | |
173 | .Sp | |
174 | .Vb 3 | |
175 | \& is_language_tag("fr") is TRUE | |
176 | \& is_language_tag("x-jicarilla") is FALSE | |
177 | \& (Subtags can be 8 chars long at most -- 'jicarilla' is 9) | |
178 | .Ve | |
179 | .Sp | |
180 | .Vb 2 | |
181 | \& is_language_tag("sgn-US") is TRUE | |
182 | \& (That's American Sign Language) | |
183 | .Ve | |
184 | .Sp | |
185 | .Vb 3 | |
186 | \& is_language_tag("i-Klikitat") is TRUE | |
187 | \& (True without regard to the fact noone has actually | |
188 | \& registered Klikitat -- it's a formally valid tag) | |
189 | .Ve | |
190 | .Sp | |
191 | .Vb 2 | |
192 | \& is_language_tag("fr-patois") is TRUE | |
193 | \& (Formally valid -- altho descriptively weak!) | |
194 | .Ve | |
195 | .Sp | |
196 | .Vb 4 | |
197 | \& is_language_tag("Spanish") is FALSE | |
198 | \& is_language_tag("french-patois") is FALSE | |
199 | \& (No good -- first subtag has to match | |
200 | \& /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) | |
201 | .Ve | |
202 | .Sp | |
203 | .Vb 2 | |
204 | \& is_language_tag("x-borg-prot2532") is TRUE | |
205 | \& (Yes, subtags can contain digits, as of RFC3066) | |
206 | .Ve | |
207 | .IP "\(bu the function extract_language_tags($whatever)" 4 | |
208 | .IX Item "the function extract_language_tags($whatever)" | |
209 | Returns a list of whatever looks like formally valid language tags | |
210 | in \f(CW$whatever\fR. Not very smart, so don't get too creative with | |
211 | what you want to feed it. | |
212 | .Sp | |
213 | .Vb 2 | |
214 | \& extract_language_tags("fr, fr-ca, i-mingo") | |
215 | \& returns: ('fr', 'fr-ca', 'i-mingo') | |
216 | .Ve | |
217 | .Sp | |
218 | .Vb 3 | |
219 | \& extract_language_tags("It's like this: I'm in fr -- French!") | |
220 | \& returns: ('It', 'in', 'fr') | |
221 | \& (So don't just feed it any old thing.) | |
222 | .Ve | |
223 | .Sp | |
224 | The output is untainted. If you don't know what tainting is, | |
225 | don't worry about it. | |
226 | .ie n .IP "\(bu the function same_language_tag($lang1, $lang2)" 4 | |
227 | .el .IP "\(bu the function same_language_tag($lang1, \f(CW$lang2\fR)" 4 | |
228 | .IX Item "the function same_language_tag($lang1, $lang2)" | |
229 | Returns true iff \f(CW$lang1\fR and \f(CW$lang2\fR are acceptable variant tags | |
230 | representing the same language\-form. | |
231 | .Sp | |
232 | .Vb 10 | |
233 | \& same_language_tag('x-kadara', 'i-kadara') is TRUE | |
234 | \& (The x/i- alternation doesn't matter) | |
235 | \& same_language_tag('X-KADARA', 'i-kadara') is TRUE | |
236 | \& (...and neither does case) | |
237 | \& same_language_tag('en', 'en-US') is FALSE | |
238 | \& (all-English is not the SAME as US English) | |
239 | \& same_language_tag('x-kadara', 'x-kadar') is FALSE | |
240 | \& (these are totally unrelated tags) | |
241 | \& same_language_tag('no-bok', 'nb') is TRUE | |
242 | \& (no-bok is a legacy tag for nb (Norwegian Bokmal)) | |
243 | .Ve | |
244 | .Sp | |
245 | \&\f(CW\*(C`same_language_tag\*(C'\fR works by just seeing whether | |
246 | \&\f(CW\*(C`encode_language_tag($lang1)\*(C'\fR is the same as | |
247 | \&\f(CW\*(C`encode_language_tag($lang2)\*(C'\fR. | |
248 | .Sp | |
249 | (Yes, I know this function is named a bit oddly. Call it historic | |
250 | reasons.) | |
251 | .ie n .IP "\(bu the function similarity_language_tag($lang1, $lang2)" 4 | |
252 | .el .IP "\(bu the function similarity_language_tag($lang1, \f(CW$lang2\fR)" 4 | |
253 | .IX Item "the function similarity_language_tag($lang1, $lang2)" | |
254 | Returns an integer representing the degree of similarity between | |
255 | tags \f(CW$lang1\fR and \f(CW$lang2\fR (the order of which does not matter), where | |
256 | similarity is the number of common elements on the left, | |
257 | without regard to case and to x/i\- alternation. | |
258 | .Sp | |
259 | .Vb 4 | |
260 | \& similarity_language_tag('fr', 'fr-ca') is 1 | |
261 | \& (one element in common) | |
262 | \& similarity_language_tag('fr-ca', 'fr-FR') is 1 | |
263 | \& (one element in common) | |
264 | .Ve | |
265 | .Sp | |
266 | .Vb 4 | |
267 | \& similarity_language_tag('fr-CA-joual', | |
268 | \& 'fr-CA-PEI') is 2 | |
269 | \& similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 | |
270 | \& (two elements in common) | |
271 | .Ve | |
272 | .Sp | |
273 | .Vb 2 | |
274 | \& similarity_language_tag('x-kadara', 'i-kadara') is 1 | |
275 | \& (x/i- doesn't matter) | |
276 | .Ve | |
277 | .Sp | |
278 | .Vb 3 | |
279 | \& similarity_language_tag('en', 'x-kadar') is 0 | |
280 | \& similarity_language_tag('x-kadara', 'x-kadar') is 0 | |
281 | \& (unrelated tags -- no similarity) | |
282 | .Ve | |
283 | .Sp | |
284 | .Vb 3 | |
285 | \& similarity_language_tag('i-cree-syllabic', | |
286 | \& 'i-cherokee-syllabic') is 0 | |
287 | \& (no B<leftmost> elements in common!) | |
288 | .Ve | |
289 | .ie n .IP "\(bu the function is_dialect_of($lang1, $lang2)" 4 | |
290 | .el .IP "\(bu the function is_dialect_of($lang1, \f(CW$lang2\fR)" 4 | |
291 | .IX Item "the function is_dialect_of($lang1, $lang2)" | |
292 | Returns true iff language tag \f(CW$lang1\fR represents a subform of | |
293 | language tag \f(CW$lang2\fR. | |
294 | .Sp | |
295 | \&\fBGet the order right! It doesn't work the other way around!\fR | |
296 | .Sp | |
297 | .Vb 2 | |
298 | \& is_dialect_of('en-US', 'en') is TRUE | |
299 | \& (American English IS a dialect of all-English) | |
300 | .Ve | |
301 | .Sp | |
302 | .Vb 3 | |
303 | \& is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE | |
304 | \& is_dialect_of('fr-CA-joual', 'fr') is TRUE | |
305 | \& (Joual is a dialect of (a dialect of) French) | |
306 | .Ve | |
307 | .Sp | |
308 | .Vb 2 | |
309 | \& is_dialect_of('en', 'en-US') is FALSE | |
310 | \& (all-English is a NOT dialect of American English) | |
311 | .Ve | |
312 | .Sp | |
313 | .Vb 1 | |
314 | \& is_dialect_of('fr', 'en-CA') is FALSE | |
315 | .Ve | |
316 | .Sp | |
317 | .Vb 3 | |
318 | \& is_dialect_of('en', 'en' ) is TRUE | |
319 | \& is_dialect_of('en-US', 'en-US') is TRUE | |
320 | \& (B<Note:> these are degenerate cases) | |
321 | .Ve | |
322 | .Sp | |
323 | .Vb 2 | |
324 | \& is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE | |
325 | \& (the x/i thing doesn't matter, nor does case) | |
326 | .Ve | |
327 | .Sp | |
328 | .Vb 4 | |
329 | \& is_dialect_of('nn', 'no') is TRUE | |
330 | \& (because 'nn' (New Norse) is aliased to 'no-nyn', | |
331 | \& as a special legacy case, and 'no-nyn' is a | |
332 | \& subform of 'no' (Norwegian)) | |
333 | .Ve | |
334 | .IP "\(bu the function super_languages($lang1)" 4 | |
335 | .IX Item "the function super_languages($lang1)" | |
336 | Returns a list of language tags that are superordinate tags to \f(CW$lang1\fR | |
337 | \&\*(-- it gets this by removing subtags from the end of \f(CW$lang1\fR until | |
338 | nothing (or just \*(L"i\*(R" or \*(L"x\*(R") is left. | |
339 | .Sp | |
340 | .Vb 1 | |
341 | \& super_languages("fr-CA-joual") is ("fr-CA", "fr") | |
342 | .Ve | |
343 | .Sp | |
344 | .Vb 1 | |
345 | \& super_languages("en-AU") is ("en") | |
346 | .Ve | |
347 | .Sp | |
348 | .Vb 1 | |
349 | \& super_languages("en") is empty-list, () | |
350 | .Ve | |
351 | .Sp | |
352 | .Vb 2 | |
353 | \& super_languages("i-cherokee") is empty-list, () | |
354 | \& ...not ("i"), which would be illegal as well as pointless. | |
355 | .Ve | |
356 | .Sp | |
357 | If \f(CW$lang1\fR is not a valid language tag, returns empty-list in | |
358 | a list context, undef in a scalar context. | |
359 | .Sp | |
360 | A notable and rather unavoidable problem with this method: | |
361 | \&\*(L"x\-mingo\-tom\*(R" has an \*(L"x\*(R" because the whole tag isn't an | |
362 | IANA-registered tag \*(-- but super_languages('x\-mingo\-tom') is | |
363 | ('x\-mingo') \*(-- which isn't really right, since 'i\-mingo' is | |
364 | registered. But this module has no way of knowing that. (But note | |
365 | that same_language_tag('x\-mingo', 'i\-mingo') is \s-1TRUE\s0.) | |
366 | .Sp | |
367 | More importantly, you assume \fIat your peril\fR that superordinates of | |
368 | \&\f(CW$lang1\fR are mutually intelligible with \f(CW$lang1\fR. Consider this | |
369 | carefully. | |
370 | .IP "\(bu the function locale2language_tag($locale_identifier)" 4 | |
371 | .IX Item "the function locale2language_tag($locale_identifier)" | |
372 | This takes a locale name (like \*(L"en\*(R", \*(L"en_US\*(R", or \*(L"en_US.ISO8859\-1\*(R") | |
373 | and maps it to a language tag. If it's not mappable (as with, | |
374 | notably, \*(L"C\*(R" and \*(L"\s-1POSIX\s0\*(R"), this returns empty-list in a list context, | |
375 | or undef in a scalar context. | |
376 | .Sp | |
377 | .Vb 1 | |
378 | \& locale2language_tag("en") is "en" | |
379 | .Ve | |
380 | .Sp | |
381 | .Vb 1 | |
382 | \& locale2language_tag("en_US") is "en-US" | |
383 | .Ve | |
384 | .Sp | |
385 | .Vb 1 | |
386 | \& locale2language_tag("en_US.ISO8859-1") is "en-US" | |
387 | .Ve | |
388 | .Sp | |
389 | .Vb 1 | |
390 | \& locale2language_tag("C") is undef or () | |
391 | .Ve | |
392 | .Sp | |
393 | .Vb 1 | |
394 | \& locale2language_tag("POSIX") is undef or () | |
395 | .Ve | |
396 | .Sp | |
397 | .Vb 1 | |
398 | \& locale2language_tag("POSIX") is undef or () | |
399 | .Ve | |
400 | .Sp | |
401 | I'm not totally sure that locale names map satisfactorily to language | |
402 | tags. Think \s-1REAL\s0 hard about how you use this. \s-1YOU\s0 \s-1HAVE\s0 \s-1BEEN\s0 \s-1WARNED\s0. | |
403 | .Sp | |
404 | The output is untainted. If you don't know what tainting is, | |
405 | don't worry about it. | |
406 | .IP "\(bu the function encode_language_tag($lang1)" 4 | |
407 | .IX Item "the function encode_language_tag($lang1)" | |
408 | This function, if given a language tag, returns an encoding of it such | |
409 | that: | |
410 | .Sp | |
411 | * tags representing different languages never get the same encoding. | |
412 | .Sp | |
413 | * tags representing the same language always get the same encoding. | |
414 | .Sp | |
415 | * an encoding of a formally valid language tag always is a string | |
416 | value that is defined, has length, and is true if considered as a | |
417 | boolean. | |
418 | .Sp | |
419 | Note that the encoding itself is \fBnot\fR a formally valid language tag. | |
420 | Note also that you cannot, currently, go from an encoding back to a | |
421 | language tag that it's an encoding of. | |
422 | .Sp | |
423 | Note also that you \fBmust\fR consider the encoded value as atomic; i.e., | |
424 | you should not consider it as anything but an opaque, unanalysable | |
425 | string value. (The internals of the encoding method may change in | |
426 | future versions, as the language tagging standard changes over time.) | |
427 | .Sp | |
428 | \&\f(CW\*(C`encode_language_tag\*(C'\fR returns undef if given anything other than a | |
429 | formally valid language tag. | |
430 | .Sp | |
431 | The reason \f(CW\*(C`encode_language_tag\*(C'\fR exists is because different language | |
432 | tags may represent the same language; this is normally treatable with | |
433 | \&\f(CW\*(C`same_language_tag\*(C'\fR, but consider this situation: | |
434 | .Sp | |
435 | You have a data file that expresses greetings in different languages. | |
436 | Its format is \*(L"[language tag]=[how to say 'Hello']\*(R", like: | |
437 | .Sp | |
438 | .Vb 3 | |
439 | \& en-US=Hiho | |
440 | \& fr=Bonjour | |
441 | \& i-mingo=Hau' | |
442 | .Ve | |
443 | .Sp | |
444 | And suppose you write a program that reads that file and then runs as | |
445 | a daemon, answering client requests that specify a language tag and | |
446 | then expect the string that says how to greet in that language. So an | |
447 | interaction looks like: | |
448 | .Sp | |
449 | .Vb 2 | |
450 | \& greeting-client asks: fr | |
451 | \& greeting-server answers: Bonjour | |
452 | .Ve | |
453 | .Sp | |
454 | So far so good. But suppose the way you're implementing this is: | |
455 | .Sp | |
456 | .Vb 9 | |
457 | \& my %greetings; | |
458 | \& die unless open(IN, "<in.dat"); | |
459 | \& while(<IN>) { | |
460 | \& chomp; | |
461 | \& next unless /^([^=]+)=(.+)/s; | |
462 | \& my($lang, $expr) = ($1, $2); | |
463 | \& $greetings{$lang} = $expr; | |
464 | \& } | |
465 | \& close(IN); | |
466 | .Ve | |
467 | .Sp | |
468 | at which point \f(CW%greetings\fR has the contents: | |
469 | .Sp | |
470 | .Vb 3 | |
471 | \& "en-US" => "Hiho" | |
472 | \& "fr" => "Bonjour" | |
473 | \& "i-mingo" => "Hau'" | |
474 | .Ve | |
475 | .Sp | |
476 | And suppose then that you answer client requests for language \f(CW$wanted\fR | |
477 | by just looking up \f(CW$greetings\fR{$wanted}. | |
478 | .Sp | |
479 | If the client asks for \*(L"fr\*(R", that will look up successfully in | |
480 | \&\f(CW%greetings\fR, to the value \*(L"Bonjour\*(R". And if the client asks for | |
481 | \&\*(L"i\-mingo\*(R", that will look up successfully in \f(CW%greetings\fR, to the value | |
482 | \&\*(L"Hau'\*(R". | |
483 | .Sp | |
484 | But if the client asks for \*(L"i\-Mingo\*(R" or \*(L"x\-mingo\*(R", or \*(L"Fr\*(R", then the | |
485 | lookup in \f(CW%greetings\fR fails. That's the Wrong Thing. | |
486 | .Sp | |
487 | You could instead do lookups on \f(CW$wanted\fR with: | |
488 | .Sp | |
489 | .Vb 8 | |
490 | \& use I18N::LangTags qw(same_language_tag); | |
491 | \& my $repsonse = ''; | |
492 | \& foreach my $l2 (keys %greetings) { | |
493 | \& if(same_language_tag($wanted, $l2)) { | |
494 | \& $response = $greetings{$l2}; | |
495 | \& last; | |
496 | \& } | |
497 | \& } | |
498 | .Ve | |
499 | .Sp | |
500 | But that's rather inefficient. A better way to do it is to start your | |
501 | program with: | |
502 | .Sp | |
503 | .Vb 12 | |
504 | \& use I18N::LangTags qw(encode_language_tag); | |
505 | \& my %greetings; | |
506 | \& die unless open(IN, "<in.dat"); | |
507 | \& while(<IN>) { | |
508 | \& chomp; | |
509 | \& next unless /^([^=]+)=(.+)/s; | |
510 | \& my($lang, $expr) = ($1, $2); | |
511 | \& $greetings{ | |
512 | \& encode_language_tag($lang) | |
513 | \& } = $expr; | |
514 | \& } | |
515 | \& close(IN); | |
516 | .Ve | |
517 | .Sp | |
518 | and then just answer client requests for language \f(CW$wanted\fR by just | |
519 | looking up | |
520 | .Sp | |
521 | .Vb 1 | |
522 | \& $greetings{encode_language_tag($wanted)} | |
523 | .Ve | |
524 | .Sp | |
525 | And that does the Right Thing. | |
526 | .IP "\(bu the function alternate_language_tags($lang1)" 4 | |
527 | .IX Item "the function alternate_language_tags($lang1)" | |
528 | This function, if given a language tag, returns all language tags that | |
529 | are alternate forms of this language tag. (I.e., tags which refer to | |
530 | the same language.) This is meant to handle legacy tags caused by | |
531 | the minor changes in language tag standards over the years; and | |
532 | the x\-/i\- alternation is also dealt with. | |
533 | .Sp | |
534 | Note that this function does \fInot\fR try to equate new (and never\-used, | |
535 | and unusable) | |
536 | \&\s-1ISO639\-2\s0 three-letter tags to old (and still in use) \s-1ISO639\-1\s0 | |
537 | two-letter equivalents \*(-- like \*(L"ara\*(R" \-> \*(L"ar\*(R" \*(-- because | |
538 | \&\*(L"ara\*(R" has \fInever\fR been in use as an Internet language tag, | |
539 | and \s-1RFC\s0 3066 stipulates that it never should be, since a shorter | |
540 | tag (\*(L"ar\*(R") exists. | |
541 | .Sp | |
542 | Examples: | |
543 | .Sp | |
544 | .Vb 10 | |
545 | \& alternate_language_tags('no-bok') is ('nb') | |
546 | \& alternate_language_tags('nb') is ('no-bok') | |
547 | \& alternate_language_tags('he') is ('iw') | |
548 | \& alternate_language_tags('iw') is ('he') | |
549 | \& alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') | |
550 | \& alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') | |
551 | \& alternate_language_tags('en') is () | |
552 | \& alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') | |
553 | \& alternate_language_tags('x-klikitat') is ('i-klikitat') | |
554 | \& alternate_language_tags('i-klikitat') is ('x-klikitat') | |
555 | .Ve | |
556 | .Sp | |
557 | This function returns empty-list if given anything other than a formally | |
558 | valid language tag. | |
559 | .ie n .IP "\(bu the function @langs = panic_languages(@accept_languages)" 4 | |
560 | .el .IP "\(bu the function \f(CW@langs\fR = panic_languages(@accept_languages)" 4 | |
561 | .IX Item "the function @langs = panic_languages(@accept_languages)" | |
562 | This function takes a list of 0 or more language | |
563 | tags that constitute a given user's Accept-Language list, and | |
564 | returns a list of tags for \fIother\fR (non\-super) | |
565 | languages that are probably acceptable to the user, to be | |
566 | used \fIif all else fails\fR. | |
567 | .Sp | |
568 | For example, if a user accepts only 'ca' (Catalan) and | |
569 | \&'es' (Spanish), and the documents/interfaces you have | |
570 | available are just in German, Italian, and Chinese, then | |
571 | the user will most likely want the Italian one (and not | |
572 | the Chinese or German one!), instead of getting | |
573 | nothing. So \f(CW\*(C`panic_languages('ca', 'es')\*(C'\fR returns | |
574 | a list containing 'it' (Italian). | |
575 | .Sp | |
576 | English ('en') is \fIalways\fR in the return list, but | |
577 | whether it's at the very end or not depends | |
578 | on the input languages. This function works by consulting | |
579 | an internal table that stipulates what common | |
580 | languages are \*(L"close\*(R" to each other. | |
581 | .Sp | |
582 | A useful construct you might consider using is: | |
583 | .Sp | |
584 | .Vb 4 | |
585 | \& @fallbacks = super_languages(@accept_languages); | |
586 | \& push @fallbacks, panic_languages( | |
587 | \& @accept_languages, @fallbacks, | |
588 | \& ); | |
589 | .Ve | |
590 | .SH "ABOUT LOWERCASING" | |
591 | .IX Header "ABOUT LOWERCASING" | |
592 | I've considered making all the above functions that output language | |
593 | tags return all those tags strictly in lowercase. Having all your | |
594 | language tags in lowercase does make some things easier. But you | |
595 | might as well just lowercase as you like, or call | |
596 | \&\f(CW\*(C`encode_language_tag($lang1)\*(C'\fR where appropriate. | |
597 | .SH "ABOUT UNICODE PLAINTEXT LANGUAGE TAGS" | |
598 | .IX Header "ABOUT UNICODE PLAINTEXT LANGUAGE TAGS" | |
599 | In some future version of I18N::LangTags, I plan to include support | |
600 | for RFC2482\-style language tags \*(-- which are basically just normal | |
601 | language tags with their \s-1ASCII\s0 characters shifted into Plane 14. | |
602 | .SH "SEE ALSO" | |
603 | .IX Header "SEE ALSO" | |
604 | * I18N::LangTags::List | |
605 | .PP | |
606 | * \s-1RFC\s0 3066, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc3066.txt\*(C'\fR, \*(L"Tags for the | |
607 | Identification of Languages\*(R". (Obsoletes \s-1RFC\s0 1766) | |
608 | .PP | |
609 | * \s-1RFC\s0 2277, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc2277.txt\*(C'\fR, \*(L"\s-1IETF\s0 Policy on | |
610 | Character Sets and Languages\*(R". | |
611 | .PP | |
612 | * \s-1RFC\s0 2231, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc2231.txt\*(C'\fR, \*(L"\s-1MIME\s0 Parameter | |
613 | Value and Encoded Word Extensions: Character Sets, Languages, and | |
614 | Continuations\*(R". | |
615 | .PP | |
616 | * \s-1RFC\s0 2482, \f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc2482.txt\*(C'\fR, | |
617 | \&\*(L"Language Tagging in Unicode Plain Text\*(R". | |
618 | .PP | |
619 | * Locale::Codes, in | |
620 | \&\f(CW\*(C`http://www.perl.com/CPAN/modules/by\-module/Locale/\*(C'\fR | |
621 | .PP | |
622 | * \s-1ISO\s0 639, \*(L"Code for the representation of names of languages\*(R", | |
623 | \&\f(CW\*(C`http://www.indigo.ie/egt/standards/iso639/iso639\-1\-en.html\*(C'\fR | |
624 | .PP | |
625 | * \s-1ISO\s0 639\-2, \*(L"Codes for the representation of names of languages\*(R", | |
626 | including three-letter codes, | |
627 | \&\f(CW\*(C`http://lcweb.loc.gov/standards/iso639\-2/bibcodes.html\*(C'\fR | |
628 | .PP | |
629 | * The \s-1IANA\s0 list of registered languages (hopefully up\-to\-date), | |
630 | \&\f(CW\*(C`ftp://ftp.isi.edu/in\-notes/iana/assignments/languages/\*(C'\fR | |
631 | .SH "COPYRIGHT" | |
632 | .IX Header "COPYRIGHT" | |
633 | Copyright (c) 1998\-2001 Sean M. Burke. All rights reserved. | |
634 | .PP | |
635 | This library is free software; you can redistribute it and/or | |
636 | modify it under the same terms as Perl itself. | |
637 | .PP | |
638 | The programs and documentation in this dist are distributed in | |
639 | the hope that they will be useful, but without any warranty; without | |
640 | even the implied warranty of merchantability or fitness for a | |
641 | particular purpose. | |
642 | .SH "AUTHOR" | |
643 | .IX Header "AUTHOR" | |
644 | Sean M. Burke \f(CW\*(C`sburke@cpan.org\*(C'\fR |