Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "Locale::Maketext::TPJ13 3" | |
132 | .TH Locale::Maketext::TPJ13 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | Locale::Maketext::TPJ13 \-\- article about software localization | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 1 | |
138 | \& # This an article, not a module. | |
139 | .Ve | |
140 | .SH "DESCRIPTION" | |
141 | .IX Header "DESCRIPTION" | |
142 | The following article by Sean M. Burke and Jordan Lachler | |
143 | first appeared in \fIThe Perl | |
144 | Journal\fR #13 and is copyright 1999 The Perl Journal. It appears | |
145 | courtesy of Jon Orwant and The Perl Journal. This document may be | |
146 | distributed under the same terms as Perl itself. | |
147 | .SH "Localization and Perl: gettext breaks, Maketext fixes" | |
148 | .IX Header "Localization and Perl: gettext breaks, Maketext fixes" | |
149 | by Sean M. Burke and Jordan Lachler | |
150 | .PP | |
151 | This article points out cases where gettext (a common system for | |
152 | localizing software interfaces \*(-- i.e., making them work in the user's | |
153 | language of choice) fails because of basic differences between human | |
154 | languages. This article then describes Maketext, a new system capable | |
155 | of correctly treating these differences. | |
156 | .Sh "A Localization Horror Story: It Could Happen To You" | |
157 | .IX Subsection "A Localization Horror Story: It Could Happen To You" | |
158 | .RS 4 | |
159 | \&\*(L"There are a number of languages spoken by human beings in this | |
160 | world.\*(R" | |
161 | .Sp | |
162 | \&\-\- Harald Tveit Alvestrand, in \s-1RFC\s0 1766, \*(L"Tags for the | |
163 | Identification of Languages\*(R" | |
164 | .RE | |
165 | .PP | |
166 | Imagine that your task for the day is to localize a piece of software | |
167 | \&\*(-- and luckily for you, the only output the program emits is two | |
168 | messages, like this: | |
169 | .PP | |
170 | .Vb 1 | |
171 | \& I scanned 12 directories. | |
172 | .Ve | |
173 | .PP | |
174 | .Vb 1 | |
175 | \& Your query matched 10 files in 4 directories. | |
176 | .Ve | |
177 | .PP | |
178 | So how hard could that be? You look at the code that | |
179 | produces the first item, and it reads: | |
180 | .PP | |
181 | .Vb 2 | |
182 | \& printf("I scanned %g directories.", | |
183 | \& $directory_count); | |
184 | .Ve | |
185 | .PP | |
186 | You think about that, and realize that it doesn't even work right for | |
187 | English, as it can produce this output: | |
188 | .PP | |
189 | .Vb 1 | |
190 | \& I scanned 1 directories. | |
191 | .Ve | |
192 | .PP | |
193 | So you rewrite it to read: | |
194 | .PP | |
195 | .Vb 5 | |
196 | \& printf("I scanned %g %s.", | |
197 | \& $directory_count, | |
198 | \& $directory_count == 1 ? | |
199 | \& "directory" : "directories", | |
200 | \& ); | |
201 | .Ve | |
202 | .PP | |
203 | \&...which does the Right Thing. (In case you don't recall, \*(L"%g\*(R" is for | |
204 | locale-specific number interpolation, and \*(L"%s\*(R" is for string | |
205 | interpolation.) | |
206 | .PP | |
207 | But you still have to localize it for all the languages you're | |
208 | producing this software for, so you pull Locale::gettext off of \s-1CPAN\s0 | |
209 | so you can access the \f(CW\*(C`gettext\*(C'\fR C functions you've heard are standard | |
210 | for localization tasks. | |
211 | .PP | |
212 | And you write: | |
213 | .PP | |
214 | .Vb 5 | |
215 | \& printf(gettext("I scanned %g %s."), | |
216 | \& $dir_scan_count, | |
217 | \& $dir_scan_count == 1 ? | |
218 | \& gettext("directory") : gettext("directories"), | |
219 | \& ); | |
220 | .Ve | |
221 | .PP | |
222 | But you then read in the gettext manual (Drepper, Miller, and Pinard 1995) | |
223 | that this is not a good idea, since how a single word like \*(L"directory\*(R" | |
224 | or \*(L"directories\*(R" is translated may depend on context \*(-- and this is | |
225 | true, since in a case language like German or Russian, you'd may need | |
226 | these words with a different case ending in the first instance (where the | |
227 | word is the object of a verb) than in the second instance, which you haven't even | |
228 | gotten to yet (where the word is the object of a preposition, \*(L"in \f(CW%g\fR | |
229 | directories\*(R") \*(-- assuming these keep the same syntax when translated | |
230 | into those languages. | |
231 | .PP | |
232 | So, on the advice of the gettext manual, you rewrite: | |
233 | .PP | |
234 | .Vb 4 | |
235 | \& printf( $dir_scan_count == 1 ? | |
236 | \& gettext("I scanned %g directory.") : | |
237 | \& gettext("I scanned %g directories."), | |
238 | \& $dir_scan_count ); | |
239 | .Ve | |
240 | .PP | |
241 | So, you email your various translators (the boss decides that the | |
242 | languages du jour are Chinese, Arabic, Russian, and Italian, so you | |
243 | have one translator for each), asking for translations for \*(L"I scanned | |
244 | \&\f(CW%g\fR directory.\*(R" and \*(L"I scanned \f(CW%g\fR directories.\*(R". When they reply, | |
245 | you'll put that in the lexicons for gettext to use when it localizes | |
246 | your software, so that when the user is running under the \*(L"zh\*(R" | |
247 | (Chinese) locale, gettext(\*(L"I scanned \f(CW%g\fR directory.\*(R") will return the | |
248 | appropriate Chinese text, with a \*(L"%g\*(R" in there where printf can then | |
249 | interpolate \f(CW$dir_scan\fR. | |
250 | .PP | |
251 | Your Chinese translator emails right back \*(-- he says both of these | |
252 | phrases translate to the same thing in Chinese, because, in linguistic | |
253 | jargon, Chinese \*(L"doesn't have number as a grammatical category\*(R" \*(-- | |
254 | whereas English does. That is, English has grammatical rules that | |
255 | refer to \*(L"number\*(R", i.e., whether something is grammatically singular | |
256 | or plural; and one of these rules is the one that forces nouns to take | |
257 | a plural suffix (generally \*(L"s\*(R") when in a plural context, as they are when | |
258 | they follow a number other than \*(L"one\*(R" (including, oddly enough, \*(L"zero\*(R"). | |
259 | Chinese has no such rules, and so has just the one phrase where English | |
260 | has two. But, no problem, you can have this one Chinese phrase appear | |
261 | as the translation for the two English phrases in the \*(L"zh\*(R" gettext | |
262 | lexicon for your program. | |
263 | .PP | |
264 | Emboldened by this, you dive into the second phrase that your software | |
265 | needs to output: \*(L"Your query matched 10 files in 4 directories.\*(R". You notice | |
266 | that if you want to treat phrases as indivisible, as the gettext | |
267 | manual wisely advises, you need four cases now, instead of two, to | |
268 | cover the permutations of singular and plural on the two items, | |
269 | \&\f(CW$dir_count\fR and \f(CW$file_count\fR. So you try this: | |
270 | .PP | |
271 | .Vb 9 | |
272 | \& printf( $file_count == 1 ? | |
273 | \& ( $directory_count == 1 ? | |
274 | \& gettext("Your query matched %g file in %g directory.") : | |
275 | \& gettext("Your query matched %g file in %g directories.") ) : | |
276 | \& ( $directory_count == 1 ? | |
277 | \& gettext("Your query matched %g files in %g directory.") : | |
278 | \& gettext("Your query matched %g files in %g directories.") ), | |
279 | \& $file_count, $directory_count, | |
280 | \& ); | |
281 | .Ve | |
282 | .PP | |
283 | (The case of \*(L"1 file in 2 [or more] directories\*(R" could, I suppose, | |
284 | occur in the case of symlinking or something of the sort.) | |
285 | .PP | |
286 | It occurs to you that this is not the prettiest code you've ever | |
287 | written, but this seems the way to go. You mail off to the | |
288 | translators asking for translations for these four cases. The | |
289 | Chinese guy replies with the one phrase that these all translate to in | |
290 | Chinese, and that phrase has two \*(L"%g\*(R"s in it, as it should \*(-- but | |
291 | there's a problem. He translates it word-for-word back: \*(L"In \f(CW%g\fR | |
292 | directories contains \f(CW%g\fR files match your query.\*(R" The \f(CW%g\fR | |
293 | slots are in an order reverse to what they are in English. You wonder | |
294 | how you'll get gettext to handle that. | |
295 | .PP | |
296 | But you put it aside for the moment, and optimistically hope that the | |
297 | other translators won't have this problem, and that their languages | |
298 | will be better behaved \*(-- i.e., that they will be just like English. | |
299 | .PP | |
300 | But the Arabic translator is the next to write back. First off, your | |
301 | code for \*(L"I scanned \f(CW%g\fR directory.\*(R" or \*(L"I scanned \f(CW%g\fR directories.\*(R" | |
302 | assumes there's only singular or plural. But, to use linguistic | |
303 | jargon again, Arabic has grammatical number, like English (but unlike | |
304 | Chinese), but it's a three-term category: singular, dual, and plural. | |
305 | In other words, the way you say \*(L"directory\*(R" depends on whether there's | |
306 | one directory, or \fItwo\fR of them, or \fImore than two\fR of them. Your | |
307 | test of \f(CW\*(C`($directory == 1)\*(C'\fR no longer does the job. And it means | |
308 | that where English's grammatical category of number necessitates | |
309 | only the two permutations of the first sentence based on \*(L"directory | |
310 | [singular]\*(R" and \*(L"directories [plural]\*(R", Arabic has three \*(-- and, | |
311 | worse, in the second sentence (\*(L"Your query matched \f(CW%g\fR file in \f(CW%g\fR | |
312 | directory.\*(R"), where English has four, Arabic has nine. You sense | |
313 | an unwelcome, exponential trend taking shape. | |
314 | .PP | |
315 | Your Italian translator emails you back and says that \*(L"I searched 0 | |
316 | directories\*(R" (a possible English output of your program) is stilted, | |
317 | and if you think that's fine English, that's your problem, but that | |
318 | \&\fIjust will not do\fR in the language of Dante. He insists that where | |
319 | \&\f(CW$directory_count\fR is 0, your program should produce the Italian text | |
320 | for "I \fIdidn't\fR scan \fIany\fR directories.\*(L". And ditto for \*(R"I didn't | |
321 | match any files in any directories\*(L", although he says the last part | |
322 | about \*(R"in any directories" should probably just be left off. | |
323 | .PP | |
324 | You wonder how you'll get gettext to handle this; to accomodate the | |
325 | ways Arabic, Chinese, and Italian deal with numbers in just these few | |
326 | very simple phrases, you need to write code that will ask gettext for | |
327 | different queries depending on whether the numerical values in | |
328 | question are 1, 2, more than 2, or in some cases 0, and you still haven't | |
329 | figured out the problem with the different word order in Chinese. | |
330 | .PP | |
331 | Then your Russian translator calls on the phone, to \fIpersonally\fR tell | |
332 | you the bad news about how really unpleasant your life is about to | |
333 | become: | |
334 | .PP | |
335 | Russian, like German or Latin, is an inflectional language; that is, nouns | |
336 | and adjectives have to take endings that depend on their case | |
337 | (i.e., nominative, accusative, genitive, etc...) \*(-- which is roughly a matter of | |
338 | what role they have in syntax of the sentence \*(-- | |
339 | as well as on the grammatical gender (i.e., masculine, feminine, neuter) | |
340 | and number (i.e., singular or plural) of the noun, as well as on the | |
341 | declension class of the noun. But unlike with most other inflected languages, | |
342 | putting a number-phrase (like \*(L"ten\*(R" or \*(L"forty\-three\*(R", or their Arabic | |
343 | numeral equivalents) in front of noun in Russian can change the case and | |
344 | number that noun is, and therefore the endings you have to put on it. | |
345 | .PP | |
346 | He elaborates: In \*(L"I scanned \f(CW%g\fR directories\*(R", you'd \fIexpect\fR | |
347 | \&\*(L"directories\*(R" to be in the accusative case (since it is the direct | |
348 | object in the sentnce) and the plural number, | |
349 | except where \f(CW$directory_count\fR is 1, then you'd expect the singular, of | |
350 | course. Just like Latin or German. \fIBut!\fR Where \f(CW$directory_count\fR % | |
351 | 10 is 1 (\*(L"%\*(R" for modulo, remember), assuming \f(CW$directory\fR count is an | |
352 | integer, and except where \f(CW$directory_count\fR % 100 is 11, \*(L"directories\*(R" | |
353 | is forced to become grammatically singular, which means it gets the | |
354 | ending for the accusative singular... You begin to visualize the code | |
355 | it'd take to test for the problem so far, \fIand still work for Chinese | |
356 | and Arabic and Italian\fR, and how many gettext items that'd take, but | |
357 | he keeps going... But where \f(CW$directory_count\fR % 10 is 2, 3, or 4 | |
358 | (except where \f(CW$directory_count\fR % 100 is 12, 13, or 14), the word for | |
359 | \&\*(L"directories\*(R" is forced to be genitive singular \*(-- which means another | |
360 | ending... The room begins to spin around you, slowly at first... But | |
361 | with \fIall other\fR integer values, since \*(L"directory\*(R" is an inanimate | |
362 | noun, when preceded by a number and in the nominative or accusative | |
363 | cases (as it is here, just your luck!), it does stay plural, but it is | |
364 | forced into the genitive case \*(-- yet another ending... And | |
365 | you never hear him get to the part about how you're going to run into | |
366 | similar (but maybe subtly different) problems with other Slavic | |
367 | languages like Polish, because the floor comes up to meet you, and you | |
368 | fade into unconsciousness. | |
369 | .PP | |
370 | The above cautionary tale relates how an attempt at localization can | |
371 | lead from programmer consternation, to program obfuscation, to a need | |
372 | for sedation. But careful evaluation shows that your choice of tools | |
373 | merely needed further consideration. | |
374 | .Sh "The Linguistic View" | |
375 | .IX Subsection "The Linguistic View" | |
376 | .RS 4 | |
377 | \&\*(L"It is more complicated than you think.\*(R" | |
378 | .Sp | |
379 | \&\-\- The Eighth Networking Truth, from \s-1RFC\s0 1925 | |
380 | .RE | |
381 | .PP | |
382 | The field of Linguistics has expended a great deal of effort over the | |
383 | past century trying to find grammatical patterns which hold across | |
384 | languages; it's been a constant process | |
385 | of people making generalizations that should apply to all languages, | |
386 | only to find out that, all too often, these generalizations fail \*(-- | |
387 | sometimes failing for just a few languages, sometimes whole classes of | |
388 | languages, and sometimes nearly every language in the world except | |
389 | English. Broad statistical trends are evident in what the \*(L"average | |
390 | language\*(R" is like as far as what its rules can look like, must look | |
391 | like, and cannot look like. But the \*(L"average language\*(R" is just as | |
392 | unreal a concept as the \*(L"average person\*(R" \*(-- it runs up against the | |
393 | fact no language (or person) is, in fact, average. The wisdom of past | |
394 | experience leads us to believe that any given language can do whatever | |
395 | it wants, in any order, with appeal to any kind of grammatical | |
396 | categories wants \*(-- case, number, tense, real or metaphoric | |
397 | characteristics of the things that words refer to, arbitrary or | |
398 | predictable classifications of words based on what endings or prefixes | |
399 | they can take, degree or means of certainty about the truth of | |
400 | statements expressed, and so on, ad infinitum. | |
401 | .PP | |
402 | Mercifully, most localization tasks are a matter of finding ways to | |
403 | translate whole phrases, generally sentences, where the context is | |
404 | relatively set, and where the only variation in content is \fIusually\fR | |
405 | in a number being expressed \*(-- as in the example sentences above. | |
406 | Translating specific, fully-formed sentences is, in practice, fairly | |
407 | foolproof \*(-- which is good, because that's what's in the phrasebooks | |
408 | that so many tourists rely on. Now, a given phrase (whether in a | |
409 | phrasebook or in a gettext lexicon) in one language \fImight\fR have a | |
410 | greater or lesser applicability than that phrase's translation into | |
411 | another language \*(-- for example, strictly speaking, in Arabic, the | |
412 | \&\*(L"your\*(R" in \*(L"Your query matched...\*(R" would take a different form | |
413 | depending on whether the user is male or female; so the Arabic | |
414 | translation \*(L"your[feminine] query\*(R" is applicable in fewer cases than | |
415 | the corresponding English phrase, which doesn't distinguish the user's | |
416 | gender. (In practice, it's not feasable to have a program know the | |
417 | user's gender, so the masculine \*(L"you\*(R" in Arabic is usually used, by | |
418 | default.) | |
419 | .PP | |
420 | But in general, such surprises are rare when entire sentences are | |
421 | being translated, especially when the functional context is restricted | |
422 | to that of a computer interacting with a user either to convey a fact | |
423 | or to prompt for a piece of information. So, for purposes of | |
424 | localization, translation by phrase (generally by sentence) is both the | |
425 | simplest and the least problematic. | |
426 | .Sh "Breaking gettext" | |
427 | .IX Subsection "Breaking gettext" | |
428 | .RS 4 | |
429 | \&\*(L"It Has To Work.\*(R" | |
430 | .Sp | |
431 | \&\-\- First Networking Truth, \s-1RFC\s0 1925 | |
432 | .RE | |
433 | .PP | |
434 | Consider that sentences in a tourist phrasebook are of two types: ones | |
435 | like \*(L"How do I get to the marketplace?\*(R" that don't have any blanks to | |
436 | fill in, and ones like \*(L"How much do these _\|__ cost?\*(R", where there's | |
437 | one or more blanks to fill in (and these are usually linked to a | |
438 | list of words that you can put in that blank: \*(L"fish\*(R", \*(L"potatoes\*(R", | |
439 | \&\*(L"tomatoes\*(R", etc.) The ones with no blanks are no problem, but the | |
440 | fill-in-the-blank ones may not be really straightforward. If it's a | |
441 | Swahili phrasebook, for example, the authors probably didn't bother to | |
442 | tell you the complicated ways that the verb \*(L"cost\*(R" changes its | |
443 | inflectional prefix depending on the noun you're putting in the blank. | |
444 | The trader in the marketplace will still understand what you're saying if | |
445 | you say \*(L"how much do these potatoes cost?\*(R" with the wrong | |
446 | inflectional prefix on \*(L"cost\*(R". After all, \fIyou\fR can't speak proper Swahili, | |
447 | \&\fIyou're\fR just a tourist. But while tourists can be stupid, computers | |
448 | are supposed to be smart; the computer should be able to fill in the | |
449 | blank, and still have the results be grammatical. | |
450 | .PP | |
451 | In other words, a phrasebook entry takes some values as parameters | |
452 | (the things that you fill in the blank or blanks), and provides a value | |
453 | based on these parameters, where the way you get that final value from | |
454 | the given values can, properly speaking, involve an arbitrarily | |
455 | complex series of operations. (In the case of Chinese, it'd be not at | |
456 | all complex, at least in cases like the examples at the beginning of | |
457 | this article; whereas in the case of Russian it'd be a rather complex | |
458 | series of operations. And in some languages, the | |
459 | complexity could be spread around differently: while the act of | |
460 | putting a number-expression in front of a noun phrase might not be | |
461 | complex by itself, it may change how you have to, for example, inflect | |
462 | a verb elsewhere in the sentence. This is what in syntax is called | |
463 | \&\*(L"long\-distance dependencies\*(R".) | |
464 | .PP | |
465 | This talk of parameters and arbitrary complexity is just another way | |
466 | to say that an entry in a phrasebook is what in a programming language | |
467 | would be called a \*(L"function\*(R". Just so you don't miss it, this is the | |
468 | crux of this article: \fIA phrase is a function; a phrasebook is a | |
469 | bunch of functions.\fR | |
470 | .PP | |
471 | The reason that using gettext runs into walls (as in the above | |
472 | second-person horror story) is that you're trying to use a string (or | |
473 | worse, a choice among a bunch of strings) to do what you really need a | |
474 | function for \*(-- which is futile. Preforming (s)printf interpolation | |
475 | on the strings which you get back from gettext does allow you to do \fIsome\fR | |
476 | common things passably well... sometimes... sort of; but, to paraphrase | |
477 | what some people say about \f(CW\*(C`csh\*(C'\fR script programming, \*(L"it fools you | |
478 | into thinking you can use it for real things, but you can't, and you | |
479 | don't discover this until you've already spent too much time trying, | |
480 | and by then it's too late.\*(R" | |
481 | .Sh "Replacing gettext" | |
482 | .IX Subsection "Replacing gettext" | |
483 | So, what needs to replace gettext is a system that supports lexicons | |
484 | of functions instead of lexicons of strings. An entry in a lexicon | |
485 | from such a system should \fInot\fR look like this: | |
486 | .PP | |
487 | .Vb 1 | |
488 | \& "J'ai trouv\exE9 %g fichiers dans %g r\exE9pertoires" | |
489 | .Ve | |
490 | .PP | |
491 | [\exE9 is e\-acute in Latin\-1. Some pod renderers would | |
492 | scream if I used the actual character here. \*(-- \s-1SB\s0] | |
493 | .PP | |
494 | but instead like this, bearing in mind that this is just a first stab: | |
495 | .PP | |
496 | .Vb 8 | |
497 | \& sub I_found_X1_files_in_X2_directories { | |
498 | \& my( $files, $dirs ) = @_[0,1]; | |
499 | \& $files = sprintf("%g %s", $files, | |
500 | \& $files == 1 ? 'fichier' : 'fichiers'); | |
501 | \& $dirs = sprintf("%g %s", $dirs, | |
502 | \& $dirs == 1 ? "r\exE9pertoire" : "r\exE9pertoires"); | |
503 | \& return "J'ai trouv\exE9 $files dans $dirs."; | |
504 | \& } | |
505 | .Ve | |
506 | .PP | |
507 | Now, there's no particularly obvious way to store anything but strings | |
508 | in a gettext lexicon; so it looks like we just have to start over and | |
509 | make something better, from scratch. I call my shot at a | |
510 | gettext-replacement system \*(L"Maketext\*(R", or, in \s-1CPAN\s0 terms, | |
511 | Locale::Maketext. | |
512 | .PP | |
513 | When designing Maketext, I chose to plan its main features in terms of | |
514 | \&\*(L"buzzword compliance\*(R". And here are the buzzwords: | |
515 | .Sh "Buzzwords: Abstraction and Encapsulation" | |
516 | .IX Subsection "Buzzwords: Abstraction and Encapsulation" | |
517 | The complexity of the language you're trying to output a phrase in is | |
518 | entirely abstracted inside (and encapsulated within) the Maketext module | |
519 | for that interface. When you call: | |
520 | .PP | |
521 | .Vb 2 | |
522 | \& print $lang->maketext("You have [quant,_1,piece] of new mail.", | |
523 | \& scalar(@messages)); | |
524 | .Ve | |
525 | .PP | |
526 | you don't know (and in fact can't easily find out) whether this will | |
527 | involve lots of figuring, as in Russian (if \f(CW$lang\fR is a handle to the | |
528 | Russian module), or relatively little, as in Chinese. That kind of | |
529 | abstraction and encapsulation may encourage other pleasant buzzwords | |
530 | like modularization and stratification, depending on what design | |
531 | decisions you make. | |
532 | .Sh "Buzzword: Isomorphism" | |
533 | .IX Subsection "Buzzword: Isomorphism" | |
534 | \&\*(L"Isomorphism\*(R" means \*(L"having the same structure or form\*(R"; in discussions | |
535 | of program design, the word takes on the special, specific meaning that | |
536 | your implementation of a solution to a problem \fIhas the same | |
537 | structure\fR as, say, an informal verbal description of the solution, or | |
538 | maybe of the problem itself. Isomorphism is, all things considered, | |
539 | a good thing \*(-- it's what problem-solving (and solution\-implementing) | |
540 | should look like. | |
541 | .PP | |
542 | What's wrong the with gettext-using code like this... | |
543 | .PP | |
544 | .Vb 9 | |
545 | \& printf( $file_count == 1 ? | |
546 | \& ( $directory_count == 1 ? | |
547 | \& "Your query matched %g file in %g directory." : | |
548 | \& "Your query matched %g file in %g directories." ) : | |
549 | \& ( $directory_count == 1 ? | |
550 | \& "Your query matched %g files in %g directory." : | |
551 | \& "Your query matched %g files in %g directories." ), | |
552 | \& $file_count, $directory_count, | |
553 | \& ); | |
554 | .Ve | |
555 | .PP | |
556 | is first off that it's not well abstracted \*(-- these ways of testing | |
557 | for grammatical number (as in the expressions like \f(CW\*(C`foo == 1 ? | |
558 | singular_form : plural_form\*(C'\fR) should be abstracted to each language | |
559 | module, since how you get grammatical number is language\-specific. | |
560 | .PP | |
561 | But second off, it's not isomorphic \*(-- the \*(L"solution\*(R" (i.e., the | |
562 | phrasebook entries) for Chinese maps from these four English phrases to | |
563 | the one Chinese phrase that fits for all of them. In other words, the | |
564 | informal solution would be \*(L"The way to say what you want in Chinese is | |
565 | with the one phrase 'For your question, in Y directories you would | |
566 | find X files'\*(R" \*(-- and so the implemented solution should be, | |
567 | isomorphically, just a straightforward way to spit out that one | |
568 | phrase, with numerals properly interpolated. It shouldn't have to map | |
569 | from the complexity of other languages to the simplicity of this one. | |
570 | .Sh "Buzzword: Inheritance" | |
571 | .IX Subsection "Buzzword: Inheritance" | |
572 | There's a great deal of reuse possible for sharing of phrases between | |
573 | modules for related dialects, or for sharing of auxiliary functions | |
574 | between related languages. (By \*(L"auxiliary functions\*(R", I mean | |
575 | functions that don't produce phrase\-text, but which, say, return an | |
576 | answer to \*(L"does this number require a plural noun after it?\*(R". Such | |
577 | auxiliary functions would be used in the internal logic of functions | |
578 | that actually do produce phrase\-text.) | |
579 | .PP | |
580 | In the case of sharing phrases, consider that you have an interface | |
581 | already localized for American English (probably by having been | |
582 | written with that as the native locale, but that's incidental). | |
583 | Localizing it for \s-1UK\s0 English should, in practical terms, be just a | |
584 | matter of running it past a British person with the instructions to | |
585 | indicate what few phrases would benefit from a change in spelling or | |
586 | possibly minor rewording. In that case, you should be able to put in | |
587 | the \s-1UK\s0 English localization module \fIonly\fR those phrases that are | |
588 | UK\-specific, and for all the rest, \fIinherit\fR from the American | |
589 | English module. (And I expect this same situation would apply with | |
590 | Brazilian and Continental Portugese, possbily with some \fIvery\fR | |
591 | closely related languages like Czech and Slovak, and possibly with the | |
592 | slightly different \*(L"versions\*(R" of written Mandarin Chinese, as I hear exist in | |
593 | Taiwan and mainland China.) | |
594 | .PP | |
595 | As to sharing of auxiliary functions, consider the problem of Russian | |
596 | numbers from the beginning of this article; obviously, you'd want to | |
597 | write only once the hairy code that, given a numeric value, would | |
598 | return some specification of which case and number a given quanitified | |
599 | noun should use. But suppose that you discover, while localizing an | |
600 | interface for, say, Ukranian (a Slavic language related to Russian, | |
601 | spoken by several million people, many of whom would be relieved to | |
602 | find that your Web site's or software's interface is available in | |
603 | their language), that the rules in Ukranian are the same as in Russian | |
604 | for quantification, and probably for many other grammatical functions. | |
605 | While there may well be no phrases in common between Russian and | |
606 | Ukranian, you could still choose to have the Ukranian module inherit | |
607 | from the Russian module, just for the sake of inheriting all the | |
608 | various grammatical methods. Or, probably better organizationally, | |
609 | you could move those functions to a module called \f(CW\*(C`_E_Slavic\*(C'\fR or | |
610 | something, which Russian and Ukranian could inherit useful functions | |
611 | from, but which would (presumably) provide no lexicon. | |
612 | .Sh "Buzzword: Concision" | |
613 | .IX Subsection "Buzzword: Concision" | |
614 | Okay, concision isn't a buzzword. But it should be, so I decree that | |
615 | as a new buzzword, \*(L"concision\*(R" means that simple common things should | |
616 | be expressible in very few lines (or maybe even just a few characters) | |
617 | of code \*(-- call it a special case of \*(L"making simple things easy and | |
618 | hard things possible\*(R", and see also the role it played in the | |
619 | MIDI::Simple language, discussed elsewhere in this issue [TPJ#13]. | |
620 | .PP | |
621 | Consider our first stab at an entry in our \*(L"phrasebook of functions\*(R": | |
622 | .PP | |
623 | .Vb 8 | |
624 | \& sub I_found_X1_files_in_X2_directories { | |
625 | \& my( $files, $dirs ) = @_[0,1]; | |
626 | \& $files = sprintf("%g %s", $files, | |
627 | \& $files == 1 ? 'fichier' : 'fichiers'); | |
628 | \& $dirs = sprintf("%g %s", $dirs, | |
629 | \& $dirs == 1 ? "r\exE9pertoire" : "r\exE9pertoires"); | |
630 | \& return "J'ai trouv\exE9 $files dans $dirs."; | |
631 | \& } | |
632 | .Ve | |
633 | .PP | |
634 | You may sense that a lexicon (to use a non-committal catch-all term for a | |
635 | collection of things you know how to say, regardless of whether they're | |
636 | phrases or words) consisting of functions \fIexpressed\fR as above would | |
637 | make for rather long-winded and repetitive code \*(-- even if you wisely | |
638 | rewrote this to have quantification (as we call adding a number | |
639 | expression to a noun phrase) be a function called like: | |
640 | .PP | |
641 | .Vb 6 | |
642 | \& sub I_found_X1_files_in_X2_directories { | |
643 | \& my( $files, $dirs ) = @_[0,1]; | |
644 | \& $files = quant($files, "fichier"); | |
645 | \& $dirs = quant($dirs, "r\exE9pertoire"); | |
646 | \& return "J'ai trouv\exE9 $files dans $dirs."; | |
647 | \& } | |
648 | .Ve | |
649 | .PP | |
650 | And you may also sense that you do not want to bother your translators | |
651 | with having to write Perl code \*(-- you'd much rather that they spend | |
652 | their \fIvery costly time\fR on just translation. And this is to say | |
653 | nothing of the near impossibility of finding a commercial translator | |
654 | who would know even simple Perl. | |
655 | .PP | |
656 | In a first-hack implementation of Maketext, each language\-module's | |
657 | lexicon looked like this: | |
658 | .PP | |
659 | .Vb 10 | |
660 | \& %Lexicon = ( | |
661 | \& "I found %g files in %g directories" | |
662 | \& => sub { | |
663 | \& my( $files, $dirs ) = @_[0,1]; | |
664 | \& $files = quant($files, "fichier"); | |
665 | \& $dirs = quant($dirs, "r\exE9pertoire"); | |
666 | \& return "J'ai trouv\exE9 $files dans $dirs."; | |
667 | \& }, | |
668 | \& ... and so on with other phrase => sub mappings ... | |
669 | \& ); | |
670 | .Ve | |
671 | .PP | |
672 | but I immediately went looking for some more concise way to basically | |
673 | denote the same phrase-function \*(-- a way that would also serve to | |
674 | concisely denote \fImost\fR phrase-functions in the lexicon for \fImost\fR | |
675 | languages. After much time and even some actual thought, I decided on | |
676 | this system: | |
677 | .PP | |
678 | * Where a value in a \f(CW%Lexicon\fR hash is a contentful string instead of | |
679 | an anonymous sub (or, conceivably, a coderef), it would be interpreted | |
680 | as a sort of shorthand expression of what the sub does. When accessed | |
681 | for the first time in a session, it is parsed, turned into Perl code, | |
682 | and then eval'd into an anonymous sub; then that sub replaces the | |
683 | original string in that lexicon. (That way, the work of parsing and | |
684 | evaling the shorthand form for a given phrase is done no more than | |
685 | once per session.) | |
686 | .PP | |
687 | * Calls to \f(CW\*(C`maketext\*(C'\fR (as Maketext's main function is called) happen | |
688 | thru a \*(L"language session handle\*(R", notionally very much like an \s-1IO\s0 | |
689 | handle, in that you open one at the start of the session, and use it | |
690 | for \*(L"sending signals\*(R" to an object in order to have it return the text | |
691 | you want. | |
692 | .PP | |
693 | So, this: | |
694 | .PP | |
695 | .Vb 2 | |
696 | \& $lang->maketext("You have [quant,_1,piece] of new mail.", | |
697 | \& scalar(@messages)); | |
698 | .Ve | |
699 | .PP | |
700 | basically means this: look in the lexicon for \f(CW$lang\fR (which may inherit | |
701 | from any number of other lexicons), and find the function that we | |
702 | happen to associate with the string \*(L"You have [quant,_1,piece] of new | |
703 | mail\*(R" (which is, and should be, a functioning \*(L"shorthand\*(R" for this | |
704 | function in the native locale \*(-- English in this case). If you find | |
705 | such a function, call it with \f(CW$lang\fR as its first parameter (as if it | |
706 | were a method), and then a copy of scalar(@messages) as its second, | |
707 | and then return that value. If that function was found, but was in | |
708 | string shorthand instead of being a fully specified function, parse it | |
709 | and make it into a function before calling it the first time. | |
710 | .PP | |
711 | * The shorthand uses code in brackets to indicate method calls that | |
712 | should be performed. A full explanation is not in order here, but a | |
713 | few examples will suffice: | |
714 | .PP | |
715 | .Vb 1 | |
716 | \& "You have [quant,_1,piece] of new mail." | |
717 | .Ve | |
718 | .PP | |
719 | The above code is shorthand for, and will be interpreted as, | |
720 | this: | |
721 | .PP | |
722 | .Vb 8 | |
723 | \& sub { | |
724 | \& my $handle = $_[0]; | |
725 | \& my(@params) = @_; | |
726 | \& return join '', | |
727 | \& "You have ", | |
728 | \& $handle->quant($params[1], 'piece'), | |
729 | \& "of new mail."; | |
730 | \& } | |
731 | .Ve | |
732 | .PP | |
733 | where \*(L"quant\*(R" is the name of a method you're using to quantify the | |
734 | noun \*(L"piece\*(R" with the number \f(CW$params\fR[0]. | |
735 | .PP | |
736 | A string with no brackety calls, like this: | |
737 | .PP | |
738 | .Vb 1 | |
739 | \& "Your search expression was malformed." | |
740 | .Ve | |
741 | .PP | |
742 | is somewhat of a degerate case, and just gets turned into: | |
743 | .PP | |
744 | .Vb 1 | |
745 | \& sub { return "Your search expression was malformed." } | |
746 | .Ve | |
747 | .PP | |
748 | However, not everything you can write in Perl code can be written in | |
749 | the above shorthand system \*(-- not by a long shot. For example, consider | |
750 | the Italian translator from the beginning of this article, who wanted | |
751 | the Italian for \*(L"I didn't find any files\*(R" as a special case, instead | |
752 | of \*(L"I found 0 files\*(R". That couldn't be specified (at least not easily | |
753 | or simply) in our shorthand system, and it would have to be written | |
754 | out in full, like this: | |
755 | .PP | |
756 | .Vb 10 | |
757 | \& sub { # pretend the English strings are in Italian | |
758 | \& my($handle, $files, $dirs) = @_[0,1,2]; | |
759 | \& return "I didn't find any files" unless $files; | |
760 | \& return join '', | |
761 | \& "I found ", | |
762 | \& $handle->quant($files, 'file'), | |
763 | \& " in ", | |
764 | \& $handle->quant($dirs, 'directory'), | |
765 | \& "."; | |
766 | \& } | |
767 | .Ve | |
768 | .PP | |
769 | Next to a lexicon full of shorthand code, that sort of sticks out like a | |
770 | sore thumb \*(-- but this \fIis\fR a special case, after all; and at least | |
771 | it's possible, if not as concise as usual. | |
772 | .PP | |
773 | As to how you'd implement the Russian example from the beginning of | |
774 | the article, well, There's More Than One Way To Do It, but it could be | |
775 | something like this (using English words for Russian, just so you know | |
776 | what's going on): | |
777 | .PP | |
778 | .Vb 1 | |
779 | \& "I [quant,_1,directory,accusative] scanned." | |
780 | .Ve | |
781 | .PP | |
782 | This shifts the burden of complexity off to the quant method. That | |
783 | method's parameters are: the numeric value it's going to use to | |
784 | quantify something; the Russian word it's going to quantify; and the | |
785 | parameter \*(L"accusative\*(R", which you're using to mean that this | |
786 | sentence's syntax wants a noun in the accusative case there, although | |
787 | that quantification method may have to overrule, for grammatical | |
788 | reasons you may recall from the beginning of this article. | |
789 | .PP | |
790 | Now, the Russian quant method here is responsible not only for | |
791 | implementing the strange logic necessary for figuring out how Russian | |
792 | number-phrases impose case and number on their noun\-phrases, but also | |
793 | for inflecting the Russian word for \*(L"directory\*(R". How that inflection | |
794 | is to be carried out is no small issue, and among the solutions I've | |
795 | seen, some (like variations on a simple lookup in a hash where all | |
796 | possible forms are provided for all necessary words) are | |
797 | straightforward but \fIcan\fR become cumbersome when you need to inflect | |
798 | more than a few dozen words; and other solutions (like using | |
799 | algorithms to model the inflections, storing only root forms and | |
800 | irregularities) \fIcan\fR involve more overhead than is justifiable for | |
801 | all but the largest lexicons. | |
802 | .PP | |
803 | Mercifully, this design decision becomes crucial only in the hairiest | |
804 | of inflected languages, of which Russian is by no means the \fIworst\fR case | |
805 | scenario, but is worse than most. Most languages have simpler | |
806 | inflection systems; for example, in English or Swahili, there are | |
807 | generally no more than two possible inflected forms for a given noun | |
808 | (\*(L"error/errors\*(R"; \*(L"kosa/makosa\*(R"), and the | |
809 | rules for producing these forms are fairly simple \*(-- or at least, | |
810 | simple rules can be formulated that work for most words, and you can | |
811 | then treat the exceptions as just \*(L"irregular\*(R", at least relative to | |
812 | your ad hoc rules. A simpler inflection system (simpler rules, fewer | |
813 | forms) means that design decisions are less crucial to maintaining | |
814 | sanity, whereas the same decisions could incur | |
815 | overhead-versus-scalability problems in languages like Russian. It | |
816 | may \fIalso\fR be likely that code (possibly in Perl, as with | |
817 | Lingua::EN::Inflect, for English nouns) has already | |
818 | been written for the language in question, whether simple or complex. | |
819 | .PP | |
820 | Moreover, a third possibility may even be simpler than anything | |
821 | discussed above: \*(L"Just require that all possible (or at least | |
822 | applicable) forms be provided in the call to the given language's quant | |
823 | method, as in:\*(R" | |
824 | .PP | |
825 | .Vb 1 | |
826 | \& "I found [quant,_1,file,files]." | |
827 | .Ve | |
828 | .PP | |
829 | That way, quant just has to chose which form it needs, without having | |
830 | to look up or generate anything. While possibly not optimal for | |
831 | Russian, this should work well for most other languages, where | |
832 | quantification is not as complicated an operation. | |
833 | .Sh "The Devil in the Details" | |
834 | .IX Subsection "The Devil in the Details" | |
835 | There's plenty more to Maketext than described above \*(-- for example, | |
836 | there's the details of how language tags (\*(L"en\-US\*(R", \*(L"i\-pwn\*(R", \*(L"fi\*(R", | |
837 | etc.) or locale IDs (\*(L"en_US\*(R") interact with actual module naming | |
838 | (\*(L"BogoQuery/Locale/en_us.pm\*(R"), and what magic can ensue; there's the | |
839 | details of how to record (and possibly negotiate) what character | |
840 | encoding Maketext will return text in (\s-1UTF8\s0? Latin\-1? \s-1KOI8\s0?). There's | |
841 | the interesting fact that Maketext is for localization, but nowhere | |
842 | actually has a "\f(CW\*(C`use locale;\*(C'\fR" anywhere in it. For the curious, | |
843 | there's the somewhat frightening details of how I actually | |
844 | implement something like data inheritance so that searches across | |
845 | modules' \f(CW%Lexicon\fR hashes can parallel how Perl implements method | |
846 | inheritance. | |
847 | .PP | |
848 | And, most importantly, there's all the practical details of how to | |
849 | actually go about deriving from Maketext so you can use it for your | |
850 | interfaces, and the various tools and conventions for starting out and | |
851 | maintaining individual language modules. | |
852 | .PP | |
853 | That is all covered in the documentation for Locale::Maketext and the | |
854 | modules that come with it, available in \s-1CPAN\s0. After having read this | |
855 | article, which covers the why's of Maketext, the documentation, | |
856 | which covers the how's of it, should be quite straightfoward. | |
857 | .Sh "The Proof in the Pudding: Localizing Web Sites" | |
858 | .IX Subsection "The Proof in the Pudding: Localizing Web Sites" | |
859 | Maketext and gettext have a notable difference: gettext is in C, | |
860 | accessible thru C library calls, whereas Maketext is in Perl, and | |
861 | really can't work without a Perl interpreter (although I suppose | |
862 | something like it could be written for C). Accidents of history (and | |
863 | not necessarily lucky ones) have made \*(C+ the most common language for | |
864 | the implementation of applications like word processors, Web browsers, | |
865 | and even many in-house applications like custom query systems. Current | |
866 | conditions make it somewhat unlikely that the next one of any of these | |
867 | kinds of applications will be written in Perl, albeit clearly more for | |
868 | reasons of custom and inertia than out of consideration of what is the | |
869 | right tool for the job. | |
870 | .PP | |
871 | However, other accidents of history have made Perl a well-accepted | |
872 | language for design of server-side programs (generally in \s-1CGI\s0 form) | |
873 | for Web site interfaces. Localization of static pages in Web sites is | |
874 | trivial, feasable either with simple language-negotiation features in | |
875 | servers like Apache, or with some kind of server-side inclusions of | |
876 | language-appropriate text into layout templates. However, I think | |
877 | that the localization of Perl-based search systems (or other kinds of | |
878 | dynamic content) in Web sites, be they public or access\-restricted, | |
879 | is where Maketext will see the greatest use. | |
880 | .PP | |
881 | I presume that it would be only the exceptional Web site that gets | |
882 | localized for English \fIand\fR Chinese \fIand\fR Italian \fIand\fR Arabic | |
883 | \&\fIand\fR Russian, to recall the languages from the beginning of this | |
884 | article \*(-- to say nothing of German, Spanish, French, Japanese, | |
885 | Finnish, and Hindi, to name a few languages that benefit from large | |
886 | numbers of programmers or Web viewers or both. | |
887 | .PP | |
888 | However, the ever-increasing internationalization of the Web (whether | |
889 | measured in terms of amount of content, of numbers of content writers | |
890 | or programmers, or of size of content audiences) makes it increasingly | |
891 | likely that the interface to the average Web-based dynamic content | |
892 | service will be localized for two or maybe three languages. It is my | |
893 | hope that Maketext will make that task as simple as possible, and will | |
894 | remove previous barriers to localization for languages dissimilar to | |
895 | English. | |
896 | .PP | |
897 | .Vb 1 | |
898 | \& __END__ | |
899 | .Ve | |
900 | .PP | |
901 | Sean M. Burke (sburke@cpan.org) has a Master's in linguistics | |
902 | from Northwestern University; he specializes in language technology. | |
903 | Jordan Lachler (lachler@unm.edu) is a PhD student in the Department of | |
904 | Linguistics at the University of New Mexico; he specializes in | |
905 | morphology and pedagogy of North American native languages. | |
906 | .Sh "References" | |
907 | .IX Subsection "References" | |
908 | Alvestrand, Harald Tveit. 1995. \fI\s-1RFC\s0 1766: Tags for the | |
909 | Identification of Languages.\fR | |
910 | \&\f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc1766.txt\*(C'\fR | |
911 | [Now see \s-1RFC\s0 3066.] | |
912 | .PP | |
913 | Callon, Ross, editor. 1996. \fI\s-1RFC\s0 1925: The Twelve | |
914 | Networking Truths.\fR | |
915 | \&\f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc1925.txt\*(C'\fR | |
916 | .PP | |
917 | Drepper, Ulrich, Peter Miller, | |
918 | and Franc\*,ois Pinard. 1995\-2001. \s-1GNU\s0 | |
919 | \&\f(CW\*(C`gettext\*(C'\fR. Available in \f(CW\*(C`ftp://prep.ai.mit.edu/pub/gnu/\*(C'\fR, with | |
920 | extensive docs in the distribution tarball. [Since | |
921 | I wrote this article in 1998, I now see that the | |
922 | gettext docs are now trying more to come to terms with | |
923 | plurality. Whether useful conclusions have come from it | |
924 | is another question altogether. \*(-- \s-1SMB\s0, May 2001] | |
925 | .PP | |
926 | Forbes, Nevill. 1964. \fIRussian Grammar.\fR Third Edition, revised | |
927 | by J. C. Dumbreck. Oxford University Press. |