Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / Locale::Maketext::TPJ13.3
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "Locale::Maketext::TPJ13 3"
132.TH Locale::Maketext::TPJ13 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134Locale::Maketext::TPJ13 \-\- article about software localization
135.SH "SYNOPSIS"
136.IX Header "SYNOPSIS"
137.Vb 1
138\& # This an article, not a module.
139.Ve
140.SH "DESCRIPTION"
141.IX Header "DESCRIPTION"
142The following article by Sean M. Burke and Jordan Lachler
143first appeared in \fIThe Perl
144Journal\fR #13 and is copyright 1999 The Perl Journal. It appears
145courtesy of Jon Orwant and The Perl Journal. This document may be
146distributed under the same terms as Perl itself.
147.SH "Localization and Perl: gettext breaks, Maketext fixes"
148.IX Header "Localization and Perl: gettext breaks, Maketext fixes"
149by Sean M. Burke and Jordan Lachler
150.PP
151This article points out cases where gettext (a common system for
152localizing software interfaces \*(-- i.e., making them work in the user's
153language of choice) fails because of basic differences between human
154languages. This article then describes Maketext, a new system capable
155of correctly treating these differences.
156.Sh "A Localization Horror Story: It Could Happen To You"
157.IX Subsection "A Localization Horror Story: It Could Happen To You"
158.RS 4
159\&\*(L"There are a number of languages spoken by human beings in this
160world.\*(R"
161.Sp
162\&\-\- Harald Tveit Alvestrand, in \s-1RFC\s0 1766, \*(L"Tags for the
163Identification of Languages\*(R"
164.RE
165.PP
166Imagine that your task for the day is to localize a piece of software
167\&\*(-- and luckily for you, the only output the program emits is two
168messages, like this:
169.PP
170.Vb 1
171\& I scanned 12 directories.
172.Ve
173.PP
174.Vb 1
175\& Your query matched 10 files in 4 directories.
176.Ve
177.PP
178So how hard could that be? You look at the code that
179produces the first item, and it reads:
180.PP
181.Vb 2
182\& printf("I scanned %g directories.",
183\& $directory_count);
184.Ve
185.PP
186You think about that, and realize that it doesn't even work right for
187English, as it can produce this output:
188.PP
189.Vb 1
190\& I scanned 1 directories.
191.Ve
192.PP
193So you rewrite it to read:
194.PP
195.Vb 5
196\& printf("I scanned %g %s.",
197\& $directory_count,
198\& $directory_count == 1 ?
199\& "directory" : "directories",
200\& );
201.Ve
202.PP
203\&...which does the Right Thing. (In case you don't recall, \*(L"%g\*(R" is for
204locale-specific number interpolation, and \*(L"%s\*(R" is for string
205interpolation.)
206.PP
207But you still have to localize it for all the languages you're
208producing this software for, so you pull Locale::gettext off of \s-1CPAN\s0
209so you can access the \f(CW\*(C`gettext\*(C'\fR C functions you've heard are standard
210for localization tasks.
211.PP
212And you write:
213.PP
214.Vb 5
215\& printf(gettext("I scanned %g %s."),
216\& $dir_scan_count,
217\& $dir_scan_count == 1 ?
218\& gettext("directory") : gettext("directories"),
219\& );
220.Ve
221.PP
222But you then read in the gettext manual (Drepper, Miller, and Pinard 1995)
223that this is not a good idea, since how a single word like \*(L"directory\*(R"
224or \*(L"directories\*(R" is translated may depend on context \*(-- and this is
225true, since in a case language like German or Russian, you'd may need
226these words with a different case ending in the first instance (where the
227word is the object of a verb) than in the second instance, which you haven't even
228gotten to yet (where the word is the object of a preposition, \*(L"in \f(CW%g\fR
229directories\*(R") \*(-- assuming these keep the same syntax when translated
230into those languages.
231.PP
232So, on the advice of the gettext manual, you rewrite:
233.PP
234.Vb 4
235\& printf( $dir_scan_count == 1 ?
236\& gettext("I scanned %g directory.") :
237\& gettext("I scanned %g directories."),
238\& $dir_scan_count );
239.Ve
240.PP
241So, you email your various translators (the boss decides that the
242languages du jour are Chinese, Arabic, Russian, and Italian, so you
243have one translator for each), asking for translations for \*(L"I scanned
244\&\f(CW%g\fR directory.\*(R" and \*(L"I scanned \f(CW%g\fR directories.\*(R". When they reply,
245you'll put that in the lexicons for gettext to use when it localizes
246your software, so that when the user is running under the \*(L"zh\*(R"
247(Chinese) locale, gettext(\*(L"I scanned \f(CW%g\fR directory.\*(R") will return the
248appropriate Chinese text, with a \*(L"%g\*(R" in there where printf can then
249interpolate \f(CW$dir_scan\fR.
250.PP
251Your Chinese translator emails right back \*(-- he says both of these
252phrases translate to the same thing in Chinese, because, in linguistic
253jargon, Chinese \*(L"doesn't have number as a grammatical category\*(R" \*(--
254whereas English does. That is, English has grammatical rules that
255refer to \*(L"number\*(R", i.e., whether something is grammatically singular
256or plural; and one of these rules is the one that forces nouns to take
257a plural suffix (generally \*(L"s\*(R") when in a plural context, as they are when
258they follow a number other than \*(L"one\*(R" (including, oddly enough, \*(L"zero\*(R").
259Chinese has no such rules, and so has just the one phrase where English
260has two. But, no problem, you can have this one Chinese phrase appear
261as the translation for the two English phrases in the \*(L"zh\*(R" gettext
262lexicon for your program.
263.PP
264Emboldened by this, you dive into the second phrase that your software
265needs to output: \*(L"Your query matched 10 files in 4 directories.\*(R". You notice
266that if you want to treat phrases as indivisible, as the gettext
267manual wisely advises, you need four cases now, instead of two, to
268cover the permutations of singular and plural on the two items,
269\&\f(CW$dir_count\fR and \f(CW$file_count\fR. So you try this:
270.PP
271.Vb 9
272\& printf( $file_count == 1 ?
273\& ( $directory_count == 1 ?
274\& gettext("Your query matched %g file in %g directory.") :
275\& gettext("Your query matched %g file in %g directories.") ) :
276\& ( $directory_count == 1 ?
277\& gettext("Your query matched %g files in %g directory.") :
278\& gettext("Your query matched %g files in %g directories.") ),
279\& $file_count, $directory_count,
280\& );
281.Ve
282.PP
283(The case of \*(L"1 file in 2 [or more] directories\*(R" could, I suppose,
284occur in the case of symlinking or something of the sort.)
285.PP
286It occurs to you that this is not the prettiest code you've ever
287written, but this seems the way to go. You mail off to the
288translators asking for translations for these four cases. The
289Chinese guy replies with the one phrase that these all translate to in
290Chinese, and that phrase has two \*(L"%g\*(R"s in it, as it should \*(-- but
291there's a problem. He translates it word-for-word back: \*(L"In \f(CW%g\fR
292directories contains \f(CW%g\fR files match your query.\*(R" The \f(CW%g\fR
293slots are in an order reverse to what they are in English. You wonder
294how you'll get gettext to handle that.
295.PP
296But you put it aside for the moment, and optimistically hope that the
297other translators won't have this problem, and that their languages
298will be better behaved \*(-- i.e., that they will be just like English.
299.PP
300But the Arabic translator is the next to write back. First off, your
301code for \*(L"I scanned \f(CW%g\fR directory.\*(R" or \*(L"I scanned \f(CW%g\fR directories.\*(R"
302assumes there's only singular or plural. But, to use linguistic
303jargon again, Arabic has grammatical number, like English (but unlike
304Chinese), but it's a three-term category: singular, dual, and plural.
305In other words, the way you say \*(L"directory\*(R" depends on whether there's
306one directory, or \fItwo\fR of them, or \fImore than two\fR of them. Your
307test of \f(CW\*(C`($directory == 1)\*(C'\fR no longer does the job. And it means
308that where English's grammatical category of number necessitates
309only the two permutations of the first sentence based on \*(L"directory
310[singular]\*(R" and \*(L"directories [plural]\*(R", Arabic has three \*(-- and,
311worse, in the second sentence (\*(L"Your query matched \f(CW%g\fR file in \f(CW%g\fR
312directory.\*(R"), where English has four, Arabic has nine. You sense
313an unwelcome, exponential trend taking shape.
314.PP
315Your Italian translator emails you back and says that \*(L"I searched 0
316directories\*(R" (a possible English output of your program) is stilted,
317and if you think that's fine English, that's your problem, but that
318\&\fIjust will not do\fR in the language of Dante. He insists that where
319\&\f(CW$directory_count\fR is 0, your program should produce the Italian text
320for "I \fIdidn't\fR scan \fIany\fR directories.\*(L". And ditto for \*(R"I didn't
321match any files in any directories\*(L", although he says the last part
322about \*(R"in any directories" should probably just be left off.
323.PP
324You wonder how you'll get gettext to handle this; to accomodate the
325ways Arabic, Chinese, and Italian deal with numbers in just these few
326very simple phrases, you need to write code that will ask gettext for
327different queries depending on whether the numerical values in
328question are 1, 2, more than 2, or in some cases 0, and you still haven't
329figured out the problem with the different word order in Chinese.
330.PP
331Then your Russian translator calls on the phone, to \fIpersonally\fR tell
332you the bad news about how really unpleasant your life is about to
333become:
334.PP
335Russian, like German or Latin, is an inflectional language; that is, nouns
336and adjectives have to take endings that depend on their case
337(i.e., nominative, accusative, genitive, etc...) \*(-- which is roughly a matter of
338what role they have in syntax of the sentence \*(--
339as well as on the grammatical gender (i.e., masculine, feminine, neuter)
340and number (i.e., singular or plural) of the noun, as well as on the
341declension class of the noun. But unlike with most other inflected languages,
342putting a number-phrase (like \*(L"ten\*(R" or \*(L"forty\-three\*(R", or their Arabic
343numeral equivalents) in front of noun in Russian can change the case and
344number that noun is, and therefore the endings you have to put on it.
345.PP
346He elaborates: In \*(L"I scanned \f(CW%g\fR directories\*(R", you'd \fIexpect\fR
347\&\*(L"directories\*(R" to be in the accusative case (since it is the direct
348object in the sentnce) and the plural number,
349except where \f(CW$directory_count\fR is 1, then you'd expect the singular, of
350course. Just like Latin or German. \fIBut!\fR Where \f(CW$directory_count\fR %
35110 is 1 (\*(L"%\*(R" for modulo, remember), assuming \f(CW$directory\fR count is an
352integer, and except where \f(CW$directory_count\fR % 100 is 11, \*(L"directories\*(R"
353is forced to become grammatically singular, which means it gets the
354ending for the accusative singular... You begin to visualize the code
355it'd take to test for the problem so far, \fIand still work for Chinese
356and Arabic and Italian\fR, and how many gettext items that'd take, but
357he keeps going... But where \f(CW$directory_count\fR % 10 is 2, 3, or 4
358(except where \f(CW$directory_count\fR % 100 is 12, 13, or 14), the word for
359\&\*(L"directories\*(R" is forced to be genitive singular \*(-- which means another
360ending... The room begins to spin around you, slowly at first... But
361with \fIall other\fR integer values, since \*(L"directory\*(R" is an inanimate
362noun, when preceded by a number and in the nominative or accusative
363cases (as it is here, just your luck!), it does stay plural, but it is
364forced into the genitive case \*(-- yet another ending... And
365you never hear him get to the part about how you're going to run into
366similar (but maybe subtly different) problems with other Slavic
367languages like Polish, because the floor comes up to meet you, and you
368fade into unconsciousness.
369.PP
370The above cautionary tale relates how an attempt at localization can
371lead from programmer consternation, to program obfuscation, to a need
372for sedation. But careful evaluation shows that your choice of tools
373merely needed further consideration.
374.Sh "The Linguistic View"
375.IX Subsection "The Linguistic View"
376.RS 4
377\&\*(L"It is more complicated than you think.\*(R"
378.Sp
379\&\-\- The Eighth Networking Truth, from \s-1RFC\s0 1925
380.RE
381.PP
382The field of Linguistics has expended a great deal of effort over the
383past century trying to find grammatical patterns which hold across
384languages; it's been a constant process
385of people making generalizations that should apply to all languages,
386only to find out that, all too often, these generalizations fail \*(--
387sometimes failing for just a few languages, sometimes whole classes of
388languages, and sometimes nearly every language in the world except
389English. Broad statistical trends are evident in what the \*(L"average
390language\*(R" is like as far as what its rules can look like, must look
391like, and cannot look like. But the \*(L"average language\*(R" is just as
392unreal a concept as the \*(L"average person\*(R" \*(-- it runs up against the
393fact no language (or person) is, in fact, average. The wisdom of past
394experience leads us to believe that any given language can do whatever
395it wants, in any order, with appeal to any kind of grammatical
396categories wants \*(-- case, number, tense, real or metaphoric
397characteristics of the things that words refer to, arbitrary or
398predictable classifications of words based on what endings or prefixes
399they can take, degree or means of certainty about the truth of
400statements expressed, and so on, ad infinitum.
401.PP
402Mercifully, most localization tasks are a matter of finding ways to
403translate whole phrases, generally sentences, where the context is
404relatively set, and where the only variation in content is \fIusually\fR
405in a number being expressed \*(-- as in the example sentences above.
406Translating specific, fully-formed sentences is, in practice, fairly
407foolproof \*(-- which is good, because that's what's in the phrasebooks
408that so many tourists rely on. Now, a given phrase (whether in a
409phrasebook or in a gettext lexicon) in one language \fImight\fR have a
410greater or lesser applicability than that phrase's translation into
411another language \*(-- for example, strictly speaking, in Arabic, the
412\&\*(L"your\*(R" in \*(L"Your query matched...\*(R" would take a different form
413depending on whether the user is male or female; so the Arabic
414translation \*(L"your[feminine] query\*(R" is applicable in fewer cases than
415the corresponding English phrase, which doesn't distinguish the user's
416gender. (In practice, it's not feasable to have a program know the
417user's gender, so the masculine \*(L"you\*(R" in Arabic is usually used, by
418default.)
419.PP
420But in general, such surprises are rare when entire sentences are
421being translated, especially when the functional context is restricted
422to that of a computer interacting with a user either to convey a fact
423or to prompt for a piece of information. So, for purposes of
424localization, translation by phrase (generally by sentence) is both the
425simplest and the least problematic.
426.Sh "Breaking gettext"
427.IX Subsection "Breaking gettext"
428.RS 4
429\&\*(L"It Has To Work.\*(R"
430.Sp
431\&\-\- First Networking Truth, \s-1RFC\s0 1925
432.RE
433.PP
434Consider that sentences in a tourist phrasebook are of two types: ones
435like \*(L"How do I get to the marketplace?\*(R" that don't have any blanks to
436fill in, and ones like \*(L"How much do these _\|__ cost?\*(R", where there's
437one or more blanks to fill in (and these are usually linked to a
438list of words that you can put in that blank: \*(L"fish\*(R", \*(L"potatoes\*(R",
439\&\*(L"tomatoes\*(R", etc.) The ones with no blanks are no problem, but the
440fill-in-the-blank ones may not be really straightforward. If it's a
441Swahili phrasebook, for example, the authors probably didn't bother to
442tell you the complicated ways that the verb \*(L"cost\*(R" changes its
443inflectional prefix depending on the noun you're putting in the blank.
444The trader in the marketplace will still understand what you're saying if
445you say \*(L"how much do these potatoes cost?\*(R" with the wrong
446inflectional prefix on \*(L"cost\*(R". After all, \fIyou\fR can't speak proper Swahili,
447\&\fIyou're\fR just a tourist. But while tourists can be stupid, computers
448are supposed to be smart; the computer should be able to fill in the
449blank, and still have the results be grammatical.
450.PP
451In other words, a phrasebook entry takes some values as parameters
452(the things that you fill in the blank or blanks), and provides a value
453based on these parameters, where the way you get that final value from
454the given values can, properly speaking, involve an arbitrarily
455complex series of operations. (In the case of Chinese, it'd be not at
456all complex, at least in cases like the examples at the beginning of
457this article; whereas in the case of Russian it'd be a rather complex
458series of operations. And in some languages, the
459complexity could be spread around differently: while the act of
460putting a number-expression in front of a noun phrase might not be
461complex by itself, it may change how you have to, for example, inflect
462a verb elsewhere in the sentence. This is what in syntax is called
463\&\*(L"long\-distance dependencies\*(R".)
464.PP
465This talk of parameters and arbitrary complexity is just another way
466to say that an entry in a phrasebook is what in a programming language
467would be called a \*(L"function\*(R". Just so you don't miss it, this is the
468crux of this article: \fIA phrase is a function; a phrasebook is a
469bunch of functions.\fR
470.PP
471The reason that using gettext runs into walls (as in the above
472second-person horror story) is that you're trying to use a string (or
473worse, a choice among a bunch of strings) to do what you really need a
474function for \*(-- which is futile. Preforming (s)printf interpolation
475on the strings which you get back from gettext does allow you to do \fIsome\fR
476common things passably well... sometimes... sort of; but, to paraphrase
477what some people say about \f(CW\*(C`csh\*(C'\fR script programming, \*(L"it fools you
478into thinking you can use it for real things, but you can't, and you
479don't discover this until you've already spent too much time trying,
480and by then it's too late.\*(R"
481.Sh "Replacing gettext"
482.IX Subsection "Replacing gettext"
483So, what needs to replace gettext is a system that supports lexicons
484of functions instead of lexicons of strings. An entry in a lexicon
485from such a system should \fInot\fR look like this:
486.PP
487.Vb 1
488\& "J'ai trouv\exE9 %g fichiers dans %g r\exE9pertoires"
489.Ve
490.PP
491[\exE9 is e\-acute in Latin\-1. Some pod renderers would
492scream if I used the actual character here. \*(-- \s-1SB\s0]
493.PP
494but instead like this, bearing in mind that this is just a first stab:
495.PP
496.Vb 8
497\& sub I_found_X1_files_in_X2_directories {
498\& my( $files, $dirs ) = @_[0,1];
499\& $files = sprintf("%g %s", $files,
500\& $files == 1 ? 'fichier' : 'fichiers');
501\& $dirs = sprintf("%g %s", $dirs,
502\& $dirs == 1 ? "r\exE9pertoire" : "r\exE9pertoires");
503\& return "J'ai trouv\exE9 $files dans $dirs.";
504\& }
505.Ve
506.PP
507Now, there's no particularly obvious way to store anything but strings
508in a gettext lexicon; so it looks like we just have to start over and
509make something better, from scratch. I call my shot at a
510gettext-replacement system \*(L"Maketext\*(R", or, in \s-1CPAN\s0 terms,
511Locale::Maketext.
512.PP
513When designing Maketext, I chose to plan its main features in terms of
514\&\*(L"buzzword compliance\*(R". And here are the buzzwords:
515.Sh "Buzzwords: Abstraction and Encapsulation"
516.IX Subsection "Buzzwords: Abstraction and Encapsulation"
517The complexity of the language you're trying to output a phrase in is
518entirely abstracted inside (and encapsulated within) the Maketext module
519for that interface. When you call:
520.PP
521.Vb 2
522\& print $lang->maketext("You have [quant,_1,piece] of new mail.",
523\& scalar(@messages));
524.Ve
525.PP
526you don't know (and in fact can't easily find out) whether this will
527involve lots of figuring, as in Russian (if \f(CW$lang\fR is a handle to the
528Russian module), or relatively little, as in Chinese. That kind of
529abstraction and encapsulation may encourage other pleasant buzzwords
530like modularization and stratification, depending on what design
531decisions you make.
532.Sh "Buzzword: Isomorphism"
533.IX Subsection "Buzzword: Isomorphism"
534\&\*(L"Isomorphism\*(R" means \*(L"having the same structure or form\*(R"; in discussions
535of program design, the word takes on the special, specific meaning that
536your implementation of a solution to a problem \fIhas the same
537structure\fR as, say, an informal verbal description of the solution, or
538maybe of the problem itself. Isomorphism is, all things considered,
539a good thing \*(-- it's what problem-solving (and solution\-implementing)
540should look like.
541.PP
542What's wrong the with gettext-using code like this...
543.PP
544.Vb 9
545\& printf( $file_count == 1 ?
546\& ( $directory_count == 1 ?
547\& "Your query matched %g file in %g directory." :
548\& "Your query matched %g file in %g directories." ) :
549\& ( $directory_count == 1 ?
550\& "Your query matched %g files in %g directory." :
551\& "Your query matched %g files in %g directories." ),
552\& $file_count, $directory_count,
553\& );
554.Ve
555.PP
556is first off that it's not well abstracted \*(-- these ways of testing
557for grammatical number (as in the expressions like \f(CW\*(C`foo == 1 ?
558singular_form : plural_form\*(C'\fR) should be abstracted to each language
559module, since how you get grammatical number is language\-specific.
560.PP
561But second off, it's not isomorphic \*(-- the \*(L"solution\*(R" (i.e., the
562phrasebook entries) for Chinese maps from these four English phrases to
563the one Chinese phrase that fits for all of them. In other words, the
564informal solution would be \*(L"The way to say what you want in Chinese is
565with the one phrase 'For your question, in Y directories you would
566find X files'\*(R" \*(-- and so the implemented solution should be,
567isomorphically, just a straightforward way to spit out that one
568phrase, with numerals properly interpolated. It shouldn't have to map
569from the complexity of other languages to the simplicity of this one.
570.Sh "Buzzword: Inheritance"
571.IX Subsection "Buzzword: Inheritance"
572There's a great deal of reuse possible for sharing of phrases between
573modules for related dialects, or for sharing of auxiliary functions
574between related languages. (By \*(L"auxiliary functions\*(R", I mean
575functions that don't produce phrase\-text, but which, say, return an
576answer to \*(L"does this number require a plural noun after it?\*(R". Such
577auxiliary functions would be used in the internal logic of functions
578that actually do produce phrase\-text.)
579.PP
580In the case of sharing phrases, consider that you have an interface
581already localized for American English (probably by having been
582written with that as the native locale, but that's incidental).
583Localizing it for \s-1UK\s0 English should, in practical terms, be just a
584matter of running it past a British person with the instructions to
585indicate what few phrases would benefit from a change in spelling or
586possibly minor rewording. In that case, you should be able to put in
587the \s-1UK\s0 English localization module \fIonly\fR those phrases that are
588UK\-specific, and for all the rest, \fIinherit\fR from the American
589English module. (And I expect this same situation would apply with
590Brazilian and Continental Portugese, possbily with some \fIvery\fR
591closely related languages like Czech and Slovak, and possibly with the
592slightly different \*(L"versions\*(R" of written Mandarin Chinese, as I hear exist in
593Taiwan and mainland China.)
594.PP
595As to sharing of auxiliary functions, consider the problem of Russian
596numbers from the beginning of this article; obviously, you'd want to
597write only once the hairy code that, given a numeric value, would
598return some specification of which case and number a given quanitified
599noun should use. But suppose that you discover, while localizing an
600interface for, say, Ukranian (a Slavic language related to Russian,
601spoken by several million people, many of whom would be relieved to
602find that your Web site's or software's interface is available in
603their language), that the rules in Ukranian are the same as in Russian
604for quantification, and probably for many other grammatical functions.
605While there may well be no phrases in common between Russian and
606Ukranian, you could still choose to have the Ukranian module inherit
607from the Russian module, just for the sake of inheriting all the
608various grammatical methods. Or, probably better organizationally,
609you could move those functions to a module called \f(CW\*(C`_E_Slavic\*(C'\fR or
610something, which Russian and Ukranian could inherit useful functions
611from, but which would (presumably) provide no lexicon.
612.Sh "Buzzword: Concision"
613.IX Subsection "Buzzword: Concision"
614Okay, concision isn't a buzzword. But it should be, so I decree that
615as a new buzzword, \*(L"concision\*(R" means that simple common things should
616be expressible in very few lines (or maybe even just a few characters)
617of code \*(-- call it a special case of \*(L"making simple things easy and
618hard things possible\*(R", and see also the role it played in the
619MIDI::Simple language, discussed elsewhere in this issue [TPJ#13].
620.PP
621Consider our first stab at an entry in our \*(L"phrasebook of functions\*(R":
622.PP
623.Vb 8
624\& sub I_found_X1_files_in_X2_directories {
625\& my( $files, $dirs ) = @_[0,1];
626\& $files = sprintf("%g %s", $files,
627\& $files == 1 ? 'fichier' : 'fichiers');
628\& $dirs = sprintf("%g %s", $dirs,
629\& $dirs == 1 ? "r\exE9pertoire" : "r\exE9pertoires");
630\& return "J'ai trouv\exE9 $files dans $dirs.";
631\& }
632.Ve
633.PP
634You may sense that a lexicon (to use a non-committal catch-all term for a
635collection of things you know how to say, regardless of whether they're
636phrases or words) consisting of functions \fIexpressed\fR as above would
637make for rather long-winded and repetitive code \*(-- even if you wisely
638rewrote this to have quantification (as we call adding a number
639expression to a noun phrase) be a function called like:
640.PP
641.Vb 6
642\& sub I_found_X1_files_in_X2_directories {
643\& my( $files, $dirs ) = @_[0,1];
644\& $files = quant($files, "fichier");
645\& $dirs = quant($dirs, "r\exE9pertoire");
646\& return "J'ai trouv\exE9 $files dans $dirs.";
647\& }
648.Ve
649.PP
650And you may also sense that you do not want to bother your translators
651with having to write Perl code \*(-- you'd much rather that they spend
652their \fIvery costly time\fR on just translation. And this is to say
653nothing of the near impossibility of finding a commercial translator
654who would know even simple Perl.
655.PP
656In a first-hack implementation of Maketext, each language\-module's
657lexicon looked like this:
658.PP
659.Vb 10
660\& %Lexicon = (
661\& "I found %g files in %g directories"
662\& => sub {
663\& my( $files, $dirs ) = @_[0,1];
664\& $files = quant($files, "fichier");
665\& $dirs = quant($dirs, "r\exE9pertoire");
666\& return "J'ai trouv\exE9 $files dans $dirs.";
667\& },
668\& ... and so on with other phrase => sub mappings ...
669\& );
670.Ve
671.PP
672but I immediately went looking for some more concise way to basically
673denote the same phrase-function \*(-- a way that would also serve to
674concisely denote \fImost\fR phrase-functions in the lexicon for \fImost\fR
675languages. After much time and even some actual thought, I decided on
676this system:
677.PP
678* Where a value in a \f(CW%Lexicon\fR hash is a contentful string instead of
679an anonymous sub (or, conceivably, a coderef), it would be interpreted
680as a sort of shorthand expression of what the sub does. When accessed
681for the first time in a session, it is parsed, turned into Perl code,
682and then eval'd into an anonymous sub; then that sub replaces the
683original string in that lexicon. (That way, the work of parsing and
684evaling the shorthand form for a given phrase is done no more than
685once per session.)
686.PP
687* Calls to \f(CW\*(C`maketext\*(C'\fR (as Maketext's main function is called) happen
688thru a \*(L"language session handle\*(R", notionally very much like an \s-1IO\s0
689handle, in that you open one at the start of the session, and use it
690for \*(L"sending signals\*(R" to an object in order to have it return the text
691you want.
692.PP
693So, this:
694.PP
695.Vb 2
696\& $lang->maketext("You have [quant,_1,piece] of new mail.",
697\& scalar(@messages));
698.Ve
699.PP
700basically means this: look in the lexicon for \f(CW$lang\fR (which may inherit
701from any number of other lexicons), and find the function that we
702happen to associate with the string \*(L"You have [quant,_1,piece] of new
703mail\*(R" (which is, and should be, a functioning \*(L"shorthand\*(R" for this
704function in the native locale \*(-- English in this case). If you find
705such a function, call it with \f(CW$lang\fR as its first parameter (as if it
706were a method), and then a copy of scalar(@messages) as its second,
707and then return that value. If that function was found, but was in
708string shorthand instead of being a fully specified function, parse it
709and make it into a function before calling it the first time.
710.PP
711* The shorthand uses code in brackets to indicate method calls that
712should be performed. A full explanation is not in order here, but a
713few examples will suffice:
714.PP
715.Vb 1
716\& "You have [quant,_1,piece] of new mail."
717.Ve
718.PP
719The above code is shorthand for, and will be interpreted as,
720this:
721.PP
722.Vb 8
723\& sub {
724\& my $handle = $_[0];
725\& my(@params) = @_;
726\& return join '',
727\& "You have ",
728\& $handle->quant($params[1], 'piece'),
729\& "of new mail.";
730\& }
731.Ve
732.PP
733where \*(L"quant\*(R" is the name of a method you're using to quantify the
734noun \*(L"piece\*(R" with the number \f(CW$params\fR[0].
735.PP
736A string with no brackety calls, like this:
737.PP
738.Vb 1
739\& "Your search expression was malformed."
740.Ve
741.PP
742is somewhat of a degerate case, and just gets turned into:
743.PP
744.Vb 1
745\& sub { return "Your search expression was malformed." }
746.Ve
747.PP
748However, not everything you can write in Perl code can be written in
749the above shorthand system \*(-- not by a long shot. For example, consider
750the Italian translator from the beginning of this article, who wanted
751the Italian for \*(L"I didn't find any files\*(R" as a special case, instead
752of \*(L"I found 0 files\*(R". That couldn't be specified (at least not easily
753or simply) in our shorthand system, and it would have to be written
754out in full, like this:
755.PP
756.Vb 10
757\& sub { # pretend the English strings are in Italian
758\& my($handle, $files, $dirs) = @_[0,1,2];
759\& return "I didn't find any files" unless $files;
760\& return join '',
761\& "I found ",
762\& $handle->quant($files, 'file'),
763\& " in ",
764\& $handle->quant($dirs, 'directory'),
765\& ".";
766\& }
767.Ve
768.PP
769Next to a lexicon full of shorthand code, that sort of sticks out like a
770sore thumb \*(-- but this \fIis\fR a special case, after all; and at least
771it's possible, if not as concise as usual.
772.PP
773As to how you'd implement the Russian example from the beginning of
774the article, well, There's More Than One Way To Do It, but it could be
775something like this (using English words for Russian, just so you know
776what's going on):
777.PP
778.Vb 1
779\& "I [quant,_1,directory,accusative] scanned."
780.Ve
781.PP
782This shifts the burden of complexity off to the quant method. That
783method's parameters are: the numeric value it's going to use to
784quantify something; the Russian word it's going to quantify; and the
785parameter \*(L"accusative\*(R", which you're using to mean that this
786sentence's syntax wants a noun in the accusative case there, although
787that quantification method may have to overrule, for grammatical
788reasons you may recall from the beginning of this article.
789.PP
790Now, the Russian quant method here is responsible not only for
791implementing the strange logic necessary for figuring out how Russian
792number-phrases impose case and number on their noun\-phrases, but also
793for inflecting the Russian word for \*(L"directory\*(R". How that inflection
794is to be carried out is no small issue, and among the solutions I've
795seen, some (like variations on a simple lookup in a hash where all
796possible forms are provided for all necessary words) are
797straightforward but \fIcan\fR become cumbersome when you need to inflect
798more than a few dozen words; and other solutions (like using
799algorithms to model the inflections, storing only root forms and
800irregularities) \fIcan\fR involve more overhead than is justifiable for
801all but the largest lexicons.
802.PP
803Mercifully, this design decision becomes crucial only in the hairiest
804of inflected languages, of which Russian is by no means the \fIworst\fR case
805scenario, but is worse than most. Most languages have simpler
806inflection systems; for example, in English or Swahili, there are
807generally no more than two possible inflected forms for a given noun
808(\*(L"error/errors\*(R"; \*(L"kosa/makosa\*(R"), and the
809rules for producing these forms are fairly simple \*(-- or at least,
810simple rules can be formulated that work for most words, and you can
811then treat the exceptions as just \*(L"irregular\*(R", at least relative to
812your ad hoc rules. A simpler inflection system (simpler rules, fewer
813forms) means that design decisions are less crucial to maintaining
814sanity, whereas the same decisions could incur
815overhead-versus-scalability problems in languages like Russian. It
816may \fIalso\fR be likely that code (possibly in Perl, as with
817Lingua::EN::Inflect, for English nouns) has already
818been written for the language in question, whether simple or complex.
819.PP
820Moreover, a third possibility may even be simpler than anything
821discussed above: \*(L"Just require that all possible (or at least
822applicable) forms be provided in the call to the given language's quant
823method, as in:\*(R"
824.PP
825.Vb 1
826\& "I found [quant,_1,file,files]."
827.Ve
828.PP
829That way, quant just has to chose which form it needs, without having
830to look up or generate anything. While possibly not optimal for
831Russian, this should work well for most other languages, where
832quantification is not as complicated an operation.
833.Sh "The Devil in the Details"
834.IX Subsection "The Devil in the Details"
835There's plenty more to Maketext than described above \*(-- for example,
836there's the details of how language tags (\*(L"en\-US\*(R", \*(L"i\-pwn\*(R", \*(L"fi\*(R",
837etc.) or locale IDs (\*(L"en_US\*(R") interact with actual module naming
838(\*(L"BogoQuery/Locale/en_us.pm\*(R"), and what magic can ensue; there's the
839details of how to record (and possibly negotiate) what character
840encoding Maketext will return text in (\s-1UTF8\s0? Latin\-1? \s-1KOI8\s0?). There's
841the interesting fact that Maketext is for localization, but nowhere
842actually has a "\f(CW\*(C`use locale;\*(C'\fR" anywhere in it. For the curious,
843there's the somewhat frightening details of how I actually
844implement something like data inheritance so that searches across
845modules' \f(CW%Lexicon\fR hashes can parallel how Perl implements method
846inheritance.
847.PP
848And, most importantly, there's all the practical details of how to
849actually go about deriving from Maketext so you can use it for your
850interfaces, and the various tools and conventions for starting out and
851maintaining individual language modules.
852.PP
853That is all covered in the documentation for Locale::Maketext and the
854modules that come with it, available in \s-1CPAN\s0. After having read this
855article, which covers the why's of Maketext, the documentation,
856which covers the how's of it, should be quite straightfoward.
857.Sh "The Proof in the Pudding: Localizing Web Sites"
858.IX Subsection "The Proof in the Pudding: Localizing Web Sites"
859Maketext and gettext have a notable difference: gettext is in C,
860accessible thru C library calls, whereas Maketext is in Perl, and
861really can't work without a Perl interpreter (although I suppose
862something like it could be written for C). Accidents of history (and
863not necessarily lucky ones) have made \*(C+ the most common language for
864the implementation of applications like word processors, Web browsers,
865and even many in-house applications like custom query systems. Current
866conditions make it somewhat unlikely that the next one of any of these
867kinds of applications will be written in Perl, albeit clearly more for
868reasons of custom and inertia than out of consideration of what is the
869right tool for the job.
870.PP
871However, other accidents of history have made Perl a well-accepted
872language for design of server-side programs (generally in \s-1CGI\s0 form)
873for Web site interfaces. Localization of static pages in Web sites is
874trivial, feasable either with simple language-negotiation features in
875servers like Apache, or with some kind of server-side inclusions of
876language-appropriate text into layout templates. However, I think
877that the localization of Perl-based search systems (or other kinds of
878dynamic content) in Web sites, be they public or access\-restricted,
879is where Maketext will see the greatest use.
880.PP
881I presume that it would be only the exceptional Web site that gets
882localized for English \fIand\fR Chinese \fIand\fR Italian \fIand\fR Arabic
883\&\fIand\fR Russian, to recall the languages from the beginning of this
884article \*(-- to say nothing of German, Spanish, French, Japanese,
885Finnish, and Hindi, to name a few languages that benefit from large
886numbers of programmers or Web viewers or both.
887.PP
888However, the ever-increasing internationalization of the Web (whether
889measured in terms of amount of content, of numbers of content writers
890or programmers, or of size of content audiences) makes it increasingly
891likely that the interface to the average Web-based dynamic content
892service will be localized for two or maybe three languages. It is my
893hope that Maketext will make that task as simple as possible, and will
894remove previous barriers to localization for languages dissimilar to
895English.
896.PP
897.Vb 1
898\& __END__
899.Ve
900.PP
901Sean M. Burke (sburke@cpan.org) has a Master's in linguistics
902from Northwestern University; he specializes in language technology.
903Jordan Lachler (lachler@unm.edu) is a PhD student in the Department of
904Linguistics at the University of New Mexico; he specializes in
905morphology and pedagogy of North American native languages.
906.Sh "References"
907.IX Subsection "References"
908Alvestrand, Harald Tveit. 1995. \fI\s-1RFC\s0 1766: Tags for the
909Identification of Languages.\fR
910\&\f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc1766.txt\*(C'\fR
911[Now see \s-1RFC\s0 3066.]
912.PP
913Callon, Ross, editor. 1996. \fI\s-1RFC\s0 1925: The Twelve
914Networking Truths.\fR
915\&\f(CW\*(C`ftp://ftp.isi.edu/in\-notes/rfc1925.txt\*(C'\fR
916.PP
917Drepper, Ulrich, Peter Miller,
918and Franc\*,ois Pinard. 1995\-2001. \s-1GNU\s0
919\&\f(CW\*(C`gettext\*(C'\fR. Available in \f(CW\*(C`ftp://prep.ai.mit.edu/pub/gnu/\*(C'\fR, with
920extensive docs in the distribution tarball. [Since
921I wrote this article in 1998, I now see that the
922gettext docs are now trying more to come to terms with
923plurality. Whether useful conclusions have come from it
924is another question altogether. \*(-- \s-1SMB\s0, May 2001]
925.PP
926Forbes, Nevill. 1964. \fIRussian Grammar.\fR Third Edition, revised
927by J. C. Dumbreck. Oxford University Press.