git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/man/man3/Locale::Maketext::TPJ13.3

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "Locale::Maketext::TPJ13 3"
	132	.TH Locale::Maketext::TPJ13 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
	133	.SH "NAME"
	134	Locale::Maketext::TPJ13 \-\- article about software localization
	135	.SH "SYNOPSIS"
	136	.IX Header "SYNOPSIS"
	137	.Vb 1
	138	\& # This an article, not a module.
	139	.Ve
	140	.SH "DESCRIPTION"
	141	.IX Header "DESCRIPTION"
	142	The following article by Sean M. Burke and Jordan Lachler
	143	first appeared in \fIThe Perl
	144	Journal\fR #13 and is copyright 1999 The Perl Journal. It appears
	145	courtesy of Jon Orwant and The Perl Journal. This document may be
	146	distributed under the same terms as Perl itself.
	147	.SH "Localization and Perl: gettext breaks, Maketext fixes"
	148	.IX Header "Localization and Perl: gettext breaks, Maketext fixes"
	149	by Sean M. Burke and Jordan Lachler
	150	.PP
	151	This article points out cases where gettext (a common system for
	152	localizing software interfaces \*(-- i.e., making them work in the user's
	153	language of choice) fails because of basic differences between human
	154	languages. This article then describes Maketext, a new system capable
	155	of correctly treating these differences.
	156	.Sh "A Localization Horror Story: It Could Happen To You"
	157	.IX Subsection "A Localization Horror Story: It Could Happen To You"
	158	.RS 4
	159	\&\*(L"There are a number of languages spoken by human beings in this
	160	world.\*(R"
	161	.Sp
	162	\&\-\- Harald Tveit Alvestrand, in \s-1RFC\s0 1766, \*(L"Tags for the
	163	Identification of Languages\*(R"
	164	.RE
	165	.PP
	166	Imagine that your task for the day is to localize a piece of software
	167	\&\*(-- and luckily for you, the only output the program emits is two
	168	messages, like this:
	169	.PP
	170	.Vb 1
	171	\& I scanned 12 directories.
	172	.Ve
	173	.PP
	174	.Vb 1
	175	\& Your query matched 10 files in 4 directories.
	176	.Ve
	177	.PP
	178	So how hard could that be? You look at the code that
	179	produces the first item, and it reads:
	180	.PP
	181	.Vb 2
	182	\& printf("I scanned %g directories.",
	183	\& $directory_count);
	184	.Ve
	185	.PP
	186	You think about that, and realize that it doesn't even work right for
	187	English, as it can produce this output:
	188	.PP
	189	.Vb 1
	190	\& I scanned 1 directories.
	191	.Ve
	192	.PP
	193	So you rewrite it to read:
	194	.PP
	195	.Vb 5
	196	\& printf("I scanned %g %s.",
	197	\& $directory_count,
	198	\& $directory_count == 1 ?
	199	\& "directory" : "directories",
	200	\& );
	201	.Ve
	202	.PP
	203	\&...which does the Right Thing. (In case you don't recall, \(L"%g\(R" is for
	204	locale-specific number interpolation, and \(L"%s\(R" is for string
	205	interpolation.)
	206	.PP
	207	But you still have to localize it for all the languages you're
	208	producing this software for, so you pull Locale::gettext off of \s-1CPAN\s0
	209	so you can access the \f(CW\(C`gettext\(C'\fR C functions you've heard are standard
	210	for localization tasks.
	211	.PP
	212	And you write:
	213	.PP
	214	.Vb 5
	215	\& printf(gettext("I scanned %g %s."),
	216	\& $dir_scan_count,
	217	\& $dir_scan_count == 1 ?
	218	\& gettext("directory") : gettext("directories"),
	219	\& );
	220	.Ve
	221	.PP
	222	But you then read in the gettext manual (Drepper, Miller, and Pinard 1995)
	223	that this is not a good idea, since how a single word like \(L"directory\(R"
	224	or \(L"directories\(R" is translated may depend on context \*(-- and this is
	225	true, since in a case language like German or Russian, you'd may need
	226	these words with a different case ending in the first instance (where the
	227	word is the object of a verb) than in the second instance, which you haven't even
	228	gotten to yet (where the word is the object of a preposition, \*(L"in \f(CW%g\fR
	229	directories\(R") \(-- assuming these keep the same syntax when translated
	230	into those languages.
	231	.PP
	232	So, on the advice of the gettext manual, you rewrite:
	233	.PP
	234	.Vb 4
	235	\& printf( $dir_scan_count == 1 ?
	236	\& gettext("I scanned %g directory.") :
	237	\& gettext("I scanned %g directories."),
	238	\& $dir_scan_count );
	239	.Ve
	240	.PP
	241	So, you email your various translators (the boss decides that the
	242	languages du jour are Chinese, Arabic, Russian, and Italian, so you
	243	have one translator for each), asking for translations for \*(L"I scanned
	244	\&\f(CW%g\fR directory.\(R" and \(L"I scanned \f(CW%g\fR directories.\*(R". When they reply,
	245	you'll put that in the lexicons for gettext to use when it localizes
	246	your software, so that when the user is running under the \(L"zh\(R"
	247	(Chinese) locale, gettext(\(L"I scanned \f(CW%g\fR directory.\(R") will return the
	248	appropriate Chinese text, with a \(L"%g\(R" in there where printf can then
	249	interpolate \f(CW$dir_scan\fR.
	250	.PP
	251	Your Chinese translator emails right back \*(-- he says both of these
	252	phrases translate to the same thing in Chinese, because, in linguistic
	253	jargon, Chinese \(L"doesn't have number as a grammatical category\(R" \*(--
	254	whereas English does. That is, English has grammatical rules that
	255	refer to \(L"number\(R", i.e., whether something is grammatically singular
	256	or plural; and one of these rules is the one that forces nouns to take
	257	a plural suffix (generally \(L"s\(R") when in a plural context, as they are when
	258	they follow a number other than \(L"one\(R" (including, oddly enough, \(L"zero\(R").
	259	Chinese has no such rules, and so has just the one phrase where English
	260	has two. But, no problem, you can have this one Chinese phrase appear
	261	as the translation for the two English phrases in the \(L"zh\(R" gettext
	262	lexicon for your program.
	263	.PP
	264	Emboldened by this, you dive into the second phrase that your software
	265	needs to output: \(L"Your query matched 10 files in 4 directories.\(R". You notice
	266	that if you want to treat phrases as indivisible, as the gettext
	267	manual wisely advises, you need four cases now, instead of two, to
	268	cover the permutations of singular and plural on the two items,
	269	\&\f(CW$dir_count\fR and \f(CW$file_count\fR. So you try this:
	270	.PP
	271	.Vb 9
	272	\& printf( $file_count == 1 ?
	273	\& ( $directory_count == 1 ?
	274	\& gettext("Your query matched %g file in %g directory.") :
	275	\& gettext("Your query matched %g file in %g directories.") ) :
	276	\& ( $directory_count == 1 ?
	277	\& gettext("Your query matched %g files in %g directory.") :
	278	\& gettext("Your query matched %g files in %g directories.") ),
	279	\& $file_count, $directory_count,
	280	\& );
	281	.Ve
	282	.PP
	283	(The case of \(L"1 file in 2 [or more] directories\(R" could, I suppose,
	284	occur in the case of symlinking or something of the sort.)
	285	.PP
	286	It occurs to you that this is not the prettiest code you've ever
	287	written, but this seems the way to go. You mail off to the
	288	translators asking for translations for these four cases. The
	289	Chinese guy replies with the one phrase that these all translate to in
	290	Chinese, and that phrase has two \(L"%g\(R"s in it, as it should \*(-- but
	291	there's a problem. He translates it word-for-word back: \*(L"In \f(CW%g\fR
	292	directories contains \f(CW%g\fR files match your query.\*(R" The \f(CW%g\fR
	293	slots are in an order reverse to what they are in English. You wonder
	294	how you'll get gettext to handle that.
	295	.PP
	296	But you put it aside for the moment, and optimistically hope that the
	297	other translators won't have this problem, and that their languages
	298	will be better behaved \*(-- i.e., that they will be just like English.
	299	.PP
	300	But the Arabic translator is the next to write back. First off, your
	301	code for \(L"I scanned \f(CW%g\fR directory.\(R" or \(L"I scanned \f(CW%g\fR directories.\(R"
	302	assumes there's only singular or plural. But, to use linguistic
	303	jargon again, Arabic has grammatical number, like English (but unlike
	304	Chinese), but it's a three-term category: singular, dual, and plural.
	305	In other words, the way you say \(L"directory\(R" depends on whether there's
	306	one directory, or \fItwo\fR of them, or \fImore than two\fR of them. Your
	307	test of \f(CW\(C`($directory == 1)\(C'\fR no longer does the job. And it means
	308	that where English's grammatical category of number necessitates
	309	only the two permutations of the first sentence based on \*(L"directory
	310	[singular]\(R" and \(L"directories [plural]\(R", Arabic has three \(-- and,
	311	worse, in the second sentence (\*(L"Your query matched \f(CW%g\fR file in \f(CW%g\fR
	312	directory.\*(R"), where English has four, Arabic has nine. You sense
	313	an unwelcome, exponential trend taking shape.
	314	.PP
	315	Your Italian translator emails you back and says that \*(L"I searched 0
	316	directories\*(R" (a possible English output of your program) is stilted,
	317	and if you think that's fine English, that's your problem, but that
	318	\&\fIjust will not do\fR in the language of Dante. He insists that where
	319	\&\f(CW$directory_count\fR is 0, your program should produce the Italian text
	320	for "I \fIdidn't\fR scan \fIany\fR directories.\(L". And ditto for \(R"I didn't
	321	match any files in any directories\*(L", although he says the last part
	322	about \*(R"in any directories" should probably just be left off.
	323	.PP
	324	You wonder how you'll get gettext to handle this; to accomodate the
	325	ways Arabic, Chinese, and Italian deal with numbers in just these few
	326	very simple phrases, you need to write code that will ask gettext for
	327	different queries depending on whether the numerical values in
	328	question are 1, 2, more than 2, or in some cases 0, and you still haven't
	329	figured out the problem with the different word order in Chinese.
	330	.PP
	331	Then your Russian translator calls on the phone, to \fIpersonally\fR tell
	332	you the bad news about how really unpleasant your life is about to
	333	become:
	334	.PP
	335	Russian, like German or Latin, is an inflectional language; that is, nouns
	336	and adjectives have to take endings that depend on their case
	337	(i.e., nominative, accusative, genitive, etc...) \*(-- which is roughly a matter of
	338	what role they have in syntax of the sentence \*(--
	339	as well as on the grammatical gender (i.e., masculine, feminine, neuter)
	340	and number (i.e., singular or plural) of the noun, as well as on the
	341	declension class of the noun. But unlike with most other inflected languages,
	342	putting a number-phrase (like \(L"ten\(R" or \(L"forty\-three\(R", or their Arabic
	343	numeral equivalents) in front of noun in Russian can change the case and
	344	number that noun is, and therefore the endings you have to put on it.
	345	.PP
	346	He elaborates: In \(L"I scanned \f(CW%g\fR directories\(R", you'd \fIexpect\fR
	347	\&\(L"directories\(R" to be in the accusative case (since it is the direct
	348	object in the sentnce) and the plural number,
	349	except where \f(CW$directory_count\fR is 1, then you'd expect the singular, of
	350	course. Just like Latin or German. \fIBut!\fR Where \f(CW$directory_count\fR %
	351	10 is 1 (\(L"%\(R" for modulo, remember), assuming \f(CW$directory\fR count is an
	352	integer, and except where \f(CW$directory_count\fR % 100 is 11, \(L"directories\(R"
	353	is forced to become grammatically singular, which means it gets the
	354	ending for the accusative singular... You begin to visualize the code
	355	it'd take to test for the problem so far, \fIand still work for Chinese
	356	and Arabic and Italian\fR, and how many gettext items that'd take, but
	357	he keeps going... But where \f(CW$directory_count\fR % 10 is 2, 3, or 4
	358	(except where \f(CW$directory_count\fR % 100 is 12, 13, or 14), the word for
	359	\&\(L"directories\(R" is forced to be genitive singular \*(-- which means another
	360	ending... The room begins to spin around you, slowly at first... But
	361	with \fIall other\fR integer values, since \(L"directory\(R" is an inanimate
	362	noun, when preceded by a number and in the nominative or accusative
	363	cases (as it is here, just your luck!), it does stay plural, but it is
	364	forced into the genitive case \*(-- yet another ending... And
	365	you never hear him get to the part about how you're going to run into
	366	similar (but maybe subtly different) problems with other Slavic
	367	languages like Polish, because the floor comes up to meet you, and you
	368	fade into unconsciousness.
	369	.PP
	370	The above cautionary tale relates how an attempt at localization can
	371	lead from programmer consternation, to program obfuscation, to a need
	372	for sedation. But careful evaluation shows that your choice of tools
	373	merely needed further consideration.
	374	.Sh "The Linguistic View"
	375	.IX Subsection "The Linguistic View"
	376	.RS 4
	377	\&\(L"It is more complicated than you think.\(R"
	378	.Sp
	379	\&\-\- The Eighth Networking Truth, from \s-1RFC\s0 1925
	380	.RE
	381	.PP
	382	The field of Linguistics has expended a great deal of effort over the
	383	past century trying to find grammatical patterns which hold across
	384	languages; it's been a constant process
	385	of people making generalizations that should apply to all languages,
	386	only to find out that, all too often, these generalizations fail \*(--
	387	sometimes failing for just a few languages, sometimes whole classes of
	388	languages, and sometimes nearly every language in the world except
	389	English. Broad statistical trends are evident in what the \*(L"average
	390	language\*(R" is like as far as what its rules can look like, must look
	391	like, and cannot look like. But the \(L"average language\(R" is just as
	392	unreal a concept as the \(L"average person\(R" \*(-- it runs up against the
	393	fact no language (or person) is, in fact, average. The wisdom of past
	394	experience leads us to believe that any given language can do whatever
	395	it wants, in any order, with appeal to any kind of grammatical
	396	categories wants \*(-- case, number, tense, real or metaphoric
	397	characteristics of the things that words refer to, arbitrary or
	398	predictable classifications of words based on what endings or prefixes
	399	they can take, degree or means of certainty about the truth of
	400	statements expressed, and so on, ad infinitum.
	401	.PP
	402	Mercifully, most localization tasks are a matter of finding ways to
	403	translate whole phrases, generally sentences, where the context is
	404	relatively set, and where the only variation in content is \fIusually\fR
	405	in a number being expressed \*(-- as in the example sentences above.
	406	Translating specific, fully-formed sentences is, in practice, fairly
	407	foolproof \*(-- which is good, because that's what's in the phrasebooks
	408	that so many tourists rely on. Now, a given phrase (whether in a
	409	phrasebook or in a gettext lexicon) in one language \fImight\fR have a
	410	greater or lesser applicability than that phrase's translation into
	411	another language \*(-- for example, strictly speaking, in Arabic, the
	412	\&\(L"your\(R" in \(L"Your query matched...\(R" would take a different form
	413	depending on whether the user is male or female; so the Arabic
	414	translation \(L"your[feminine] query\(R" is applicable in fewer cases than
	415	the corresponding English phrase, which doesn't distinguish the user's
	416	gender. (In practice, it's not feasable to have a program know the
	417	user's gender, so the masculine \(L"you\(R" in Arabic is usually used, by
	418	default.)
	419	.PP
	420	But in general, such surprises are rare when entire sentences are
	421	being translated, especially when the functional context is restricted
	422	to that of a computer interacting with a user either to convey a fact
	423	or to prompt for a piece of information. So, for purposes of
	424	localization, translation by phrase (generally by sentence) is both the
	425	simplest and the least problematic.
	426	.Sh "Breaking gettext"
	427	.IX Subsection "Breaking gettext"
	428	.RS 4
	429	\&\(L"It Has To Work.\(R"
	430	.Sp
	431	\&\-\- First Networking Truth, \s-1RFC\s0 1925
	432	.RE
	433	.PP
	434	Consider that sentences in a tourist phrasebook are of two types: ones
	435	like \(L"How do I get to the marketplace?\(R" that don't have any blanks to
	436	fill in, and ones like \(L"How much do these _\\|__ cost?\(R", where there's
	437	one or more blanks to fill in (and these are usually linked to a
	438	list of words that you can put in that blank: \(L"fish\(R", \(L"potatoes\(R",
	439	\&\(L"tomatoes\(R", etc.) The ones with no blanks are no problem, but the
	440	fill-in-the-blank ones may not be really straightforward. If it's a
	441	Swahili phrasebook, for example, the authors probably didn't bother to
	442	tell you the complicated ways that the verb \(L"cost\(R" changes its
	443	inflectional prefix depending on the noun you're putting in the blank.
	444	The trader in the marketplace will still understand what you're saying if
	445	you say \(L"how much do these potatoes cost?\(R" with the wrong
	446	inflectional prefix on \(L"cost\(R". After all, \fIyou\fR can't speak proper Swahili,
	447	\&\fIyou're\fR just a tourist. But while tourists can be stupid, computers
	448	are supposed to be smart; the computer should be able to fill in the
	449	blank, and still have the results be grammatical.
	450	.PP
	451	In other words, a phrasebook entry takes some values as parameters
	452	(the things that you fill in the blank or blanks), and provides a value
	453	based on these parameters, where the way you get that final value from
	454	the given values can, properly speaking, involve an arbitrarily
	455	complex series of operations. (In the case of Chinese, it'd be not at
	456	all complex, at least in cases like the examples at the beginning of
	457	this article; whereas in the case of Russian it'd be a rather complex
	458	series of operations. And in some languages, the
	459	complexity could be spread around differently: while the act of
	460	putting a number-expression in front of a noun phrase might not be
	461	complex by itself, it may change how you have to, for example, inflect
	462	a verb elsewhere in the sentence. This is what in syntax is called
	463	\&\(L"long\-distance dependencies\(R".)
	464	.PP
	465	This talk of parameters and arbitrary complexity is just another way
	466	to say that an entry in a phrasebook is what in a programming language
	467	would be called a \(L"function\(R". Just so you don't miss it, this is the
	468	crux of this article: \fIA phrase is a function; a phrasebook is a
	469	bunch of functions.\fR
	470	.PP
	471	The reason that using gettext runs into walls (as in the above
	472	second-person horror story) is that you're trying to use a string (or
	473	worse, a choice among a bunch of strings) to do what you really need a
	474	function for \*(-- which is futile. Preforming (s)printf interpolation
	475	on the strings which you get back from gettext does allow you to do \fIsome\fR
	476	common things passably well... sometimes... sort of; but, to paraphrase
	477	what some people say about \f(CW\(C`csh\(C'\fR script programming, \*(L"it fools you
	478	into thinking you can use it for real things, but you can't, and you
	479	don't discover this until you've already spent too much time trying,
	480	and by then it's too late.\*(R"
	481	.Sh "Replacing gettext"
	482	.IX Subsection "Replacing gettext"
	483	So, what needs to replace gettext is a system that supports lexicons
	484	of functions instead of lexicons of strings. An entry in a lexicon
	485	from such a system should \fInot\fR look like this:
	486	.PP
	487	.Vb 1
	488	\& "J'ai trouv\exE9 %g fichiers dans %g r\exE9pertoires"
	489	.Ve
	490	.PP
	491	[\exE9 is e\-acute in Latin\-1. Some pod renderers would
	492	scream if I used the actual character here. \*(-- \s-1SB\s0]
	493	.PP
	494	but instead like this, bearing in mind that this is just a first stab:
	495	.PP
	496	.Vb 8
	497	\& sub I_found_X1_files_in_X2_directories {
	498	\& my( $files, $dirs ) = @_[0,1];
	499	\& $files = sprintf("%g %s", $files,
	500	\& $files == 1 ? 'fichier' : 'fichiers');
	501	\& $dirs = sprintf("%g %s", $dirs,
	502	\& $dirs == 1 ? "r\exE9pertoire" : "r\exE9pertoires");
	503	\& return "J'ai trouv\exE9 $files dans $dirs.";
	504	\& }
	505	.Ve
	506	.PP
	507	Now, there's no particularly obvious way to store anything but strings
	508	in a gettext lexicon; so it looks like we just have to start over and
	509	make something better, from scratch. I call my shot at a
	510	gettext-replacement system \(L"Maketext\(R", or, in \s-1CPAN\s0 terms,
	511	Locale::Maketext.
	512	.PP
	513	When designing Maketext, I chose to plan its main features in terms of
	514	\&\(L"buzzword compliance\(R". And here are the buzzwords:
	515	.Sh "Buzzwords: Abstraction and Encapsulation"
	516	.IX Subsection "Buzzwords: Abstraction and Encapsulation"
	517	The complexity of the language you're trying to output a phrase in is
	518	entirely abstracted inside (and encapsulated within) the Maketext module
	519	for that interface. When you call:
	520	.PP
	521	.Vb 2
	522	\& print $lang->maketext("You have [quant,_1,piece] of new mail.",
	523	\& scalar(@messages));
	524	.Ve
	525	.PP
	526	you don't know (and in fact can't easily find out) whether this will
	527	involve lots of figuring, as in Russian (if \f(CW$lang\fR is a handle to the
	528	Russian module), or relatively little, as in Chinese. That kind of
	529	abstraction and encapsulation may encourage other pleasant buzzwords
	530	like modularization and stratification, depending on what design
	531	decisions you make.
	532	.Sh "Buzzword: Isomorphism"
	533	.IX Subsection "Buzzword: Isomorphism"
	534	\&\(L"Isomorphism\(R" means \(L"having the same structure or form\(R"; in discussions
	535	of program design, the word takes on the special, specific meaning that
	536	your implementation of a solution to a problem \fIhas the same
	537	structure\fR as, say, an informal verbal description of the solution, or
	538	maybe of the problem itself. Isomorphism is, all things considered,
	539	a good thing \*(-- it's what problem-solving (and solution\-implementing)
	540	should look like.
	541	.PP
	542	What's wrong the with gettext-using code like this...
	543	.PP
	544	.Vb 9
	545	\& printf( $file_count == 1 ?
	546	\& ( $directory_count == 1 ?
	547	\& "Your query matched %g file in %g directory." :
	548	\& "Your query matched %g file in %g directories." ) :
	549	\& ( $directory_count == 1 ?
	550	\& "Your query matched %g files in %g directory." :
	551	\& "Your query matched %g files in %g directories." ),
	552	\& $file_count, $directory_count,
	553	\& );
	554	.Ve
	555	.PP
	556	is first off that it's not well abstracted \*(-- these ways of testing
	557	for grammatical number (as in the expressions like \f(CW\*(C`foo == 1 ?
	558	singular_form : plural_form\*(C'\fR) should be abstracted to each language
	559	module, since how you get grammatical number is language\-specific.
	560	.PP
	561	But second off, it's not isomorphic \(-- the \(L"solution\*(R" (i.e., the
	562	phrasebook entries) for Chinese maps from these four English phrases to
	563	the one Chinese phrase that fits for all of them. In other words, the
	564	informal solution would be \*(L"The way to say what you want in Chinese is
	565	with the one phrase 'For your question, in Y directories you would
	566	find X files'\(R" \(-- and so the implemented solution should be,
	567	isomorphically, just a straightforward way to spit out that one
	568	phrase, with numerals properly interpolated. It shouldn't have to map
	569	from the complexity of other languages to the simplicity of this one.
	570	.Sh "Buzzword: Inheritance"
	571	.IX Subsection "Buzzword: Inheritance"
	572	There's a great deal of reuse possible for sharing of phrases between
	573	modules for related dialects, or for sharing of auxiliary functions
	574	between related languages. (By \(L"auxiliary functions\(R", I mean
	575	functions that don't produce phrase\-text, but which, say, return an
	576	answer to \(L"does this number require a plural noun after it?\(R". Such
	577	auxiliary functions would be used in the internal logic of functions
	578	that actually do produce phrase\-text.)
	579	.PP
	580	In the case of sharing phrases, consider that you have an interface
	581	already localized for American English (probably by having been
	582	written with that as the native locale, but that's incidental).
	583	Localizing it for \s-1UK\s0 English should, in practical terms, be just a
	584	matter of running it past a British person with the instructions to
	585	indicate what few phrases would benefit from a change in spelling or
	586	possibly minor rewording. In that case, you should be able to put in
	587	the \s-1UK\s0 English localization module \fIonly\fR those phrases that are
	588	UK\-specific, and for all the rest, \fIinherit\fR from the American
	589	English module. (And I expect this same situation would apply with
	590	Brazilian and Continental Portugese, possbily with some \fIvery\fR
	591	closely related languages like Czech and Slovak, and possibly with the
	592	slightly different \(L"versions\(R" of written Mandarin Chinese, as I hear exist in
	593	Taiwan and mainland China.)
	594	.PP
	595	As to sharing of auxiliary functions, consider the problem of Russian
	596	numbers from the beginning of this article; obviously, you'd want to
	597	write only once the hairy code that, given a numeric value, would
	598	return some specification of which case and number a given quanitified
	599	noun should use. But suppose that you discover, while localizing an
	600	interface for, say, Ukranian (a Slavic language related to Russian,
	601	spoken by several million people, many of whom would be relieved to
	602	find that your Web site's or software's interface is available in
	603	their language), that the rules in Ukranian are the same as in Russian
	604	for quantification, and probably for many other grammatical functions.
	605	While there may well be no phrases in common between Russian and
	606	Ukranian, you could still choose to have the Ukranian module inherit
	607	from the Russian module, just for the sake of inheriting all the
	608	various grammatical methods. Or, probably better organizationally,
	609	you could move those functions to a module called \f(CW\(C`_E_Slavic\(C'\fR or
	610	something, which Russian and Ukranian could inherit useful functions
	611	from, but which would (presumably) provide no lexicon.
	612	.Sh "Buzzword: Concision"
	613	.IX Subsection "Buzzword: Concision"
	614	Okay, concision isn't a buzzword. But it should be, so I decree that
	615	as a new buzzword, \(L"concision\(R" means that simple common things should
	616	be expressible in very few lines (or maybe even just a few characters)
	617	of code \(-- call it a special case of \(L"making simple things easy and
	618	hard things possible\*(R", and see also the role it played in the
	619	MIDI::Simple language, discussed elsewhere in this issue [TPJ#13].
	620	.PP
	621	Consider our first stab at an entry in our \(L"phrasebook of functions\(R":
	622	.PP
	623	.Vb 8
	624	\& sub I_found_X1_files_in_X2_directories {
	625	\& my( $files, $dirs ) = @_[0,1];
	626	\& $files = sprintf("%g %s", $files,
	627	\& $files == 1 ? 'fichier' : 'fichiers');
	628	\& $dirs = sprintf("%g %s", $dirs,
	629	\& $dirs == 1 ? "r\exE9pertoire" : "r\exE9pertoires");
	630	\& return "J'ai trouv\exE9 $files dans $dirs.";
	631	\& }
	632	.Ve
	633	.PP
	634	You may sense that a lexicon (to use a non-committal catch-all term for a
	635	collection of things you know how to say, regardless of whether they're
	636	phrases or words) consisting of functions \fIexpressed\fR as above would
	637	make for rather long-winded and repetitive code \*(-- even if you wisely
	638	rewrote this to have quantification (as we call adding a number
	639	expression to a noun phrase) be a function called like:
	640	.PP
	641	.Vb 6
	642	\& sub I_found_X1_files_in_X2_directories {
	643	\& my( $files, $dirs ) = @_[0,1];
	644	\& $files = quant($files, "fichier");
	645	\& $dirs = quant($dirs, "r\exE9pertoire");
	646	\& return "J'ai trouv\exE9 $files dans $dirs.";
	647	\& }
	648	.Ve
	649	.PP
	650	And you may also sense that you do not want to bother your translators
	651	with having to write Perl code \*(-- you'd much rather that they spend
	652	their \fIvery costly time\fR on just translation. And this is to say
	653	nothing of the near impossibility of finding a commercial translator
	654	who would know even simple Perl.
	655	.PP
	656	In a first-hack implementation of Maketext, each language\-module's
	657	lexicon looked like this:
	658	.PP
	659	.Vb 10
	660	\& %Lexicon = (
	661	\& "I found %g files in %g directories"
	662	\& => sub {
	663	\& my( $files, $dirs ) = @_[0,1];
	664	\& $files = quant($files, "fichier");
	665	\& $dirs = quant($dirs, "r\exE9pertoire");
	666	\& return "J'ai trouv\exE9 $files dans $dirs.";
	667	\& },
	668	\& ... and so on with other phrase => sub mappings ...
	669	\& );
	670	.Ve
	671	.PP
	672	but I immediately went looking for some more concise way to basically
	673	denote the same phrase-function \*(-- a way that would also serve to
	674	concisely denote \fImost\fR phrase-functions in the lexicon for \fImost\fR
	675	languages. After much time and even some actual thought, I decided on
	676	this system:
	677	.PP
	678	* Where a value in a \f(CW%Lexicon\fR hash is a contentful string instead of
	679	an anonymous sub (or, conceivably, a coderef), it would be interpreted
	680	as a sort of shorthand expression of what the sub does. When accessed
	681	for the first time in a session, it is parsed, turned into Perl code,
	682	and then eval'd into an anonymous sub; then that sub replaces the
	683	original string in that lexicon. (That way, the work of parsing and
	684	evaling the shorthand form for a given phrase is done no more than
	685	once per session.)
	686	.PP
	687	* Calls to \f(CW\(C`maketext\(C'\fR (as Maketext's main function is called) happen
	688	thru a \(L"language session handle\(R", notionally very much like an \s-1IO\s0
	689	handle, in that you open one at the start of the session, and use it
	690	for \(L"sending signals\(R" to an object in order to have it return the text
	691	you want.
	692	.PP
	693	So, this:
	694	.PP
	695	.Vb 2
	696	\& $lang->maketext("You have [quant,_1,piece] of new mail.",
	697	\& scalar(@messages));
	698	.Ve
	699	.PP
	700	basically means this: look in the lexicon for \f(CW$lang\fR (which may inherit
	701	from any number of other lexicons), and find the function that we
	702	happen to associate with the string \*(L"You have [quant,_1,piece] of new
	703	mail\(R" (which is, and should be, a functioning \(L"shorthand\*(R" for this
	704	function in the native locale \*(-- English in this case). If you find
	705	such a function, call it with \f(CW$lang\fR as its first parameter (as if it
	706	were a method), and then a copy of scalar(@messages) as its second,
	707	and then return that value. If that function was found, but was in
	708	string shorthand instead of being a fully specified function, parse it
	709	and make it into a function before calling it the first time.
	710	.PP
	711	* The shorthand uses code in brackets to indicate method calls that
	712	should be performed. A full explanation is not in order here, but a
	713	few examples will suffice:
	714	.PP
	715	.Vb 1
	716	\& "You have [quant,_1,piece] of new mail."
	717	.Ve
	718	.PP
	719	The above code is shorthand for, and will be interpreted as,
	720	this:
	721	.PP
	722	.Vb 8
	723	\& sub {
	724	\& my $handle = $_[0];
	725	\& my(@params) = @_;
	726	\& return join '',
	727	\& "You have ",
	728	\& $handle->quant($params[1], 'piece'),
	729	\& "of new mail.";
	730	\& }
	731	.Ve
	732	.PP
	733	where \(L"quant\(R" is the name of a method you're using to quantify the
	734	noun \(L"piece\(R" with the number \f(CW$params\fR[0].
	735	.PP
	736	A string with no brackety calls, like this:
	737	.PP
	738	.Vb 1
	739	\& "Your search expression was malformed."
	740	.Ve
	741	.PP
	742	is somewhat of a degerate case, and just gets turned into:
	743	.PP
	744	.Vb 1
	745	\& sub { return "Your search expression was malformed." }
	746	.Ve
	747	.PP
	748	However, not everything you can write in Perl code can be written in
	749	the above shorthand system \*(-- not by a long shot. For example, consider
	750	the Italian translator from the beginning of this article, who wanted
	751	the Italian for \(L"I didn't find any files\(R" as a special case, instead
	752	of \(L"I found 0 files\(R". That couldn't be specified (at least not easily
	753	or simply) in our shorthand system, and it would have to be written
	754	out in full, like this:
	755	.PP
	756	.Vb 10
	757	\& sub { # pretend the English strings are in Italian
	758	\& my($handle, $files, $dirs) = @_[0,1,2];
	759	\& return "I didn't find any files" unless $files;
	760	\& return join '',
	761	\& "I found ",
	762	\& $handle->quant($files, 'file'),
	763	\& " in ",
	764	\& $handle->quant($dirs, 'directory'),
	765	\& ".";
	766	\& }
	767	.Ve
	768	.PP
	769	Next to a lexicon full of shorthand code, that sort of sticks out like a
	770	sore thumb \*(-- but this \fIis\fR a special case, after all; and at least
	771	it's possible, if not as concise as usual.
	772	.PP
	773	As to how you'd implement the Russian example from the beginning of
	774	the article, well, There's More Than One Way To Do It, but it could be
	775	something like this (using English words for Russian, just so you know
	776	what's going on):
	777	.PP
	778	.Vb 1
	779	\& "I [quant,_1,directory,accusative] scanned."
	780	.Ve
	781	.PP
	782	This shifts the burden of complexity off to the quant method. That
	783	method's parameters are: the numeric value it's going to use to
	784	quantify something; the Russian word it's going to quantify; and the
	785	parameter \(L"accusative\(R", which you're using to mean that this
	786	sentence's syntax wants a noun in the accusative case there, although
	787	that quantification method may have to overrule, for grammatical
	788	reasons you may recall from the beginning of this article.
	789	.PP
	790	Now, the Russian quant method here is responsible not only for
	791	implementing the strange logic necessary for figuring out how Russian
	792	number-phrases impose case and number on their noun\-phrases, but also
	793	for inflecting the Russian word for \(L"directory\(R". How that inflection
	794	is to be carried out is no small issue, and among the solutions I've
	795	seen, some (like variations on a simple lookup in a hash where all
	796	possible forms are provided for all necessary words) are
	797	straightforward but \fIcan\fR become cumbersome when you need to inflect
	798	more than a few dozen words; and other solutions (like using
	799	algorithms to model the inflections, storing only root forms and
	800	irregularities) \fIcan\fR involve more overhead than is justifiable for
	801	all but the largest lexicons.
	802	.PP
	803	Mercifully, this design decision becomes crucial only in the hairiest
	804	of inflected languages, of which Russian is by no means the \fIworst\fR case
	805	scenario, but is worse than most. Most languages have simpler
	806	inflection systems; for example, in English or Swahili, there are
	807	generally no more than two possible inflected forms for a given noun
	808	(\(L"error/errors\(R"; \(L"kosa/makosa\(R"), and the
	809	rules for producing these forms are fairly simple \*(-- or at least,
	810	simple rules can be formulated that work for most words, and you can
	811	then treat the exceptions as just \(L"irregular\(R", at least relative to
	812	your ad hoc rules. A simpler inflection system (simpler rules, fewer
	813	forms) means that design decisions are less crucial to maintaining
	814	sanity, whereas the same decisions could incur
	815	overhead-versus-scalability problems in languages like Russian. It
	816	may \fIalso\fR be likely that code (possibly in Perl, as with
	817	Lingua::EN::Inflect, for English nouns) has already
	818	been written for the language in question, whether simple or complex.
	819	.PP
	820	Moreover, a third possibility may even be simpler than anything
	821	discussed above: \*(L"Just require that all possible (or at least
	822	applicable) forms be provided in the call to the given language's quant
	823	method, as in:\*(R"
	824	.PP
	825	.Vb 1
	826	\& "I found [quant,_1,file,files]."
	827	.Ve
	828	.PP
	829	That way, quant just has to chose which form it needs, without having
	830	to look up or generate anything. While possibly not optimal for
	831	Russian, this should work well for most other languages, where
	832	quantification is not as complicated an operation.
	833	.Sh "The Devil in the Details"
	834	.IX Subsection "The Devil in the Details"
	835	There's plenty more to Maketext than described above \*(-- for example,
	836	there's the details of how language tags (\(L"en\-US\(R", \(L"i\-pwn\(R", \(L"fi\(R",
	837	etc.) or locale IDs (\(L"en_US\(R") interact with actual module naming
	838	(\(L"BogoQuery/Locale/en_us.pm\(R"), and what magic can ensue; there's the
	839	details of how to record (and possibly negotiate) what character
	840	encoding Maketext will return text in (\s-1UTF8\s0? Latin\-1? \s-1KOI8\s0?). There's
	841	the interesting fact that Maketext is for localization, but nowhere
	842	actually has a "\f(CW\(C`use locale;\(C'\fR" anywhere in it. For the curious,
	843	there's the somewhat frightening details of how I actually
	844	implement something like data inheritance so that searches across
	845	modules' \f(CW%Lexicon\fR hashes can parallel how Perl implements method
	846	inheritance.
	847	.PP
	848	And, most importantly, there's all the practical details of how to
	849	actually go about deriving from Maketext so you can use it for your
	850	interfaces, and the various tools and conventions for starting out and
	851	maintaining individual language modules.
	852	.PP
	853	That is all covered in the documentation for Locale::Maketext and the
	854	modules that come with it, available in \s-1CPAN\s0. After having read this
	855	article, which covers the why's of Maketext, the documentation,
	856	which covers the how's of it, should be quite straightfoward.
	857	.Sh "The Proof in the Pudding: Localizing Web Sites"
	858	.IX Subsection "The Proof in the Pudding: Localizing Web Sites"
	859	Maketext and gettext have a notable difference: gettext is in C,
	860	accessible thru C library calls, whereas Maketext is in Perl, and
	861	really can't work without a Perl interpreter (although I suppose
	862	something like it could be written for C). Accidents of history (and
	863	not necessarily lucky ones) have made \*(C+ the most common language for
	864	the implementation of applications like word processors, Web browsers,
	865	and even many in-house applications like custom query systems. Current
	866	conditions make it somewhat unlikely that the next one of any of these
	867	kinds of applications will be written in Perl, albeit clearly more for
	868	reasons of custom and inertia than out of consideration of what is the
	869	right tool for the job.
	870	.PP
	871	However, other accidents of history have made Perl a well-accepted
	872	language for design of server-side programs (generally in \s-1CGI\s0 form)
	873	for Web site interfaces. Localization of static pages in Web sites is
	874	trivial, feasable either with simple language-negotiation features in
	875	servers like Apache, or with some kind of server-side inclusions of
	876	language-appropriate text into layout templates. However, I think
	877	that the localization of Perl-based search systems (or other kinds of
	878	dynamic content) in Web sites, be they public or access\-restricted,
	879	is where Maketext will see the greatest use.
	880	.PP
	881	I presume that it would be only the exceptional Web site that gets
	882	localized for English \fIand\fR Chinese \fIand\fR Italian \fIand\fR Arabic
	883	\&\fIand\fR Russian, to recall the languages from the beginning of this
	884	article \*(-- to say nothing of German, Spanish, French, Japanese,
	885	Finnish, and Hindi, to name a few languages that benefit from large
	886	numbers of programmers or Web viewers or both.
	887	.PP
	888	However, the ever-increasing internationalization of the Web (whether
	889	measured in terms of amount of content, of numbers of content writers
	890	or programmers, or of size of content audiences) makes it increasingly
	891	likely that the interface to the average Web-based dynamic content
	892	service will be localized for two or maybe three languages. It is my
	893	hope that Maketext will make that task as simple as possible, and will
	894	remove previous barriers to localization for languages dissimilar to
	895	English.
	896	.PP
	897	.Vb 1
	898	\& __END__
	899	.Ve
	900	.PP
	901	Sean M. Burke (sburke@cpan.org) has a Master's in linguistics
	902	from Northwestern University; he specializes in language technology.
	903	Jordan Lachler (lachler@unm.edu) is a PhD student in the Department of
	904	Linguistics at the University of New Mexico; he specializes in
	905	morphology and pedagogy of North American native languages.
	906	.Sh "References"
	907	.IX Subsection "References"
	908	Alvestrand, Harald Tveit. 1995. \fI\s-1RFC\s0 1766: Tags for the
	909	Identification of Languages.\fR
	910	\&\f(CW\(C`ftp://ftp.isi.edu/in\-notes/rfc1766.txt\(C'\fR
	911	[Now see \s-1RFC\s0 3066.]
	912	.PP
	913	Callon, Ross, editor. 1996. \fI\s-1RFC\s0 1925: The Twelve
	914	Networking Truths.\fR
	915	\&\f(CW\(C`ftp://ftp.isi.edu/in\-notes/rfc1925.txt\(C'\fR
	916	.PP
	917	Drepper, Ulrich, Peter Miller,
	918	and Franc\*,ois Pinard. 1995\-2001. \s-1GNU\s0
	919	\&\f(CW\(C`gettext\(C'\fR. Available in \f(CW\(C`ftp://prep.ai.mit.edu/pub/gnu/\(C'\fR, with
	920	extensive docs in the distribution tarball. [Since
	921	I wrote this article in 1998, I now see that the
	922	gettext docs are now trying more to come to terms with
	923	plurality. Whether useful conclusions have come from it
	924	is another question altogether. \*(-- \s-1SMB\s0, May 2001]
	925	.PP
	926	Forbes, Nevill. 1964. \fIRussian Grammar.\fR Third Edition, revised
	927	by J. C. Dumbreck. Oxford University Press.