Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "PERLLOCALE 1" | |
132 | .TH PERLLOCALE 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | perllocale \- Perl locale handling (internationalization and localization) | |
135 | .SH "DESCRIPTION" | |
136 | .IX Header "DESCRIPTION" | |
137 | Perl supports language-specific notions of data such as \*(L"is this | |
138 | a letter\*(R", \*(L"what is the uppercase equivalent of this letter\*(R", and | |
139 | \&\*(L"which of these letters comes first\*(R". These are important issues, | |
140 | especially for languages other than English\*(--but also for English: it | |
141 | would be nai\*:ve to imagine that \f(CW\*(C`A\-Za\-z\*(C'\fR defines all the \*(L"letters\*(R" | |
142 | needed to write in English. Perl is also aware that some character other | |
143 | than '.' may be preferred as a decimal point, and that output date | |
144 | representations may be language\-specific. The process of making an | |
145 | application take account of its users' preferences in such matters is | |
146 | called \fBinternationalization\fR (often abbreviated as \fBi18n\fR); telling | |
147 | such an application about a particular set of preferences is known as | |
148 | \&\fBlocalization\fR (\fBl10n\fR). | |
149 | .PP | |
150 | Perl can understand language-specific data via the standardized (\s-1ISO\s0 C, | |
151 | \&\s-1XPG4\s0, \s-1POSIX\s0 1.c) method called \*(L"the locale system\*(R". The locale system is | |
152 | controlled per application using one pragma, one function call, and | |
153 | several environment variables. | |
154 | .PP | |
155 | \&\fB\s-1NOTE\s0\fR: This feature is new in Perl 5.004, and does not apply unless an | |
156 | application specifically requests it\*(--see \*(L"Backward compatibility\*(R". | |
157 | The one exception is that \fIwrite()\fR now \fBalways\fR uses the current locale | |
158 | \&\- see \*(L"\s-1NOTES\s0\*(R". | |
159 | .SH "PREPARING TO USE LOCALES" | |
160 | .IX Header "PREPARING TO USE LOCALES" | |
161 | If Perl applications are to understand and present your data | |
162 | correctly according a locale of your choice, \fBall\fR of the following | |
163 | must be true: | |
164 | .IP "\(bu" 4 | |
165 | \&\fBYour operating system must support the locale system\fR. If it does, | |
166 | you should find that the \fIsetlocale()\fR function is a documented part of | |
167 | its C library. | |
168 | .IP "\(bu" 4 | |
169 | \&\fBDefinitions for locales that you use must be installed\fR. You, or | |
170 | your system administrator, must make sure that this is the case. The | |
171 | available locales, the location in which they are kept, and the manner | |
172 | in which they are installed all vary from system to system. Some systems | |
173 | provide only a few, hard-wired locales and do not allow more to be | |
174 | added. Others allow you to add \*(L"canned\*(R" locales provided by the system | |
175 | supplier. Still others allow you or the system administrator to define | |
176 | and add arbitrary locales. (You may have to ask your supplier to | |
177 | provide canned locales that are not delivered with your operating | |
178 | system.) Read your system documentation for further illumination. | |
179 | .IP "\(bu" 4 | |
180 | \&\fBPerl must believe that the locale system is supported\fR. If it does, | |
181 | \&\f(CW\*(C`perl \-V:d_setlocale\*(C'\fR will say that the value for \f(CW\*(C`d_setlocale\*(C'\fR is | |
182 | \&\f(CW\*(C`define\*(C'\fR. | |
183 | .PP | |
184 | If you want a Perl application to process and present your data | |
185 | according to a particular locale, the application code should include | |
186 | the \f(CW\*(C`use\ locale\*(C'\fR pragma (see \*(L"The use locale pragma\*(R") where | |
187 | appropriate, and \fBat least one\fR of the following must be true: | |
188 | .IP "\(bu" 4 | |
189 | \&\fBThe locale-determining environment variables (see \*(L"\s-1ENVIRONMENT\s0\*(R") | |
190 | must be correctly set up\fR at the time the application is started, either | |
191 | by yourself or by whoever set up your system account. | |
192 | .IP "\(bu" 4 | |
193 | \&\fBThe application must set its own locale\fR using the method described in | |
194 | \&\*(L"The setlocale function\*(R". | |
195 | .SH "USING LOCALES" | |
196 | .IX Header "USING LOCALES" | |
197 | .Sh "The use locale pragma" | |
198 | .IX Subsection "The use locale pragma" | |
199 | By default, Perl ignores the current locale. The \f(CW\*(C`use\ locale\*(C'\fR | |
200 | pragma tells Perl to use the current locale for some operations: | |
201 | .IP "\(bu" 4 | |
202 | \&\fBThe comparison operators\fR (\f(CW\*(C`lt\*(C'\fR, \f(CW\*(C`le\*(C'\fR, \f(CW\*(C`cmp\*(C'\fR, \f(CW\*(C`ge\*(C'\fR, and \f(CW\*(C`gt\*(C'\fR) and | |
203 | the \s-1POSIX\s0 string collation functions \fIstrcoll()\fR and \fIstrxfrm()\fR use | |
204 | \&\f(CW\*(C`LC_COLLATE\*(C'\fR. \fIsort()\fR is also affected if used without an | |
205 | explicit comparison function, because it uses \f(CW\*(C`cmp\*(C'\fR by default. | |
206 | .Sp | |
207 | \&\fBNote:\fR \f(CW\*(C`eq\*(C'\fR and \f(CW\*(C`ne\*(C'\fR are unaffected by locale: they always | |
208 | perform a char-by-char comparison of their scalar operands. What's | |
209 | more, if \f(CW\*(C`cmp\*(C'\fR finds that its operands are equal according to the | |
210 | collation sequence specified by the current locale, it goes on to | |
211 | perform a char-by-char comparison, and only returns \fI0\fR (equal) if the | |
212 | operands are char-for-char identical. If you really want to know whether | |
213 | two strings\*(--which \f(CW\*(C`eq\*(C'\fR and \f(CW\*(C`cmp\*(C'\fR may consider different\*(--are equal | |
214 | as far as collation in the locale is concerned, see the discussion in | |
215 | \&\*(L"Category \s-1LC_COLLATE:\s0 Collation\*(R". | |
216 | .IP "\(bu" 4 | |
217 | \&\fBRegular expressions and case-modification functions\fR (\fIuc()\fR, \fIlc()\fR, | |
218 | \&\fIucfirst()\fR, and \fIlcfirst()\fR) use \f(CW\*(C`LC_CTYPE\*(C'\fR | |
219 | .IP "\(bu" 4 | |
220 | \&\fBThe formatting functions\fR (\fIprintf()\fR, \fIsprintf()\fR and \fIwrite()\fR) use | |
221 | \&\f(CW\*(C`LC_NUMERIC\*(C'\fR | |
222 | .IP "\(bu" 4 | |
223 | \&\fBThe \s-1POSIX\s0 date formatting function\fR (\fIstrftime()\fR) uses \f(CW\*(C`LC_TIME\*(C'\fR. | |
224 | .PP | |
225 | \&\f(CW\*(C`LC_COLLATE\*(C'\fR, \f(CW\*(C`LC_CTYPE\*(C'\fR, and so on, are discussed further in | |
226 | \&\*(L"\s-1LOCALE\s0 \s-1CATEGORIES\s0\*(R". | |
227 | .PP | |
228 | The default behavior is restored with the \f(CW\*(C`no\ locale\*(C'\fR pragma, or | |
229 | upon reaching the end of block enclosing \f(CW\*(C`use locale\*(C'\fR. | |
230 | .PP | |
231 | The string result of any operation that uses locale | |
232 | information is tainted, as it is possible for a locale to be | |
233 | untrustworthy. See \*(L"\s-1SECURITY\s0\*(R". | |
234 | .Sh "The setlocale function" | |
235 | .IX Subsection "The setlocale function" | |
236 | You can switch locales as often as you wish at run time with the | |
237 | \&\fIPOSIX::setlocale()\fR function: | |
238 | .PP | |
239 | .Vb 2 | |
240 | \& # This functionality not usable prior to Perl 5.004 | |
241 | \& require 5.004; | |
242 | .Ve | |
243 | .PP | |
244 | .Vb 4 | |
245 | \& # Import locale-handling tool set from POSIX module. | |
246 | \& # This example uses: setlocale -- the function call | |
247 | \& # LC_CTYPE -- explained below | |
248 | \& use POSIX qw(locale_h); | |
249 | .Ve | |
250 | .PP | |
251 | .Vb 2 | |
252 | \& # query and save the old locale | |
253 | \& $old_locale = setlocale(LC_CTYPE); | |
254 | .Ve | |
255 | .PP | |
256 | .Vb 2 | |
257 | \& setlocale(LC_CTYPE, "fr_CA.ISO8859-1"); | |
258 | \& # LC_CTYPE now in locale "French, Canada, codeset ISO 8859-1" | |
259 | .Ve | |
260 | .PP | |
261 | .Vb 3 | |
262 | \& setlocale(LC_CTYPE, ""); | |
263 | \& # LC_CTYPE now reset to default defined by LC_ALL/LC_CTYPE/LANG | |
264 | \& # environment variables. See below for documentation. | |
265 | .Ve | |
266 | .PP | |
267 | .Vb 2 | |
268 | \& # restore the old locale | |
269 | \& setlocale(LC_CTYPE, $old_locale); | |
270 | .Ve | |
271 | .PP | |
272 | The first argument of \fIsetlocale()\fR gives the \fBcategory\fR, the second the | |
273 | \&\fBlocale\fR. The category tells in what aspect of data processing you | |
274 | want to apply locale-specific rules. Category names are discussed in | |
275 | \&\*(L"\s-1LOCALE\s0 \s-1CATEGORIES\s0\*(R" and \*(L"\s-1ENVIRONMENT\s0\*(R". The locale is the name of a | |
276 | collection of customization information corresponding to a particular | |
277 | combination of language, country or territory, and codeset. Read on for | |
278 | hints on the naming of locales: not all systems name locales as in the | |
279 | example. | |
280 | .PP | |
281 | If no second argument is provided and the category is something else | |
282 | than \s-1LC_ALL\s0, the function returns a string naming the current locale | |
283 | for the category. You can use this value as the second argument in a | |
284 | subsequent call to \fIsetlocale()\fR. | |
285 | .PP | |
286 | If no second argument is provided and the category is \s-1LC_ALL\s0, the | |
287 | result is implementation\-dependent. It may be a string of | |
288 | concatenated locales names (separator also implementation\-dependent) | |
289 | or a single locale name. Please consult your \fIsetlocale\fR\|(3) for | |
290 | details. | |
291 | .PP | |
292 | If a second argument is given and it corresponds to a valid locale, | |
293 | the locale for the category is set to that value, and the function | |
294 | returns the now-current locale value. You can then use this in yet | |
295 | another call to \fIsetlocale()\fR. (In some implementations, the return | |
296 | value may sometimes differ from the value you gave as the second | |
297 | argument\*(--think of it as an alias for the value you gave.) | |
298 | .PP | |
299 | As the example shows, if the second argument is an empty string, the | |
300 | category's locale is returned to the default specified by the | |
301 | corresponding environment variables. Generally, this results in a | |
302 | return to the default that was in force when Perl started up: changes | |
303 | to the environment made by the application after startup may or may not | |
304 | be noticed, depending on your system's C library. | |
305 | .PP | |
306 | If the second argument does not correspond to a valid locale, the locale | |
307 | for the category is not changed, and the function returns \fIundef\fR. | |
308 | .PP | |
309 | For further information about the categories, consult \fIsetlocale\fR\|(3). | |
310 | .Sh "Finding locales" | |
311 | .IX Subsection "Finding locales" | |
312 | For locales available in your system, consult also \fIsetlocale\fR\|(3) to | |
313 | see whether it leads to the list of available locales (search for the | |
314 | \&\fI\s-1SEE\s0 \s-1ALSO\s0\fR section). If that fails, try the following command lines: | |
315 | .PP | |
316 | .Vb 1 | |
317 | \& locale -a | |
318 | .Ve | |
319 | .PP | |
320 | .Vb 1 | |
321 | \& nlsinfo | |
322 | .Ve | |
323 | .PP | |
324 | .Vb 1 | |
325 | \& ls /usr/lib/nls/loc | |
326 | .Ve | |
327 | .PP | |
328 | .Vb 1 | |
329 | \& ls /usr/lib/locale | |
330 | .Ve | |
331 | .PP | |
332 | .Vb 1 | |
333 | \& ls /usr/lib/nls | |
334 | .Ve | |
335 | .PP | |
336 | .Vb 1 | |
337 | \& ls /usr/share/locale | |
338 | .Ve | |
339 | .PP | |
340 | and see whether they list something resembling these | |
341 | .PP | |
342 | .Vb 7 | |
343 | \& en_US.ISO8859-1 de_DE.ISO8859-1 ru_RU.ISO8859-5 | |
344 | \& en_US.iso88591 de_DE.iso88591 ru_RU.iso88595 | |
345 | \& en_US de_DE ru_RU | |
346 | \& en de ru | |
347 | \& english german russian | |
348 | \& english.iso88591 german.iso88591 russian.iso88595 | |
349 | \& english.roman8 russian.koi8r | |
350 | .Ve | |
351 | .PP | |
352 | Sadly, even though the calling interface for \fIsetlocale()\fR has been | |
353 | standardized, names of locales and the directories where the | |
354 | configuration resides have not been. The basic form of the name is | |
355 | \&\fIlanguage_territory\fR\fB.\fR\fIcodeset\fR, but the latter parts after | |
356 | \&\fIlanguage\fR are not always present. The \fIlanguage\fR and \fIcountry\fR | |
357 | are usually from the standards \fB\s-1ISO\s0 3166\fR and \fB\s-1ISO\s0 639\fR, the | |
358 | two-letter abbreviations for the countries and the languages of the | |
359 | world, respectively. The \fIcodeset\fR part often mentions some \fB\s-1ISO\s0 | |
360 | 8859\fR character set, the Latin codesets. For example, \f(CW\*(C`ISO 8859\-1\*(C'\fR | |
361 | is the so-called \*(L"Western European codeset\*(R" that can be used to encode | |
362 | most Western European languages adequately. Again, there are several | |
363 | ways to write even the name of that one standard. Lamentably. | |
364 | .PP | |
365 | Two special locales are worth particular mention: \*(L"C\*(R" and \*(L"\s-1POSIX\s0\*(R". | |
366 | Currently these are effectively the same locale: the difference is | |
367 | mainly that the first one is defined by the C standard, the second by | |
368 | the \s-1POSIX\s0 standard. They define the \fBdefault locale\fR in which | |
369 | every program starts in the absence of locale information in its | |
370 | environment. (The \fIdefault\fR default locale, if you will.) Its language | |
371 | is (American) English and its character codeset \s-1ASCII\s0. | |
372 | .PP | |
373 | \&\fB\s-1NOTE\s0\fR: Not all systems have the \*(L"\s-1POSIX\s0\*(R" locale (not all systems are | |
374 | POSIX\-conformant), so use \*(L"C\*(R" when you need explicitly to specify this | |
375 | default locale. | |
376 | .Sh "\s-1LOCALE\s0 \s-1PROBLEMS\s0" | |
377 | .IX Subsection "LOCALE PROBLEMS" | |
378 | You may encounter the following warning message at Perl startup: | |
379 | .PP | |
380 | .Vb 6 | |
381 | \& perl: warning: Setting locale failed. | |
382 | \& perl: warning: Please check that your locale settings: | |
383 | \& LC_ALL = "En_US", | |
384 | \& LANG = (unset) | |
385 | \& are supported and installed on your system. | |
386 | \& perl: warning: Falling back to the standard locale ("C"). | |
387 | .Ve | |
388 | .PP | |
389 | This means that your locale settings had \s-1LC_ALL\s0 set to \*(L"En_US\*(R" and | |
390 | \&\s-1LANG\s0 exists but has no value. Perl tried to believe you but could not. | |
391 | Instead, Perl gave up and fell back to the \*(L"C\*(R" locale, the default locale | |
392 | that is supposed to work no matter what. This usually means your locale | |
393 | settings were wrong, they mention locales your system has never heard | |
394 | of, or the locale installation in your system has problems (for example, | |
395 | some system files are broken or missing). There are quick and temporary | |
396 | fixes to these problems, as well as more thorough and lasting fixes. | |
397 | .Sh "Temporarily fixing locale problems" | |
398 | .IX Subsection "Temporarily fixing locale problems" | |
399 | The two quickest fixes are either to render Perl silent about any | |
400 | locale inconsistencies or to run Perl under the default locale \*(L"C\*(R". | |
401 | .PP | |
402 | Perl's moaning about locale problems can be silenced by setting the | |
403 | environment variable \s-1PERL_BADLANG\s0 to a zero value, for example \*(L"0\*(R". | |
404 | This method really just sweeps the problem under the carpet: you tell | |
405 | Perl to shut up even when Perl sees that something is wrong. Do not | |
406 | be surprised if later something locale-dependent misbehaves. | |
407 | .PP | |
408 | Perl can be run under the \*(L"C\*(R" locale by setting the environment | |
409 | variable \s-1LC_ALL\s0 to \*(L"C\*(R". This method is perhaps a bit more civilized | |
410 | than the \s-1PERL_BADLANG\s0 approach, but setting \s-1LC_ALL\s0 (or | |
411 | other locale variables) may affect other programs as well, not just | |
412 | Perl. In particular, external programs run from within Perl will see | |
413 | these changes. If you make the new settings permanent (read on), all | |
414 | programs you run see the changes. See \s-1ENVIRONMENT\s0 for | |
415 | the full list of relevant environment variables and \*(L"\s-1USING\s0 \s-1LOCALES\s0\*(R" | |
416 | for their effects in Perl. Effects in other programs are | |
417 | easily deducible. For example, the variable \s-1LC_COLLATE\s0 may well affect | |
418 | your \fBsort\fR program (or whatever the program that arranges \*(L"records\*(R" | |
419 | alphabetically in your system is called). | |
420 | .PP | |
421 | You can test out changing these variables temporarily, and if the | |
422 | new settings seem to help, put those settings into your shell startup | |
423 | files. Consult your local documentation for the exact details. For in | |
424 | Bourne-like shells (\fBsh\fR, \fBksh\fR, \fBbash\fR, \fBzsh\fR): | |
425 | .PP | |
426 | .Vb 2 | |
427 | \& LC_ALL=en_US.ISO8859-1 | |
428 | \& export LC_ALL | |
429 | .Ve | |
430 | .PP | |
431 | This assumes that we saw the locale \*(L"en_US.ISO8859\-1\*(R" using the commands | |
432 | discussed above. We decided to try that instead of the above faulty | |
433 | locale \*(L"En_US\*(R"\-\-and in Cshish shells (\fBcsh\fR, \fBtcsh\fR) | |
434 | .PP | |
435 | .Vb 1 | |
436 | \& setenv LC_ALL en_US.ISO8859-1 | |
437 | .Ve | |
438 | .PP | |
439 | or if you have the \*(L"env\*(R" application you can do in any shell | |
440 | .PP | |
441 | .Vb 1 | |
442 | \& env LC_ALL=en_US.ISO8859-1 perl ... | |
443 | .Ve | |
444 | .PP | |
445 | If you do not know what shell you have, consult your local | |
446 | helpdesk or the equivalent. | |
447 | .Sh "Permanently fixing locale problems" | |
448 | .IX Subsection "Permanently fixing locale problems" | |
449 | The slower but superior fixes are when you may be able to yourself | |
450 | fix the misconfiguration of your own environment variables. The | |
451 | mis(sing)configuration of the whole system's locales usually requires | |
452 | the help of your friendly system administrator. | |
453 | .PP | |
454 | First, see earlier in this document about \*(L"Finding locales\*(R". That tells | |
455 | how to find which locales are really supported\*(--and more importantly, | |
456 | installed\*(--on your system. In our example error message, environment | |
457 | variables affecting the locale are listed in the order of decreasing | |
458 | importance (and unset variables do not matter). Therefore, having | |
459 | \&\s-1LC_ALL\s0 set to \*(L"En_US\*(R" must have been the bad choice, as shown by the | |
460 | error message. First try fixing locale settings listed first. | |
461 | .PP | |
462 | Second, if using the listed commands you see something \fBexactly\fR | |
463 | (prefix matches do not count and case usually counts) like \*(L"En_US\*(R" | |
464 | without the quotes, then you should be okay because you are using a | |
465 | locale name that should be installed and available in your system. | |
466 | In this case, see \*(L"Permanently fixing your system's locale configuration\*(R". | |
467 | .Sh "Permanently fixing your system's locale configuration" | |
468 | .IX Subsection "Permanently fixing your system's locale configuration" | |
469 | This is when you see something like: | |
470 | .PP | |
471 | .Vb 4 | |
472 | \& perl: warning: Please check that your locale settings: | |
473 | \& LC_ALL = "En_US", | |
474 | \& LANG = (unset) | |
475 | \& are supported and installed on your system. | |
476 | .Ve | |
477 | .PP | |
478 | but then cannot see that \*(L"En_US\*(R" listed by the above-mentioned | |
479 | commands. You may see things like \*(L"en_US.ISO8859\-1\*(R", but that isn't | |
480 | the same. In this case, try running under a locale | |
481 | that you can list and which somehow matches what you tried. The | |
482 | rules for matching locale names are a bit vague because | |
483 | standardization is weak in this area. See again the | |
484 | \&\*(L"Finding locales\*(R" about general rules. | |
485 | .Sh "Fixing system locale configuration" | |
486 | .IX Subsection "Fixing system locale configuration" | |
487 | Contact a system administrator (preferably your own) and report the exact | |
488 | error message you get, and ask them to read this same documentation you | |
489 | are now reading. They should be able to check whether there is something | |
490 | wrong with the locale configuration of the system. The \*(L"Finding locales\*(R" | |
491 | section is unfortunately a bit vague about the exact commands and places | |
492 | because these things are not that standardized. | |
493 | .Sh "The localeconv function" | |
494 | .IX Subsection "The localeconv function" | |
495 | The \fIPOSIX::localeconv()\fR function allows you to get particulars of the | |
496 | locale-dependent numeric formatting information specified by the current | |
497 | \&\f(CW\*(C`LC_NUMERIC\*(C'\fR and \f(CW\*(C`LC_MONETARY\*(C'\fR locales. (If you just want the name of | |
498 | the current locale for a particular category, use \fIPOSIX::setlocale()\fR | |
499 | with a single parameter\*(--see \*(L"The setlocale function\*(R".) | |
500 | .PP | |
501 | .Vb 1 | |
502 | \& use POSIX qw(locale_h); | |
503 | .Ve | |
504 | .PP | |
505 | .Vb 2 | |
506 | \& # Get a reference to a hash of locale-dependent info | |
507 | \& $locale_values = localeconv(); | |
508 | .Ve | |
509 | .PP | |
510 | .Vb 4 | |
511 | \& # Output sorted list of the values | |
512 | \& for (sort keys %$locale_values) { | |
513 | \& printf "%-20s = %s\en", $_, $locale_values->{$_} | |
514 | \& } | |
515 | .Ve | |
516 | .PP | |
517 | \&\fIlocaleconv()\fR takes no arguments, and returns \fBa reference to\fR a hash. | |
518 | The keys of this hash are variable names for formatting, such as | |
519 | \&\f(CW\*(C`decimal_point\*(C'\fR and \f(CW\*(C`thousands_sep\*(C'\fR. The values are the | |
520 | corresponding, er, values. See \*(L"localeconv\*(R" in \s-1POSIX\s0 for a longer | |
521 | example listing the categories an implementation might be expected to | |
522 | provide; some provide more and others fewer. You don't need an | |
523 | explicit \f(CW\*(C`use locale\*(C'\fR, because \fIlocaleconv()\fR always observes the | |
524 | current locale. | |
525 | .PP | |
526 | Here's a simple-minded example program that rewrites its command-line | |
527 | parameters as integers correctly formatted in the current locale: | |
528 | .PP | |
529 | .Vb 3 | |
530 | \& # See comments in previous example | |
531 | \& require 5.004; | |
532 | \& use POSIX qw(locale_h); | |
533 | .Ve | |
534 | .PP | |
535 | .Vb 3 | |
536 | \& # Get some of locale's numeric formatting parameters | |
537 | \& my ($thousands_sep, $grouping) = | |
538 | \& @{localeconv()}{'thousands_sep', 'grouping'}; | |
539 | .Ve | |
540 | .PP | |
541 | .Vb 2 | |
542 | \& # Apply defaults if values are missing | |
543 | \& $thousands_sep = ',' unless $thousands_sep; | |
544 | .Ve | |
545 | .PP | |
546 | .Vb 16 | |
547 | \& # grouping and mon_grouping are packed lists | |
548 | \& # of small integers (characters) telling the | |
549 | \& # grouping (thousand_seps and mon_thousand_seps | |
550 | \& # being the group dividers) of numbers and | |
551 | \& # monetary quantities. The integers' meanings: | |
552 | \& # 255 means no more grouping, 0 means repeat | |
553 | \& # the previous grouping, 1-254 means use that | |
554 | \& # as the current grouping. Grouping goes from | |
555 | \& # right to left (low to high digits). In the | |
556 | \& # below we cheat slightly by never using anything | |
557 | \& # else than the first grouping (whatever that is). | |
558 | \& if ($grouping) { | |
559 | \& @grouping = unpack("C*", $grouping); | |
560 | \& } else { | |
561 | \& @grouping = (3); | |
562 | \& } | |
563 | .Ve | |
564 | .PP | |
565 | .Vb 8 | |
566 | \& # Format command line params for current locale | |
567 | \& for (@ARGV) { | |
568 | \& $_ = int; # Chop non-integer part | |
569 | \& 1 while | |
570 | \& s/(\ed)(\ed{$grouping[0]}($|$thousands_sep))/$1$thousands_sep$2/; | |
571 | \& print "$_"; | |
572 | \& } | |
573 | \& print "\en"; | |
574 | .Ve | |
575 | .Sh "I18N::Langinfo" | |
576 | .IX Subsection "I18N::Langinfo" | |
577 | Another interface for querying locale-dependent information is the | |
578 | \&\fII18N::Langinfo::langinfo()\fR function, available at least in UNIX-like | |
579 | systems and \s-1VMS\s0. | |
580 | .PP | |
581 | The following example will import the \fIlanginfo()\fR function itself and | |
582 | three constants to be used as arguments to \fIlanginfo()\fR: a constant for | |
583 | the abbreviated first day of the week (the numbering starts from | |
584 | Sunday = 1) and two more constants for the affirmative and negative | |
585 | answers for a yes/no question in the current locale. | |
586 | .PP | |
587 | .Vb 1 | |
588 | \& use I18N::Langinfo qw(langinfo ABDAY_1 YESSTR NOSTR); | |
589 | .Ve | |
590 | .PP | |
591 | .Vb 1 | |
592 | \& my ($abday_1, $yesstr, $nostr) = map { langinfo } qw(ABDAY_1 YESSTR NOSTR); | |
593 | .Ve | |
594 | .PP | |
595 | .Vb 1 | |
596 | \& print "$abday_1? [$yesstr/$nostr] "; | |
597 | .Ve | |
598 | .PP | |
599 | In other words, in the \*(L"C\*(R" (or English) locale the above will probably | |
600 | print something like: | |
601 | .PP | |
602 | .Vb 1 | |
603 | \& Sun? [yes/no] | |
604 | .Ve | |
605 | .PP | |
606 | See I18N::Langinfo for more information. | |
607 | .SH "LOCALE CATEGORIES" | |
608 | .IX Header "LOCALE CATEGORIES" | |
609 | The following subsections describe basic locale categories. Beyond these, | |
610 | some combination categories allow manipulation of more than one | |
611 | basic category at a time. See \*(L"\s-1ENVIRONMENT\s0\*(R" for a discussion of these. | |
612 | .Sh "Category \s-1LC_COLLATE:\s0 Collation" | |
613 | .IX Subsection "Category LC_COLLATE: Collation" | |
614 | In the scope of \f(CW\*(C`use\ locale\*(C'\fR, Perl looks to the \f(CW\*(C`LC_COLLATE\*(C'\fR | |
615 | environment variable to determine the application's notions on collation | |
616 | (ordering) of characters. For example, 'b' follows 'a' in Latin | |
617 | alphabets, but where do 'a\*'' and 'a\*o' belong? And while | |
618 | \&'color' follows 'chocolate' in English, what about in Spanish? | |
619 | .PP | |
620 | The following collations all make sense and you may meet any of them | |
621 | if you \*(L"use locale\*(R". | |
622 | .PP | |
623 | .Vb 4 | |
624 | \& A B C D E a b c d e | |
625 | \& A a B b C c D d E e | |
626 | \& a A b B c C d D e E | |
627 | \& a b c d e A B C D E | |
628 | .Ve | |
629 | .PP | |
630 | Here is a code snippet to tell what \*(L"word\*(R" | |
631 | characters are in the current locale, in that locale's order: | |
632 | .PP | |
633 | .Vb 2 | |
634 | \& use locale; | |
635 | \& print +(sort grep /\ew/, map { chr } 0..255), "\en"; | |
636 | .Ve | |
637 | .PP | |
638 | Compare this with the characters that you see and their order if you | |
639 | state explicitly that the locale should be ignored: | |
640 | .PP | |
641 | .Vb 2 | |
642 | \& no locale; | |
643 | \& print +(sort grep /\ew/, map { chr } 0..255), "\en"; | |
644 | .Ve | |
645 | .PP | |
646 | This machine-native collation (which is what you get unless \f(CW\*(C`use\ locale\*(C'\fR has appeared earlier in the same block) must be used for | |
647 | sorting raw binary data, whereas the locale-dependent collation of the | |
648 | first example is useful for natural text. | |
649 | .PP | |
650 | As noted in \*(L"\s-1USING\s0 \s-1LOCALES\s0\*(R", \f(CW\*(C`cmp\*(C'\fR compares according to the current | |
651 | collation locale when \f(CW\*(C`use locale\*(C'\fR is in effect, but falls back to a | |
652 | char-by-char comparison for strings that the locale says are equal. You | |
653 | can use \fIPOSIX::strcoll()\fR if you don't want this fall\-back: | |
654 | .PP | |
655 | .Vb 3 | |
656 | \& use POSIX qw(strcoll); | |
657 | \& $equal_in_locale = | |
658 | \& !strcoll("space and case ignored", "SpaceAndCaseIgnored"); | |
659 | .Ve | |
660 | .PP | |
661 | $equal_in_locale will be true if the collation locale specifies a | |
662 | dictionary-like ordering that ignores space characters completely and | |
663 | which folds case. | |
664 | .PP | |
665 | If you have a single string that you want to check for \*(L"equality in | |
666 | locale\*(R" against several others, you might think you could gain a little | |
667 | efficiency by using \fIPOSIX::strxfrm()\fR in conjunction with \f(CW\*(C`eq\*(C'\fR: | |
668 | .PP | |
669 | .Vb 8 | |
670 | \& use POSIX qw(strxfrm); | |
671 | \& $xfrm_string = strxfrm("Mixed-case string"); | |
672 | \& print "locale collation ignores spaces\en" | |
673 | \& if $xfrm_string eq strxfrm("Mixed-casestring"); | |
674 | \& print "locale collation ignores hyphens\en" | |
675 | \& if $xfrm_string eq strxfrm("Mixedcase string"); | |
676 | \& print "locale collation ignores case\en" | |
677 | \& if $xfrm_string eq strxfrm("mixed-case string"); | |
678 | .Ve | |
679 | .PP | |
680 | \&\fIstrxfrm()\fR takes a string and maps it into a transformed string for use | |
681 | in char-by-char comparisons against other transformed strings during | |
682 | collation. \*(L"Under the hood\*(R", locale-affected Perl comparison operators | |
683 | call \fIstrxfrm()\fR for both operands, then do a char-by-char | |
684 | comparison of the transformed strings. By calling \fIstrxfrm()\fR explicitly | |
685 | and using a non locale-affected comparison, the example attempts to save | |
686 | a couple of transformations. But in fact, it doesn't save anything: Perl | |
687 | magic (see \*(L"Magic Variables\*(R" in perlguts) creates the transformed version of a | |
688 | string the first time it's needed in a comparison, then keeps this version around | |
689 | in case it's needed again. An example rewritten the easy way with | |
690 | \&\f(CW\*(C`cmp\*(C'\fR runs just about as fast. It also copes with null characters | |
691 | embedded in strings; if you call \fIstrxfrm()\fR directly, it treats the first | |
692 | null it finds as a terminator. don't expect the transformed strings | |
693 | it produces to be portable across systems\*(--or even from one revision | |
694 | of your operating system to the next. In short, don't call \fIstrxfrm()\fR | |
695 | directly: let Perl do it for you. | |
696 | .PP | |
697 | Note: \f(CW\*(C`use locale\*(C'\fR isn't shown in some of these examples because it isn't | |
698 | needed: \fIstrcoll()\fR and \fIstrxfrm()\fR exist only to generate locale-dependent | |
699 | results, and so always obey the current \f(CW\*(C`LC_COLLATE\*(C'\fR locale. | |
700 | .Sh "Category \s-1LC_CTYPE:\s0 Character Types" | |
701 | .IX Subsection "Category LC_CTYPE: Character Types" | |
702 | In the scope of \f(CW\*(C`use\ locale\*(C'\fR, Perl obeys the \f(CW\*(C`LC_CTYPE\*(C'\fR locale | |
703 | setting. This controls the application's notion of which characters are | |
704 | alphabetic. This affects Perl's \f(CW\*(C`\ew\*(C'\fR regular expression metanotation, | |
705 | which stands for alphanumeric characters\*(--that is, alphabetic, | |
706 | numeric, and including other special characters such as the underscore or | |
707 | hyphen. (Consult perlre for more information about | |
708 | regular expressions.) Thanks to \f(CW\*(C`LC_CTYPE\*(C'\fR, depending on your locale | |
709 | setting, characters like '\*(ae', '\*(d-', '\*8', and | |
710 | \&'o\*/' may be understood as \f(CW\*(C`\ew\*(C'\fR characters. | |
711 | .PP | |
712 | The \f(CW\*(C`LC_CTYPE\*(C'\fR locale also provides the map used in transliterating | |
713 | characters between lower and uppercase. This affects the case-mapping | |
714 | functions\*(--\fIlc()\fR, lcfirst, \fIuc()\fR, and \fIucfirst()\fR; case-mapping | |
715 | interpolation with \f(CW\*(C`\el\*(C'\fR, \f(CW\*(C`\eL\*(C'\fR, \f(CW\*(C`\eu\*(C'\fR, or \f(CW\*(C`\eU\*(C'\fR in double-quoted strings | |
716 | and \f(CW\*(C`s///\*(C'\fR substitutions; and case-independent regular expression | |
717 | pattern matching using the \f(CW\*(C`i\*(C'\fR modifier. | |
718 | .PP | |
719 | Finally, \f(CW\*(C`LC_CTYPE\*(C'\fR affects the \s-1POSIX\s0 character-class test | |
720 | functions\*(--\fIisalpha()\fR, \fIislower()\fR, and so on. For example, if you move | |
721 | from the \*(L"C\*(R" locale to a 7\-bit Scandinavian one, you may find\*(--possibly | |
722 | to your surprise\*(--that \*(L"|\*(R" moves from the \fIispunct()\fR class to \fIisalpha()\fR. | |
723 | .PP | |
724 | \&\fBNote:\fR A broken or malicious \f(CW\*(C`LC_CTYPE\*(C'\fR locale definition may result | |
725 | in clearly ineligible characters being considered to be alphanumeric by | |
726 | your application. For strict matching of (mundane) letters and | |
727 | digits\*(--for example, in command strings\*(--locale\-aware applications | |
728 | should use \f(CW\*(C`\ew\*(C'\fR inside a \f(CW\*(C`no locale\*(C'\fR block. See \*(L"\s-1SECURITY\s0\*(R". | |
729 | .Sh "Category \s-1LC_NUMERIC:\s0 Numeric Formatting" | |
730 | .IX Subsection "Category LC_NUMERIC: Numeric Formatting" | |
731 | In the scope of \f(CW\*(C`use\ locale\*(C'\fR, Perl obeys the \f(CW\*(C`LC_NUMERIC\*(C'\fR locale | |
732 | information, which controls an application's idea of how numbers should | |
733 | be formatted for human readability by the \fIprintf()\fR, \fIsprintf()\fR, and | |
734 | \&\fIwrite()\fR functions. String-to-numeric conversion by the \fIPOSIX::strtod()\fR | |
735 | function is also affected. In most implementations the only effect is to | |
736 | change the character used for the decimal point\*(--perhaps from '.' to ','. | |
737 | These functions aren't aware of such niceties as thousands separation and | |
738 | so on. (See \*(L"The localeconv function\*(R" if you care about these things.) | |
739 | .PP | |
740 | Output produced by \fIprint()\fR is also affected by the current locale: it | |
741 | depends on whether \f(CW\*(C`use locale\*(C'\fR or \f(CW\*(C`no locale\*(C'\fR is in effect, and | |
742 | corresponds to what you'd get from \fIprintf()\fR in the \*(L"C\*(R" locale. The | |
743 | same is true for Perl's internal conversions between numeric and | |
744 | string formats: | |
745 | .PP | |
746 | .Vb 2 | |
747 | \& use POSIX qw(strtod); | |
748 | \& use locale; | |
749 | .Ve | |
750 | .PP | |
751 | .Vb 1 | |
752 | \& $n = 5/2; # Assign numeric 2.5 to $n | |
753 | .Ve | |
754 | .PP | |
755 | .Vb 1 | |
756 | \& $a = " $n"; # Locale-dependent conversion to string | |
757 | .Ve | |
758 | .PP | |
759 | .Vb 1 | |
760 | \& print "half five is $n\en"; # Locale-dependent output | |
761 | .Ve | |
762 | .PP | |
763 | .Vb 1 | |
764 | \& printf "half five is %g\en", $n; # Locale-dependent output | |
765 | .Ve | |
766 | .PP | |
767 | .Vb 2 | |
768 | \& print "DECIMAL POINT IS COMMA\en" | |
769 | \& if $n == (strtod("2,5"))[0]; # Locale-dependent conversion | |
770 | .Ve | |
771 | .PP | |
772 | See also I18N::Langinfo and \f(CW\*(C`RADIXCHAR\*(C'\fR. | |
773 | .Sh "Category \s-1LC_MONETARY:\s0 Formatting of monetary amounts" | |
774 | .IX Subsection "Category LC_MONETARY: Formatting of monetary amounts" | |
775 | The C standard defines the \f(CW\*(C`LC_MONETARY\*(C'\fR category, but no function | |
776 | that is affected by its contents. (Those with experience of standards | |
777 | committees will recognize that the working group decided to punt on the | |
778 | issue.) Consequently, Perl takes no notice of it. If you really want | |
779 | to use \f(CW\*(C`LC_MONETARY\*(C'\fR, you can query its contents\*(--see | |
780 | \&\*(L"The localeconv function\*(R"\-\-and use the information that it returns in your | |
781 | application's own formatting of currency amounts. However, you may well | |
782 | find that the information, voluminous and complex though it may be, still | |
783 | does not quite meet your requirements: currency formatting is a hard nut | |
784 | to crack. | |
785 | .PP | |
786 | See also I18N::Langinfo and \f(CW\*(C`CRNCYSTR\*(C'\fR. | |
787 | .Sh "\s-1LC_TIME\s0" | |
788 | .IX Subsection "LC_TIME" | |
789 | Output produced by \fIPOSIX::strftime()\fR, which builds a formatted | |
790 | human-readable date/time string, is affected by the current \f(CW\*(C`LC_TIME\*(C'\fR | |
791 | locale. Thus, in a French locale, the output produced by the \f(CW%B\fR | |
792 | format element (full month name) for the first month of the year would | |
793 | be \*(L"janvier\*(R". Here's how to get a list of long month names in the | |
794 | current locale: | |
795 | .PP | |
796 | .Vb 5 | |
797 | \& use POSIX qw(strftime); | |
798 | \& for (0..11) { | |
799 | \& $long_month_name[$_] = | |
800 | \& strftime("%B", 0, 0, 0, 1, $_, 96); | |
801 | \& } | |
802 | .Ve | |
803 | .PP | |
804 | Note: \f(CW\*(C`use locale\*(C'\fR isn't needed in this example: as a function that | |
805 | exists only to generate locale-dependent results, \fIstrftime()\fR always | |
806 | obeys the current \f(CW\*(C`LC_TIME\*(C'\fR locale. | |
807 | .PP | |
808 | See also I18N::Langinfo and \f(CW\*(C`ABDAY_1\*(C'\fR..\f(CW\*(C`ABDAY_7\*(C'\fR, \f(CW\*(C`DAY_1\*(C'\fR..\f(CW\*(C`DAY_7\*(C'\fR, | |
809 | \&\f(CW\*(C`ABMON_1\*(C'\fR..\f(CW\*(C`ABMON_12\*(C'\fR, and \f(CW\*(C`ABMON_1\*(C'\fR..\f(CW\*(C`ABMON_12\*(C'\fR. | |
810 | .Sh "Other categories" | |
811 | .IX Subsection "Other categories" | |
812 | The remaining locale category, \f(CW\*(C`LC_MESSAGES\*(C'\fR (possibly supplemented | |
813 | by others in particular implementations) is not currently used by | |
814 | Perl\*(--except possibly to affect the behavior of library functions | |
815 | called by extensions outside the standard Perl distribution and by the | |
816 | operating system and its utilities. Note especially that the string | |
817 | value of \f(CW$!\fR and the error messages given by external utilities may | |
818 | be changed by \f(CW\*(C`LC_MESSAGES\*(C'\fR. If you want to have portable error | |
819 | codes, use \f(CW\*(C`%!\*(C'\fR. See Errno. | |
820 | .SH "SECURITY" | |
821 | .IX Header "SECURITY" | |
822 | Although the main discussion of Perl security issues can be found in | |
823 | perlsec, a discussion of Perl's locale handling would be incomplete | |
824 | if it did not draw your attention to locale-dependent security issues. | |
825 | Locales\*(--particularly on systems that allow unprivileged users to | |
826 | build their own locales\*(--are untrustworthy. A malicious (or just plain | |
827 | broken) locale can make a locale-aware application give unexpected | |
828 | results. Here are a few possibilities: | |
829 | .IP "\(bu" 4 | |
830 | Regular expression checks for safe file names or mail addresses using | |
831 | \&\f(CW\*(C`\ew\*(C'\fR may be spoofed by an \f(CW\*(C`LC_CTYPE\*(C'\fR locale that claims that | |
832 | characters such as ">\*(L" and \*(R"|" are alphanumeric. | |
833 | .IP "\(bu" 4 | |
834 | String interpolation with case\-mapping, as in, say, \f(CW\*(C`$dest = | |
835 | "C:\eU$name.$ext"\*(C'\fR, may produce dangerous results if a bogus \s-1LC_CTYPE\s0 | |
836 | case-mapping table is in effect. | |
837 | .IP "\(bu" 4 | |
838 | A sneaky \f(CW\*(C`LC_COLLATE\*(C'\fR locale could result in the names of students with | |
839 | \&\*(L"D\*(R" grades appearing ahead of those with \*(L"A\*(R"s. | |
840 | .IP "\(bu" 4 | |
841 | An application that takes the trouble to use information in | |
842 | \&\f(CW\*(C`LC_MONETARY\*(C'\fR may format debits as if they were credits and vice versa | |
843 | if that locale has been subverted. Or it might make payments in \s-1US\s0 | |
844 | dollars instead of Hong Kong dollars. | |
845 | .IP "\(bu" 4 | |
846 | The date and day names in dates formatted by \fIstrftime()\fR could be | |
847 | manipulated to advantage by a malicious user able to subvert the | |
848 | \&\f(CW\*(C`LC_DATE\*(C'\fR locale. (\*(L"Look\*(--it says I wasn't in the building on | |
849 | Sunday.\*(R") | |
850 | .PP | |
851 | Such dangers are not peculiar to the locale system: any aspect of an | |
852 | application's environment which may be modified maliciously presents | |
853 | similar challenges. Similarly, they are not specific to Perl: any | |
854 | programming language that allows you to write programs that take | |
855 | account of their environment exposes you to these issues. | |
856 | .PP | |
857 | Perl cannot protect you from all possibilities shown in the | |
858 | examples\*(--there is no substitute for your own vigilance\*(--but, when | |
859 | \&\f(CW\*(C`use locale\*(C'\fR is in effect, Perl uses the tainting mechanism (see | |
860 | perlsec) to mark string results that become locale\-dependent, and | |
861 | which may be untrustworthy in consequence. Here is a summary of the | |
862 | tainting behavior of operators and functions that may be affected by | |
863 | the locale: | |
864 | .IP "\(bu" 4 | |
865 | \&\fBComparison operators\fR (\f(CW\*(C`lt\*(C'\fR, \f(CW\*(C`le\*(C'\fR, \f(CW\*(C`ge\*(C'\fR, \f(CW\*(C`gt\*(C'\fR and \f(CW\*(C`cmp\*(C'\fR): | |
866 | .Sp | |
867 | Scalar true/false (or less/equal/greater) result is never tainted. | |
868 | .IP "\(bu" 4 | |
869 | \&\fBCase-mapping interpolation\fR (with \f(CW\*(C`\el\*(C'\fR, \f(CW\*(C`\eL\*(C'\fR, \f(CW\*(C`\eu\*(C'\fR or \f(CW\*(C`\eU\*(C'\fR) | |
870 | .Sp | |
871 | Result string containing interpolated material is tainted if | |
872 | \&\f(CW\*(C`use locale\*(C'\fR is in effect. | |
873 | .IP "\(bu" 4 | |
874 | \&\fBMatching operator\fR (\f(CW\*(C`m//\*(C'\fR): | |
875 | .Sp | |
876 | Scalar true/false result never tainted. | |
877 | .Sp | |
878 | Subpatterns, either delivered as a list-context result or as \f(CW$1\fR etc. | |
879 | are tainted if \f(CW\*(C`use locale\*(C'\fR is in effect, and the subpattern regular | |
880 | expression contains \f(CW\*(C`\ew\*(C'\fR (to match an alphanumeric character), \f(CW\*(C`\eW\*(C'\fR | |
881 | (non\-alphanumeric character), \f(CW\*(C`\es\*(C'\fR (whitespace character), or \f(CW\*(C`\eS\*(C'\fR | |
882 | (non whitespace character). The matched-pattern variable, $&, $` | |
883 | (pre\-match), $' (post\-match), and $+ (last match) are also tainted if | |
884 | \&\f(CW\*(C`use locale\*(C'\fR is in effect and the regular expression contains \f(CW\*(C`\ew\*(C'\fR, | |
885 | \&\f(CW\*(C`\eW\*(C'\fR, \f(CW\*(C`\es\*(C'\fR, or \f(CW\*(C`\eS\*(C'\fR. | |
886 | .IP "\(bu" 4 | |
887 | \&\fBSubstitution operator\fR (\f(CW\*(C`s///\*(C'\fR): | |
888 | .Sp | |
889 | Has the same behavior as the match operator. Also, the left | |
890 | operand of \f(CW\*(C`=~\*(C'\fR becomes tainted when \f(CW\*(C`use locale\*(C'\fR in effect | |
891 | if modified as a result of a substitution based on a regular | |
892 | expression match involving \f(CW\*(C`\ew\*(C'\fR, \f(CW\*(C`\eW\*(C'\fR, \f(CW\*(C`\es\*(C'\fR, or \f(CW\*(C`\eS\*(C'\fR; or of | |
893 | case-mapping with \f(CW\*(C`\el\*(C'\fR, \f(CW\*(C`\eL\*(C'\fR,\f(CW\*(C`\eu\*(C'\fR or \f(CW\*(C`\eU\*(C'\fR. | |
894 | .IP "\(bu" 4 | |
895 | \&\fBOutput formatting functions\fR (\fIprintf()\fR and \fIwrite()\fR): | |
896 | .Sp | |
897 | Results are never tainted because otherwise even output from print, | |
898 | for example \f(CW\*(C`print(1/7)\*(C'\fR, should be tainted if \f(CW\*(C`use locale\*(C'\fR is in | |
899 | effect. | |
900 | .IP "\(bu" 4 | |
901 | \&\fBCase-mapping functions\fR (\fIlc()\fR, \fIlcfirst()\fR, \fIuc()\fR, \fIucfirst()\fR): | |
902 | .Sp | |
903 | Results are tainted if \f(CW\*(C`use locale\*(C'\fR is in effect. | |
904 | .IP "\(bu" 4 | |
905 | \&\fB\s-1POSIX\s0 locale-dependent functions\fR (\fIlocaleconv()\fR, \fIstrcoll()\fR, | |
906 | \&\fIstrftime()\fR, \fIstrxfrm()\fR): | |
907 | .Sp | |
908 | Results are never tainted. | |
909 | .IP "\(bu" 4 | |
910 | \&\fB\s-1POSIX\s0 character class tests\fR (\fIisalnum()\fR, \fIisalpha()\fR, \fIisdigit()\fR, | |
911 | \&\fIisgraph()\fR, \fIislower()\fR, \fIisprint()\fR, \fIispunct()\fR, \fIisspace()\fR, \fIisupper()\fR, | |
912 | \&\fIisxdigit()\fR): | |
913 | .Sp | |
914 | True/false results are never tainted. | |
915 | .PP | |
916 | Three examples illustrate locale-dependent tainting. | |
917 | The first program, which ignores its locale, won't run: a value taken | |
918 | directly from the command line may not be used to name an output file | |
919 | when taint checks are enabled. | |
920 | .PP | |
921 | .Vb 2 | |
922 | \& #/usr/local/bin/perl -T | |
923 | \& # Run with taint checking | |
924 | .Ve | |
925 | .PP | |
926 | .Vb 2 | |
927 | \& # Command line sanity check omitted... | |
928 | \& $tainted_output_file = shift; | |
929 | .Ve | |
930 | .PP | |
931 | .Vb 2 | |
932 | \& open(F, ">$tainted_output_file") | |
933 | \& or warn "Open of $untainted_output_file failed: $!\en"; | |
934 | .Ve | |
935 | .PP | |
936 | The program can be made to run by \*(L"laundering\*(R" the tainted value through | |
937 | a regular expression: the second example\*(--which still ignores locale | |
938 | information\*(--runs, creating the file named on its command line | |
939 | if it can. | |
940 | .PP | |
941 | .Vb 1 | |
942 | \& #/usr/local/bin/perl -T | |
943 | .Ve | |
944 | .PP | |
945 | .Vb 3 | |
946 | \& $tainted_output_file = shift; | |
947 | \& $tainted_output_file =~ m%[\ew/]+%; | |
948 | \& $untainted_output_file = $&; | |
949 | .Ve | |
950 | .PP | |
951 | .Vb 2 | |
952 | \& open(F, ">$untainted_output_file") | |
953 | \& or warn "Open of $untainted_output_file failed: $!\en"; | |
954 | .Ve | |
955 | .PP | |
956 | Compare this with a similar but locale-aware program: | |
957 | .PP | |
958 | .Vb 1 | |
959 | \& #/usr/local/bin/perl -T | |
960 | .Ve | |
961 | .PP | |
962 | .Vb 4 | |
963 | \& $tainted_output_file = shift; | |
964 | \& use locale; | |
965 | \& $tainted_output_file =~ m%[\ew/]+%; | |
966 | \& $localized_output_file = $&; | |
967 | .Ve | |
968 | .PP | |
969 | .Vb 2 | |
970 | \& open(F, ">$localized_output_file") | |
971 | \& or warn "Open of $localized_output_file failed: $!\en"; | |
972 | .Ve | |
973 | .PP | |
974 | This third program fails to run because $& is tainted: it is the result | |
975 | of a match involving \f(CW\*(C`\ew\*(C'\fR while \f(CW\*(C`use locale\*(C'\fR is in effect. | |
976 | .SH "ENVIRONMENT" | |
977 | .IX Header "ENVIRONMENT" | |
978 | .IP "\s-1PERL_BADLANG\s0" 12 | |
979 | .IX Item "PERL_BADLANG" | |
980 | A string that can suppress Perl's warning about failed locale settings | |
981 | at startup. Failure can occur if the locale support in the operating | |
982 | system is lacking (broken) in some way\*(--or if you mistyped the name of | |
983 | a locale when you set up your environment. If this environment | |
984 | variable is absent, or has a value that does not evaluate to integer | |
985 | zero\*(--that is, \*(L"0\*(R" or ""\-\- Perl will complain about locale setting | |
986 | failures. | |
987 | .Sp | |
988 | \&\fB\s-1NOTE\s0\fR: \s-1PERL_BADLANG\s0 only gives you a way to hide the warning message. | |
989 | The message tells about some problem in your system's locale support, | |
990 | and you should investigate what the problem is. | |
991 | .PP | |
992 | The following environment variables are not specific to Perl: They are | |
993 | part of the standardized (\s-1ISO\s0 C, \s-1XPG4\s0, \s-1POSIX\s0 1.c) \fIsetlocale()\fR method | |
994 | for controlling an application's opinion on data. | |
995 | .IP "\s-1LC_ALL\s0" 12 | |
996 | .IX Item "LC_ALL" | |
997 | \&\f(CW\*(C`LC_ALL\*(C'\fR is the \*(L"override\-all\*(R" locale environment variable. If | |
998 | set, it overrides all the rest of the locale environment variables. | |
999 | .IP "\s-1LANGUAGE\s0" 12 | |
1000 | .IX Item "LANGUAGE" | |
1001 | \&\fB\s-1NOTE\s0\fR: \f(CW\*(C`LANGUAGE\*(C'\fR is a \s-1GNU\s0 extension, it affects you only if you | |
1002 | are using the \s-1GNU\s0 libc. This is the case if you are using e.g. Linux. | |
1003 | If you are using \*(L"commercial\*(R" UNIXes you are most probably \fInot\fR | |
1004 | using \s-1GNU\s0 libc and you can ignore \f(CW\*(C`LANGUAGE\*(C'\fR. | |
1005 | .Sp | |
1006 | However, in the case you are using \f(CW\*(C`LANGUAGE\*(C'\fR: it affects the | |
1007 | language of informational, warning, and error messages output by | |
1008 | commands (in other words, it's like \f(CW\*(C`LC_MESSAGES\*(C'\fR) but it has higher | |
1009 | priority than \s-1LC_ALL\s0. Moreover, it's not a single value but | |
1010 | instead a \*(L"path\*(R" (\*(L":\*(R"\-separated list) of \fIlanguages\fR (not locales). | |
1011 | See the \s-1GNU\s0 \f(CW\*(C`gettext\*(C'\fR library documentation for more information. | |
1012 | .IP "\s-1LC_CTYPE\s0" 12 | |
1013 | .IX Item "LC_CTYPE" | |
1014 | In the absence of \f(CW\*(C`LC_ALL\*(C'\fR, \f(CW\*(C`LC_CTYPE\*(C'\fR chooses the character type | |
1015 | locale. In the absence of both \f(CW\*(C`LC_ALL\*(C'\fR and \f(CW\*(C`LC_CTYPE\*(C'\fR, \f(CW\*(C`LANG\*(C'\fR | |
1016 | chooses the character type locale. | |
1017 | .IP "\s-1LC_COLLATE\s0" 12 | |
1018 | .IX Item "LC_COLLATE" | |
1019 | In the absence of \f(CW\*(C`LC_ALL\*(C'\fR, \f(CW\*(C`LC_COLLATE\*(C'\fR chooses the collation | |
1020 | (sorting) locale. In the absence of both \f(CW\*(C`LC_ALL\*(C'\fR and \f(CW\*(C`LC_COLLATE\*(C'\fR, | |
1021 | \&\f(CW\*(C`LANG\*(C'\fR chooses the collation locale. | |
1022 | .IP "\s-1LC_MONETARY\s0" 12 | |
1023 | .IX Item "LC_MONETARY" | |
1024 | In the absence of \f(CW\*(C`LC_ALL\*(C'\fR, \f(CW\*(C`LC_MONETARY\*(C'\fR chooses the monetary | |
1025 | formatting locale. In the absence of both \f(CW\*(C`LC_ALL\*(C'\fR and \f(CW\*(C`LC_MONETARY\*(C'\fR, | |
1026 | \&\f(CW\*(C`LANG\*(C'\fR chooses the monetary formatting locale. | |
1027 | .IP "\s-1LC_NUMERIC\s0" 12 | |
1028 | .IX Item "LC_NUMERIC" | |
1029 | In the absence of \f(CW\*(C`LC_ALL\*(C'\fR, \f(CW\*(C`LC_NUMERIC\*(C'\fR chooses the numeric format | |
1030 | locale. In the absence of both \f(CW\*(C`LC_ALL\*(C'\fR and \f(CW\*(C`LC_NUMERIC\*(C'\fR, \f(CW\*(C`LANG\*(C'\fR | |
1031 | chooses the numeric format. | |
1032 | .IP "\s-1LC_TIME\s0" 12 | |
1033 | .IX Item "LC_TIME" | |
1034 | In the absence of \f(CW\*(C`LC_ALL\*(C'\fR, \f(CW\*(C`LC_TIME\*(C'\fR chooses the date and time | |
1035 | formatting locale. In the absence of both \f(CW\*(C`LC_ALL\*(C'\fR and \f(CW\*(C`LC_TIME\*(C'\fR, | |
1036 | \&\f(CW\*(C`LANG\*(C'\fR chooses the date and time formatting locale. | |
1037 | .IP "\s-1LANG\s0" 12 | |
1038 | .IX Item "LANG" | |
1039 | \&\f(CW\*(C`LANG\*(C'\fR is the \*(L"catch\-all\*(R" locale environment variable. If it is set, it | |
1040 | is used as the last resort after the overall \f(CW\*(C`LC_ALL\*(C'\fR and the | |
1041 | category-specific \f(CW\*(C`LC_...\*(C'\fR. | |
1042 | .SH "NOTES" | |
1043 | .IX Header "NOTES" | |
1044 | .Sh "Backward compatibility" | |
1045 | .IX Subsection "Backward compatibility" | |
1046 | Versions of Perl prior to 5.004 \fBmostly\fR ignored locale information, | |
1047 | generally behaving as if something similar to the \f(CW"C"\fR locale were | |
1048 | always in force, even if the program environment suggested otherwise | |
1049 | (see \*(L"The setlocale function\*(R"). By default, Perl still behaves this | |
1050 | way for backward compatibility. If you want a Perl application to pay | |
1051 | attention to locale information, you \fBmust\fR use the \f(CW\*(C`use\ locale\*(C'\fR | |
1052 | pragma (see \*(L"The use locale pragma\*(R") to instruct it to do so. | |
1053 | .PP | |
1054 | Versions of Perl from 5.002 to 5.003 did use the \f(CW\*(C`LC_CTYPE\*(C'\fR | |
1055 | information if available; that is, \f(CW\*(C`\ew\*(C'\fR did understand what | |
1056 | were the letters according to the locale environment variables. | |
1057 | The problem was that the user had no control over the feature: | |
1058 | if the C library supported locales, Perl used them. | |
1059 | .Sh "I18N:Collate obsolete" | |
1060 | .IX Subsection "I18N:Collate obsolete" | |
1061 | In versions of Perl prior to 5.004, per-locale collation was possible | |
1062 | using the \f(CW\*(C`I18N::Collate\*(C'\fR library module. This module is now mildly | |
1063 | obsolete and should be avoided in new applications. The \f(CW\*(C`LC_COLLATE\*(C'\fR | |
1064 | functionality is now integrated into the Perl core language: One can | |
1065 | use locale-specific scalar data completely normally with \f(CW\*(C`use locale\*(C'\fR, | |
1066 | so there is no longer any need to juggle with the scalar references of | |
1067 | \&\f(CW\*(C`I18N::Collate\*(C'\fR. | |
1068 | .Sh "Sort speed and memory use impacts" | |
1069 | .IX Subsection "Sort speed and memory use impacts" | |
1070 | Comparing and sorting by locale is usually slower than the default | |
1071 | sorting; slow-downs of two to four times have been observed. It will | |
1072 | also consume more memory: once a Perl scalar variable has participated | |
1073 | in any string comparison or sorting operation obeying the locale | |
1074 | collation rules, it will take 3\-15 times more memory than before. (The | |
1075 | exact multiplier depends on the string's contents, the operating system | |
1076 | and the locale.) These downsides are dictated more by the operating | |
1077 | system's implementation of the locale system than by Perl. | |
1078 | .Sh "\fIwrite()\fP and \s-1LC_NUMERIC\s0" | |
1079 | .IX Subsection "write() and LC_NUMERIC" | |
1080 | Formats are the only part of Perl that unconditionally use information | |
1081 | from a program's locale; if a program's environment specifies an | |
1082 | \&\s-1LC_NUMERIC\s0 locale, it is always used to specify the decimal point | |
1083 | character in formatted output. Formatted output cannot be controlled by | |
1084 | \&\f(CW\*(C`use locale\*(C'\fR because the pragma is tied to the block structure of the | |
1085 | program, and, for historical reasons, formats exist outside that block | |
1086 | structure. | |
1087 | .Sh "Freely available locale definitions" | |
1088 | .IX Subsection "Freely available locale definitions" | |
1089 | There is a large collection of locale definitions at | |
1090 | ftp://dkuug.dk/i18n/WG15\-collection . You should be aware that it is | |
1091 | unsupported, and is not claimed to be fit for any purpose. If your | |
1092 | system allows installation of arbitrary locales, you may find the | |
1093 | definitions useful as they are, or as a basis for the development of | |
1094 | your own locales. | |
1095 | .Sh "I18n and l10n" | |
1096 | .IX Subsection "I18n and l10n" | |
1097 | \&\*(L"Internationalization\*(R" is often abbreviated as \fBi18n\fR because its first | |
1098 | and last letters are separated by eighteen others. (You may guess why | |
1099 | the internalin ... internaliti ... i18n tends to get abbreviated.) In | |
1100 | the same way, \*(L"localization\*(R" is often abbreviated to \fBl10n\fR. | |
1101 | .Sh "An imperfect standard" | |
1102 | .IX Subsection "An imperfect standard" | |
1103 | Internationalization, as defined in the C and \s-1POSIX\s0 standards, can be | |
1104 | criticized as incomplete, ungainly, and having too large a granularity. | |
1105 | (Locales apply to a whole process, when it would arguably be more useful | |
1106 | to have them apply to a single thread, window group, or whatever.) They | |
1107 | also have a tendency, like standards groups, to divide the world into | |
1108 | nations, when we all know that the world can equally well be divided | |
1109 | into bankers, bikers, gamers, and so on. But, for now, it's the only | |
1110 | standard we've got. This may be construed as a bug. | |
1111 | .SH "Unicode and UTF\-8" | |
1112 | .IX Header "Unicode and UTF-8" | |
1113 | The support of Unicode is new starting from Perl version 5.6, and | |
1114 | more fully implemented in the version 5.8. See perluniintro and | |
1115 | perlunicode for more details. | |
1116 | .PP | |
1117 | Usually locale settings and Unicode do not affect each other, but | |
1118 | there are exceptions, see \*(L"Locales\*(R" in perlunicode for examples. | |
1119 | .SH "BUGS" | |
1120 | .IX Header "BUGS" | |
1121 | .Sh "Broken systems" | |
1122 | .IX Subsection "Broken systems" | |
1123 | In certain systems, the operating system's locale support | |
1124 | is broken and cannot be fixed or used by Perl. Such deficiencies can | |
1125 | and will result in mysterious hangs and/or Perl core dumps when the | |
1126 | \&\f(CW\*(C`use locale\*(C'\fR is in effect. When confronted with such a system, | |
1127 | please report in excruciating detail to <\fIperlbug@perl.org\fR>, and | |
1128 | complain to your vendor: bug fixes may exist for these problems | |
1129 | in your operating system. Sometimes such bug fixes are called an | |
1130 | operating system upgrade. | |
1131 | .SH "SEE ALSO" | |
1132 | .IX Header "SEE ALSO" | |
1133 | I18N::Langinfo, perluniintro, perlunicode, open, | |
1134 | \&\*(L"isalnum\*(R" in \s-1POSIX\s0, \*(L"isalpha\*(R" in \s-1POSIX\s0, | |
1135 | \&\*(L"isdigit\*(R" in \s-1POSIX\s0, \*(L"isgraph\*(R" in \s-1POSIX\s0, \*(L"islower\*(R" in \s-1POSIX\s0, | |
1136 | \&\*(L"isprint\*(R" in \s-1POSIX\s0, \*(L"ispunct\*(R" in \s-1POSIX\s0, \*(L"isspace\*(R" in \s-1POSIX\s0, | |
1137 | \&\*(L"isupper\*(R" in \s-1POSIX\s0, \*(L"isxdigit\*(R" in \s-1POSIX\s0, \*(L"localeconv\*(R" in \s-1POSIX\s0, | |
1138 | \&\*(L"setlocale\*(R" in \s-1POSIX\s0, \*(L"strcoll\*(R" in \s-1POSIX\s0, \*(L"strftime\*(R" in \s-1POSIX\s0, | |
1139 | \&\*(L"strtod\*(R" in \s-1POSIX\s0, \*(L"strxfrm\*(R" in \s-1POSIX\s0. | |
1140 | .SH "HISTORY" | |
1141 | .IX Header "HISTORY" | |
1142 | Jarkko Hietaniemi's original \fIperli18n.pod\fR heavily hacked by Dominic | |
1143 | Dunlop, assisted by the perl5\-porters. Prose worked over a bit by | |
1144 | Tom Christiansen. | |
1145 | .PP | |
1146 | Last update: Thu Jun 11 08:44:13 \s-1MDT\s0 1998 |