Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | # $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp dankogai $ |
2 | package encoding; | |
3 | our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
4 | ||
5 | use Encode; | |
6 | use strict; | |
7 | ||
8 | sub DEBUG () { 0 } | |
9 | ||
10 | BEGIN { | |
11 | if (ord("A") == 193) { | |
12 | require Carp; | |
13 | Carp::croak("encoding: pragma does not support EBCDIC platforms"); | |
14 | } | |
15 | } | |
16 | ||
17 | our $HAS_PERLIO = 0; | |
18 | eval { require PerlIO::encoding }; | |
19 | unless ($@){ | |
20 | $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); | |
21 | } | |
22 | ||
23 | sub _exception{ | |
24 | my $name = shift; | |
25 | $] > 5.008 and return 0; # 5.8.1 or higher then no | |
26 | my %utfs = map {$_=>1} | |
27 | qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE | |
28 | UTF-32 UTF-32BE UTF-32LE); | |
29 | $utfs{$name} or return 0; # UTFs or no | |
30 | require Config; Config->import(); our %Config; | |
31 | return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no | |
32 | } | |
33 | ||
34 | sub in_locale { $^H & ($locale::hint_bits || 0)} | |
35 | ||
36 | sub _get_locale_encoding { | |
37 | my $locale_encoding; | |
38 | ||
39 | # I18N::Langinfo isn't available everywhere | |
40 | eval { | |
41 | require I18N::Langinfo; | |
42 | I18N::Langinfo->import(qw(langinfo CODESET)); | |
43 | $locale_encoding = langinfo(CODESET()); | |
44 | }; | |
45 | ||
46 | my $country_language; | |
47 | ||
48 | no warnings 'uninitialized'; | |
49 | ||
50 | if (not $locale_encoding && in_locale()) { | |
51 | if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { | |
52 | ($country_language, $locale_encoding) = ($1, $2); | |
53 | } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { | |
54 | ($country_language, $locale_encoding) = ($1, $2); | |
55 | } | |
56 | # LANGUAGE affects only LC_MESSAGES only on glibc | |
57 | } elsif (not $locale_encoding) { | |
58 | if ($ENV{LC_ALL} =~ /\butf-?8\b/i || | |
59 | $ENV{LANG} =~ /\butf-?8\b/i) { | |
60 | $locale_encoding = 'utf8'; | |
61 | } | |
62 | # Could do more heuristics based on the country and language | |
63 | # parts of LC_ALL and LANG (the parts before the dot (if any)), | |
64 | # since we have Locale::Country and Locale::Language available. | |
65 | # TODO: get a database of Language -> Encoding mappings | |
66 | # (the Estonian database at http://www.eki.ee/letter/ | |
67 | # would be excellent!) --jhi | |
68 | } | |
69 | if (defined $locale_encoding && | |
70 | lc($locale_encoding) eq 'euc' && | |
71 | defined $country_language) { | |
72 | if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { | |
73 | $locale_encoding = 'euc-jp'; | |
74 | } elsif ($country_language =~ /^ko_KR|korean?$/i) { | |
75 | $locale_encoding = 'euc-kr'; | |
76 | } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { | |
77 | $locale_encoding = 'euc-cn'; | |
78 | } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { | |
79 | $locale_encoding = 'euc-tw'; | |
80 | } else { | |
81 | require Carp; | |
82 | Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous"); | |
83 | } | |
84 | } | |
85 | ||
86 | return $locale_encoding; | |
87 | } | |
88 | ||
89 | sub import { | |
90 | my $class = shift; | |
91 | my $name = shift; | |
92 | if ($name eq ':_get_locale_encoding') { # used by lib/open.pm | |
93 | my $caller = caller(); | |
94 | { | |
95 | no strict 'refs'; | |
96 | *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; | |
97 | } | |
98 | return; | |
99 | } | |
100 | $name = _get_locale_encoding() if $name eq ':locale'; | |
101 | my %arg = @_; | |
102 | $name = $ENV{PERL_ENCODING} unless defined $name; | |
103 | my $enc = find_encoding($name); | |
104 | unless (defined $enc) { | |
105 | require Carp; | |
106 | Carp::croak("encoding: Unknown encoding '$name'"); | |
107 | } | |
108 | $name = $enc->name; # canonize | |
109 | unless ($arg{Filter}) { | |
110 | DEBUG and warn "_exception($name) = ", _exception($name); | |
111 | _exception($name) or ${^ENCODING} = $enc; | |
112 | $HAS_PERLIO or return 1; | |
113 | }else{ | |
114 | defined(${^ENCODING}) and undef ${^ENCODING}; | |
115 | # implicitly 'use utf8' | |
116 | require utf8; # to fetch $utf8::hint_bits; | |
117 | $^H |= $utf8::hint_bits; | |
118 | eval { | |
119 | require Filter::Util::Call ; | |
120 | Filter::Util::Call->import ; | |
121 | filter_add(sub{ | |
122 | my $status = filter_read(); | |
123 | if ($status > 0){ | |
124 | $_ = $enc->decode($_, 1); | |
125 | DEBUG and warn $_; | |
126 | } | |
127 | $status ; | |
128 | }); | |
129 | }; | |
130 | $@ eq '' and DEBUG and warn "Filter installed"; | |
131 | } | |
132 | defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; | |
133 | for my $h (qw(STDIN STDOUT)){ | |
134 | if ($arg{$h}){ | |
135 | unless (defined find_encoding($arg{$h})) { | |
136 | require Carp; | |
137 | Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'"); | |
138 | } | |
139 | eval { binmode($h, ":raw :encoding($arg{$h})") }; | |
140 | }else{ | |
141 | unless (exists $arg{$h}){ | |
142 | eval { | |
143 | no warnings 'uninitialized'; | |
144 | binmode($h, ":raw :encoding($name)"); | |
145 | }; | |
146 | } | |
147 | } | |
148 | if ($@){ | |
149 | require Carp; | |
150 | Carp::croak($@); | |
151 | } | |
152 | } | |
153 | return 1; # I doubt if we need it, though | |
154 | } | |
155 | ||
156 | sub unimport{ | |
157 | no warnings; | |
158 | undef ${^ENCODING}; | |
159 | if ($HAS_PERLIO){ | |
160 | binmode(STDIN, ":raw"); | |
161 | binmode(STDOUT, ":raw"); | |
162 | }else{ | |
163 | binmode(STDIN); | |
164 | binmode(STDOUT); | |
165 | } | |
166 | if ($INC{"Filter/Util/Call.pm"}){ | |
167 | eval { filter_del() }; | |
168 | } | |
169 | } | |
170 | ||
171 | 1; | |
172 | __END__ | |
173 | ||
174 | =pod | |
175 | ||
176 | =head1 NAME | |
177 | ||
178 | encoding - allows you to write your script in non-ascii or non-utf8 | |
179 | ||
180 | =head1 SYNOPSIS | |
181 | ||
182 | use encoding "greek"; # Perl like Greek to you? | |
183 | use encoding "euc-jp"; # Jperl! | |
184 | ||
185 | # or you can even do this if your shell supports your native encoding | |
186 | ||
187 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? | |
188 | perl -Mencoding=euc-kr -e '...' # Or Korean? | |
189 | ||
190 | # more control | |
191 | ||
192 | # A simple euc-cn => utf-8 converter | |
193 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; | |
194 | ||
195 | # "no encoding;" supported (but not scoped!) | |
196 | no encoding; | |
197 | ||
198 | # an alternate way, Filter | |
199 | use encoding "euc-jp", Filter=>1; | |
200 | # now you can use kanji identifiers -- in euc-jp! | |
201 | ||
202 | # switch on locale - | |
203 | # note that this probably means that unless you have a complete control | |
204 | # over the environments the application is ever going to be run, you should | |
205 | # NOT use the feature of encoding pragma allowing you to write your script | |
206 | # in any recognized encoding because changing locale settings will wreck | |
207 | # the script; you can of course still use the other features of the pragma. | |
208 | use encoding ':locale'; | |
209 | ||
210 | =head1 ABSTRACT | |
211 | ||
212 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode | |
213 | support. You could apply C<substr()> and regexes even to complex CJK | |
214 | characters -- so long as the script was written in UTF-8. But back | |
215 | then, text editors that supported UTF-8 were still rare and many users | |
216 | instead chose to write scripts in legacy encodings, giving up a whole | |
217 | new feature of Perl 5.6. | |
218 | ||
219 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> | |
220 | pragma, you can write your script in any encoding you like (so long | |
221 | as the C<Encode> module supports it) and still enjoy Unicode support. | |
222 | This pragma achieves that by doing the following: | |
223 | ||
224 | =over | |
225 | ||
226 | =item * | |
227 | ||
228 | Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from | |
229 | the encoding specified to utf8. In Perl 5.8.1 and later, literals in | |
230 | C<tr///> and C<DATA> pseudo-filehandle are also converted. | |
231 | ||
232 | =item * | |
233 | ||
234 | Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding | |
235 | specified. | |
236 | ||
237 | =back | |
238 | ||
239 | =head2 Literal Conversions | |
240 | ||
241 | You can write code in EUC-JP as follows: | |
242 | ||
243 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
244 | #<-char-><-char-> # 4 octets | |
245 | s/\bCamel\b/$Rakuda/; | |
246 | ||
247 | And with C<use encoding "euc-jp"> in effect, it is the same thing as | |
248 | the code in UTF-8: | |
249 | ||
250 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters | |
251 | s/\bCamel\b/$Rakuda/; | |
252 | ||
253 | =head2 PerlIO layers for C<STD(IN|OUT)> | |
254 | ||
255 | The B<encoding> pragma also modifies the filehandle layers of | |
256 | STDIN and STDOUT to the specified encoding. Therefore, | |
257 | ||
258 | use encoding "euc-jp"; | |
259 | my $message = "Camel is the symbol of perl.\n"; | |
260 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
261 | $message =~ s/\bCamel\b/$Rakuda/; | |
262 | print $message; | |
263 | ||
264 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", | |
265 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". | |
266 | ||
267 | You can override this by giving extra arguments; see below. | |
268 | ||
269 | =head2 Implicit upgrading for byte strings | |
270 | ||
271 | By default, if strings operating under byte semantics and strings | |
272 | with Unicode character data are concatenated, the new string will | |
273 | be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. | |
274 | ||
275 | The B<encoding> pragma changes this to use the specified encoding | |
276 | instead. For example: | |
277 | ||
278 | use encoding 'utf8'; | |
279 | my $string = chr(20000); # a Unicode string | |
280 | utf8::encode($string); # now it's a UTF-8 encoded byte string | |
281 | # concatenate with another Unicode string | |
282 | print length($string . chr(20000)); | |
283 | ||
284 | Will print C<2>, because C<$string> is upgraded as UTF-8. Without | |
285 | C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> | |
286 | is three octets when interpreted as Latin-1. | |
287 | ||
288 | =head1 FEATURES THAT REQUIRE 5.8.1 | |
289 | ||
290 | Some of the features offered by this pragma requires perl 5.8.1. Most | |
291 | of these are done by Inaba Hiroto. Any other features and changes | |
292 | are good for 5.8.0. | |
293 | ||
294 | =over | |
295 | ||
296 | =item "NON-EUC" doublebyte encodings | |
297 | ||
298 | Because perl needs to parse script before applying this pragma, such | |
299 | encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; | |
300 | \x5c) in the second byte fails because the second byte may | |
301 | accidentally escape the quoting character that follows. Perl 5.8.1 | |
302 | or later fixes this problem. | |
303 | ||
304 | =item tr// | |
305 | ||
306 | C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 | |
307 | See the section below for details. | |
308 | ||
309 | =item DATA pseudo-filehandle | |
310 | ||
311 | Another feature that was overlooked was C<DATA>. | |
312 | ||
313 | =back | |
314 | ||
315 | =head1 USAGE | |
316 | ||
317 | =over 4 | |
318 | ||
319 | =item use encoding [I<ENCNAME>] ; | |
320 | ||
321 | Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} | |
322 | exists and non-zero, PerlIO layers of STDIN and STDOUT are set to | |
323 | ":encoding(I<ENCNAME>)". | |
324 | ||
325 | Note that STDERR WILL NOT be changed. | |
326 | ||
327 | Also note that non-STD file handles remain unaffected. Use C<use | |
328 | open> or C<binmode> to change layers of those. | |
329 | ||
330 | If no encoding is specified, the environment variable L<PERL_ENCODING> | |
331 | is consulted. If no encoding can be found, the error C<Unknown encoding | |
332 | 'I<ENCNAME>'> will be thrown. | |
333 | ||
334 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; | |
335 | ||
336 | You can also individually set encodings of STDIN and STDOUT via the | |
337 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the | |
338 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding | |
339 | completely off. | |
340 | ||
341 | When ${^UNICODE} exists and non-zero, these options will completely | |
342 | ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See | |
343 | L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for | |
344 | details (perl 5.8.1 and later). | |
345 | ||
346 | =item use encoding I<ENCNAME> Filter=E<gt>1; | |
347 | ||
348 | This turns the encoding pragma into a source filter. While the | |
349 | default approach just decodes interpolated literals (in qq() and | |
350 | qr()), this will apply a source filter to the entire source code. See | |
351 | L</"The Filter Option"> below for details. | |
352 | ||
353 | =item no encoding; | |
354 | ||
355 | Unsets the script encoding. The layers of STDIN, STDOUT are | |
356 | reset to ":raw" (the default unprocessed raw stream of bytes). | |
357 | ||
358 | =back | |
359 | ||
360 | =head1 The Filter Option | |
361 | ||
362 | The magic of C<use encoding> is not applied to the names of | |
363 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human | |
364 | is a single Han ideograph) work, you still need to write your script | |
365 | in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. | |
366 | ||
367 | What does this mean? Your source code behaves as if it is written in | |
368 | UTF-8 with 'use utf8' in effect. So even if your editor only supports | |
369 | Shift_JIS, for example, you can still try examples in Chapter 15 of | |
370 | C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 | |
371 | identifiers. | |
372 | ||
373 | This option is significantly slower and (as of this writing) non-ASCII | |
374 | identifiers are not very stable WITHOUT this option and with the | |
375 | source code written in UTF-8. | |
376 | ||
377 | =head2 Filter-related changes at Encode version 1.87 | |
378 | ||
379 | =over | |
380 | ||
381 | =item * | |
382 | ||
383 | The Filter option now sets STDIN and STDOUT like non-filter options. | |
384 | And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like | |
385 | non-filter version. | |
386 | ||
387 | =item * | |
388 | ||
389 | C<use utf8> is implicitly declared so you no longer have to C<use | |
390 | utf8> to C<${"\x{4eba}"}++>. | |
391 | ||
392 | =back | |
393 | ||
394 | =head1 CAVEATS | |
395 | ||
396 | =head2 NOT SCOPED | |
397 | ||
398 | The pragma is a per script, not a per block lexical. Only the last | |
399 | C<use encoding> or C<no encoding> matters, and it affects | |
400 | B<the whole script>. However, the <no encoding> pragma is supported and | |
401 | B<use encoding> can appear as many times as you want in a given script. | |
402 | The multiple use of this pragma is discouraged. | |
403 | ||
404 | By the same reason, the use this pragma inside modules is also | |
405 | discouraged (though not as strongly discouraged as the case above. | |
406 | See below). | |
407 | ||
408 | If you still have to write a module with this pragma, be very careful | |
409 | of the load order. See the codes below; | |
410 | ||
411 | # called module | |
412 | package Module_IN_BAR; | |
413 | use encoding "bar"; | |
414 | # stuff in "bar" encoding here | |
415 | 1; | |
416 | ||
417 | # caller script | |
418 | use encoding "foo" | |
419 | use Module_IN_BAR; | |
420 | # surprise! use encoding "bar" is in effect. | |
421 | ||
422 | The best way to avoid this oddity is to use this pragma RIGHT AFTER | |
423 | other modules are loaded. i.e. | |
424 | ||
425 | use Module_IN_BAR; | |
426 | use encoding "foo"; | |
427 | ||
428 | =head2 DO NOT MIX MULTIPLE ENCODINGS | |
429 | ||
430 | Notice that only literals (string or regular expression) having only | |
431 | legacy code points are affected: if you mix data like this | |
432 | ||
433 | \xDF\x{100} | |
434 | ||
435 | the data is assumed to be in (Latin 1 and) Unicode, not in your native | |
436 | encoding. In other words, this will match in "greek": | |
437 | ||
438 | "\xDF" =~ /\x{3af}/ | |
439 | ||
440 | but this will not | |
441 | ||
442 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ | |
443 | ||
444 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on | |
445 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL | |
446 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You | |
447 | should not be mixing your legacy data and Unicode in the same string. | |
448 | ||
449 | This pragma also affects encoding of the 0x80..0xFF code point range: | |
450 | normally characters in that range are left as eight-bit bytes (unless | |
451 | they are combined with characters with code points 0x100 or larger, | |
452 | in which case all characters need to become UTF-8 encoded), but if | |
453 | the C<encoding> pragma is present, even the 0x80..0xFF range always | |
454 | gets UTF-8 encoded. | |
455 | ||
456 | After all, the best thing about this pragma is that you don't have to | |
457 | resort to \x{....} just to spell your name in a native encoding. | |
458 | So feel free to put your strings in your encoding in quotes and | |
459 | regexes. | |
460 | ||
461 | =head2 tr/// with ranges | |
462 | ||
463 | The B<encoding> pragma works by decoding string literals in | |
464 | C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this | |
465 | does not apply to C<tr///>. Therefore, | |
466 | ||
467 | use encoding 'euc-jp'; | |
468 | #.... | |
469 | $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; | |
470 | # -------- -------- -------- -------- | |
471 | ||
472 | Does not work as | |
473 | ||
474 | $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; | |
475 | ||
476 | =over | |
477 | ||
478 | =item Legend of characters above | |
479 | ||
480 | utf8 euc-jp charnames::viacode() | |
481 | ----------------------------------------- | |
482 | \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A | |
483 | \x{3093} \xA4\xF3 HIRAGANA LETTER N | |
484 | \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A | |
485 | \x{30f3} \xA5\xF3 KATAKANA LETTER N | |
486 | ||
487 | =back | |
488 | ||
489 | This counterintuitive behavior has been fixed in perl 5.8.1. | |
490 | ||
491 | =head3 workaround to tr///; | |
492 | ||
493 | In perl 5.8.0, you can work around as follows; | |
494 | ||
495 | use encoding 'euc-jp'; | |
496 | # .... | |
497 | eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; | |
498 | ||
499 | Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind | |
500 | is the same as classic idiom that makes C<tr///> 'interpolate'. | |
501 | ||
502 | tr/$from/$to/; # wrong! | |
503 | eval qq{ tr/$from/$to/ }; # workaround. | |
504 | ||
505 | Nevertheless, in case of B<encoding> pragma even C<q//> is affected so | |
506 | C<tr///> not being decoded was obviously against the will of Perl5 | |
507 | Porters so it has been fixed in Perl 5.8.1 or later. | |
508 | ||
509 | =head1 EXAMPLE - Greekperl | |
510 | ||
511 | use encoding "iso 8859-7"; | |
512 | ||
513 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. | |
514 | ||
515 | $a = "\xDF"; | |
516 | $b = "\x{100}"; | |
517 | ||
518 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf | |
519 | ||
520 | $c = $a . $b; | |
521 | ||
522 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". | |
523 | ||
524 | # chr() is affected, and ... | |
525 | ||
526 | print "mega\n" if ord(chr(0xdf)) == 0x3af; | |
527 | ||
528 | # ... ord() is affected by the encoding pragma ... | |
529 | ||
530 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; | |
531 | ||
532 | # ... as are eq and cmp ... | |
533 | ||
534 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); | |
535 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; | |
536 | ||
537 | # ... but pack/unpack C are not affected, in case you still | |
538 | # want to go back to your native encoding | |
539 | ||
540 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; | |
541 | ||
542 | =head1 KNOWN PROBLEMS | |
543 | ||
544 | =over | |
545 | ||
546 | =item literals in regex that are longer than 127 bytes | |
547 | ||
548 | For native multibyte encodings (either fixed or variable length), | |
549 | the current implementation of the regular expressions may introduce | |
550 | recoding errors for regular expression literals longer than 127 bytes. | |
551 | ||
552 | =item EBCDIC | |
553 | ||
554 | The encoding pragma is not supported on EBCDIC platforms. | |
555 | (Porters who are willing and able to remove this limitation are | |
556 | welcome.) | |
557 | ||
558 | =item format | |
559 | ||
560 | This pragma doesn't work well with format because PerlIO does not | |
561 | get along very well with it. When format contains non-ascii | |
562 | characters it prints funny or gets "wide character warnings". | |
563 | To understand it, try the code below. | |
564 | ||
565 | # Save this one in utf8 | |
566 | # replace *non-ascii* with a non-ascii string | |
567 | my $camel; | |
568 | format STDOUT = | |
569 | *non-ascii*@>>>>>>> | |
570 | $camel | |
571 | . | |
572 | $camel = "*non-ascii*"; | |
573 | binmode(STDOUT=>':encoding(utf8)'); # bang! | |
574 | write; # funny | |
575 | print $camel, "\n"; # fine | |
576 | ||
577 | Without binmode this happens to work but without binmode, print() | |
578 | fails instead of write(). | |
579 | ||
580 | At any rate, the very use of format is questionable when it comes to | |
581 | unicode characters since you have to consider such things as character | |
582 | width (i.e. double-width for ideographs) and directions (i.e. BIDI for | |
583 | Arabic and Hebrew). | |
584 | ||
585 | =back | |
586 | ||
587 | =head2 The Logic of :locale | |
588 | ||
589 | The logic of C<:locale> is as follows: | |
590 | ||
591 | =over 4 | |
592 | ||
593 | =item 1. | |
594 | ||
595 | If the platform supports the langinfo(CODESET) interface, the codeset | |
596 | returned is used as the default encoding for the open pragma. | |
597 | ||
598 | =item 2. | |
599 | ||
600 | If 1. didn't work but we are under the locale pragma, the environment | |
601 | variables LC_ALL and LANG (in that order) are matched for encodings | |
602 | (the part after C<.>, if any), and if any found, that is used | |
603 | as the default encoding for the open pragma. | |
604 | ||
605 | =item 3. | |
606 | ||
607 | If 1. and 2. didn't work, the environment variables LC_ALL and LANG | |
608 | (in that order) are matched for anything looking like UTF-8, and if | |
609 | any found, C<:utf8> is used as the default encoding for the open | |
610 | pragma. | |
611 | ||
612 | =back | |
613 | ||
614 | If your locale environment variables (LC_ALL, LC_CTYPE, LANG) | |
615 | contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), | |
616 | the default encoding of your STDIN, STDOUT, and STDERR, and of | |
617 | B<any subsequent file open>, is UTF-8. | |
618 | ||
619 | =head1 HISTORY | |
620 | ||
621 | This pragma first appeared in Perl 5.8.0. For features that require | |
622 | 5.8.1 and better, see above. | |
623 | ||
624 | The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. | |
625 | ||
626 | =head1 SEE ALSO | |
627 | ||
628 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, | |
629 | ||
630 | Ch. 15 of C<Programming Perl (3rd Edition)> | |
631 | by Larry Wall, Tom Christiansen, Jon Orwant; | |
632 | O'Reilly & Associates; ISBN 0-596-00027-8 | |
633 | ||
634 | =cut |