Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package encoding; |
2 | our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
3 | ||
4 | use Encode; | |
5 | use strict; | |
6 | ||
7 | BEGIN { | |
8 | if (ord("A") == 193) { | |
9 | require Carp; | |
10 | Carp::croak("encoding pragma does not support EBCDIC platforms"); | |
11 | } | |
12 | } | |
13 | ||
14 | our $HAS_PERLIO = 0; | |
15 | eval { require PerlIO::encoding }; | |
16 | unless ($@){ | |
17 | $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); | |
18 | } | |
19 | ||
20 | sub import { | |
21 | my $class = shift; | |
22 | my $name = shift; | |
23 | my %arg = @_; | |
24 | $name ||= $ENV{PERL_ENCODING}; | |
25 | ||
26 | my $enc = find_encoding($name); | |
27 | unless (defined $enc) { | |
28 | require Carp; | |
29 | Carp::croak("Unknown encoding '$name'"); | |
30 | } | |
31 | unless ($arg{Filter}){ | |
32 | ${^ENCODING} = $enc; # this is all you need, actually. | |
33 | $HAS_PERLIO or return 1; | |
34 | for my $h (qw(STDIN STDOUT)){ | |
35 | if ($arg{$h}){ | |
36 | unless (defined find_encoding($arg{$h})) { | |
37 | require Carp; | |
38 | Carp::croak("Unknown encoding for $h, '$arg{$h}'"); | |
39 | } | |
40 | eval { binmode($h, ":encoding($arg{$h})") }; | |
41 | }else{ | |
42 | unless (exists $arg{$h}){ | |
43 | eval { | |
44 | no warnings 'uninitialized'; | |
45 | binmode($h, ":encoding($name)"); | |
46 | }; | |
47 | } | |
48 | } | |
49 | if ($@){ | |
50 | require Carp; | |
51 | Carp::croak($@); | |
52 | } | |
53 | } | |
54 | }else{ | |
55 | defined(${^ENCODING}) and undef ${^ENCODING}; | |
56 | eval { | |
57 | require Filter::Util::Call ; | |
58 | Filter::Util::Call->import ; | |
59 | binmode(STDIN); | |
60 | binmode(STDOUT); | |
61 | filter_add(sub{ | |
62 | my $status; | |
63 | if (($status = filter_read()) > 0){ | |
64 | $_ = $enc->decode($_, 1); | |
65 | # warn $_; | |
66 | } | |
67 | $status ; | |
68 | }); | |
69 | }; | |
70 | # warn "Filter installed"; | |
71 | } | |
72 | return 1; # I doubt if we need it, though | |
73 | } | |
74 | ||
75 | sub unimport{ | |
76 | no warnings; | |
77 | undef ${^ENCODING}; | |
78 | if ($HAS_PERLIO){ | |
79 | binmode(STDIN, ":raw"); | |
80 | binmode(STDOUT, ":raw"); | |
81 | }else{ | |
82 | binmode(STDIN); | |
83 | binmode(STDOUT); | |
84 | } | |
85 | if ($INC{"Filter/Util/Call.pm"}){ | |
86 | eval { filter_del() }; | |
87 | } | |
88 | } | |
89 | ||
90 | 1; | |
91 | __END__ | |
92 | ||
93 | =pod | |
94 | ||
95 | =head1 NAME | |
96 | ||
97 | encoding - allows you to write your script in non-ascii or non-utf8 | |
98 | ||
99 | =head1 SYNOPSIS | |
100 | ||
101 | use encoding "greek"; # Perl like Greek to you? | |
102 | use encoding "euc-jp"; # Jperl! | |
103 | ||
104 | # or you can even do this if your shell supports your native encoding | |
105 | ||
106 | perl -Mencoding=latin2 -e '...' # Feeling centrally European? | |
107 | perl -Mencoding=euc-kr -e '...' # Or Korean? | |
108 | ||
109 | # more control | |
110 | ||
111 | # A simple euc-cn => utf-8 converter | |
112 | use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; | |
113 | ||
114 | # "no encoding;" supported (but not scoped!) | |
115 | no encoding; | |
116 | ||
117 | # an alternate way, Filter | |
118 | use encoding "euc-jp", Filter=>1; | |
119 | use utf8; | |
120 | # now you can use kanji identifiers -- in euc-jp! | |
121 | ||
122 | =head1 ABSTRACT | |
123 | ||
124 | Let's start with a bit of history: Perl 5.6.0 introduced Unicode | |
125 | support. You could apply C<substr()> and regexes even to complex CJK | |
126 | characters -- so long as the script was written in UTF-8. But back | |
127 | then, text editors that supported UTF-8 were still rare and many users | |
128 | instead chose to write scripts in legacy encodings, giving up a whole | |
129 | new feature of Perl 5.6. | |
130 | ||
131 | Rewind to the future: starting from perl 5.8.0 with the B<encoding> | |
132 | pragma, you can write your script in any encoding you like (so long | |
133 | as the C<Encode> module supports it) and still enjoy Unicode support. | |
134 | You can write code in EUC-JP as follows: | |
135 | ||
136 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
137 | #<-char-><-char-> # 4 octets | |
138 | s/\bCamel\b/$Rakuda/; | |
139 | ||
140 | And with C<use encoding "euc-jp"> in effect, it is the same thing as | |
141 | the code in UTF-8: | |
142 | ||
143 | my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters | |
144 | s/\bCamel\b/$Rakuda/; | |
145 | ||
146 | The B<encoding> pragma also modifies the filehandle disciplines of | |
147 | STDIN, STDOUT, and STDERR to the specified encoding. Therefore, | |
148 | ||
149 | use encoding "euc-jp"; | |
150 | my $message = "Camel is the symbol of perl.\n"; | |
151 | my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji | |
152 | $message =~ s/\bCamel\b/$Rakuda/; | |
153 | print $message; | |
154 | ||
155 | Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", | |
156 | not "\x{99F1}\x{99DD} is the symbol of perl.\n". | |
157 | ||
158 | You can override this by giving extra arguments; see below. | |
159 | ||
160 | =head1 USAGE | |
161 | ||
162 | =over 4 | |
163 | ||
164 | =item use encoding [I<ENCNAME>] ; | |
165 | ||
166 | Sets the script encoding to I<ENCNAME>. Filehandle disciplines of | |
167 | STDIN and STDOUT are set to ":encoding(I<ENCNAME>)". Note that STDERR | |
168 | will not be changed. | |
169 | ||
170 | If no encoding is specified, the environment variable L<PERL_ENCODING> | |
171 | is consulted. If no encoding can be found, the error C<Unknown encoding | |
172 | 'I<ENCNAME>'> will be thrown. | |
173 | ||
174 | Note that non-STD file handles remain unaffected. Use C<use open> or | |
175 | C<binmode> to change disciplines of those. | |
176 | ||
177 | =item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; | |
178 | ||
179 | You can also individually set encodings of STDIN and STDOUT via the | |
180 | C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the | |
181 | first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding | |
182 | completely off. | |
183 | ||
184 | =item no encoding; | |
185 | ||
186 | Unsets the script encoding. The disciplines of STDIN, STDOUT are | |
187 | reset to ":raw" (the default unprocessed raw stream of bytes). | |
188 | ||
189 | =back | |
190 | ||
191 | =head1 CAVEATS | |
192 | ||
193 | =head2 NOT SCOPED | |
194 | ||
195 | The pragma is a per script, not a per block lexical. Only the last | |
196 | C<use encoding> or C<no encoding> matters, and it affects | |
197 | B<the whole script>. However, the <no encoding> pragma is supported and | |
198 | B<use encoding> can appear as many times as you want in a given script. | |
199 | The multiple use of this pragma is discouraged. | |
200 | ||
201 | Because of this nature, the use of this pragma inside the module is | |
202 | strongly discouraged (because the influence of this pragma lasts not | |
203 | only for the module but the script that uses). But if you have to, | |
204 | make sure you say C<no encoding> at the end of the module so you | |
205 | contain the influence of the pragma within the module. | |
206 | ||
207 | =head2 DO NOT MIX MULTIPLE ENCODINGS | |
208 | ||
209 | Notice that only literals (string or regular expression) having only | |
210 | legacy code points are affected: if you mix data like this | |
211 | ||
212 | \xDF\x{100} | |
213 | ||
214 | the data is assumed to be in (Latin 1 and) Unicode, not in your native | |
215 | encoding. In other words, this will match in "greek": | |
216 | ||
217 | "\xDF" =~ /\x{3af}/ | |
218 | ||
219 | but this will not | |
220 | ||
221 | "\xDF\x{100}" =~ /\x{3af}\x{100}/ | |
222 | ||
223 | since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on | |
224 | the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL | |
225 | LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You | |
226 | should not be mixing your legacy data and Unicode in the same string. | |
227 | ||
228 | This pragma also affects encoding of the 0x80..0xFF code point range: | |
229 | normally characters in that range are left as eight-bit bytes (unless | |
230 | they are combined with characters with code points 0x100 or larger, | |
231 | in which case all characters need to become UTF-8 encoded), but if | |
232 | the C<encoding> pragma is present, even the 0x80..0xFF range always | |
233 | gets UTF-8 encoded. | |
234 | ||
235 | After all, the best thing about this pragma is that you don't have to | |
236 | resort to \x{....} just to spell your name in a native encoding. | |
237 | So feel free to put your strings in your encoding in quotes and | |
238 | regexes. | |
239 | ||
240 | =head1 Non-ASCII Identifiers and Filter option | |
241 | ||
242 | The magic of C<use encoding> is not applied to the names of | |
243 | identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human | |
244 | is a single Han ideograph) work, you still need to write your script | |
245 | in UTF-8 or use a source filter. | |
246 | ||
247 | In other words, the same restriction as with Jperl applies. | |
248 | ||
249 | If you dare to experiment, however, you can try the Filter option. | |
250 | ||
251 | =over 4 | |
252 | ||
253 | =item use encoding I<ENCNAME> Filter=E<gt>1; | |
254 | ||
255 | This turns the encoding pragma into a source filter. While the default | |
256 | approach just decodes interpolated literals (in qq() and qr()), this | |
257 | will apply a source filter to the entire source code. In this case, | |
258 | STDIN and STDOUT remain untouched. | |
259 | ||
260 | =back | |
261 | ||
262 | What does this mean? Your source code behaves as if it is written in | |
263 | UTF-8. So even if your editor only supports Shift_JIS, for example, | |
264 | you can still try examples in Chapter 15 of C<Programming Perl, 3rd | |
265 | Ed.>. For instance, you can use UTF-8 identifiers. | |
266 | ||
267 | This option is significantly slower and (as of this writing) non-ASCII | |
268 | identifiers are not very stable WITHOUT this option and with the | |
269 | source code written in UTF-8. | |
270 | ||
271 | To make your script in legacy encoding work with minimum effort, | |
272 | do not use Filter=E<gt>1. | |
273 | ||
274 | =head1 EXAMPLE - Greekperl | |
275 | ||
276 | use encoding "iso 8859-7"; | |
277 | ||
278 | # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. | |
279 | ||
280 | $a = "\xDF"; | |
281 | $b = "\x{100}"; | |
282 | ||
283 | printf "%#x\n", ord($a); # will print 0x3af, not 0xdf | |
284 | ||
285 | $c = $a . $b; | |
286 | ||
287 | # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". | |
288 | ||
289 | # chr() is affected, and ... | |
290 | ||
291 | print "mega\n" if ord(chr(0xdf)) == 0x3af; | |
292 | ||
293 | # ... ord() is affected by the encoding pragma ... | |
294 | ||
295 | print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; | |
296 | ||
297 | # ... as are eq and cmp ... | |
298 | ||
299 | print "peta\n" if "\x{3af}" eq pack("C", 0xdf); | |
300 | print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; | |
301 | ||
302 | # ... but pack/unpack C are not affected, in case you still | |
303 | # want to go back to your native encoding | |
304 | ||
305 | print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; | |
306 | ||
307 | =head1 KNOWN PROBLEMS | |
308 | ||
309 | For native multibyte encodings (either fixed or variable length), | |
310 | the current implementation of the regular expressions may introduce | |
311 | recoding errors for regular expression literals longer than 127 bytes. | |
312 | ||
313 | The encoding pragma is not supported on EBCDIC platforms. | |
314 | (Porters who are willing and able to remove this limitation are | |
315 | welcome.) | |
316 | ||
317 | =head1 SEE ALSO | |
318 | ||
319 | L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, | |
320 | ||
321 | Ch. 15 of C<Programming Perl (3rd Edition)> | |
322 | by Larry Wall, Tom Christiansen, Jon Orwant; | |
323 | O'Reilly & Associates; ISBN 0-596-00027-8 | |
324 | ||
325 | =cut |