Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Encode::Guess; |
2 | use strict; | |
3 | ||
4 | use Encode qw(:fallbacks find_encoding); | |
5 | our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
6 | ||
7 | my $Canon = 'Guess'; | |
8 | sub DEBUG () { 0 } | |
9 | our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); | |
10 | $Encode::Encoding{$Canon} = | |
11 | bless { | |
12 | Name => $Canon, | |
13 | Suspects => { %DEF_SUSPECTS }, | |
14 | } => __PACKAGE__; | |
15 | ||
16 | use base qw(Encode::Encoding); | |
17 | sub needs_lines { 1 } | |
18 | sub perlio_ok { 0 } | |
19 | ||
20 | our @EXPORT = qw(guess_encoding); | |
21 | our $NoUTFAutoGuess = 0; | |
22 | our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf); | |
23 | ||
24 | sub import { # Exporter not used so we do it on our own | |
25 | my $callpkg = caller; | |
26 | for my $item (@EXPORT){ | |
27 | no strict 'refs'; | |
28 | *{"$callpkg\::$item"} = \&{"$item"}; | |
29 | } | |
30 | set_suspects(@_); | |
31 | } | |
32 | ||
33 | sub set_suspects{ | |
34 | my $class = shift; | |
35 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; | |
36 | $self->{Suspects} = { %DEF_SUSPECTS }; | |
37 | $self->add_suspects(@_); | |
38 | } | |
39 | ||
40 | sub add_suspects{ | |
41 | my $class = shift; | |
42 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; | |
43 | for my $c (@_){ | |
44 | my $e = find_encoding($c) or die "Unknown encoding: $c"; | |
45 | $self->{Suspects}{$e->name} = $e; | |
46 | DEBUG and warn "Added: ", $e->name; | |
47 | } | |
48 | } | |
49 | ||
50 | sub decode($$;$){ | |
51 | my ($obj, $octet, $chk) = @_; | |
52 | my $guessed = guess($obj, $octet); | |
53 | unless (ref($guessed)){ | |
54 | require Carp; | |
55 | Carp::croak($guessed); | |
56 | } | |
57 | my $utf8 = $guessed->decode($octet, $chk); | |
58 | $_[1] = $octet if $chk; | |
59 | return $utf8; | |
60 | } | |
61 | ||
62 | sub guess_encoding{ | |
63 | guess($Encode::Encoding{$Canon}, @_); | |
64 | } | |
65 | ||
66 | sub guess { | |
67 | my $class = shift; | |
68 | my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; | |
69 | my $octet = shift; | |
70 | ||
71 | # sanity check | |
72 | return unless defined $octet and length $octet; | |
73 | ||
74 | # cheat 0: utf8 flag; | |
75 | if ( Encode::is_utf8($octet) ) { | |
76 | return find_encoding('utf8') unless $NoUTFAutoGuess; | |
77 | Encode::_utf8_off($octet); | |
78 | } | |
79 | # cheat 1: BOM | |
80 | use Encode::Unicode; | |
81 | unless ($NoUTFAutoGuess) { | |
82 | my $BOM = pack('C3', unpack("C3", $octet)); | |
83 | return find_encoding('utf8') | |
84 | if (defined $BOM and $BOM eq $UTF8_BOM); | |
85 | $BOM = unpack('N', $octet); | |
86 | return find_encoding('UTF-32') | |
87 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); | |
88 | $BOM = unpack('n', $octet); | |
89 | return find_encoding('UTF-16') | |
90 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); | |
91 | if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) | |
92 | my $utf; | |
93 | my ($be, $le) = (0, 0); | |
94 | if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed | |
95 | $utf = "UTF-32"; | |
96 | for my $char (unpack('N*', $octet)){ | |
97 | $char & 0x0000ffff and $be++; | |
98 | $char & 0xffff0000 and $le++; | |
99 | } | |
100 | }else{ # UTF-16(BE|LE) assumed | |
101 | $utf = "UTF-16"; | |
102 | for my $char (unpack('n*', $octet)){ | |
103 | $char & 0x00ff and $be++; | |
104 | $char & 0xff00 and $le++; | |
105 | } | |
106 | } | |
107 | DEBUG and warn "$utf, be == $be, le == $le"; | |
108 | $be == $le | |
109 | and return | |
110 | "Encodings ambiguous between $utf BE and LE ($be, $le)"; | |
111 | $utf .= ($be > $le) ? 'BE' : 'LE'; | |
112 | return find_encoding($utf); | |
113 | } | |
114 | } | |
115 | my %try = %{$obj->{Suspects}}; | |
116 | for my $c (@_){ | |
117 | my $e = find_encoding($c) or die "Unknown encoding: $c"; | |
118 | $try{$e->name} = $e; | |
119 | DEBUG and warn "Added: ", $e->name; | |
120 | } | |
121 | my $nline = 1; | |
122 | for my $line (split /\r\n?|\n/, $octet){ | |
123 | # cheat 2 -- \e in the string | |
124 | if ($line =~ /\e/o){ | |
125 | my @keys = keys %try; | |
126 | delete @try{qw/utf8 ascii/}; | |
127 | for my $k (@keys){ | |
128 | ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; | |
129 | } | |
130 | } | |
131 | my %ok = %try; | |
132 | # warn join(",", keys %try); | |
133 | for my $k (keys %try){ | |
134 | my $scratch = $line; | |
135 | $try{$k}->decode($scratch, FB_QUIET); | |
136 | if ($scratch eq ''){ | |
137 | DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); | |
138 | }else{ | |
139 | use bytes (); | |
140 | DEBUG and | |
141 | warn sprintf("%4d:%-24s not ok; %d bytes left\n", | |
142 | $nline, $k, bytes::length($scratch)); | |
143 | delete $ok{$k}; | |
144 | } | |
145 | } | |
146 | %ok or return "No appropriate encodings found!"; | |
147 | if (scalar(keys(%ok)) == 1){ | |
148 | my ($retval) = values(%ok); | |
149 | return $retval; | |
150 | } | |
151 | %try = %ok; $nline++; | |
152 | } | |
153 | $try{ascii} or | |
154 | return "Encodings too ambiguous: ", join(" or ", keys %try); | |
155 | return $try{ascii}; | |
156 | } | |
157 | ||
158 | ||
159 | ||
160 | 1; | |
161 | __END__ | |
162 | ||
163 | =head1 NAME | |
164 | ||
165 | Encode::Guess -- Guesses encoding from data | |
166 | ||
167 | =head1 SYNOPSIS | |
168 | ||
169 | # if you are sure $data won't contain anything bogus | |
170 | ||
171 | use Encode; | |
172 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; | |
173 | my $utf8 = decode("Guess", $data); | |
174 | my $data = encode("Guess", $utf8); # this doesn't work! | |
175 | ||
176 | # more elaborate way | |
177 | use Encode::Guess; | |
178 | my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); | |
179 | ref($enc) or die "Can't guess: $enc"; # trap error this way | |
180 | $utf8 = $enc->decode($data); | |
181 | # or | |
182 | $utf8 = decode($enc->name, $data) | |
183 | ||
184 | =head1 ABSTRACT | |
185 | ||
186 | Encode::Guess enables you to guess in what encoding a given data is | |
187 | encoded, or at least tries to. | |
188 | ||
189 | =head1 DESCRIPTION | |
190 | ||
191 | By default, it checks only ascii, utf8 and UTF-16/32 with BOM. | |
192 | ||
193 | use Encode::Guess; # ascii/utf8/BOMed UTF | |
194 | ||
195 | To use it more practically, you have to give the names of encodings to | |
196 | check (I<suspects> as follows). The name of suspects can either be | |
197 | canonical names or aliases. | |
198 | ||
199 | CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED. | |
200 | ||
201 | # tries all major Japanese Encodings as well | |
202 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; | |
203 | ||
204 | If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true | |
205 | value, no heuristics will be applied to UTF8/16/32, and the result | |
206 | will be limited to the suspects and C<ascii>. | |
207 | ||
208 | =over 4 | |
209 | ||
210 | =item Encode::Guess->set_suspects | |
211 | ||
212 | You can also change the internal suspects list via C<set_suspects> | |
213 | method. | |
214 | ||
215 | use Encode::Guess; | |
216 | Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); | |
217 | ||
218 | =item Encode::Guess->add_suspects | |
219 | ||
220 | Or you can use C<add_suspects> method. The difference is that | |
221 | C<set_suspects> flushes the current suspects list while | |
222 | C<add_suspects> adds. | |
223 | ||
224 | use Encode::Guess; | |
225 | Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); | |
226 | # now the suspects are euc-jp,shiftjis,7bit-jis, AND | |
227 | # euc-kr,euc-cn, and big5-eten | |
228 | Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); | |
229 | ||
230 | =item Encode::decode("Guess" ...) | |
231 | ||
232 | When you are content with suspects list, you can now | |
233 | ||
234 | my $utf8 = Encode::decode("Guess", $data); | |
235 | ||
236 | =item Encode::Guess->guess($data) | |
237 | ||
238 | But it will croak if: | |
239 | ||
240 | =over | |
241 | ||
242 | =item * | |
243 | ||
244 | Two or more suspects remain | |
245 | ||
246 | =item * | |
247 | ||
248 | No suspects left | |
249 | ||
250 | =back | |
251 | ||
252 | So you should instead try this; | |
253 | ||
254 | my $decoder = Encode::Guess->guess($data); | |
255 | ||
256 | On success, $decoder is an object that is documented in | |
257 | L<Encode::Encoding>. So you can now do this; | |
258 | ||
259 | my $utf8 = $decoder->decode($data); | |
260 | ||
261 | On failure, $decoder now contains an error message so the whole thing | |
262 | would be as follows; | |
263 | ||
264 | my $decoder = Encode::Guess->guess($data); | |
265 | die $decoder unless ref($decoder); | |
266 | my $utf8 = $decoder->decode($data); | |
267 | ||
268 | =item guess_encoding($data, [, I<list of suspects>]) | |
269 | ||
270 | You can also try C<guess_encoding> function which is exported by | |
271 | default. It takes $data to check and it also takes the list of | |
272 | suspects by option. The optional suspect list is I<not reflected> to | |
273 | the internal suspects list. | |
274 | ||
275 | my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); | |
276 | die $decoder unless ref($decoder); | |
277 | my $utf8 = $decoder->decode($data); | |
278 | # check only ascii and utf8 | |
279 | my $decoder = guess_encoding($data); | |
280 | ||
281 | =back | |
282 | ||
283 | =head1 CAVEATS | |
284 | ||
285 | =over 4 | |
286 | ||
287 | =item * | |
288 | ||
289 | Because of the algorithm used, ISO-8859 series and other single-byte | |
290 | encodings do not work well unless either one of ISO-8859 is the only | |
291 | one suspect (besides ascii and utf8). | |
292 | ||
293 | use Encode::Guess; | |
294 | # perhaps ok | |
295 | my $decoder = guess_encoding($data, 'latin1'); | |
296 | # definitely NOT ok | |
297 | my $decoder = guess_encoding($data, qw/latin1 greek/); | |
298 | ||
299 | The reason is that Encode::Guess guesses encoding by trial and error. | |
300 | It first splits $data into lines and tries to decode the line for each | |
301 | suspect. It keeps it going until all but one encoding is eliminated | |
302 | out of suspects list. ISO-8859 series is just too successful for most | |
303 | cases (because it fills almost all code points in \x00-\xff). | |
304 | ||
305 | =item * | |
306 | ||
307 | Do not mix national standard encodings and the corresponding vendor | |
308 | encodings. | |
309 | ||
310 | # a very bad idea | |
311 | my $decoder | |
312 | = guess_encoding($data, qw/shiftjis MacJapanese cp932/); | |
313 | ||
314 | The reason is that vendor encoding is usually a superset of national | |
315 | standard so it becomes too ambiguous for most cases. | |
316 | ||
317 | =item * | |
318 | ||
319 | On the other hand, mixing various national standard encodings | |
320 | automagically works unless $data is too short to allow for guessing. | |
321 | ||
322 | # This is ok if $data is long enough | |
323 | my $decoder = | |
324 | guess_encoding($data, qw/euc-cn | |
325 | euc-jp shiftjis 7bit-jis | |
326 | euc-kr | |
327 | big5-eten/); | |
328 | ||
329 | =item * | |
330 | ||
331 | DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! | |
332 | ||
333 | my $decoder = guess_encoding($data, | |
334 | Encode->encodings(":all")); | |
335 | ||
336 | =back | |
337 | ||
338 | It is, after all, just a guess. You should alway be explicit when it | |
339 | comes to encodings. But there are some, especially Japanese, | |
340 | environment that guess-coding is a must. Use this module with care. | |
341 | ||
342 | =head1 TO DO | |
343 | ||
344 | Encode::Guess does not work on EBCDIC platforms. | |
345 | ||
346 | =head1 SEE ALSO | |
347 | ||
348 | L<Encode>, L<Encode::Encoding> | |
349 | ||
350 | =cut | |
351 |