Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package Encode::Guess; |
2 | use strict; | |
3 | ||
4 | use Encode qw(:fallbacks find_encoding); | |
5 | our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
6 | ||
7 | my $Canon = 'Guess'; | |
8 | our $DEBUG = 0; | |
9 | our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); | |
10 | $Encode::Encoding{$Canon} = | |
11 | bless { | |
12 | Name => $Canon, | |
13 | Suspects => { %DEF_SUSPECTS }, | |
14 | } => __PACKAGE__; | |
15 | ||
16 | use base qw(Encode::Encoding); | |
17 | sub needs_lines { 1 } | |
18 | sub perlio_ok { 0 } | |
19 | ||
20 | our @EXPORT = qw(guess_encoding); | |
21 | ||
22 | sub import { # Exporter not used so we do it on our own | |
23 | my $callpkg = caller; | |
24 | for my $item (@EXPORT){ | |
25 | no strict 'refs'; | |
26 | *{"$callpkg\::$item"} = \&{"$item"}; | |
27 | } | |
28 | set_suspects(@_); | |
29 | } | |
30 | ||
31 | sub set_suspects{ | |
32 | my $class = shift; | |
33 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; | |
34 | $self->{Suspects} = { %DEF_SUSPECTS }; | |
35 | $self->add_suspects(@_); | |
36 | } | |
37 | ||
38 | sub add_suspects{ | |
39 | my $class = shift; | |
40 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; | |
41 | for my $c (@_){ | |
42 | my $e = find_encoding($c) or die "Unknown encoding: $c"; | |
43 | $self->{Suspects}{$e->name} = $e; | |
44 | $DEBUG and warn "Added: ", $e->name; | |
45 | } | |
46 | } | |
47 | ||
48 | sub decode($$;$){ | |
49 | my ($obj, $octet, $chk) = @_; | |
50 | my $guessed = guess($obj, $octet); | |
51 | unless (ref($guessed)){ | |
52 | require Carp; | |
53 | Carp::croak($guessed); | |
54 | } | |
55 | my $utf8 = $guessed->decode($octet, $chk); | |
56 | $_[1] = $octet if $chk; | |
57 | return $utf8; | |
58 | } | |
59 | ||
60 | sub guess_encoding{ | |
61 | guess($Encode::Encoding{$Canon}, @_); | |
62 | } | |
63 | ||
64 | sub guess { | |
65 | my $class = shift; | |
66 | my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; | |
67 | my $octet = shift; | |
68 | ||
69 | # sanity check | |
70 | return unless defined $octet and length $octet; | |
71 | ||
72 | # cheat 0: utf8 flag; | |
73 | Encode::is_utf8($octet) and return find_encoding('utf8'); | |
74 | # cheat 1: BOM | |
75 | use Encode::Unicode; | |
76 | my $BOM = unpack('n', $octet); | |
77 | return find_encoding('UTF-16') | |
78 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); | |
79 | $BOM = unpack('N', $octet); | |
80 | return find_encoding('UTF-32') | |
81 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); | |
82 | ||
83 | my %try = %{$obj->{Suspects}}; | |
84 | for my $c (@_){ | |
85 | my $e = find_encoding($c) or die "Unknown encoding: $c"; | |
86 | $try{$e->name} = $e; | |
87 | $DEBUG and warn "Added: ", $e->name; | |
88 | } | |
89 | my $nline = 1; | |
90 | for my $line (split /\r\n?|\n/, $octet){ | |
91 | # cheat 2 -- \e in the string | |
92 | if ($line =~ /\e/o){ | |
93 | my @keys = keys %try; | |
94 | delete @try{qw/utf8 ascii/}; | |
95 | for my $k (@keys){ | |
96 | ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; | |
97 | } | |
98 | } | |
99 | my %ok = %try; | |
100 | # warn join(",", keys %try); | |
101 | for my $k (keys %try){ | |
102 | my $scratch = $line; | |
103 | $try{$k}->decode($scratch, FB_QUIET); | |
104 | if ($scratch eq ''){ | |
105 | $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); | |
106 | }else{ | |
107 | use bytes (); | |
108 | $DEBUG and | |
109 | warn sprintf("%4d:%-24s not ok; %d bytes left\n", | |
110 | $nline, $k, bytes::length($scratch)); | |
111 | delete $ok{$k}; | |
112 | ||
113 | } | |
114 | } | |
115 | %ok or return "No appropriate encodings found!"; | |
116 | if (scalar(keys(%ok)) == 1){ | |
117 | my ($retval) = values(%ok); | |
118 | return $retval; | |
119 | } | |
120 | %try = %ok; $nline++; | |
121 | } | |
122 | $try{ascii} or | |
123 | return "Encodings too ambiguous: ", join(" or ", keys %try); | |
124 | return $try{ascii}; | |
125 | } | |
126 | ||
127 | ||
128 | ||
129 | 1; | |
130 | __END__ | |
131 | ||
132 | =head1 NAME | |
133 | ||
134 | Encode::Guess -- Guesses encoding from data | |
135 | ||
136 | =head1 SYNOPSIS | |
137 | ||
138 | # if you are sure $data won't contain anything bogus | |
139 | ||
140 | use Encode; | |
141 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; | |
142 | my $utf8 = decode("Guess", $data); | |
143 | my $data = encode("Guess", $utf8); # this doesn't work! | |
144 | ||
145 | # more elaborate way | |
146 | use Encode::Guess, | |
147 | my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); | |
148 | ref($enc) or die "Can't guess: $enc"; # trap error this way | |
149 | $utf8 = $enc->decode($data); | |
150 | # or | |
151 | $utf8 = decode($enc->name, $data) | |
152 | ||
153 | =head1 ABSTRACT | |
154 | ||
155 | Encode::Guess enables you to guess in what encoding a given data is | |
156 | encoded, or at least tries to. | |
157 | ||
158 | =head1 DESCRIPTION | |
159 | ||
160 | By default, it checks only ascii, utf8 and UTF-16/32 with BOM. | |
161 | ||
162 | use Encode::Guess; # ascii/utf8/BOMed UTF | |
163 | ||
164 | To use it more practically, you have to give the names of encodings to | |
165 | check (I<suspects> as follows). The name of suspects can either be | |
166 | canonical names or aliases. | |
167 | ||
168 | # tries all major Japanese Encodings as well | |
169 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; | |
170 | ||
171 | =over 4 | |
172 | ||
173 | =item Encode::Guess->set_suspects | |
174 | ||
175 | You can also change the internal suspects list via C<set_suspects> | |
176 | method. | |
177 | ||
178 | use Encode::Guess; | |
179 | Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); | |
180 | ||
181 | =item Encode::Guess->add_suspects | |
182 | ||
183 | Or you can use C<add_suspects> method. The difference is that | |
184 | C<set_suspects> flushes the current suspects list while | |
185 | C<add_suspects> adds. | |
186 | ||
187 | use Encode::Guess; | |
188 | Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); | |
189 | # now the suspects are euc-jp,shiftjis,7bit-jis, AND | |
190 | # euc-kr,euc-cn, and big5-eten | |
191 | Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); | |
192 | ||
193 | =item Encode::decode("Guess" ...) | |
194 | ||
195 | When you are content with suspects list, you can now | |
196 | ||
197 | my $utf8 = Encode::decode("Guess", $data); | |
198 | ||
199 | =item Encode::Guess->guess($data) | |
200 | ||
201 | But it will croak if Encode::Guess fails to eliminate all other | |
202 | suspects but the right one or no suspect was good. So you should | |
203 | instead try this; | |
204 | ||
205 | my $decoder = Encode::Guess->guess($data); | |
206 | ||
207 | On success, $decoder is an object that is documented in | |
208 | L<Encode::Encoding>. So you can now do this; | |
209 | ||
210 | my $utf8 = $decoder->decode($data); | |
211 | ||
212 | On failure, $decoder now contains an error message so the whole thing | |
213 | would be as follows; | |
214 | ||
215 | my $decoder = Encode::Guess->guess($data); | |
216 | die $decoder unless ref($decoder); | |
217 | my $utf8 = $decoder->decode($data); | |
218 | ||
219 | =item guess_encoding($data, [, I<list of suspects>]) | |
220 | ||
221 | You can also try C<guess_encoding> function which is exported by | |
222 | default. It takes $data to check and it also takes the list of | |
223 | suspects by option. The optional suspect list is I<not reflected> to | |
224 | the internal suspects list. | |
225 | ||
226 | my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); | |
227 | die $decoder unless ref($decoder); | |
228 | my $utf8 = $decoder->decode($data); | |
229 | # check only ascii and utf8 | |
230 | my $decoder = guess_encoding($data); | |
231 | ||
232 | =back | |
233 | ||
234 | =head1 CAVEATS | |
235 | ||
236 | =over 4 | |
237 | ||
238 | =item * | |
239 | ||
240 | Because of the algorithm used, ISO-8859 series and other single-byte | |
241 | encodings do not work well unless either one of ISO-8859 is the only | |
242 | one suspect (besides ascii and utf8). | |
243 | ||
244 | use Encode::Guess; | |
245 | # perhaps ok | |
246 | my $decoder = guess_encoding($data, 'latin1'); | |
247 | # definitely NOT ok | |
248 | my $decoder = guess_encoding($data, qw/latin1 greek/); | |
249 | ||
250 | The reason is that Encode::Guess guesses encoding by trial and error. | |
251 | It first splits $data into lines and tries to decode the line for each | |
252 | suspect. It keeps it going until all but one encoding was eliminated | |
253 | out of suspects list. ISO-8859 series is just too successful for most | |
254 | cases (because it fills almost all code points in \x00-\xff). | |
255 | ||
256 | =item * | |
257 | ||
258 | Do not mix national standard encodings and the corresponding vendor | |
259 | encodings. | |
260 | ||
261 | # a very bad idea | |
262 | my $decoder | |
263 | = guess_encoding($data, qw/shiftjis MacJapanese cp932/); | |
264 | ||
265 | The reason is that vendor encoding is usually a superset of national | |
266 | standard so it becomes too ambiguous for most cases. | |
267 | ||
268 | =item * | |
269 | ||
270 | On the other hand, mixing various national standard encodings | |
271 | automagically works unless $data is too short to allow for guessing. | |
272 | ||
273 | # This is ok if $data is long enough | |
274 | my $decoder = | |
275 | guess_encoding($data, qw/euc-cn | |
276 | euc-jp shiftjis 7bit-jis | |
277 | euc-kr | |
278 | big5-eten/); | |
279 | ||
280 | =item * | |
281 | ||
282 | DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! | |
283 | ||
284 | my $decoder = guess_encoding($data, | |
285 | Encode->encodings(":all")); | |
286 | ||
287 | =back | |
288 | ||
289 | It is, after all, just a guess. You should alway be explicit when it | |
290 | comes to encodings. But there are some, especially Japanese, | |
291 | environment that guess-coding is a must. Use this module with care. | |
292 | ||
293 | =head1 TO DO | |
294 | ||
295 | Encode::Guess does not work on EBCDIC platforms. | |
296 | ||
297 | =head1 SEE ALSO | |
298 | ||
299 | L<Encode>, L<Encode::Encoding> | |
300 | ||
301 | =cut | |
302 |