| 1 | package Encode::Guess; |
| 2 | use strict; |
| 3 | |
| 4 | use Encode qw(:fallbacks find_encoding); |
| 5 | our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
| 6 | |
| 7 | my $Canon = 'Guess'; |
| 8 | sub DEBUG () { 0 } |
| 9 | our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); |
| 10 | $Encode::Encoding{$Canon} = |
| 11 | bless { |
| 12 | Name => $Canon, |
| 13 | Suspects => { %DEF_SUSPECTS }, |
| 14 | } => __PACKAGE__; |
| 15 | |
| 16 | use base qw(Encode::Encoding); |
| 17 | sub needs_lines { 1 } |
| 18 | sub perlio_ok { 0 } |
| 19 | |
| 20 | our @EXPORT = qw(guess_encoding); |
| 21 | our $NoUTFAutoGuess = 0; |
| 22 | our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf); |
| 23 | |
| 24 | sub import { # Exporter not used so we do it on our own |
| 25 | my $callpkg = caller; |
| 26 | for my $item (@EXPORT){ |
| 27 | no strict 'refs'; |
| 28 | *{"$callpkg\::$item"} = \&{"$item"}; |
| 29 | } |
| 30 | set_suspects(@_); |
| 31 | } |
| 32 | |
| 33 | sub set_suspects{ |
| 34 | my $class = shift; |
| 35 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
| 36 | $self->{Suspects} = { %DEF_SUSPECTS }; |
| 37 | $self->add_suspects(@_); |
| 38 | } |
| 39 | |
| 40 | sub add_suspects{ |
| 41 | my $class = shift; |
| 42 | my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; |
| 43 | for my $c (@_){ |
| 44 | my $e = find_encoding($c) or die "Unknown encoding: $c"; |
| 45 | $self->{Suspects}{$e->name} = $e; |
| 46 | DEBUG and warn "Added: ", $e->name; |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | sub decode($$;$){ |
| 51 | my ($obj, $octet, $chk) = @_; |
| 52 | my $guessed = guess($obj, $octet); |
| 53 | unless (ref($guessed)){ |
| 54 | require Carp; |
| 55 | Carp::croak($guessed); |
| 56 | } |
| 57 | my $utf8 = $guessed->decode($octet, $chk); |
| 58 | $_[1] = $octet if $chk; |
| 59 | return $utf8; |
| 60 | } |
| 61 | |
| 62 | sub guess_encoding{ |
| 63 | guess($Encode::Encoding{$Canon}, @_); |
| 64 | } |
| 65 | |
| 66 | sub guess { |
| 67 | my $class = shift; |
| 68 | my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; |
| 69 | my $octet = shift; |
| 70 | |
| 71 | # sanity check |
| 72 | return unless defined $octet and length $octet; |
| 73 | |
| 74 | # cheat 0: utf8 flag; |
| 75 | if ( Encode::is_utf8($octet) ) { |
| 76 | return find_encoding('utf8') unless $NoUTFAutoGuess; |
| 77 | Encode::_utf8_off($octet); |
| 78 | } |
| 79 | # cheat 1: BOM |
| 80 | use Encode::Unicode; |
| 81 | unless ($NoUTFAutoGuess) { |
| 82 | my $BOM = pack('C3', unpack("C3", $octet)); |
| 83 | return find_encoding('utf8') |
| 84 | if (defined $BOM and $BOM eq $UTF8_BOM); |
| 85 | $BOM = unpack('N', $octet); |
| 86 | return find_encoding('UTF-32') |
| 87 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000)); |
| 88 | $BOM = unpack('n', $octet); |
| 89 | return find_encoding('UTF-16') |
| 90 | if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe)); |
| 91 | if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16|32)(BE|LE) |
| 92 | my $utf; |
| 93 | my ($be, $le) = (0, 0); |
| 94 | if ($octet =~ /\x00\x00/o){ # UTF-32(BE|LE) assumed |
| 95 | $utf = "UTF-32"; |
| 96 | for my $char (unpack('N*', $octet)){ |
| 97 | $char & 0x0000ffff and $be++; |
| 98 | $char & 0xffff0000 and $le++; |
| 99 | } |
| 100 | }else{ # UTF-16(BE|LE) assumed |
| 101 | $utf = "UTF-16"; |
| 102 | for my $char (unpack('n*', $octet)){ |
| 103 | $char & 0x00ff and $be++; |
| 104 | $char & 0xff00 and $le++; |
| 105 | } |
| 106 | } |
| 107 | DEBUG and warn "$utf, be == $be, le == $le"; |
| 108 | $be == $le |
| 109 | and return |
| 110 | "Encodings ambiguous between $utf BE and LE ($be, $le)"; |
| 111 | $utf .= ($be > $le) ? 'BE' : 'LE'; |
| 112 | return find_encoding($utf); |
| 113 | } |
| 114 | } |
| 115 | my %try = %{$obj->{Suspects}}; |
| 116 | for my $c (@_){ |
| 117 | my $e = find_encoding($c) or die "Unknown encoding: $c"; |
| 118 | $try{$e->name} = $e; |
| 119 | DEBUG and warn "Added: ", $e->name; |
| 120 | } |
| 121 | my $nline = 1; |
| 122 | for my $line (split /\r\n?|\n/, $octet){ |
| 123 | # cheat 2 -- \e in the string |
| 124 | if ($line =~ /\e/o){ |
| 125 | my @keys = keys %try; |
| 126 | delete @try{qw/utf8 ascii/}; |
| 127 | for my $k (@keys){ |
| 128 | ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; |
| 129 | } |
| 130 | } |
| 131 | my %ok = %try; |
| 132 | # warn join(",", keys %try); |
| 133 | for my $k (keys %try){ |
| 134 | my $scratch = $line; |
| 135 | $try{$k}->decode($scratch, FB_QUIET); |
| 136 | if ($scratch eq ''){ |
| 137 | DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); |
| 138 | }else{ |
| 139 | use bytes (); |
| 140 | DEBUG and |
| 141 | warn sprintf("%4d:%-24s not ok; %d bytes left\n", |
| 142 | $nline, $k, bytes::length($scratch)); |
| 143 | delete $ok{$k}; |
| 144 | } |
| 145 | } |
| 146 | %ok or return "No appropriate encodings found!"; |
| 147 | if (scalar(keys(%ok)) == 1){ |
| 148 | my ($retval) = values(%ok); |
| 149 | return $retval; |
| 150 | } |
| 151 | %try = %ok; $nline++; |
| 152 | } |
| 153 | $try{ascii} or |
| 154 | return "Encodings too ambiguous: ", join(" or ", keys %try); |
| 155 | return $try{ascii}; |
| 156 | } |
| 157 | |
| 158 | |
| 159 | |
| 160 | 1; |
| 161 | __END__ |
| 162 | |
| 163 | =head1 NAME |
| 164 | |
| 165 | Encode::Guess -- Guesses encoding from data |
| 166 | |
| 167 | =head1 SYNOPSIS |
| 168 | |
| 169 | # if you are sure $data won't contain anything bogus |
| 170 | |
| 171 | use Encode; |
| 172 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; |
| 173 | my $utf8 = decode("Guess", $data); |
| 174 | my $data = encode("Guess", $utf8); # this doesn't work! |
| 175 | |
| 176 | # more elaborate way |
| 177 | use Encode::Guess; |
| 178 | my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); |
| 179 | ref($enc) or die "Can't guess: $enc"; # trap error this way |
| 180 | $utf8 = $enc->decode($data); |
| 181 | # or |
| 182 | $utf8 = decode($enc->name, $data) |
| 183 | |
| 184 | =head1 ABSTRACT |
| 185 | |
| 186 | Encode::Guess enables you to guess in what encoding a given data is |
| 187 | encoded, or at least tries to. |
| 188 | |
| 189 | =head1 DESCRIPTION |
| 190 | |
| 191 | By default, it checks only ascii, utf8 and UTF-16/32 with BOM. |
| 192 | |
| 193 | use Encode::Guess; # ascii/utf8/BOMed UTF |
| 194 | |
| 195 | To use it more practically, you have to give the names of encodings to |
| 196 | check (I<suspects> as follows). The name of suspects can either be |
| 197 | canonical names or aliases. |
| 198 | |
| 199 | CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED. |
| 200 | |
| 201 | # tries all major Japanese Encodings as well |
| 202 | use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; |
| 203 | |
| 204 | If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true |
| 205 | value, no heuristics will be applied to UTF8/16/32, and the result |
| 206 | will be limited to the suspects and C<ascii>. |
| 207 | |
| 208 | =over 4 |
| 209 | |
| 210 | =item Encode::Guess->set_suspects |
| 211 | |
| 212 | You can also change the internal suspects list via C<set_suspects> |
| 213 | method. |
| 214 | |
| 215 | use Encode::Guess; |
| 216 | Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); |
| 217 | |
| 218 | =item Encode::Guess->add_suspects |
| 219 | |
| 220 | Or you can use C<add_suspects> method. The difference is that |
| 221 | C<set_suspects> flushes the current suspects list while |
| 222 | C<add_suspects> adds. |
| 223 | |
| 224 | use Encode::Guess; |
| 225 | Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); |
| 226 | # now the suspects are euc-jp,shiftjis,7bit-jis, AND |
| 227 | # euc-kr,euc-cn, and big5-eten |
| 228 | Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); |
| 229 | |
| 230 | =item Encode::decode("Guess" ...) |
| 231 | |
| 232 | When you are content with suspects list, you can now |
| 233 | |
| 234 | my $utf8 = Encode::decode("Guess", $data); |
| 235 | |
| 236 | =item Encode::Guess->guess($data) |
| 237 | |
| 238 | But it will croak if: |
| 239 | |
| 240 | =over |
| 241 | |
| 242 | =item * |
| 243 | |
| 244 | Two or more suspects remain |
| 245 | |
| 246 | =item * |
| 247 | |
| 248 | No suspects left |
| 249 | |
| 250 | =back |
| 251 | |
| 252 | So you should instead try this; |
| 253 | |
| 254 | my $decoder = Encode::Guess->guess($data); |
| 255 | |
| 256 | On success, $decoder is an object that is documented in |
| 257 | L<Encode::Encoding>. So you can now do this; |
| 258 | |
| 259 | my $utf8 = $decoder->decode($data); |
| 260 | |
| 261 | On failure, $decoder now contains an error message so the whole thing |
| 262 | would be as follows; |
| 263 | |
| 264 | my $decoder = Encode::Guess->guess($data); |
| 265 | die $decoder unless ref($decoder); |
| 266 | my $utf8 = $decoder->decode($data); |
| 267 | |
| 268 | =item guess_encoding($data, [, I<list of suspects>]) |
| 269 | |
| 270 | You can also try C<guess_encoding> function which is exported by |
| 271 | default. It takes $data to check and it also takes the list of |
| 272 | suspects by option. The optional suspect list is I<not reflected> to |
| 273 | the internal suspects list. |
| 274 | |
| 275 | my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); |
| 276 | die $decoder unless ref($decoder); |
| 277 | my $utf8 = $decoder->decode($data); |
| 278 | # check only ascii and utf8 |
| 279 | my $decoder = guess_encoding($data); |
| 280 | |
| 281 | =back |
| 282 | |
| 283 | =head1 CAVEATS |
| 284 | |
| 285 | =over 4 |
| 286 | |
| 287 | =item * |
| 288 | |
| 289 | Because of the algorithm used, ISO-8859 series and other single-byte |
| 290 | encodings do not work well unless either one of ISO-8859 is the only |
| 291 | one suspect (besides ascii and utf8). |
| 292 | |
| 293 | use Encode::Guess; |
| 294 | # perhaps ok |
| 295 | my $decoder = guess_encoding($data, 'latin1'); |
| 296 | # definitely NOT ok |
| 297 | my $decoder = guess_encoding($data, qw/latin1 greek/); |
| 298 | |
| 299 | The reason is that Encode::Guess guesses encoding by trial and error. |
| 300 | It first splits $data into lines and tries to decode the line for each |
| 301 | suspect. It keeps it going until all but one encoding is eliminated |
| 302 | out of suspects list. ISO-8859 series is just too successful for most |
| 303 | cases (because it fills almost all code points in \x00-\xff). |
| 304 | |
| 305 | =item * |
| 306 | |
| 307 | Do not mix national standard encodings and the corresponding vendor |
| 308 | encodings. |
| 309 | |
| 310 | # a very bad idea |
| 311 | my $decoder |
| 312 | = guess_encoding($data, qw/shiftjis MacJapanese cp932/); |
| 313 | |
| 314 | The reason is that vendor encoding is usually a superset of national |
| 315 | standard so it becomes too ambiguous for most cases. |
| 316 | |
| 317 | =item * |
| 318 | |
| 319 | On the other hand, mixing various national standard encodings |
| 320 | automagically works unless $data is too short to allow for guessing. |
| 321 | |
| 322 | # This is ok if $data is long enough |
| 323 | my $decoder = |
| 324 | guess_encoding($data, qw/euc-cn |
| 325 | euc-jp shiftjis 7bit-jis |
| 326 | euc-kr |
| 327 | big5-eten/); |
| 328 | |
| 329 | =item * |
| 330 | |
| 331 | DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! |
| 332 | |
| 333 | my $decoder = guess_encoding($data, |
| 334 | Encode->encodings(":all")); |
| 335 | |
| 336 | =back |
| 337 | |
| 338 | It is, after all, just a guess. You should alway be explicit when it |
| 339 | comes to encodings. But there are some, especially Japanese, |
| 340 | environment that guess-coding is a must. Use this module with care. |
| 341 | |
| 342 | =head1 TO DO |
| 343 | |
| 344 | Encode::Guess does not work on EBCDIC platforms. |
| 345 | |
| 346 | =head1 SEE ALSO |
| 347 | |
| 348 | L<Encode>, L<Encode::Encoding> |
| 349 | |
| 350 | =cut |
| 351 | |