git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental - sam-t2/devtools/amd64/lib/perl5/5.8.8/i86pc-solaris-64-ld/Encode/Guess.pm

... / ...

Commit	Line	Data
	1	package Encode::Guess;
	2	use strict;
	3
	4	use Encode qw(:fallbacks find_encoding);
	5	our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
	6
	7	my $Canon = 'Guess';
	8	sub DEBUG () { 0 }
	9	our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
	10	$Encode::Encoding{$Canon} =
	11	bless {
	12	Name => $Canon,
	13	Suspects => { %DEF_SUSPECTS },
	14	} => __PACKAGE__;
	15
	16	use base qw(Encode::Encoding);
	17	sub needs_lines { 1 }
	18	sub perlio_ok { 0 }
	19
	20	our @EXPORT = qw(guess_encoding);
	21	our $NoUTFAutoGuess = 0;
	22	our $UTF8_BOM = pack("C3", 0xef, 0xbb, 0xbf);
	23
	24	sub import { # Exporter not used so we do it on our own
	25	my $callpkg = caller;
	26	for my $item (@EXPORT){
	27	no strict 'refs';
	28	*{"$callpkg\::$item"} = \&{"$item"};
	29	}
	30	set_suspects(@_);
	31	}
	32
	33	sub set_suspects{
	34	my $class = shift;
	35	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	36	$self->{Suspects} = { %DEF_SUSPECTS };
	37	$self->add_suspects(@_);
	38	}
	39
	40	sub add_suspects{
	41	my $class = shift;
	42	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	43	for my $c (@_){
	44	my $e = find_encoding($c) or die "Unknown encoding: $c";
	45	$self->{Suspects}{$e->name} = $e;
	46	DEBUG and warn "Added: ", $e->name;
	47	}
	48	}
	49
	50	sub decode($$;$){
	51	my ($obj, $octet, $chk) = @_;
	52	my $guessed = guess($obj, $octet);
	53	unless (ref($guessed)){
	54	require Carp;
	55	Carp::croak($guessed);
	56	}
	57	my $utf8 = $guessed->decode($octet, $chk);
	58	$_[1] = $octet if $chk;
	59	return $utf8;
	60	}
	61
	62	sub guess_encoding{
	63	guess($Encode::Encoding{$Canon}, @_);
	64	}
	65
	66	sub guess {
	67	my $class = shift;
	68	my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
	69	my $octet = shift;
	70
	71	# sanity check
	72	return unless defined $octet and length $octet;
	73
	74	# cheat 0: utf8 flag;
	75	if ( Encode::is_utf8($octet) ) {
	76	return find_encoding('utf8') unless $NoUTFAutoGuess;
	77	Encode::_utf8_off($octet);
	78	}
	79	# cheat 1: BOM
	80	use Encode::Unicode;
	81	unless ($NoUTFAutoGuess) {
	82	my $BOM = pack('C3', unpack("C3", $octet));
	83	return find_encoding('utf8')
	84	if (defined $BOM and $BOM eq $UTF8_BOM);
	85	$BOM = unpack('N', $octet);
	86	return find_encoding('UTF-32')
	87	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
	88	$BOM = unpack('n', $octet);
	89	return find_encoding('UTF-16')
	90	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
	91	if ($octet =~ /\x00/o){ # if \x00 found, we assume UTF-(16\|32)(BE\|LE)
	92	my $utf;
	93	my ($be, $le) = (0, 0);
	94	if ($octet =~ /\x00\x00/o){ # UTF-32(BE\|LE) assumed
	95	$utf = "UTF-32";
	96	for my $char (unpack('N*', $octet)){
	97	$char & 0x0000ffff and $be++;
	98	$char & 0xffff0000 and $le++;
	99	}
	100	}else{ # UTF-16(BE\|LE) assumed
	101	$utf = "UTF-16";
	102	for my $char (unpack('n*', $octet)){
	103	$char & 0x00ff and $be++;
	104	$char & 0xff00 and $le++;
	105	}
	106	}
	107	DEBUG and warn "$utf, be == $be, le == $le";
	108	$be == $le
	109	and return
	110	"Encodings ambiguous between $utf BE and LE ($be, $le)";
	111	$utf .= ($be > $le) ? 'BE' : 'LE';
	112	return find_encoding($utf);
	113	}
	114	}
	115	my %try = %{$obj->{Suspects}};
	116	for my $c (@_){
	117	my $e = find_encoding($c) or die "Unknown encoding: $c";
	118	$try{$e->name} = $e;
	119	DEBUG and warn "Added: ", $e->name;
	120	}
	121	my $nline = 1;
	122	for my $line (split /\r\n?\|\n/, $octet){
	123	# cheat 2 -- \e in the string
	124	if ($line =~ /\e/o){
	125	my @keys = keys %try;
	126	delete @try{qw/utf8 ascii/};
	127	for my $k (@keys){
	128	ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
	129	}
	130	}
	131	my %ok = %try;
	132	# warn join(",", keys %try);
	133	for my $k (keys %try){
	134	my $scratch = $line;
	135	$try{$k}->decode($scratch, FB_QUIET);
	136	if ($scratch eq ''){
	137	DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
	138	}else{
	139	use bytes ();
	140	DEBUG and
	141	warn sprintf("%4d:%-24s not ok; %d bytes left\n",
	142	$nline, $k, bytes::length($scratch));
	143	delete $ok{$k};
	144	}
	145	}
	146	%ok or return "No appropriate encodings found!";
	147	if (scalar(keys(%ok)) == 1){
	148	my ($retval) = values(%ok);
	149	return $retval;
	150	}
	151	%try = %ok; $nline++;
	152	}
	153	$try{ascii} or
	154	return "Encodings too ambiguous: ", join(" or ", keys %try);
	155	return $try{ascii};
	156	}
	157
	158
	159
	160	1;
	161	__END__
	162
	163	=head1 NAME
	164
	165	Encode::Guess -- Guesses encoding from data
	166
	167	=head1 SYNOPSIS
	168
	169	# if you are sure $data won't contain anything bogus
	170
	171	use Encode;
	172	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	173	my $utf8 = decode("Guess", $data);
	174	my $data = encode("Guess", $utf8); # this doesn't work!
	175
	176	# more elaborate way
	177	use Encode::Guess;
	178	my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
	179	ref($enc) or die "Can't guess: $enc"; # trap error this way
	180	$utf8 = $enc->decode($data);
	181	# or
	182	$utf8 = decode($enc->name, $data)
	183
	184	=head1 ABSTRACT
	185
	186	Encode::Guess enables you to guess in what encoding a given data is
	187	encoded, or at least tries to.
	188
	189	=head1 DESCRIPTION
	190
	191	By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
	192
	193	use Encode::Guess; # ascii/utf8/BOMed UTF
	194
	195	To use it more practically, you have to give the names of encodings to
	196	check (I<suspects> as follows). The name of suspects can either be
	197	canonical names or aliases.
	198
	199	CAVEAT: Unlike UTF-(16\|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
	200
	201	# tries all major Japanese Encodings as well
	202	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
	203
	204	If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
	205	value, no heuristics will be applied to UTF8/16/32, and the result
	206	will be limited to the suspects and C<ascii>.
	207
	208	=over 4
	209
	210	=item Encode::Guess->set_suspects
	211
	212	You can also change the internal suspects list via C<set_suspects>
	213	method.
	214
	215	use Encode::Guess;
	216	Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
	217
	218	=item Encode::Guess->add_suspects
	219
	220	Or you can use C<add_suspects> method. The difference is that
	221	C<set_suspects> flushes the current suspects list while
	222	C<add_suspects> adds.
	223
	224	use Encode::Guess;
	225	Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
	226	# now the suspects are euc-jp,shiftjis,7bit-jis, AND
	227	# euc-kr,euc-cn, and big5-eten
	228	Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
	229
	230	=item Encode::decode("Guess" ...)
	231
	232	When you are content with suspects list, you can now
	233
	234	my $utf8 = Encode::decode("Guess", $data);
	235
	236	=item Encode::Guess->guess($data)
	237
	238	But it will croak if:
	239
	240	=over
	241
	242	=item *
	243
	244	Two or more suspects remain
	245
	246	=item *
	247
	248	No suspects left
	249
	250	=back
	251
	252	So you should instead try this;
	253
	254	my $decoder = Encode::Guess->guess($data);
	255
	256	On success, $decoder is an object that is documented in
	257	L<Encode::Encoding>. So you can now do this;
	258
	259	my $utf8 = $decoder->decode($data);
	260
	261	On failure, $decoder now contains an error message so the whole thing
	262	would be as follows;
	263
	264	my $decoder = Encode::Guess->guess($data);
	265	die $decoder unless ref($decoder);
	266	my $utf8 = $decoder->decode($data);
	267
	268	=item guess_encoding($data, [, I<list of suspects>])
	269
	270	You can also try C<guess_encoding> function which is exported by
	271	default. It takes $data to check and it also takes the list of
	272	suspects by option. The optional suspect list is I<not reflected> to
	273	the internal suspects list.
	274
	275	my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
	276	die $decoder unless ref($decoder);
	277	my $utf8 = $decoder->decode($data);
	278	# check only ascii and utf8
	279	my $decoder = guess_encoding($data);
	280
	281	=back
	282
	283	=head1 CAVEATS
	284
	285	=over 4
	286
	287	=item *
	288
	289	Because of the algorithm used, ISO-8859 series and other single-byte
	290	encodings do not work well unless either one of ISO-8859 is the only
	291	one suspect (besides ascii and utf8).
	292
	293	use Encode::Guess;
	294	# perhaps ok
	295	my $decoder = guess_encoding($data, 'latin1');
	296	# definitely NOT ok
	297	my $decoder = guess_encoding($data, qw/latin1 greek/);
	298
	299	The reason is that Encode::Guess guesses encoding by trial and error.
	300	It first splits $data into lines and tries to decode the line for each
	301	suspect. It keeps it going until all but one encoding is eliminated
	302	out of suspects list. ISO-8859 series is just too successful for most
	303	cases (because it fills almost all code points in \x00-\xff).
	304
	305	=item *
	306
	307	Do not mix national standard encodings and the corresponding vendor
	308	encodings.
	309
	310	# a very bad idea
	311	my $decoder
	312	= guess_encoding($data, qw/shiftjis MacJapanese cp932/);
	313
	314	The reason is that vendor encoding is usually a superset of national
	315	standard so it becomes too ambiguous for most cases.
	316
	317	=item *
	318
	319	On the other hand, mixing various national standard encodings
	320	automagically works unless $data is too short to allow for guessing.
	321
	322	# This is ok if $data is long enough
	323	my $decoder =
	324	guess_encoding($data, qw/euc-cn
	325	euc-jp shiftjis 7bit-jis
	326	euc-kr
	327	big5-eten/);
	328
	329	=item *
	330
	331	DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
	332
	333	my $decoder = guess_encoding($data,
	334	Encode->encodings(":all"));
	335
	336	=back
	337
	338	It is, after all, just a guess. You should alway be explicit when it
	339	comes to encodings. But there are some, especially Japanese,
	340	environment that guess-coding is a must. Use this module with care.
	341
	342	=head1 TO DO
	343
	344	Encode::Guess does not work on EBCDIC platforms.
	345
	346	=head1 SEE ALSO
	347
	348	L<Encode>, L<Encode::Encoding>
	349
	350	=cut
	351