git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame - tools/perl-5.8.0/lib/5.8.0/sun4-solaris/Encode/Guess.pm

Commit	Line	Data
86530b38 AT	1	package Encode::Guess;
	2	use strict;
	3
	4	use Encode qw(:fallbacks find_encoding);
	5	our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
	6
	7	my $Canon = 'Guess';
	8	our $DEBUG = 0;
	9	our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
	10	$Encode::Encoding{$Canon} =
	11	bless {
	12	Name => $Canon,
	13	Suspects => { %DEF_SUSPECTS },
	14	} => __PACKAGE__;
	15
	16	use base qw(Encode::Encoding);
	17	sub needs_lines { 1 }
	18	sub perlio_ok { 0 }
	19
	20	our @EXPORT = qw(guess_encoding);
	21
	22	sub import { # Exporter not used so we do it on our own
	23	my $callpkg = caller;
	24	for my $item (@EXPORT){
	25	no strict 'refs';
	26	*{"$callpkg\::$item"} = \&{"$item"};
	27	}
	28	set_suspects(@_);
	29	}
	30
	31	sub set_suspects{
	32	my $class = shift;
	33	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	34	$self->{Suspects} = { %DEF_SUSPECTS };
	35	$self->add_suspects(@_);
	36	}
	37
	38	sub add_suspects{
	39	my $class = shift;
	40	my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
	41	for my $c (@_){
	42	my $e = find_encoding($c) or die "Unknown encoding: $c";
	43	$self->{Suspects}{$e->name} = $e;
	44	$DEBUG and warn "Added: ", $e->name;
	45	}
	46	}
	47
	48	sub decode($$;$){
	49	my ($obj, $octet, $chk) = @_;
	50	my $guessed = guess($obj, $octet);
	51	unless (ref($guessed)){
	52	require Carp;
	53	Carp::croak($guessed);
	54	}
	55	my $utf8 = $guessed->decode($octet, $chk);
	56	$_[1] = $octet if $chk;
	57	return $utf8;
	58	}
	59
	60	sub guess_encoding{
	61	guess($Encode::Encoding{$Canon}, @_);
	62	}
	63
	64	sub guess {
65	my $class = shift;
66	my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
67	my $octet = shift;
68
69	# sanity check
70	return unless defined $octet and length $octet;
71
72	# cheat 0: utf8 flag;
73	Encode::is_utf8($octet) and return find_encoding('utf8');
74	# cheat 1: BOM
75	use Encode::Unicode;
76	my $BOM = unpack('n', $octet);
77	return find_encoding('UTF-16')
78	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe));
79	$BOM = unpack('N', $octet);
80	return find_encoding('UTF-32')
81	if (defined $BOM and ($BOM == 0xFeFF or $BOM == 0xFFFe0000));
82
83	my %try = %{$obj->{Suspects}};
84	for my $c (@_){
85	my $e = find_encoding($c) or die "Unknown encoding: $c";
86	$try{$e->name} = $e;
87	$DEBUG and warn "Added: ", $e->name;
88	}
89	my $nline = 1;
90	for my $line (split /\r\n?\|\n/, $octet){
91	# cheat 2 -- \e in the string
92	if ($line =~ /\e/o){
93	my @keys = keys %try;
94	delete @try{qw/utf8 ascii/};
95	for my $k (@keys){
96	ref($try{$k}) eq 'Encode::XS' and delete $try{$k};
97	}
98	}
99	my %ok = %try;
100	# warn join(",", keys %try);
101	for my $k (keys %try){
102	my $scratch = $line;
103	$try{$k}->decode($scratch, FB_QUIET);
104	if ($scratch eq ''){
105	$DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k);
106	}else{
107	use bytes ();
108	$DEBUG and
109	warn sprintf("%4d:%-24s not ok; %d bytes left\n",
110	$nline, $k, bytes::length($scratch));
111	delete $ok{$k};
112
113	}
114	}
115	%ok or return "No appropriate encodings found!";
116	if (scalar(keys(%ok)) == 1){
117	my ($retval) = values(%ok);
118	return $retval;
119	}
120	%try = %ok; $nline++;
121	}
122	$try{ascii} or
123	return "Encodings too ambiguous: ", join(" or ", keys %try);
124	return $try{ascii};
125	}
126
127
128
129	1;
130	__END__
131
132	=head1 NAME
133
134	Encode::Guess -- Guesses encoding from data
135
136	=head1 SYNOPSIS
137
138	# if you are sure $data won't contain anything bogus
139
140	use Encode;
141	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
142	my $utf8 = decode("Guess", $data);
143	my $data = encode("Guess", $utf8); # this doesn't work!
144
145	# more elaborate way
146	use Encode::Guess,
147	my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
148	ref($enc) or die "Can't guess: $enc"; # trap error this way
149	$utf8 = $enc->decode($data);
150	# or
151	$utf8 = decode($enc->name, $data)
152
153	=head1 ABSTRACT
154
155	Encode::Guess enables you to guess in what encoding a given data is
156	encoded, or at least tries to.
157
158	=head1 DESCRIPTION
159
160	By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
161
162	use Encode::Guess; # ascii/utf8/BOMed UTF
163
164	To use it more practically, you have to give the names of encodings to
165	check (I<suspects> as follows). The name of suspects can either be
166	canonical names or aliases.
167
168	# tries all major Japanese Encodings as well
169	use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
170
171	=over 4
172
173	=item Encode::Guess->set_suspects
174
175	You can also change the internal suspects list via C<set_suspects>
176	method.
177
178	use Encode::Guess;
179	Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
180
181	=item Encode::Guess->add_suspects
182
183	Or you can use C<add_suspects> method. The difference is that
184	C<set_suspects> flushes the current suspects list while
185	C<add_suspects> adds.
186
187	use Encode::Guess;
188	Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
189	# now the suspects are euc-jp,shiftjis,7bit-jis, AND
190	# euc-kr,euc-cn, and big5-eten
191	Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
192
193	=item Encode::decode("Guess" ...)
194
195	When you are content with suspects list, you can now
196
197	my $utf8 = Encode::decode("Guess", $data);
198
199	=item Encode::Guess->guess($data)
200
201	But it will croak if Encode::Guess fails to eliminate all other
202	suspects but the right one or no suspect was good. So you should
203	instead try this;
204
205	my $decoder = Encode::Guess->guess($data);
206
207	On success, $decoder is an object that is documented in
208	L<Encode::Encoding>. So you can now do this;
209
210	my $utf8 = $decoder->decode($data);
211
212	On failure, $decoder now contains an error message so the whole thing
213	would be as follows;
214
215	my $decoder = Encode::Guess->guess($data);
216	die $decoder unless ref($decoder);
217	my $utf8 = $decoder->decode($data);
218
219	=item guess_encoding($data, [, I<list of suspects>])
220
221	You can also try C<guess_encoding> function which is exported by
222	default. It takes $data to check and it also takes the list of
223	suspects by option. The optional suspect list is I<not reflected> to
224	the internal suspects list.
225
226	my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
227	die $decoder unless ref($decoder);
228	my $utf8 = $decoder->decode($data);
229	# check only ascii and utf8
230	my $decoder = guess_encoding($data);
231
232	=back
233
234	=head1 CAVEATS
235
236	=over 4
237
238	=item *
239
240	Because of the algorithm used, ISO-8859 series and other single-byte
241	encodings do not work well unless either one of ISO-8859 is the only
242	one suspect (besides ascii and utf8).
243
244	use Encode::Guess;
245	# perhaps ok
246	my $decoder = guess_encoding($data, 'latin1');
247	# definitely NOT ok
248	my $decoder = guess_encoding($data, qw/latin1 greek/);
249
250	The reason is that Encode::Guess guesses encoding by trial and error.
251	It first splits $data into lines and tries to decode the line for each
252	suspect. It keeps it going until all but one encoding was eliminated
253	out of suspects list. ISO-8859 series is just too successful for most
254	cases (because it fills almost all code points in \x00-\xff).
255
256	=item *
257
258	Do not mix national standard encodings and the corresponding vendor
259	encodings.
260
261	# a very bad idea
262	my $decoder
263	= guess_encoding($data, qw/shiftjis MacJapanese cp932/);
264
265	The reason is that vendor encoding is usually a superset of national
266	standard so it becomes too ambiguous for most cases.
267
268	=item *
269
270	On the other hand, mixing various national standard encodings
271	automagically works unless $data is too short to allow for guessing.
272
273	# This is ok if $data is long enough
274	my $decoder =
275	guess_encoding($data, qw/euc-cn
276	euc-jp shiftjis 7bit-jis
277	euc-kr
278	big5-eten/);
279
280	=item *
281
282	DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
283
284	my $decoder = guess_encoding($data,
285	Encode->encodings(":all"));
286
287	=back
288
289	It is, after all, just a guess. You should alway be explicit when it
290	comes to encodings. But there are some, especially Japanese,
291	environment that guess-coding is a must. Use this module with care.
292
293	=head1 TO DO
294
295	Encode::Guess does not work on EBCDIC platforms.
296
297	=head1 SEE ALSO
298
299	L<Encode>, L<Encode::Encoding>
300
301	=cut
302