git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental

... / ...

Commit	Line	Data
	1	#!/import/bw/tools/local/perl-5.8.0/bin/perl
	2	eval 'exec /import/bw/tools/local/perl-5.8.0/bin/perl -S $0 ${1+"$@"}'
	3	if $running_under_some_shell;
	4	#!./perl
	5	BEGIN {
	6	# @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
	7	# with $ENV{PERL_CORE} set
	8	# In case we need it in future...
	9	require Config; import Config;
	10	}
	11	use strict;
	12	use Getopt::Std;
	13	my @orig_ARGV = @ARGV;
	14	our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
	15
	16	# These may get re-ordered.
	17	# RAW is a do_now as inserted by &enter
	18	# AGG is an aggreagated do_now, as built up by &process
	19
	20	use constant {
	21	RAW_NEXT => 0,
	22	RAW_IN_LEN => 1,
	23	RAW_OUT_BYTES => 2,
	24	RAW_FALLBACK => 3,
	25
	26	AGG_MIN_IN => 0,
	27	AGG_MAX_IN => 1,
	28	AGG_OUT_BYTES => 2,
	29	AGG_NEXT => 3,
	30	AGG_IN_LEN => 4,
	31	AGG_OUT_LEN => 5,
	32	AGG_FALLBACK => 6,
	33	};
	34
	35	# (See the algorithm in encengine.c - we're building structures for it)
	36
	37	# There are two sorts of structures.
	38	# "do_now" (an array, two variants of what needs storing) is whatever we need
	39	# to do now we've read an input byte.
	40	# It's housed in a "do_next" (which is how we got to it), and in turn points
	41	# to a "do_next" which contains all the "do_now"s for the next input byte.
	42
	43	# There will be a "do_next" which is the start state.
	44	# For a single byte encoding it's the only "do_next" - each "do_now" points
	45	# back to it, and each "do_now" will cause bytes. There is no state.
	46
	47	# For a multi-byte encoding where all characters in the input are the same
	48	# length, then there will be a tree of "do_now"->"do_next"->"do_now"
	49	# branching out from the start state, one step for each input byte.
	50	# The leaf "do_now"s will all be at the same distance from the start state,
	51	# only the leaf "do_now"s cause output bytes, and they in turn point back to
	52	# the start state.
	53
	54	# For an encoding where there are varaible length input byte sequences, you
	55	# will encounter a leaf "do_now" sooner for the shorter input sequences, but
	56	# as before the leaves will point back to the start state.
	57
	58	# The system will cope with escape encodings (imagine them as a mostly
	59	# self-contained tree for each escape state, and cross links between trees
	60	# at the state-switching characters) but so far no input format defines these.
	61
	62	# The system will also cope with having output "leaves" in the middle of
	63	# the bifurcating branches, not just at the extremities, but again no
	64	# input format does this yet.
	65
	66	# There are two variants of the "do_now" structure. The first, smaller variant
	67	# is generated by &enter as the input file is read. There is one structure
	68	# for each input byte. Say we are mapping a single byte encoding to a
	69	# single byte encoding, with "ABCD" going "abcd". There will be
	70	# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
	71
	72	# &process then walks the tree, building aggregate "do_now" structres for
	73	# adjacent bytes where possible. The aggregate is for a contiguous range of
	74	# bytes which each produce the same length of output, each move to the
	75	# same next state, and each have the same fallback flag.
	76	# So our 4 RAW "do_now"s above become replaced by a single structure
	77	# containing:
	78	# ["A", "D", "abcd", 1, ...]
	79	# ie, for an input byte $_ in "A".."D", output 1 byte, found as
	80	# substr ("abcd", (ord $_ - ord "A") * 1, 1)
	81	# which maps very nicely into pointer arithmetic in C for encengine.c
	82
	83	sub encode_U
	84	{
	85	# UTF-8 encode long hand - only covers part of perl's range
	86	## my $uv = shift;
	87	# chr() works in native space so convert value from table
	88	# into that space before using chr().
	89	my $ch = chr(utf8::unicode_to_native($_[0]));
	90	# Now get core perl to encode that the way it likes.
	91	utf8::encode($ch);
	92	return $ch;
	93	}
	94
	95	sub encode_S
	96	{
	97	# encode single byte
	98	## my ($ch,$page) = @_; return chr($ch);
	99	return chr $_[0];
	100	}
	101
	102	sub encode_D
	103	{
	104	# encode double byte MS byte first
	105	## my ($ch,$page) = @_; return chr($page).chr($ch);
	106	return chr ($_[1]) . chr $_[0];
	107	}
	108
	109	sub encode_M
	110	{
	111	# encode Multi-byte - single for 0..255 otherwise double
	112	## my ($ch,$page) = @_;
	113	## return &encode_D if $page;
	114	## return &encode_S;
	115	return chr ($_[1]) . chr $_[0] if $_[1];
	116	return chr $_[0];
	117	}
	118
	119	my %encode_types = (U => \&encode_U,
	120	S => \&encode_S,
	121	D => \&encode_D,
	122	M => \&encode_M,
	123	);
	124
	125	# Win32 does not expand globs on command line
	126	eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
	127
	128	my %opt;
	129	# I think these are:
	130	# -Q to disable the duplicate codepoint test
	131	# -S make mapping errors fatal
	132	# -q to remove comments written to output files
	133	# -O to enable the (brute force) substring optimiser
	134	# -o <output> to specify the output file name (else it's the first arg)
	135	# -f <inlist> to give a file with a list of input files (else use the args)
	136	# -n <name> to name the encoding (else use the basename of the input file.
	137	getopts('CM:SQqOo:f:n:',\%opt);
	138
	139	$opt{M} and make_makefile_pl($opt{M}, @ARGV);
	140	$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
	141
	142	# This really should go first, else the die here causes empty (non-erroneous)
	143	# output files to be written.
	144	my @encfiles;
	145	if (exists $opt{'f'}) {
	146	# -F is followed by name of file containing list of filenames
	147	my $flist = $opt{'f'};
	148	open(FLIST,$flist) \|\| die "Cannot open $flist:$!";
	149	chomp(@encfiles = <FLIST>);
	150	close(FLIST);
	151	} else {
	152	@encfiles = @ARGV;
	153	}
	154
	155	my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
	156	chmod(0666,$cname) if -f $cname && !-w $cname;
	157	open(C,">$cname") \|\| die "Cannot open $cname:$!";
	158
	159	my $dname = $cname;
	160	my $hname = $cname;
	161
	162	my ($doC,$doEnc,$doUcm,$doPet);
	163
	164	if ($cname =~ /\.(c\|xs)$/)
	165	{
	166	$doC = 1;
	167	$dname =~ s/(\.[^\.]*)?$/.exh/;
	168	chmod(0666,$dname) if -f $cname && !-w $dname;
	169	open(D,">$dname") \|\| die "Cannot open $dname:$!";
	170	$hname =~ s/(\.[^\.]*)?$/.h/;
	171	chmod(0666,$hname) if -f $cname && !-w $hname;
	172	open(H,">$hname") \|\| die "Cannot open $hname:$!";
	173
	174	foreach my $fh (\C,\D,\*H)
	175	{
	176	print $fh <<"END" unless $opt{'q'};
	177	/*
	178	!!!!!!! DO NOT EDIT THIS FILE !!!!!!!
	179	This file was autogenerated by:
	180	$^X $0 @orig_ARGV
	181	*/
	182	END
	183	}
	184
	185	if ($cname =~ /(\w+)\.xs$/)
	186	{
	187	print C "#include <EXTERN.h>\n";
	188	print C "#include <perl.h>\n";
	189	print C "#include <XSUB.h>\n";
	190	print C "#define U8 U8\n";
	191	}
	192	print C "#include \"encode.h\"\n";
	193
	194	}
	195	elsif ($cname =~ /\.enc$/)
	196	{
	197	$doEnc = 1;
	198	}
	199	elsif ($cname =~ /\.ucm$/)
	200	{
	201	$doUcm = 1;
	202	}
	203	elsif ($cname =~ /\.pet$/)
	204	{
	205	$doPet = 1;
	206	}
	207
	208	my %encoding;
	209	my %strings;
	210	my $saved = 0;
	211	my $subsave = 0;
	212	my $strings = 0;
	213
	214	sub cmp_name
	215	{
	216	if ($a =~ /^.*-(\d+)/)
	217	{
	218	my $an = $1;
	219	if ($b =~ /^.*-(\d+)/)
	220	{
	221	my $r = $an <=> $1;
	222	return $r if $r;
	223	}
	224	}
	225	return $a cmp $b;
	226	}
	227
	228
	229	foreach my $enc (sort cmp_name @encfiles)
	230	{
	231	my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc\|ucm)$/;
	232	$name = $opt{'n'} if exists $opt{'n'};
	233	if (open(E,$enc))
	234	{
	235	if ($sfx eq 'enc')
	236	{
	237	compile_enc(\*E,lc($name));
	238	}
	239	else
	240	{
	241	compile_ucm(\*E,lc($name));
	242	}
	243	}
	244	else
	245	{
	246	warn "Cannot open $enc for $name:$!";
	247	}
	248	}
	249
	250	if ($doC)
	251	{
	252	print STDERR "Writing compiled form\n";
	253	foreach my $name (sort cmp_name keys %encoding)
	254	{
	255	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
	256	output(\*C,$name.'_utf8',$e2u);
	257	output(\*C,'utf8_'.$name,$u2e);
	258	# push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
	259	}
	260	foreach my $enc (sort cmp_name keys %encoding)
	261	{
	262	# my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
	263	my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
	264	#my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
	265	my $replen = 0;
	266	$replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
	267	my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
	268	my $sym = "${enc}_encoding";
	269	$sym =~ s/\W+/_/g;
	270	print C "encode_t $sym = \n";
	271	# This is to make null encoding work -- dankogai
	272	for (my $i = (scalar @info) - 1; $i >= 0; --$i){
	273	$info[$i] \|\|= 1;
	274	}
	275	# end of null tweak -- dankogai
	276	print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
	277	}
	278
	279	foreach my $enc (sort cmp_name keys %encoding)
	280	{
	281	my $sym = "${enc}_encoding";
	282	$sym =~ s/\W+/_/g;
	283	print H "extern encode_t $sym;\n";
	284	print D " Encode_XSEncoding(aTHX_ &$sym);\n";
	285	}
	286
	287	if ($cname =~ /(\w+)\.xs$/)
	288	{
	289	my $mod = $1;
	290	print C <<'END';
	291
	292	static void
	293	Encode_XSEncoding(pTHX_ encode_t *enc)
	294	{
	295	dSP;
	296	HV *stash = gv_stashpv("Encode::XS", TRUE);
	297	SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
	298	int i = 0;
	299	PUSHMARK(sp);
	300	XPUSHs(sv);
	301	while (enc->name[i])
	302	{
	303	const char *name = enc->name[i++];
	304	XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
	305	}
	306	PUTBACK;
	307	call_pv("Encode::define_encoding",G_DISCARD);
	308	SvREFCNT_dec(sv);
	309	}
	310
	311	END
	312
	313	print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
	314	print C "BOOT:\n{\n";
	315	print C "#include \"$dname\"\n";
	316	print C "}\n";
	317	}
	318	# Close in void context is bad, m'kay
	319	close(D) or warn "Error closing '$dname': $!";
	320	close(H) or warn "Error closing '$hname': $!";
	321
	322	my $perc_saved = $strings/($strings + $saved) * 100;
	323	my $perc_subsaved = $strings/($strings + $subsave) * 100;
	324	printf STDERR "%d bytes in string tables\n",$strings;
	325	printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
	326	$saved, $perc_saved if $saved;
	327	printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
	328	$subsave, $perc_subsaved if $subsave;
	329	}
	330	elsif ($doEnc)
	331	{
	332	foreach my $name (sort cmp_name keys %encoding)
	333	{
	334	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
	335	output_enc(\*C,$name,$e2u);
	336	}
	337	}
	338	elsif ($doUcm)
	339	{
	340	foreach my $name (sort cmp_name keys %encoding)
	341	{
	342	my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
	343	output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
	344	}
	345	}
	346
	347	# writing half meg files and then not checking to see if you just filled the
	348	# disk is bad, m'kay
	349	close(C) or die "Error closing '$cname': $!";
	350
	351	# End of the main program.
	352
	353	sub compile_ucm
	354	{
	355	my ($fh,$name) = @_;
	356	my $e2u = {};
	357	my $u2e = {};
	358	my $cs;
	359	my %attr;
	360	while (<$fh>)
	361	{
	362	s/#.*$//;
	363	last if /^\sCHARMAP\s$/i;
	364	if (/^\s<(\w+)>\s+"?([^"])"?\s*$/i) # " # Grrr
	365	{
	366	$attr{$1} = $2;
	367	}
	368	}
	369	if (!defined($cs = $attr{'code_set_name'}))
	370	{
	371	warn "No <code_set_name> in $name\n";
	372	}
	373	else
	374	{
	375	$name = $cs unless exists $opt{'n'};
	376	}
	377	my $erep;
	378	my $urep;
	379	my $max_el;
	380	my $min_el;
	381	if (exists $attr{'subchar'})
	382	{
	383	#my @byte;
	384	#$attr{'subchar'} =~ /^\s*/cg;
	385	#push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
	386	#$erep = join('',map(chr(hex($_)),@byte));
	387	$erep = $attr{'subchar'};
	388	$erep =~ s/^\s+//; $erep =~ s/\s+$//;
	389	}
	390	print "Reading $name ($cs)\n";
	391	my $nfb = 0;
	392	my $hfb = 0;
	393	while (<$fh>)
	394	{
	395	s/#.*$//;
	396	last if /^\sEND\s+CHARMAP\s$/i;
	397	next if /^\s*$/;
	398	my (@uni, @byte) = ();
	399	my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
	400	or die "Bad line: $_";
	401	while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
	402	push @uni, map { substr($_, 1) } split(/\+/, $1);
	403	}
	404	while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
	405	push @byte, $1;
	406	}
	407	if (@uni)
	408	{
	409	my $uch = join('', map { encode_U(hex($_)) } @uni );
	410	my $ech = join('',map(chr(hex($_)),@byte));
	411	my $el = length($ech);
	412	$max_el = $el if (!defined($max_el) \|\| $el > $max_el);
	413	$min_el = $el if (!defined($min_el) \|\| $el < $min_el);
	414	if (length($fb))
	415	{
	416	$fb = substr($fb,1);
	417	$hfb++;
	418	}
	419	else
	420	{
	421	$nfb++;
	422	$fb = '0';
	423	}
	424	# $fb is fallback flag
	425	# 0 - round trip safe
	426	# 1 - fallback for unicode -> enc
	427	# 2 - skip sub-char mapping
	428	# 3 - fallback enc -> unicode
	429	enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
	430	enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
	431	}
	432	else
	433	{
	434	warn $_;
	435	}
	436	}
	437	if ($nfb && $hfb)
	438	{
	439	die "$nfb entries without fallback, $hfb entries with\n";
	440	}
	441	$encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
	442	}
	443
	444
	445
	446	sub compile_enc
	447	{
	448	my ($fh,$name) = @_;
	449	my $e2u = {};
	450	my $u2e = {};
	451
	452	my $type;
	453	while ($type = <$fh>)
	454	{
	455	last if $type !~ /^\s*#/;
	456	}
	457	chomp($type);
	458	return if $type eq 'E';
	459	# Do the hash lookup once, rather than once per function call. 4% speedup.
	460	my $type_func = $encode_types{$type};
	461	my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
	462	warn "$type encoded $name\n";
	463	my $rep = '';
	464	# Save a defined test by setting these to defined values.
	465	my $min_el = ~0; # A very big integer
	466	my $max_el = 0; # Anything must be longer than 0
	467	{
	468	my $v = hex($def);
	469	$rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
	470	}
	471	my $errors;
	472	my $seen;
	473	# use -Q to silence the seen test. Makefile.PL uses this by default.
	474	$seen = {} unless $opt{Q};
	475	do
	476	{
	477	my $line = <$fh>;
	478	chomp($line);
	479	my $page = hex($line);
	480	my $ch = 0;
	481	my $i = 16;
	482	do
	483	{
	484	# So why is it 1% faster to leave the my here?
	485	my $line = <$fh>;
	486	$line =~ s/\r\n$/\n/;
	487	die "$.:${line}Line should be exactly 65 characters long including
	488	newline (".length($line).")" unless length ($line) == 65;
	489	# Split line into groups of 4 hex digits, convert groups to ints
	490	# This takes 65.35
	491	# map {hex $_} $line =~ /(....)/g
	492	# This takes 63.75 (2.5% less time)
	493	# unpack "n", pack "H", $line
	494	# There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
	495	# Doing it as while ($line =~ /(....)/g) took 74.63
	496	foreach my $val (unpack "n", pack "H", $line)
	497	{
	498	next if $val == 0xFFFD;
	499	my $ech = &$type_func($ch,$page);
	500	if ($val \|\| (!$ch && !$page))
	501	{
	502	my $el = length($ech);
	503	$max_el = $el if $el > $max_el;
	504	$min_el = $el if $el < $min_el;
	505	my $uch = encode_U($val);
	506	if ($seen) {
	507	# We're doing the test.
	508	# We don't need to read this quickly, so storing it as a scalar,
	509	# rather than 3 (anon array, plus the 2 scalars it holds) saves
	510	# RAM and may make us faster on low RAM systems. [see __END__]
	511	if (exists $seen->{$uch})
	512	{
	513	warn sprintf("U%04X is %02X%02X and %04X\n",
	514	$val,$page,$ch,$seen->{$uch});
	515	$errors++;
	516	}
	517	else
	518	{
	519	$seen->{$uch} = $page << 8 \| $ch;
	520	}
	521	}
	522	# Passing 2 extra args each time is 3.6% slower!
	523	# Even with having to add $fallback \|\|= 0 later
	524	enter_fb0($e2u,$ech,$uch);
	525	enter_fb0($u2e,$uch,$ech);
	526	}
	527	else
	528	{
	529	# No character at this position
	530	# enter($e2u,$ech,undef,$e2u);
	531	}
	532	$ch++;
	533	}
	534	} while --$i;
	535	} while --$pages;
	536	die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
	537	if $min_el > $max_el;
	538	die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
	539	$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
	540	}
	541
	542	# my ($a,$s,$d,$t,$fb) = @_;
	543	sub enter {
	544	my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
	545	# state we shift to after this (multibyte) input character defaults to same
	546	# as current state.
	547	$next \|\|= $current;
	548	# Making sure it is defined seems to be faster than {no warnings;} in
	549	# &process, or passing it in as 0 explicity.
	550	# XXX $fallback \|\|= 0;
	551
	552	# Start at the beginning and work forwards through the string to zero.
	553	# effectively we are removing 1 character from the front each time
	554	# but we don't actually edit the string. [this alone seems to be 14% speedup]
	555	# Hence -$pos is the length of the remaining string.
	556	my $pos = -length $inbytes;
	557	while (1) {
	558	my $byte = substr $inbytes, $pos, 1;
	559	# RAW_NEXT => 0,
	560	# RAW_IN_LEN => 1,
	561	# RAW_OUT_BYTES => 2,
	562	# RAW_FALLBACK => 3,
	563	# to unicode an array would seem to be better, because the pages are dense.
	564	# from unicode can be very sparse, favouring a hash.
	565	# hash using the bytes (all length 1) as keys rather than ord value,
	566	# as it's easier to sort these in &process.
	567
	568	# It's faster to always add $fallback even if it's undef, rather than
	569	# choosing between 3 and 4 element array. (hence why we set it defined
	570	# above)
	571	my $do_now = $current->{Raw}{$byte} \|\|= [{},-$pos,'',$fallback];
	572	# When $pos was -1 we were at the last input character.
	573	unless (++$pos) {
	574	$do_now->[RAW_OUT_BYTES] = $outbytes;
	575	$do_now->[RAW_NEXT] = $next;
	576	return;
	577	}
	578	# Tail recursion. The intermdiate state may not have a name yet.
	579	$current = $do_now->[RAW_NEXT];
	580	}
	581	}
	582
	583	# This is purely for optimistation. It's just &enter hard coded for $fallback
	584	# of 0, using only a 3 entry array ref to save memory for every entry.
	585	sub enter_fb0 {
	586	my ($current,$inbytes,$outbytes,$next) = @_;
	587	$next \|\|= $current;
	588
	589	my $pos = -length $inbytes;
	590	while (1) {
	591	my $byte = substr $inbytes, $pos, 1;
	592	my $do_now = $current->{Raw}{$byte} \|\|= [{},-$pos,''];
	593	unless (++$pos) {
	594	$do_now->[RAW_OUT_BYTES] = $outbytes;
	595	$do_now->[RAW_NEXT] = $next;
	596	return;
	597	}
	598	$current = $do_now->[RAW_NEXT];
	599	}
	600	}
	601
	602
	603	sub outstring
	604	{
	605	my ($fh,$name,$s) = @_;
	606	my $sym = $strings{$s};
	607	if ($sym)
	608	{
	609	$saved += length($s);
	610	}
	611	else
	612	{
	613	if ($opt{'O'}) {
	614	foreach my $o (keys %strings)
	615	{
	616	next unless (my $i = index($o,$s)) >= 0;
	617	$sym = $strings{$o};
	618	# gcc things that 0x0e+0x10 (anything with e+) starts to look like
	619	# a hexadecimal floating point constant. Silly gcc. Only p
	620	# introduces a floating point constant. Put the space in to stop it
	621	# getting confused.
	622	$sym .= sprintf(" +0x%02x",$i) if ($i);
	623	$subsave += length($s);
	624	return $strings{$s} = $sym;
	625	}
	626	}
	627	$strings{$s} = $sym = $name;
	628	$strings += length($s);
	629	my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
	630	# Maybe we should assert that these are all <256.
	631	$definition .= join(',',unpack "C*",$s);
	632	# We have a single long line. Split it at convenient commas.
	633	$definition =~ s/(.{74,77},)/$1\n/g;
	634	print $fh "$definition };\n\n";
	635	}
	636	return $sym;
	637	}
	638
	639	sub process
	640	{
	641	my ($name,$a) = @_;
	642	$name =~ s/\W+/_/g;
	643	$a->{Cname} = $name;
	644	my $raw = $a->{Raw};
	645	my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
	646	my @ent;
	647	$agg_max_in = 0;
	648	foreach my $key (sort keys %$raw) {
	649	# RAW_NEXT => 0,
	650	# RAW_IN_LEN => 1,
	651	# RAW_OUT_BYTES => 2,
	652	# RAW_FALLBACK => 3,
	653	my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
	654	# Now we are converting from raw to aggregate, switch from 1 byte strings
	655	# to numbers
	656	my $b = ord $key;
	657	$fallback \|\|= 0;
	658	if ($l &&
	659	# If this == fails, we're going to reset $agg_max_in below anyway.
	660	$b == ++$agg_max_in &&
	661	# References in numeric context give the pointer as an int.
	662	$agg_next == $next &&
	663	$agg_in_len == $in_len &&
	664	$agg_out_len == length $out_bytes &&
	665	$agg_fallback == $fallback
	666	# && length($l->[AGG_OUT_BYTES]) < 16
	667	) {
	668	# my $i = ord($b)-ord($l->[AGG_MIN_IN]);
	669	# we can aggregate this byte onto the end.
	670	$l->[AGG_MAX_IN] = $b;
	671	$l->[AGG_OUT_BYTES] .= $out_bytes;
	672	} else {
	673	# AGG_MIN_IN => 0,
	674	# AGG_MAX_IN => 1,
	675	# AGG_OUT_BYTES => 2,
	676	# AGG_NEXT => 3,
	677	# AGG_IN_LEN => 4,
	678	# AGG_OUT_LEN => 5,
	679	# AGG_FALLBACK => 6,
	680	# Reset the last thing we saw, plus set 5 lexicals to save some derefs.
	681	# (only gains .6% on euc-jp -- is it worth it?)
	682	push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
	683	$agg_in_len = $in_len, $agg_out_len = length $out_bytes,
	684	$agg_fallback = $fallback];
	685	}
	686	if (exists $next->{Cname}) {
	687	$next->{'Forward'} = 1 if $next != $a;
	688	} else {
	689	process(sprintf("%s_%02x",$name,$b),$next);
	690	}
	691	}
	692	# encengine.c rules say that last entry must be for 255
	693	if ($agg_max_in < 255) {
	694	push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
	695	}
	696	$a->{'Entries'} = \@ent;
	697	}
	698
	699	sub outtable
	700	{
	701	my ($fh,$a) = @_;
	702	my $name = $a->{'Cname'};
	703	# String tables
	704	foreach my $b (@{$a->{'Entries'}})
	705	{
	706	next unless $b->[AGG_OUT_LEN];
	707	my $s = $b->[AGG_MIN_IN];
	708	my $e = $b->[AGG_MAX_IN];
	709	outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
	710	}
	711	if ($a->{'Forward'})
	712	{
	713	my $var = $^O eq 'MacOS' ? 'extern' : 'static';
	714	print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
	715	}
	716	$a->{'Done'} = 1;
	717	foreach my $b (@{$a->{'Entries'}})
	718	{
	719	my ($s,$e,$out,$t,$end,$l) = @$b;
	720	outtable($fh,$t) unless $t->{'Done'};
	721	}
	722	print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
	723	foreach my $b (@{$a->{'Entries'}})
	724	{
	725	my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
	726	# $end \|= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
	727	print $fh "{";
	728	if ($l)
	729	{
	730	printf $fh outstring($fh,'',$out);
	731	}
	732	else
	733	{
	734	print $fh "0";
	735	}
	736	print $fh ",",$t->{Cname};
	737	printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
	738	}
	739	print $fh "};\n";
	740	}
	741
	742	sub output
	743	{
	744	my ($fh,$name,$a) = @_;
	745	process($name,$a);
	746	# Sub-tables
	747	outtable($fh,$a);
	748	}
	749
	750	sub output_enc
	751	{
	752	my ($fh,$name,$a) = @_;
	753	die "Changed - fix me for new structure";
	754	foreach my $b (sort keys %$a)
	755	{
	756	my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
	757	}
	758	}
	759
	760	sub decode_U
	761	{
	762	my $s = shift;
	763	}
	764
	765	my @uname;
	766	sub char_names
	767	{
	768	my $s = do "unicore/Name.pl";
	769	die "char_names: unicore/Name.pl: $!\n" unless defined $s;
	770	pos($s) = 0;
	771	while ($s =~ /\G([0-9a-f]+)\t([0-9a-f])\t(.?)\s*\n/igc)
	772	{
	773	my $name = $3;
	774	my $s = hex($1);
	775	last if $s >= 0x10000;
	776	my $e = length($2) ? hex($2) : $s;
	777	for (my $i = $s; $i <= $e; $i++)
	778	{
	779	$uname[$i] = $name;
	780	# print sprintf("U%04X $name\n",$i);
	781	}
	782	}
	783	}
	784
	785	sub output_ucm_page
	786	{
	787	my ($cmap,$a,$t,$pre) = @_;
	788	# warn sprintf("Page %x\n",$pre);
	789	my $raw = $t->{Raw};
	790	foreach my $key (sort keys %$raw) {
	791	# RAW_NEXT => 0,
	792	# RAW_IN_LEN => 1,
	793	# RAW_OUT_BYTES => 2,
	794	# RAW_FALLBACK => 3,
	795	my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
	796	my $u = ord $key;
	797	$fallback \|\|= 0;
	798
	799	if ($next != $a && $next != $t) {
	800	output_ucm_page($cmap,$a,$next,(($pre\|($u &0x3F)) << 6)&0xFFFF);
	801	} elsif (length $out_bytes) {
	802	if ($pre) {
	803	$u = $pre\|($u &0x3f);
	804	}
	805	my $s = sprintf "<U%04X> ",$u;
	806	#foreach my $c (split(//,$out_bytes)) {
	807	# $s .= sprintf "\\x%02X",ord($c);
	808	#}
	809	# 9.5% faster changing that loop to this:
	810	$s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
	811	$s .= sprintf " \|%d # %s\n",($fallback ? 1 : 0),$uname[$u];
	812	push(@$cmap,$s);
	813	} else {
	814	warn join(',',$u, @{$raw->{$key}},$a,$t);
	815	}
	816	}
	817	}
	818
	819	sub output_ucm
	820	{
	821	my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
	822	print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
	823	print $fh "<code_set_name> \"$name\"\n";
	824	char_names();
	825	if (defined $min_el)
	826	{
	827	print $fh "<mb_cur_min> $min_el\n";
	828	}
	829	if (defined $max_el)
	830	{
	831	print $fh "<mb_cur_max> $max_el\n";
	832	}
	833	if (defined $rep)
	834	{
	835	print $fh "<subchar> ";
	836	foreach my $c (split(//,$rep))
	837	{
	838	printf $fh "\\x%02X",ord($c);
	839	}
	840	print $fh "\n";
	841	}
	842	my @cmap;
	843	output_ucm_page(\@cmap,$h,$h,0);
	844	print $fh "#\nCHARMAP\n";
	845	foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
	846	{
	847	print $fh $line;
	848	}
	849	print $fh "END CHARMAP\n";
	850	}
	851
	852	use vars qw(
	853	$_Enc2xs
	854	$_Version
	855	$_Inc
	856	$_E2X
	857	$_Name
	858	$_TableFiles
	859	$_Now
	860	);
	861
	862	sub find_e2x{
	863	eval { require File::Find };
	864	my (@inc, %e2x_dir);
	865	for my $inc (@INC){
	866	push @inc, $inc unless $inc eq '.'; #skip current dir
	867	}
	868	File::Find::find(
	869	sub {
	870	my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	871	$atime,$mtime,$ctime,$blksize,$blocks)
	872	= lstat($_) or return;
	873	-f _ or return;
	874	if (/^.*\.e2x$/o){
	875	$e2x_dir{$File::Find::dir} \|\|= $mtime;
	876	}
	877	return;
	878	}, @inc);
	879	warn join("\n", keys %e2x_dir), "\n";
	880	for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
	881	$_E2X = $d;
	882	# warn "$_E2X => ", scalar localtime($e2x_dir{$d});
	883	return $_E2X;
	884	}
	885	}
	886
	887	sub make_makefile_pl
	888	{
	889	eval { require Encode; };
	890	$@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
	891	# our used for variable expanstion
	892	$_Enc2xs = $0;
	893	$_Version = $VERSION;
	894	$_E2X = find_e2x();
	895	$_Name = shift;
	896	$_TableFiles = join(",", map {qq('$_')} @_);
	897	$_Now = scalar localtime();
	898
	899	eval { require File::Spec; };
	900	_print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
	901	_print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
	902	_print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
	903	_print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
	904	_print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
	905	exit;
	906	}
	907
	908	use vars qw(
	909	$_ModLines
	910	$_LocalVer
	911	);
	912
	913	sub make_configlocal_pm
	914	{
	915	eval { require Encode; };
	916	$@ and die "Unable to require Encode: $@\n";
	917	eval { require File::Spec; };
	918	# our used for variable expanstion
	919	my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
	920	my %LocalMod = ();
	921	for my $d (@INC){
	922	my $inc = File::Spec->catfile($d, "Encode");
	923	-d $inc or next;
	924	opendir my $dh, $inc or die "$inc:$!";
	925	warn "Checking $inc...\n";
	926	for my $f (grep /\.pm$/o, readdir($dh)){
	927	-f File::Spec->catfile($inc, "$f") or next;
	928	$INC{"Encode/$f"} and next;
	929	warn "require Encode/$f;\n";
	930	eval { require "Encode/$f"; };
	931	$@ and die "Can't require Encode/$f: $@\n";
	932	for my $enc (Encode->encodings()){
	933	$in_core{$enc} and next;
	934	$Encode::Config::ExtModule{$enc} and next;
	935	my $mod = "Encode/$f";
	936	$mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
	937	$LocalMod{$enc} \|\|= $mod;
	938	}
	939	}
	940	}
	941	$_ModLines = "";
	942	for my $enc (sort keys %LocalMod){
	943	$_ModLines .=
	944	qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
	945	}
	946	warn $_ModLines;
	947	$_LocalVer = _mkversion();
	948	$_E2X = find_e2x();
	949	$_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
	950	_print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
	951	File::Spec->catfile($_Inc,"ConfigLocal.pm"),
	952	1);
	953	exit;
	954	}
	955
	956	sub _mkversion{
	957	my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
	958	$yyyy += 1900, $mo +=1;
	959	return sprintf("v%04d.%04d.%04d", $yyyy, $mo100+$dd, $hh100+$mm);
	960	}
	961
	962	sub _print_expand{
	963	eval { require File::Basename; };
	964	$@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
	965	File::Basename->import();
	966	my ($src, $dst, $clobber) = @_;
	967	if (!$clobber and -e $dst){
	968	warn "$dst exists. skipping\n";
	969	return;
	970	}
	971	warn "Generating $dst...\n";
	972	open my $in, $src or die "$src : $!";
	973	if ((my $d = dirname($dst)) ne '.'){
	974	-d $d or mkdir $d, 0755 or die "mkdir $d : $!";
	975	}
	976	open my $out, ">$dst" or die "$!";
	977	my $asis = 0;
	978	while (<$in>){
	979	if (/^#### END_OF_HEADER/){
	980	$asis = 1; next;
	981	}
	982	s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
	983	print $out $_;
	984	}
	985	}
	986	__END__
	987
	988	=head1 NAME
	989
	990	enc2xs -- Perl Encode Module Generator
	991
	992	=head1 SYNOPSIS
	993
	994	enc2xs -[options]
	995	enc2xs -M ModName mapfiles...
	996	enc2xs -C
	997
	998	=head1 DESCRIPTION
	999
	1000	F<enc2xs> builds a Perl extension for use by Encode from either
	1001	Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
	1002	Besides being used internally during the build process of the Encode
	1003	module, you can use F<enc2xs> to add your own encoding to perl.
	1004	No knowledge of XS is necessary.
	1005
	1006	=head1 Quick Guide
	1007
	1008	If you want to know as little about Perl as possible but need to
	1009	add a new encoding, just read this chapter and forget the rest.
	1010
	1011	=over 4
	1012
	1013	=item 0.
	1014
	1015	Have a .ucm file ready. You can get it from somewhere or you can write
	1016	your own from scratch or you can grab one from the Encode distribution
	1017	and customize it. For the UCM format, see the next Chapter. In the
	1018	example below, I'll call my theoretical encoding myascii, defined
	1019	in I<my.ucm>. C<$> is a shell prompt.
	1020
	1021	$ ls -F
	1022	my.ucm
	1023
	1024	=item 1.
	1025
	1026	Issue a command as follows;
	1027
	1028	$ enc2xs -M My my.ucm
	1029	generating Makefile.PL
	1030	generating My.pm
	1031	generating README
	1032	generating Changes
	1033
	1034	Now take a look at your current directory. It should look like this.
	1035
	1036	$ ls -F
	1037	Makefile.PL My.pm my.ucm t/
	1038
	1039	The following files were created.
	1040
	1041	Makefile.PL - MakeMaker script
	1042	My.pm - Encode submodule
	1043	t/My.t - test file
	1044
	1045	=over 4
	1046
	1047	=item 1.1.
	1048
	1049	If you want *.ucm installed together with the modules, do as follows;
	1050
	1051	$ mkdir Encode
	1052	$ mv *.ucm Encode
	1053	$ enc2xs -M My Encode/*ucm
	1054
	1055	=back
	1056
	1057	=item 2.
	1058
	1059	Edit the files generated. You don't have to if you have no time AND no
	1060	intention to give it to someone else. But it is a good idea to edit
	1061	the pod and to add more tests.
	1062
	1063	=item 3.
	1064
	1065	Now issue a command all Perl Mongers love:
	1066
	1067	$ perl Makefile.PL
	1068	Writing Makefile for Encode::My
	1069
	1070	=item 4.
	1071
	1072	Now all you have to do is make.
	1073
	1074	$ make
	1075	cp My.pm blib/lib/Encode/My.pm
	1076	/usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
	1077	-o encode_t.c -f encode_t.fnm
	1078	Reading myascii (myascii)
	1079	Writing compiled form
	1080	128 bytes in string tables
	1081	384 bytes (25%) saved spotting duplicates
	1082	1 bytes (99.2%) saved using substrings
	1083	....
	1084	chmod 644 blib/arch/auto/Encode/My/My.bs
	1085	$
	1086
	1087	The time it takes varies depending on how fast your machine is and
	1088	how large your encoding is. Unless you are working on something big
	1089	like euc-tw, it won't take too long.
	1090
	1091	=item 5.
	1092
	1093	You can "make install" already but you should test first.
	1094
	1095	$ make test
	1096	PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
	1097	-e 'use Test::Harness qw(&runtests $verbose); \
	1098	$verbose=0; runtests @ARGV;' t/*.t
	1099	t/My....ok
	1100	All tests successful.
	1101	Files=1, Tests=2, 0 wallclock secs
	1102	( 0.09 cusr + 0.01 csys = 0.09 CPU)
	1103
	1104	=item 6.
	1105
	1106	If you are content with the test result, just "make install"
	1107
	1108	=item 7.
	1109
	1110	If you want to add your encoding to Encode's demand-loading list
	1111	(so you don't have to "use Encode::YourEncoding"), run
	1112
	1113	enc2xs -C
	1114
	1115	to update Encode::ConfigLocal, a module that controls local settings.
	1116	After that, "use Encode;" is enough to load your encodings on demand.
	1117
	1118	=back
	1119
	1120	=head1 The Unicode Character Map
	1121
	1122	Encode uses the Unicode Character Map (UCM) format for source character
	1123	mappings. This format is used by IBM's ICU package and was adopted
	1124	by Nick Ing-Simmons for use with the Encode module. Since UCM is
	1125	more flexible than Tcl's Encoding Map and far more user-friendly,
	1126	this is the recommended formet for Encode now.
	1127
	1128	A UCM file looks like this.
	1129
	1130	#
	1131	# Comments
	1132	#
	1133	<code_set_name> "US-ascii" # Required
	1134	<code_set_alias> "ascii" # Optional
	1135	<mb_cur_min> 1 # Required; usually 1
	1136	<mb_cur_max> 1 # Max. # of bytes/char
	1137	<subchar> \x3F # Substitution char
	1138	#
	1139	CHARMAP
	1140	<U0000> \x00 \|0 # <control>
	1141	<U0001> \x01 \|0 # <control>
	1142	<U0002> \x02 \|0 # <control>
	1143	....
	1144	<U007C> \x7C \|0 # VERTICAL LINE
	1145	<U007D> \x7D \|0 # RIGHT CURLY BRACKET
	1146	<U007E> \x7E \|0 # TILDE
	1147	<U007F> \x7F \|0 # <control>
	1148	END CHARMAP
	1149
	1150	=over 4
	1151
	1152	=item *
	1153
	1154	Anything that follows C<#> is treated as a comment.
	1155
	1156	=item *
	1157
	1158	The header section continues until a line containing the word
	1159	CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
	1160	pair per line. Strings used as values must be quoted. Barewords are
	1161	treated as numbers. I<\xXX> represents a byte.
	1162
	1163	Most of the keywords are self-explanatory. I<subchar> means
	1164	substitution character, not subcharacter. When you decode a Unicode
	1165	sequence to this encoding but no matching character is found, the byte
	1166	sequence defined here will be used. For most cases, the value here is
	1167	\x3F; in ASCII, this is a question mark.
	1168
	1169	=item *
	1170
	1171	CHARMAP starts the character map section. Each line has a form as
	1172	follows:
	1173
	1174	<UXXXX> \xXX.. \|0 # comment
	1175	^ ^ ^
	1176	\| \| +- Fallback flag
	1177	\| +-------- Encoded byte sequence
	1178	+-------------- Unicode Character ID in hex
	1179
	1180	The format is roughly the same as a header section except for the
	1181	fallback flag: \| followed by 0..3. The meaning of the possible
	1182	values is as follows:
	1183
	1184	=over 4
	1185
	1186	=item \|0
	1187
	1188	Round trip safe. A character decoded to Unicode encodes back to the
	1189	same byte sequence. Most characters have this flag.
	1190
	1191	=item \|1
	1192
	1193	Fallback for unicode -> encoding. When seen, enc2xs adds this
	1194	character for the encode map only.
	1195
	1196	=item \|2
	1197
	1198	Skip sub-char mapping should there be no code point.
	1199
	1200	=item \|3
	1201
	1202	Fallback for encoding -> unicode. When seen, enc2xs adds this
	1203	character for the decode map only.
	1204
	1205	=back
	1206
	1207	=item *
	1208
	1209	And finally, END OF CHARMAP ends the section.
	1210
	1211	=back
	1212
	1213	When you are manually creating a UCM file, you should copy ascii.ucm
	1214	or an existing encoding which is close to yours, rather than write
	1215	your own from scratch.
	1216
	1217	When you do so, make sure you leave at least B<U0000> to B<U0020> as
	1218	is, unless your environment is EBCDIC.
	1219
	1220	B<CAVEAT>: not all features in UCM are implemented. For example,
	1221	icu:state is not used. Because of that, you need to write a perl
	1222	module if you want to support algorithmical encodings, notably
	1223	the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
	1224	L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
	1225
	1226	=head2 Coping with duplicate mappings
	1227
	1228	When you create a map, you SHOULD make your mappings round-trip safe.
	1229	That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
	1230	$data> stands for all characters that are marked as C<\|0>. Here is
	1231	how to make sure:
	1232
	1233	=over 4
	1234
	1235	=item *
	1236
	1237	Sort your map in Unicode order.
	1238
	1239	=item *
	1240
	1241	When you have a duplicate entry, mark either one with '\|1' or '\|3'.
	1242
	1243	=item *
	1244
	1245	And make sure the '\|1' or '\|3' entry FOLLOWS the '\|0' entry.
	1246
	1247	=back
	1248
	1249	Here is an example from big5-eten.
	1250
	1251	<U2550> \xF9\xF9 \|0
	1252	<U2550> \xA2\xA4 \|3
	1253
	1254	Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
	1255	this;
	1256
	1257	E to U U to E
	1258	--------------------------------------
	1259	\xF9\xF9 => U2550 U2550 => \xF9\xF9
	1260	\xA2\xA4 => U2550
	1261
	1262	So it is round-trip safe for \xF9\xF9. But if the line above is upside
	1263	down, here is what happens.
	1264
	1265	E to U U to E
	1266	--------------------------------------
	1267	\xA2\xA4 => U2550 U2550 => \xF9\xF9
	1268	(\xF9\xF9 => U2550 is now overwritten!)
	1269
	1270	The Encode package comes with F<ucmlint>, a crude but sufficient
	1271	utility to check the integrity of a UCM file. Check under the
	1272	Encode/bin directory for this.
	1273
	1274
	1275	=head1 Bookmarks
	1276
	1277	=over 4
	1278
	1279	=item *
	1280
	1281	ICU Home Page
	1282	L<http://oss.software.ibm.com/icu/>
	1283
	1284	=item *
	1285
	1286	ICU Character Mapping Tables
	1287	L<http://oss.software.ibm.com/icu/charset/>
	1288
	1289	=item *
	1290
	1291	ICU:Conversion Data
	1292	L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
	1293
	1294	=back
	1295
	1296	=head1 SEE ALSO
	1297
	1298	L<Encode>,
	1299	L<perlmod>,
	1300	L<perlpod>
	1301
	1302	=cut
	1303
	1304	# -Q to disable the duplicate codepoint test
	1305	# -S make mapping errors fatal
	1306	# -q to remove comments written to output files
	1307	# -O to enable the (brute force) substring optimiser
	1308	# -o <output> to specify the output file name (else it's the first arg)
	1309	# -f <inlist> to give a file with a list of input files (else use the args)
	1310	# -n <name> to name the encoding (else use the basename of the input file.
	1311
	1312	With %seen holding array refs:
	1313
	1314	865.66 real 28.80 user 8.79 sys
	1315	7904 maximum resident set size
	1316	1356 average shared memory size
	1317	18566 average unshared data size
	1318	229 average unshared stack size
	1319	46080 page reclaims
	1320	33373 page faults
	1321
	1322	With %seen holding simple scalars:
	1323
	1324	342.16 real 27.11 user 3.54 sys
	1325	8388 maximum resident set size
	1326	1394 average shared memory size
	1327	14969 average unshared data size
	1328	236 average unshared stack size
	1329	28159 page reclaims
	1330	9839 page faults
	1331
	1332	Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
	1333	how %seen is storing things its seen. So it is pathalogically bad on a 16M
	1334	RAM machine, but it's going to help even on modern machines.
	1335	Swapping is bad, m'kay :-)