Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / bin / enc2xs
CommitLineData
86530b38
AT
1#!/import/bw/tools/local/perl-5.8.0/bin/perl
2 eval 'exec /import/bw/tools/local/perl-5.8.0/bin/perl -S $0 ${1+"$@"}'
3 if $running_under_some_shell;
4#!./perl
5BEGIN {
6 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
7 # with $ENV{PERL_CORE} set
8 # In case we need it in future...
9 require Config; import Config;
10}
11use strict;
12use Getopt::Std;
13my @orig_ARGV = @ARGV;
14our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
15
16# These may get re-ordered.
17# RAW is a do_now as inserted by &enter
18# AGG is an aggreagated do_now, as built up by &process
19
20use constant {
21 RAW_NEXT => 0,
22 RAW_IN_LEN => 1,
23 RAW_OUT_BYTES => 2,
24 RAW_FALLBACK => 3,
25
26 AGG_MIN_IN => 0,
27 AGG_MAX_IN => 1,
28 AGG_OUT_BYTES => 2,
29 AGG_NEXT => 3,
30 AGG_IN_LEN => 4,
31 AGG_OUT_LEN => 5,
32 AGG_FALLBACK => 6,
33};
34
35# (See the algorithm in encengine.c - we're building structures for it)
36
37# There are two sorts of structures.
38# "do_now" (an array, two variants of what needs storing) is whatever we need
39# to do now we've read an input byte.
40# It's housed in a "do_next" (which is how we got to it), and in turn points
41# to a "do_next" which contains all the "do_now"s for the next input byte.
42
43# There will be a "do_next" which is the start state.
44# For a single byte encoding it's the only "do_next" - each "do_now" points
45# back to it, and each "do_now" will cause bytes. There is no state.
46
47# For a multi-byte encoding where all characters in the input are the same
48# length, then there will be a tree of "do_now"->"do_next"->"do_now"
49# branching out from the start state, one step for each input byte.
50# The leaf "do_now"s will all be at the same distance from the start state,
51# only the leaf "do_now"s cause output bytes, and they in turn point back to
52# the start state.
53
54# For an encoding where there are varaible length input byte sequences, you
55# will encounter a leaf "do_now" sooner for the shorter input sequences, but
56# as before the leaves will point back to the start state.
57
58# The system will cope with escape encodings (imagine them as a mostly
59# self-contained tree for each escape state, and cross links between trees
60# at the state-switching characters) but so far no input format defines these.
61
62# The system will also cope with having output "leaves" in the middle of
63# the bifurcating branches, not just at the extremities, but again no
64# input format does this yet.
65
66# There are two variants of the "do_now" structure. The first, smaller variant
67# is generated by &enter as the input file is read. There is one structure
68# for each input byte. Say we are mapping a single byte encoding to a
69# single byte encoding, with "ABCD" going "abcd". There will be
70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71
72# &process then walks the tree, building aggregate "do_now" structres for
73# adjacent bytes where possible. The aggregate is for a contiguous range of
74# bytes which each produce the same length of output, each move to the
75# same next state, and each have the same fallback flag.
76# So our 4 RAW "do_now"s above become replaced by a single structure
77# containing:
78# ["A", "D", "abcd", 1, ...]
79# ie, for an input byte $_ in "A".."D", output 1 byte, found as
80# substr ("abcd", (ord $_ - ord "A") * 1, 1)
81# which maps very nicely into pointer arithmetic in C for encengine.c
82
83sub encode_U
84{
85 # UTF-8 encode long hand - only covers part of perl's range
86 ## my $uv = shift;
87 # chr() works in native space so convert value from table
88 # into that space before using chr().
89 my $ch = chr(utf8::unicode_to_native($_[0]));
90 # Now get core perl to encode that the way it likes.
91 utf8::encode($ch);
92 return $ch;
93}
94
95sub encode_S
96{
97 # encode single byte
98 ## my ($ch,$page) = @_; return chr($ch);
99 return chr $_[0];
100}
101
102sub encode_D
103{
104 # encode double byte MS byte first
105 ## my ($ch,$page) = @_; return chr($page).chr($ch);
106 return chr ($_[1]) . chr $_[0];
107}
108
109sub encode_M
110{
111 # encode Multi-byte - single for 0..255 otherwise double
112 ## my ($ch,$page) = @_;
113 ## return &encode_D if $page;
114 ## return &encode_S;
115 return chr ($_[1]) . chr $_[0] if $_[1];
116 return chr $_[0];
117}
118
119my %encode_types = (U => \&encode_U,
120 S => \&encode_S,
121 D => \&encode_D,
122 M => \&encode_M,
123 );
124
125# Win32 does not expand globs on command line
126eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
127
128my %opt;
129# I think these are:
130# -Q to disable the duplicate codepoint test
131# -S make mapping errors fatal
132# -q to remove comments written to output files
133# -O to enable the (brute force) substring optimiser
134# -o <output> to specify the output file name (else it's the first arg)
135# -f <inlist> to give a file with a list of input files (else use the args)
136# -n <name> to name the encoding (else use the basename of the input file.
137getopts('CM:SQqOo:f:n:',\%opt);
138
139$opt{M} and make_makefile_pl($opt{M}, @ARGV);
140$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
141
142# This really should go first, else the die here causes empty (non-erroneous)
143# output files to be written.
144my @encfiles;
145if (exists $opt{'f'}) {
146 # -F is followed by name of file containing list of filenames
147 my $flist = $opt{'f'};
148 open(FLIST,$flist) || die "Cannot open $flist:$!";
149 chomp(@encfiles = <FLIST>);
150 close(FLIST);
151} else {
152 @encfiles = @ARGV;
153}
154
155my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
156chmod(0666,$cname) if -f $cname && !-w $cname;
157open(C,">$cname") || die "Cannot open $cname:$!";
158
159my $dname = $cname;
160my $hname = $cname;
161
162my ($doC,$doEnc,$doUcm,$doPet);
163
164if ($cname =~ /\.(c|xs)$/)
165 {
166 $doC = 1;
167 $dname =~ s/(\.[^\.]*)?$/.exh/;
168 chmod(0666,$dname) if -f $cname && !-w $dname;
169 open(D,">$dname") || die "Cannot open $dname:$!";
170 $hname =~ s/(\.[^\.]*)?$/.h/;
171 chmod(0666,$hname) if -f $cname && !-w $hname;
172 open(H,">$hname") || die "Cannot open $hname:$!";
173
174 foreach my $fh (\*C,\*D,\*H)
175 {
176 print $fh <<"END" unless $opt{'q'};
177/*
178 !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
179 This file was autogenerated by:
180 $^X $0 @orig_ARGV
181*/
182END
183 }
184
185 if ($cname =~ /(\w+)\.xs$/)
186 {
187 print C "#include <EXTERN.h>\n";
188 print C "#include <perl.h>\n";
189 print C "#include <XSUB.h>\n";
190 print C "#define U8 U8\n";
191 }
192 print C "#include \"encode.h\"\n";
193
194 }
195elsif ($cname =~ /\.enc$/)
196 {
197 $doEnc = 1;
198 }
199elsif ($cname =~ /\.ucm$/)
200 {
201 $doUcm = 1;
202 }
203elsif ($cname =~ /\.pet$/)
204 {
205 $doPet = 1;
206 }
207
208my %encoding;
209my %strings;
210my $saved = 0;
211my $subsave = 0;
212my $strings = 0;
213
214sub cmp_name
215{
216 if ($a =~ /^.*-(\d+)/)
217 {
218 my $an = $1;
219 if ($b =~ /^.*-(\d+)/)
220 {
221 my $r = $an <=> $1;
222 return $r if $r;
223 }
224 }
225 return $a cmp $b;
226}
227
228
229foreach my $enc (sort cmp_name @encfiles)
230 {
231 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
232 $name = $opt{'n'} if exists $opt{'n'};
233 if (open(E,$enc))
234 {
235 if ($sfx eq 'enc')
236 {
237 compile_enc(\*E,lc($name));
238 }
239 else
240 {
241 compile_ucm(\*E,lc($name));
242 }
243 }
244 else
245 {
246 warn "Cannot open $enc for $name:$!";
247 }
248 }
249
250if ($doC)
251 {
252 print STDERR "Writing compiled form\n";
253 foreach my $name (sort cmp_name keys %encoding)
254 {
255 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
256 output(\*C,$name.'_utf8',$e2u);
257 output(\*C,'utf8_'.$name,$u2e);
258 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
259 }
260 foreach my $enc (sort cmp_name keys %encoding)
261 {
262 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
263 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
264 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
265 my $replen = 0;
266 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
267 my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
268 my $sym = "${enc}_encoding";
269 $sym =~ s/\W+/_/g;
270 print C "encode_t $sym = \n";
271 # This is to make null encoding work -- dankogai
272 for (my $i = (scalar @info) - 1; $i >= 0; --$i){
273 $info[$i] ||= 1;
274 }
275 # end of null tweak -- dankogai
276 print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
277 }
278
279 foreach my $enc (sort cmp_name keys %encoding)
280 {
281 my $sym = "${enc}_encoding";
282 $sym =~ s/\W+/_/g;
283 print H "extern encode_t $sym;\n";
284 print D " Encode_XSEncoding(aTHX_ &$sym);\n";
285 }
286
287 if ($cname =~ /(\w+)\.xs$/)
288 {
289 my $mod = $1;
290 print C <<'END';
291
292static void
293Encode_XSEncoding(pTHX_ encode_t *enc)
294{
295 dSP;
296 HV *stash = gv_stashpv("Encode::XS", TRUE);
297 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
298 int i = 0;
299 PUSHMARK(sp);
300 XPUSHs(sv);
301 while (enc->name[i])
302 {
303 const char *name = enc->name[i++];
304 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
305 }
306 PUTBACK;
307 call_pv("Encode::define_encoding",G_DISCARD);
308 SvREFCNT_dec(sv);
309}
310
311END
312
313 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
314 print C "BOOT:\n{\n";
315 print C "#include \"$dname\"\n";
316 print C "}\n";
317 }
318 # Close in void context is bad, m'kay
319 close(D) or warn "Error closing '$dname': $!";
320 close(H) or warn "Error closing '$hname': $!";
321
322 my $perc_saved = $strings/($strings + $saved) * 100;
323 my $perc_subsaved = $strings/($strings + $subsave) * 100;
324 printf STDERR "%d bytes in string tables\n",$strings;
325 printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
326 $saved, $perc_saved if $saved;
327 printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
328 $subsave, $perc_subsaved if $subsave;
329 }
330elsif ($doEnc)
331 {
332 foreach my $name (sort cmp_name keys %encoding)
333 {
334 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
335 output_enc(\*C,$name,$e2u);
336 }
337 }
338elsif ($doUcm)
339 {
340 foreach my $name (sort cmp_name keys %encoding)
341 {
342 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
343 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
344 }
345 }
346
347# writing half meg files and then not checking to see if you just filled the
348# disk is bad, m'kay
349close(C) or die "Error closing '$cname': $!";
350
351# End of the main program.
352
353sub compile_ucm
354{
355 my ($fh,$name) = @_;
356 my $e2u = {};
357 my $u2e = {};
358 my $cs;
359 my %attr;
360 while (<$fh>)
361 {
362 s/#.*$//;
363 last if /^\s*CHARMAP\s*$/i;
364 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
365 {
366 $attr{$1} = $2;
367 }
368 }
369 if (!defined($cs = $attr{'code_set_name'}))
370 {
371 warn "No <code_set_name> in $name\n";
372 }
373 else
374 {
375 $name = $cs unless exists $opt{'n'};
376 }
377 my $erep;
378 my $urep;
379 my $max_el;
380 my $min_el;
381 if (exists $attr{'subchar'})
382 {
383 #my @byte;
384 #$attr{'subchar'} =~ /^\s*/cg;
385 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
386 #$erep = join('',map(chr(hex($_)),@byte));
387 $erep = $attr{'subchar'};
388 $erep =~ s/^\s+//; $erep =~ s/\s+$//;
389 }
390 print "Reading $name ($cs)\n";
391 my $nfb = 0;
392 my $hfb = 0;
393 while (<$fh>)
394 {
395 s/#.*$//;
396 last if /^\s*END\s+CHARMAP\s*$/i;
397 next if /^\s*$/;
398 my (@uni, @byte) = ();
399 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
400 or die "Bad line: $_";
401 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
402 push @uni, map { substr($_, 1) } split(/\+/, $1);
403 }
404 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
405 push @byte, $1;
406 }
407 if (@uni)
408 {
409 my $uch = join('', map { encode_U(hex($_)) } @uni );
410 my $ech = join('',map(chr(hex($_)),@byte));
411 my $el = length($ech);
412 $max_el = $el if (!defined($max_el) || $el > $max_el);
413 $min_el = $el if (!defined($min_el) || $el < $min_el);
414 if (length($fb))
415 {
416 $fb = substr($fb,1);
417 $hfb++;
418 }
419 else
420 {
421 $nfb++;
422 $fb = '0';
423 }
424 # $fb is fallback flag
425 # 0 - round trip safe
426 # 1 - fallback for unicode -> enc
427 # 2 - skip sub-char mapping
428 # 3 - fallback enc -> unicode
429 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
430 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
431 }
432 else
433 {
434 warn $_;
435 }
436 }
437 if ($nfb && $hfb)
438 {
439 die "$nfb entries without fallback, $hfb entries with\n";
440 }
441 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
442}
443
444
445
446sub compile_enc
447{
448 my ($fh,$name) = @_;
449 my $e2u = {};
450 my $u2e = {};
451
452 my $type;
453 while ($type = <$fh>)
454 {
455 last if $type !~ /^\s*#/;
456 }
457 chomp($type);
458 return if $type eq 'E';
459 # Do the hash lookup once, rather than once per function call. 4% speedup.
460 my $type_func = $encode_types{$type};
461 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
462 warn "$type encoded $name\n";
463 my $rep = '';
464 # Save a defined test by setting these to defined values.
465 my $min_el = ~0; # A very big integer
466 my $max_el = 0; # Anything must be longer than 0
467 {
468 my $v = hex($def);
469 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
470 }
471 my $errors;
472 my $seen;
473 # use -Q to silence the seen test. Makefile.PL uses this by default.
474 $seen = {} unless $opt{Q};
475 do
476 {
477 my $line = <$fh>;
478 chomp($line);
479 my $page = hex($line);
480 my $ch = 0;
481 my $i = 16;
482 do
483 {
484 # So why is it 1% faster to leave the my here?
485 my $line = <$fh>;
486 $line =~ s/\r\n$/\n/;
487 die "$.:${line}Line should be exactly 65 characters long including
488 newline (".length($line).")" unless length ($line) == 65;
489 # Split line into groups of 4 hex digits, convert groups to ints
490 # This takes 65.35
491 # map {hex $_} $line =~ /(....)/g
492 # This takes 63.75 (2.5% less time)
493 # unpack "n*", pack "H*", $line
494 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
495 # Doing it as while ($line =~ /(....)/g) took 74.63
496 foreach my $val (unpack "n*", pack "H*", $line)
497 {
498 next if $val == 0xFFFD;
499 my $ech = &$type_func($ch,$page);
500 if ($val || (!$ch && !$page))
501 {
502 my $el = length($ech);
503 $max_el = $el if $el > $max_el;
504 $min_el = $el if $el < $min_el;
505 my $uch = encode_U($val);
506 if ($seen) {
507 # We're doing the test.
508 # We don't need to read this quickly, so storing it as a scalar,
509 # rather than 3 (anon array, plus the 2 scalars it holds) saves
510 # RAM and may make us faster on low RAM systems. [see __END__]
511 if (exists $seen->{$uch})
512 {
513 warn sprintf("U%04X is %02X%02X and %04X\n",
514 $val,$page,$ch,$seen->{$uch});
515 $errors++;
516 }
517 else
518 {
519 $seen->{$uch} = $page << 8 | $ch;
520 }
521 }
522 # Passing 2 extra args each time is 3.6% slower!
523 # Even with having to add $fallback ||= 0 later
524 enter_fb0($e2u,$ech,$uch);
525 enter_fb0($u2e,$uch,$ech);
526 }
527 else
528 {
529 # No character at this position
530 # enter($e2u,$ech,undef,$e2u);
531 }
532 $ch++;
533 }
534 } while --$i;
535 } while --$pages;
536 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
537 if $min_el > $max_el;
538 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
539 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
540}
541
542# my ($a,$s,$d,$t,$fb) = @_;
543sub enter {
544 my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
545 # state we shift to after this (multibyte) input character defaults to same
546 # as current state.
547 $next ||= $current;
548 # Making sure it is defined seems to be faster than {no warnings;} in
549 # &process, or passing it in as 0 explicity.
550 # XXX $fallback ||= 0;
551
552 # Start at the beginning and work forwards through the string to zero.
553 # effectively we are removing 1 character from the front each time
554 # but we don't actually edit the string. [this alone seems to be 14% speedup]
555 # Hence -$pos is the length of the remaining string.
556 my $pos = -length $inbytes;
557 while (1) {
558 my $byte = substr $inbytes, $pos, 1;
559 # RAW_NEXT => 0,
560 # RAW_IN_LEN => 1,
561 # RAW_OUT_BYTES => 2,
562 # RAW_FALLBACK => 3,
563 # to unicode an array would seem to be better, because the pages are dense.
564 # from unicode can be very sparse, favouring a hash.
565 # hash using the bytes (all length 1) as keys rather than ord value,
566 # as it's easier to sort these in &process.
567
568 # It's faster to always add $fallback even if it's undef, rather than
569 # choosing between 3 and 4 element array. (hence why we set it defined
570 # above)
571 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
572 # When $pos was -1 we were at the last input character.
573 unless (++$pos) {
574 $do_now->[RAW_OUT_BYTES] = $outbytes;
575 $do_now->[RAW_NEXT] = $next;
576 return;
577 }
578 # Tail recursion. The intermdiate state may not have a name yet.
579 $current = $do_now->[RAW_NEXT];
580 }
581}
582
583# This is purely for optimistation. It's just &enter hard coded for $fallback
584# of 0, using only a 3 entry array ref to save memory for every entry.
585sub enter_fb0 {
586 my ($current,$inbytes,$outbytes,$next) = @_;
587 $next ||= $current;
588
589 my $pos = -length $inbytes;
590 while (1) {
591 my $byte = substr $inbytes, $pos, 1;
592 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
593 unless (++$pos) {
594 $do_now->[RAW_OUT_BYTES] = $outbytes;
595 $do_now->[RAW_NEXT] = $next;
596 return;
597 }
598 $current = $do_now->[RAW_NEXT];
599 }
600}
601
602
603sub outstring
604{
605 my ($fh,$name,$s) = @_;
606 my $sym = $strings{$s};
607 if ($sym)
608 {
609 $saved += length($s);
610 }
611 else
612 {
613 if ($opt{'O'}) {
614 foreach my $o (keys %strings)
615 {
616 next unless (my $i = index($o,$s)) >= 0;
617 $sym = $strings{$o};
618 # gcc things that 0x0e+0x10 (anything with e+) starts to look like
619 # a hexadecimal floating point constant. Silly gcc. Only p
620 # introduces a floating point constant. Put the space in to stop it
621 # getting confused.
622 $sym .= sprintf(" +0x%02x",$i) if ($i);
623 $subsave += length($s);
624 return $strings{$s} = $sym;
625 }
626 }
627 $strings{$s} = $sym = $name;
628 $strings += length($s);
629 my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
630 # Maybe we should assert that these are all <256.
631 $definition .= join(',',unpack "C*",$s);
632 # We have a single long line. Split it at convenient commas.
633 $definition =~ s/(.{74,77},)/$1\n/g;
634 print $fh "$definition };\n\n";
635 }
636 return $sym;
637}
638
639sub process
640{
641 my ($name,$a) = @_;
642 $name =~ s/\W+/_/g;
643 $a->{Cname} = $name;
644 my $raw = $a->{Raw};
645 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
646 my @ent;
647 $agg_max_in = 0;
648 foreach my $key (sort keys %$raw) {
649 # RAW_NEXT => 0,
650 # RAW_IN_LEN => 1,
651 # RAW_OUT_BYTES => 2,
652 # RAW_FALLBACK => 3,
653 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
654 # Now we are converting from raw to aggregate, switch from 1 byte strings
655 # to numbers
656 my $b = ord $key;
657 $fallback ||= 0;
658 if ($l &&
659 # If this == fails, we're going to reset $agg_max_in below anyway.
660 $b == ++$agg_max_in &&
661 # References in numeric context give the pointer as an int.
662 $agg_next == $next &&
663 $agg_in_len == $in_len &&
664 $agg_out_len == length $out_bytes &&
665 $agg_fallback == $fallback
666 # && length($l->[AGG_OUT_BYTES]) < 16
667 ) {
668 # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
669 # we can aggregate this byte onto the end.
670 $l->[AGG_MAX_IN] = $b;
671 $l->[AGG_OUT_BYTES] .= $out_bytes;
672 } else {
673 # AGG_MIN_IN => 0,
674 # AGG_MAX_IN => 1,
675 # AGG_OUT_BYTES => 2,
676 # AGG_NEXT => 3,
677 # AGG_IN_LEN => 4,
678 # AGG_OUT_LEN => 5,
679 # AGG_FALLBACK => 6,
680 # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
681 # (only gains .6% on euc-jp -- is it worth it?)
682 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
683 $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
684 $agg_fallback = $fallback];
685 }
686 if (exists $next->{Cname}) {
687 $next->{'Forward'} = 1 if $next != $a;
688 } else {
689 process(sprintf("%s_%02x",$name,$b),$next);
690 }
691 }
692 # encengine.c rules say that last entry must be for 255
693 if ($agg_max_in < 255) {
694 push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
695 }
696 $a->{'Entries'} = \@ent;
697}
698
699sub outtable
700{
701 my ($fh,$a) = @_;
702 my $name = $a->{'Cname'};
703 # String tables
704 foreach my $b (@{$a->{'Entries'}})
705 {
706 next unless $b->[AGG_OUT_LEN];
707 my $s = $b->[AGG_MIN_IN];
708 my $e = $b->[AGG_MAX_IN];
709 outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
710 }
711 if ($a->{'Forward'})
712 {
713 my $var = $^O eq 'MacOS' ? 'extern' : 'static';
714 print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
715 }
716 $a->{'Done'} = 1;
717 foreach my $b (@{$a->{'Entries'}})
718 {
719 my ($s,$e,$out,$t,$end,$l) = @$b;
720 outtable($fh,$t) unless $t->{'Done'};
721 }
722 print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
723 foreach my $b (@{$a->{'Entries'}})
724 {
725 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
726 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
727 print $fh "{";
728 if ($l)
729 {
730 printf $fh outstring($fh,'',$out);
731 }
732 else
733 {
734 print $fh "0";
735 }
736 print $fh ",",$t->{Cname};
737 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
738 }
739 print $fh "};\n";
740}
741
742sub output
743{
744 my ($fh,$name,$a) = @_;
745 process($name,$a);
746 # Sub-tables
747 outtable($fh,$a);
748}
749
750sub output_enc
751{
752 my ($fh,$name,$a) = @_;
753 die "Changed - fix me for new structure";
754 foreach my $b (sort keys %$a)
755 {
756 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
757 }
758}
759
760sub decode_U
761{
762 my $s = shift;
763}
764
765my @uname;
766sub char_names
767{
768 my $s = do "unicore/Name.pl";
769 die "char_names: unicore/Name.pl: $!\n" unless defined $s;
770 pos($s) = 0;
771 while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
772 {
773 my $name = $3;
774 my $s = hex($1);
775 last if $s >= 0x10000;
776 my $e = length($2) ? hex($2) : $s;
777 for (my $i = $s; $i <= $e; $i++)
778 {
779 $uname[$i] = $name;
780# print sprintf("U%04X $name\n",$i);
781 }
782 }
783}
784
785sub output_ucm_page
786{
787 my ($cmap,$a,$t,$pre) = @_;
788 # warn sprintf("Page %x\n",$pre);
789 my $raw = $t->{Raw};
790 foreach my $key (sort keys %$raw) {
791 # RAW_NEXT => 0,
792 # RAW_IN_LEN => 1,
793 # RAW_OUT_BYTES => 2,
794 # RAW_FALLBACK => 3,
795 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
796 my $u = ord $key;
797 $fallback ||= 0;
798
799 if ($next != $a && $next != $t) {
800 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
801 } elsif (length $out_bytes) {
802 if ($pre) {
803 $u = $pre|($u &0x3f);
804 }
805 my $s = sprintf "<U%04X> ",$u;
806 #foreach my $c (split(//,$out_bytes)) {
807 # $s .= sprintf "\\x%02X",ord($c);
808 #}
809 # 9.5% faster changing that loop to this:
810 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
811 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
812 push(@$cmap,$s);
813 } else {
814 warn join(',',$u, @{$raw->{$key}},$a,$t);
815 }
816 }
817}
818
819sub output_ucm
820{
821 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
822 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
823 print $fh "<code_set_name> \"$name\"\n";
824 char_names();
825 if (defined $min_el)
826 {
827 print $fh "<mb_cur_min> $min_el\n";
828 }
829 if (defined $max_el)
830 {
831 print $fh "<mb_cur_max> $max_el\n";
832 }
833 if (defined $rep)
834 {
835 print $fh "<subchar> ";
836 foreach my $c (split(//,$rep))
837 {
838 printf $fh "\\x%02X",ord($c);
839 }
840 print $fh "\n";
841 }
842 my @cmap;
843 output_ucm_page(\@cmap,$h,$h,0);
844 print $fh "#\nCHARMAP\n";
845 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
846 {
847 print $fh $line;
848 }
849 print $fh "END CHARMAP\n";
850}
851
852use vars qw(
853 $_Enc2xs
854 $_Version
855 $_Inc
856 $_E2X
857 $_Name
858 $_TableFiles
859 $_Now
860);
861
862sub find_e2x{
863 eval { require File::Find };
864 my (@inc, %e2x_dir);
865 for my $inc (@INC){
866 push @inc, $inc unless $inc eq '.'; #skip current dir
867 }
868 File::Find::find(
869 sub {
870 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
871 $atime,$mtime,$ctime,$blksize,$blocks)
872 = lstat($_) or return;
873 -f _ or return;
874 if (/^.*\.e2x$/o){
875 $e2x_dir{$File::Find::dir} ||= $mtime;
876 }
877 return;
878 }, @inc);
879 warn join("\n", keys %e2x_dir), "\n";
880 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
881 $_E2X = $d;
882 # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
883 return $_E2X;
884 }
885}
886
887sub make_makefile_pl
888{
889 eval { require Encode; };
890 $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
891 # our used for variable expanstion
892 $_Enc2xs = $0;
893 $_Version = $VERSION;
894 $_E2X = find_e2x();
895 $_Name = shift;
896 $_TableFiles = join(",", map {qq('$_')} @_);
897 $_Now = scalar localtime();
898
899 eval { require File::Spec; };
900 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
901 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
902 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
903 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
904 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
905 exit;
906}
907
908use vars qw(
909 $_ModLines
910 $_LocalVer
911 );
912
913sub make_configlocal_pm
914{
915 eval { require Encode; };
916 $@ and die "Unable to require Encode: $@\n";
917 eval { require File::Spec; };
918 # our used for variable expanstion
919 my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
920 my %LocalMod = ();
921 for my $d (@INC){
922 my $inc = File::Spec->catfile($d, "Encode");
923 -d $inc or next;
924 opendir my $dh, $inc or die "$inc:$!";
925 warn "Checking $inc...\n";
926 for my $f (grep /\.pm$/o, readdir($dh)){
927 -f File::Spec->catfile($inc, "$f") or next;
928 $INC{"Encode/$f"} and next;
929 warn "require Encode/$f;\n";
930 eval { require "Encode/$f"; };
931 $@ and die "Can't require Encode/$f: $@\n";
932 for my $enc (Encode->encodings()){
933 $in_core{$enc} and next;
934 $Encode::Config::ExtModule{$enc} and next;
935 my $mod = "Encode/$f";
936 $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
937 $LocalMod{$enc} ||= $mod;
938 }
939 }
940 }
941 $_ModLines = "";
942 for my $enc (sort keys %LocalMod){
943 $_ModLines .=
944 qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
945 }
946 warn $_ModLines;
947 $_LocalVer = _mkversion();
948 $_E2X = find_e2x();
949 $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
950 _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
951 File::Spec->catfile($_Inc,"ConfigLocal.pm"),
952 1);
953 exit;
954}
955
956sub _mkversion{
957 my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
958 $yyyy += 1900, $mo +=1;
959 return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
960}
961
962sub _print_expand{
963 eval { require File::Basename; };
964 $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
965 File::Basename->import();
966 my ($src, $dst, $clobber) = @_;
967 if (!$clobber and -e $dst){
968 warn "$dst exists. skipping\n";
969 return;
970 }
971 warn "Generating $dst...\n";
972 open my $in, $src or die "$src : $!";
973 if ((my $d = dirname($dst)) ne '.'){
974 -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
975 }
976 open my $out, ">$dst" or die "$!";
977 my $asis = 0;
978 while (<$in>){
979 if (/^#### END_OF_HEADER/){
980 $asis = 1; next;
981 }
982 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
983 print $out $_;
984 }
985}
986__END__
987
988=head1 NAME
989
990enc2xs -- Perl Encode Module Generator
991
992=head1 SYNOPSIS
993
994 enc2xs -[options]
995 enc2xs -M ModName mapfiles...
996 enc2xs -C
997
998=head1 DESCRIPTION
999
1000F<enc2xs> builds a Perl extension for use by Encode from either
1001Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1002Besides being used internally during the build process of the Encode
1003module, you can use F<enc2xs> to add your own encoding to perl.
1004No knowledge of XS is necessary.
1005
1006=head1 Quick Guide
1007
1008If you want to know as little about Perl as possible but need to
1009add a new encoding, just read this chapter and forget the rest.
1010
1011=over 4
1012
1013=item 0.
1014
1015Have a .ucm file ready. You can get it from somewhere or you can write
1016your own from scratch or you can grab one from the Encode distribution
1017and customize it. For the UCM format, see the next Chapter. In the
1018example below, I'll call my theoretical encoding myascii, defined
1019in I<my.ucm>. C<$> is a shell prompt.
1020
1021 $ ls -F
1022 my.ucm
1023
1024=item 1.
1025
1026Issue a command as follows;
1027
1028 $ enc2xs -M My my.ucm
1029 generating Makefile.PL
1030 generating My.pm
1031 generating README
1032 generating Changes
1033
1034Now take a look at your current directory. It should look like this.
1035
1036 $ ls -F
1037 Makefile.PL My.pm my.ucm t/
1038
1039The following files were created.
1040
1041 Makefile.PL - MakeMaker script
1042 My.pm - Encode submodule
1043 t/My.t - test file
1044
1045=over 4
1046
1047=item 1.1.
1048
1049If you want *.ucm installed together with the modules, do as follows;
1050
1051 $ mkdir Encode
1052 $ mv *.ucm Encode
1053 $ enc2xs -M My Encode/*ucm
1054
1055=back
1056
1057=item 2.
1058
1059Edit the files generated. You don't have to if you have no time AND no
1060intention to give it to someone else. But it is a good idea to edit
1061the pod and to add more tests.
1062
1063=item 3.
1064
1065Now issue a command all Perl Mongers love:
1066
1067 $ perl Makefile.PL
1068 Writing Makefile for Encode::My
1069
1070=item 4.
1071
1072Now all you have to do is make.
1073
1074 $ make
1075 cp My.pm blib/lib/Encode/My.pm
1076 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1077 -o encode_t.c -f encode_t.fnm
1078 Reading myascii (myascii)
1079 Writing compiled form
1080 128 bytes in string tables
1081 384 bytes (25%) saved spotting duplicates
1082 1 bytes (99.2%) saved using substrings
1083 ....
1084 chmod 644 blib/arch/auto/Encode/My/My.bs
1085 $
1086
1087The time it takes varies depending on how fast your machine is and
1088how large your encoding is. Unless you are working on something big
1089like euc-tw, it won't take too long.
1090
1091=item 5.
1092
1093You can "make install" already but you should test first.
1094
1095 $ make test
1096 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1097 -e 'use Test::Harness qw(&runtests $verbose); \
1098 $verbose=0; runtests @ARGV;' t/*.t
1099 t/My....ok
1100 All tests successful.
1101 Files=1, Tests=2, 0 wallclock secs
1102 ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1103
1104=item 6.
1105
1106If you are content with the test result, just "make install"
1107
1108=item 7.
1109
1110If you want to add your encoding to Encode's demand-loading list
1111(so you don't have to "use Encode::YourEncoding"), run
1112
1113 enc2xs -C
1114
1115to update Encode::ConfigLocal, a module that controls local settings.
1116After that, "use Encode;" is enough to load your encodings on demand.
1117
1118=back
1119
1120=head1 The Unicode Character Map
1121
1122Encode uses the Unicode Character Map (UCM) format for source character
1123mappings. This format is used by IBM's ICU package and was adopted
1124by Nick Ing-Simmons for use with the Encode module. Since UCM is
1125more flexible than Tcl's Encoding Map and far more user-friendly,
1126this is the recommended formet for Encode now.
1127
1128A UCM file looks like this.
1129
1130 #
1131 # Comments
1132 #
1133 <code_set_name> "US-ascii" # Required
1134 <code_set_alias> "ascii" # Optional
1135 <mb_cur_min> 1 # Required; usually 1
1136 <mb_cur_max> 1 # Max. # of bytes/char
1137 <subchar> \x3F # Substitution char
1138 #
1139 CHARMAP
1140 <U0000> \x00 |0 # <control>
1141 <U0001> \x01 |0 # <control>
1142 <U0002> \x02 |0 # <control>
1143 ....
1144 <U007C> \x7C |0 # VERTICAL LINE
1145 <U007D> \x7D |0 # RIGHT CURLY BRACKET
1146 <U007E> \x7E |0 # TILDE
1147 <U007F> \x7F |0 # <control>
1148 END CHARMAP
1149
1150=over 4
1151
1152=item *
1153
1154Anything that follows C<#> is treated as a comment.
1155
1156=item *
1157
1158The header section continues until a line containing the word
1159CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1160pair per line. Strings used as values must be quoted. Barewords are
1161treated as numbers. I<\xXX> represents a byte.
1162
1163Most of the keywords are self-explanatory. I<subchar> means
1164substitution character, not subcharacter. When you decode a Unicode
1165sequence to this encoding but no matching character is found, the byte
1166sequence defined here will be used. For most cases, the value here is
1167\x3F; in ASCII, this is a question mark.
1168
1169=item *
1170
1171CHARMAP starts the character map section. Each line has a form as
1172follows:
1173
1174 <UXXXX> \xXX.. |0 # comment
1175 ^ ^ ^
1176 | | +- Fallback flag
1177 | +-------- Encoded byte sequence
1178 +-------------- Unicode Character ID in hex
1179
1180The format is roughly the same as a header section except for the
1181fallback flag: | followed by 0..3. The meaning of the possible
1182values is as follows:
1183
1184=over 4
1185
1186=item |0
1187
1188Round trip safe. A character decoded to Unicode encodes back to the
1189same byte sequence. Most characters have this flag.
1190
1191=item |1
1192
1193Fallback for unicode -> encoding. When seen, enc2xs adds this
1194character for the encode map only.
1195
1196=item |2
1197
1198Skip sub-char mapping should there be no code point.
1199
1200=item |3
1201
1202Fallback for encoding -> unicode. When seen, enc2xs adds this
1203character for the decode map only.
1204
1205=back
1206
1207=item *
1208
1209And finally, END OF CHARMAP ends the section.
1210
1211=back
1212
1213When you are manually creating a UCM file, you should copy ascii.ucm
1214or an existing encoding which is close to yours, rather than write
1215your own from scratch.
1216
1217When you do so, make sure you leave at least B<U0000> to B<U0020> as
1218is, unless your environment is EBCDIC.
1219
1220B<CAVEAT>: not all features in UCM are implemented. For example,
1221icu:state is not used. Because of that, you need to write a perl
1222module if you want to support algorithmical encodings, notably
1223the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
1224L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1225
1226=head2 Coping with duplicate mappings
1227
1228When you create a map, you SHOULD make your mappings round-trip safe.
1229That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1230$data> stands for all characters that are marked as C<|0>. Here is
1231how to make sure:
1232
1233=over 4
1234
1235=item *
1236
1237Sort your map in Unicode order.
1238
1239=item *
1240
1241When you have a duplicate entry, mark either one with '|1' or '|3'.
1242
1243=item *
1244
1245And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1246
1247=back
1248
1249Here is an example from big5-eten.
1250
1251 <U2550> \xF9\xF9 |0
1252 <U2550> \xA2\xA4 |3
1253
1254Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1255this;
1256
1257 E to U U to E
1258 --------------------------------------
1259 \xF9\xF9 => U2550 U2550 => \xF9\xF9
1260 \xA2\xA4 => U2550
1261
1262So it is round-trip safe for \xF9\xF9. But if the line above is upside
1263down, here is what happens.
1264
1265 E to U U to E
1266 --------------------------------------
1267 \xA2\xA4 => U2550 U2550 => \xF9\xF9
1268 (\xF9\xF9 => U2550 is now overwritten!)
1269
1270The Encode package comes with F<ucmlint>, a crude but sufficient
1271utility to check the integrity of a UCM file. Check under the
1272Encode/bin directory for this.
1273
1274
1275=head1 Bookmarks
1276
1277=over 4
1278
1279=item *
1280
1281ICU Home Page
1282L<http://oss.software.ibm.com/icu/>
1283
1284=item *
1285
1286ICU Character Mapping Tables
1287L<http://oss.software.ibm.com/icu/charset/>
1288
1289=item *
1290
1291ICU:Conversion Data
1292L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
1293
1294=back
1295
1296=head1 SEE ALSO
1297
1298L<Encode>,
1299L<perlmod>,
1300L<perlpod>
1301
1302=cut
1303
1304# -Q to disable the duplicate codepoint test
1305# -S make mapping errors fatal
1306# -q to remove comments written to output files
1307# -O to enable the (brute force) substring optimiser
1308# -o <output> to specify the output file name (else it's the first arg)
1309# -f <inlist> to give a file with a list of input files (else use the args)
1310# -n <name> to name the encoding (else use the basename of the input file.
1311
1312With %seen holding array refs:
1313
1314 865.66 real 28.80 user 8.79 sys
1315 7904 maximum resident set size
1316 1356 average shared memory size
1317 18566 average unshared data size
1318 229 average unshared stack size
1319 46080 page reclaims
1320 33373 page faults
1321
1322With %seen holding simple scalars:
1323
1324 342.16 real 27.11 user 3.54 sys
1325 8388 maximum resident set size
1326 1394 average shared memory size
1327 14969 average unshared data size
1328 236 average unshared stack size
1329 28159 page reclaims
1330 9839 page faults
1331
1332Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1333how %seen is storing things its seen. So it is pathalogically bad on a 16M
1334RAM machine, but it's going to help even on modern machines.
1335Swapping is bad, m'kay :-)