git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/lib/5.8.0/unicore/mktables

... / ...

Commit	Line	Data
	1	#!/usr/bin/perl -w
	2	use strict;
	3	use Carp;
	4
	5	##
	6	## mktables -- create the runtime Perl Unicode files (lib/unicore/*/.pl)
	7	## from the Unicode database files (lib/unicore/*.txt).
	8	##
	9
	10	mkdir("lib", 0755);
	11	mkdir("To", 0755);
	12
	13	##
	14	## Process any args.
	15	##
	16	my $Verbose = 0;
	17	my $MakeTestScript = 0;
	18
	19	while (@ARGV)
	20	{
	21	my $arg = shift @ARGV;
	22	if ($arg eq '-v') {
	23	$Verbose = 1;
	24	} elsif ($arg eq '-q') {
	25	$Verbose = 0;
	26	} elsif ($arg eq '-maketest') {
	27	$MakeTestScript = 1;
	28	} else {
	29	die "usage: $0 [-v\|-q] [-maketest]";
	30	}
	31	}
	32
	33	my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
	34
	35	my $HEADER=<<"EOF";
	36	# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
	37	# This file is built by $0 from e.g. UnicodeData.txt.
	38	# Any changes made here will be lost!
	39
	40	EOF
	41
	42
	43	##
	44	## Given a filename and a reference to an array of lines,
	45	## write the lines to the file only if the contents have not changed.
	46	##
	47	sub WriteIfChanged($\@)
	48	{
	49	my $file = shift;
	50	my $lines = shift;
	51
	52	my $TextToWrite = join '', @$lines;
	53	if (open IN, $file) {
	54	local($/) = undef;
	55	my $PreviousText = <IN>;
	56	close IN;
	57	if ($PreviousText eq $TextToWrite) {
	58	print "$file unchanged.\n" if $Verbose;
	59	return;
	60	}
	61	}
	62	if (not open OUT, ">$file") {
	63	die "$0: can't open $file for output: $!\n";
	64	}
	65	print "$file written.\n" if $Verbose;
	66
	67	print OUT $TextToWrite;
	68	close OUT;
	69	}
	70
	71	##
	72	## The main datastructure (a "Table") represents a set of code points that
	73	## are part of a particular quality (that are part of \pL, \p{InGreek},
	74	## etc.). They are kept as ranges of code points (starting and ending of
	75	## each range).
	76	##
	77	## For example, a range ASCII LETTERS would be represented as:
	78	## [ [ 0x41 => 0x5A, 'UPPER' ],
	79	## [ 0x61 => 0x7A, 'LOWER, ] ]
	80	##
	81	sub RANGE_START() { 0 } ## index into range element
	82	sub RANGE_END() { 1 } ## index into range element
	83	sub RANGE_NAME() { 2 } ## index into range element
	84
	85	## Conceptually, these should really be folded into the 'Table' objects
	86	my %TableInfo;
	87	my %TableDesc;
	88	my %FuzzyNames;
	89	my %AliasInfo;
	90	my %CanonicalToOrig;
	91
	92	##
	93	## Turn something like
	94	## OLD-ITALIC
	95	## into
	96	## OldItalic
	97	##
	98	sub CanonicalName($)
	99	{
	100	my $orig = shift;
	101	my $name = lc $orig;
	102	$name =~ s/(?<![a-z])(\w)/\u$1/g;
	103	$name =~ s/[-_\s]+//g;
	104
	105	$CanonicalToOrig{$name} = $orig if not $CanonicalToOrig{$name};
	106	return $name;
	107	}
	108
	109	##
	110	## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
	111	##
	112	## Called like:
	113	## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1);
	114	##
	115	## Normally, these parameters are set when the Table is created (when the
	116	## Table->New constructor is called), but there are times when it needs to
	117	## be done after-the-fact...)
	118	##
	119	sub New_Prop($$$@)
	120	{
	121	my $Type = shift; ## "Is" or "In";
	122	my $Name = shift;
	123	my $Table = shift;
	124
	125	## remaining args are optional key/val
	126	my %Args = @_;
	127
	128	my $Fuzzy = delete $Args{Fuzzy};
	129	my $Desc = delete $Args{Desc}; # description
	130
	131	$Name = CanonicalName($Name) if $Fuzzy;
	132
	133	## sanity check a few args
	134	if (%Args or ($Type ne 'Is' and $Type ne 'In') or not ref $Table) {
	135	confess "$0: bad args to New_Prop"
	136	}
	137
	138	if (not $TableInfo{$Type}->{$Name})
	139	{
	140	$TableInfo{$Type}->{$Name} = $Table;
	141	$TableDesc{$Type}->{$Name} = $Desc;
	142	if ($Fuzzy) {
	143	$FuzzyNames{$Type}->{$Name} = $Name;
	144	}
	145	}
	146	}
	147
	148
	149	##
	150	## Creates a new Table object.
	151	##
	152	## Args are key/value pairs:
	153	## In => Name -- Name of "In" property to be associated with
	154	## Is => Name -- Name of "Is" property to be associated with
	155	## Fuzzy => Boolean -- True if name can be accessed "fuzzily"
	156	## Desc => String -- Description of the property
	157	##
	158	## No args are required.
	159	##
	160	sub Table::New
	161	{
	162	my $class = shift;
	163	my %Args = @_;
	164
	165	my $Table = bless [], $class;
	166
	167	my $Fuzzy = delete $Args{Fuzzy};
	168	my $Desc = delete $Args{Desc};
	169
	170	for my $Type ('Is', 'In')
	171	{
	172	if (my $Name = delete $Args{$Type}) {
	173	New_Prop($Type => $Name, $Table, Desc => $Desc, Fuzzy => $Fuzzy);
	174	}
	175	}
	176
	177	## shouldn't have any left over
	178	if (%Args) {
	179	confess "$0: bad args to Table->New"
	180	}
	181
	182	return $Table;
	183	}
	184
	185	##
	186	## Returns true if the Table has no code points
	187	##
	188	sub Table::IsEmpty
	189	{
	190	my $Table = shift; #self
	191	return not @$Table;
	192	}
	193
	194	##
	195	## Returns true if the Table has code points
	196	##
	197	sub Table::NotEmpty
	198	{
	199	my $Table = shift; #self
	200	return @$Table;
	201	}
	202
	203	##
	204	## Returns the maximum code point currently in the table.
	205	##
	206	sub Table::Max
	207	{
	208	my $Table = shift; #self
	209	confess "oops" if $Table->IsEmpty; ## must have code points to have a max
	210	return $Table->[-1]->[RANGE_END];
	211	}
	212
	213	##
	214	## Replaces the codepoints in the Table with those in the Table given
	215	## as an arg. (NOTE: this is not a "deep copy").
	216	##
	217	sub Table::Replace($$)
	218	{
	219	my $Table = shift; #self
	220	my $New = shift;
	221
	222	@$Table = @$New;
	223	}
	224
	225	##
	226	## Given a new code point, make the last range of the Table extend to
	227	## include the new (and all intervening) code points.
	228	##
	229	sub Table::Extend
	230	{
	231	my $Table = shift; #self
	232	my $codepoint = shift;
	233
	234	my $PrevMax = $Table->Max;
	235
	236	confess "oops ($codepoint <= $PrevMax)" if $codepoint <= $PrevMax;
	237
	238	$Table->[-1]->[RANGE_END] = $codepoint;
	239	}
	240
	241	##
	242	## Given a code point range start and end (and optional name), blindly
	243	## append them to the list of ranges for the Table.
	244	##
	245	## NOTE: Code points must be added in strictly ascending numeric order.
	246	##
	247	sub Table::RawAppendRange
	248	{
	249	my $Table = shift; #self
	250	my $start = shift;
	251	my $end = shift;
	252	my $name = shift;
	253	$name = "" if not defined $name; ## warning: $name can be "0"
	254
	255	push @$Table, [ $start, # RANGE_START
	256	$end, # RANGE_END
	257	$name ]; # RANGE_NAME
	258	}
	259
	260	##
	261	## Given a code point (and optional name), add it to the Table.
	262	##
	263	## NOTE: Code points must be added in strictly ascending numeric order.
	264	##
	265	sub Table::Append
	266	{
	267	my $Table = shift; #self
	268	my $codepoint = shift;
	269	my $name = shift;
	270	$name = "" if not defined $name; ## warning: $name can be "0"
	271
	272	##
	273	## If we've already got a range working, and this code point is the next
	274	## one in line, and if the name is the same, just extend the current range.
	275	##
	276	if ($Table->NotEmpty
	277	and
	278	$Table->Max == $codepoint - 1
	279	and
	280	$Table->[-1]->[RANGE_NAME] eq $name)
	281	{
	282	$Table->Extend($codepoint);
	283	}
	284	else
	285	{
	286	$Table->RawAppendRange($codepoint, $codepoint, $name);
	287	}
	288	}
	289
	290	##
	291	## Given a code point range starting value and ending value (and name),
	292	## Add the range to teh Table.
	293	##
	294	## NOTE: Code points must be added in strictly ascending numeric order.
	295	##
	296	sub Table::AppendRange
	297	{
	298	my $Table = shift; #self
	299	my $start = shift;
	300	my $end = shift;
	301	my $name = shift;
	302	$name = "" if not defined $name; ## warning: $name can be "0"
	303
	304	$Table->Append($start, $name);
	305	$Table->Extend($end) if $end > $start;
	306	}
	307
	308	##
	309	## Return a new Table that represents all code points not in the Table.
	310	##
	311	sub Table::Invert
	312	{
	313	my $Table = shift; #self
	314
	315	my $New = Table->New();
	316	my $max = -1;
	317	for my $range (@$Table)
	318	{
	319	my $start = $range->[RANGE_START];
	320	my $end = $range->[RANGE_END];
	321	if ($start-1 >= $max+1) {
	322	$New->AppendRange($max+1, $start-1, "");
	323	}
	324	$max = $end;
	325	}
	326	if ($max+1 < $LastUnicodeCodepoint) {
	327	$New->AppendRange($max+1, $LastUnicodeCodepoint);
	328	}
	329	return $New;
	330	}
	331
	332	##
	333	## Merges any number of other tables with $self, returning the new table.
	334	## (existing tables are not modified)
	335	##
	336	##
	337	## Args may be Tables, or individual code points (as integers).
	338	##
	339	## Can be called as either a constructor or a method.
	340	##
	341	sub Table::Merge
	342	{
	343	shift(@_) if not ref $_[0]; ## if called as a constructor, lose the class
	344	my @Tables = @_;
	345
	346	## Accumulate all records from all tables
	347	my @Records;
	348	for my $Arg (@Tables)
	349	{
	350	if (ref $Arg) {
	351	## arg is a table -- get its ranges
	352	push @Records, @$Arg;
	353	} else {
	354	## arg is a codepoint, make a range
	355	push @Records, [ $Arg, $Arg ]
	356	}
	357	}
	358
	359	## sort by range start, with longer ranges coming first.
	360	my ($first, @Rest) = sort {
	361	($a->[RANGE_START] <=> $b->[RANGE_START])
	362	or
	363	($b->[RANGE_END] <=> $b->[RANGE_END])
	364	} @Records;
	365
	366	my $New = Table->New();
	367
	368	## Ensuring the first range is there makes the subsequent loop easier
	369	$New->AppendRange($first->[RANGE_START],
	370	$first->[RANGE_END]);
	371
	372	## Fold in records so long as they add new information.
	373	for my $set (@Rest)
	374	{
	375	my $start = $set->[RANGE_START];
	376	my $end = $set->[RANGE_END];
	377	if ($start > $New->Max) {
	378	$New->AppendRange($start, $end);
	379	} elsif ($end > $New->Max) {
	380	$New->Extend($end);
	381	}
	382	}
	383
	384	return $New;
	385	}
	386
	387	##
	388	## Given a filename, write a representation of the Table to a file.
	389	## May have an optional comment as a 2nd arg.
	390	##
	391	sub Table::Write
	392	{
	393	my $Table = shift; #self
	394	my $filename = shift;
	395	my $comment = shift;
	396
	397	my @OUT = $HEADER;
	398	if (defined $comment) {
	399	$comment =~ s/\s+\Z//;
	400	$comment =~ s/^/# /gm;
	401	push @OUT, "#\n$comment\n#\n";
	402	}
	403	push @OUT, "return <<'END';\n";
	404
	405	for my $set (@$Table)
	406	{
	407	my $start = $set->[RANGE_START];
	408	my $end = $set->[RANGE_END];
	409	my $name = $set->[RANGE_NAME];
	410
	411	if ($start == $end) {
	412	push @OUT, sprintf "%04X\t\t%s\n", $start, $name;
	413	} else {
	414	push @OUT, sprintf "%04X\t%04X\t%s\n", $start, $end, $name;
	415	}
	416	}
	417
	418	push @OUT, "END\n";
	419
	420	WriteIfChanged($filename, @OUT);
	421	}
	422
	423	## This used only for making the test script.
	424	## helper function
	425	sub IsUsable($)
	426	{
	427	my $code = shift;
	428	return 0 if $code <= 0x0000; ## don't use null
	429	return 0 if $code >= $LastUnicodeCodepoint; ## keep in range
	430	return 0 if ($code >= 0xD800 and $code <= 0xDFFF); ## no surrogates
	431	return 0 if ($code >= 0xFDD0 and $code <= 0xFDEF); ## utf8.c says no good
	432	return 0 if (($code & 0xFFFF) == 0xFFFE); ## utf8.c says no good
	433	return 0 if (($code & 0xFFFF) == 0xFFFF); ## utf8.c says no good
	434	return 1;
	435	}
	436
	437	## Return a code point that's part of the table.
	438	## Returns nothing if the table is empty (or covers only surrogates).
	439	## This used only for making the test script.
	440	sub Table::ValidCode
	441	{
	442	my $Table = shift; #self
	443	for my $set (@$Table) {
	444	return $set->[RANGE_END] if IsUsable($set->[RANGE_END]);
	445	}
	446	return ();
	447	}
	448
	449	## Return a code point that's not part of the table
	450	## Returns nothing if the table covers all code points.
	451	## This used only for making the test script.
	452	sub Table::InvalidCode
	453	{
	454	my $Table = shift; #self
	455
	456	return 0x1234 if $Table->IsEmpty();
	457
	458	for my $set (@$Table)
	459	{
	460	if (IsUsable($set->[RANGE_END] + 1))
	461	{
	462	return $set->[RANGE_END] + 1;
	463	}
	464
	465	if (IsUsable($set->[RANGE_START] - 1))
	466	{
	467	return $set->[RANGE_START] - 1;
	468	}
	469	}
	470	return ();
	471	}
	472
	473	###########################################################################
	474	###########################################################################
	475	###########################################################################
	476
	477
	478	##
	479	## Called like:
	480	## New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 1);
	481	##
	482	## The args must be in that order, although the Fuzzy pair may be omitted.
	483	##
	484	## This creates 'IsAll' as an alias for 'IsAny'
	485	##
	486	sub New_Alias($$$@)
	487	{
	488	my $Type = shift; ## "Is" or "In"
	489	my $Alias = shift;
	490	my $SameAs = shift; # expecting "SameAs" -- just ignored
	491	my $Name = shift;
	492
	493	## remaining args are optional key/val
	494	my %Args = @_;
	495
	496	my $Fuzzy = delete $Args{Fuzzy};
	497
	498	## sanity check a few args
	499	if (%Args or ($Type ne 'Is' and $Type ne 'In') or $SameAs ne 'SameAs') {
	500	confess "$0: bad args to New_Alias"
	501	}
	502
	503	$Alias = CanonicalName($Alias) if $Fuzzy;
	504
	505	if (not $TableInfo{$Type}->{$Name})
	506	{
	507	my $CName = CanonicalName($Name);
	508	if ($TableInfo{$Type}->{$CName}) {
	509	confess "$0: Use canonical form '$CName' instead of '$Name' for alias.";
	510	} else {
	511	confess "$0: don't have orignial $Type => $Name to make alias";
	512	}
	513	}
	514	if ($TableInfo{$Alias}) {
	515	confess "$0: already have original $Type => $Alias; can't make alias";
	516	}
	517	$AliasInfo{$Type}->{$Name} = $Alias;
	518	if ($Fuzzy) {
	519	$FuzzyNames{$Type}->{$Alias} = $Name;
	520	}
	521
	522	}
	523
	524
	525	## All assigned code points
	526	my $Assigned = Table->New(Is => 'Assigned',
	527	Desc => "All assigned code points",
	528	Fuzzy => 0);
	529
	530	my $Name = Table->New(); ## all characters, individually by name
	531	my $General = Table->New(); ## all characters, grouped by category
	532	my %General;
	533	my %Cat;
	534
	535	##
	536	## Process UnicodeData.txt (Categories, etc.)
	537	##
	538	sub UnicodeData_Txt()
	539	{
	540	my $Bidi = Table->New();
	541	my $Deco = Table->New();
	542	my $Comb = Table->New();
	543	my $Number = Table->New();
	544	my $Mirrored = Table->New(Is => 'Mirrored',
	545	Desc => "Mirrored in bidirectional text",
	546	Fuzzy => 0);
	547
	548	my %DC;
	549	my %Bidi;
	550	my %Deco;
	551	$Deco{Canon} = Table->New(Is => 'Canon',
	552	Desc => 'Decomposes to multiple characters',
	553	Fuzzy => 0);
	554	$Deco{Compat} = Table->New(Is => 'Compat',
	555	Desc => 'Compatible with a more-basic character',
	556	Fuzzy => 0);
	557
	558	## Initialize Perl-generated categories
	559	## (Categories from UnicodeData.txt are auto-initialized in gencat)
	560	$Cat{Alnum} =
	561	Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0);
	562	$Cat{Alpha} =
	563	Table->New(Is => 'Alpha', Desc => "[[:Alpha:]]", Fuzzy => 0);
	564	$Cat{ASCII} =
	565	Table->New(Is => 'ASCII', Desc => "[[:ASCII:]]", Fuzzy => 0);
	566	$Cat{Blank} =
	567	Table->New(Is => 'Blank', Desc => "[[:Blank:]]", Fuzzy => 0);
	568	$Cat{Cntrl} =
	569	Table->New(Is => 'Cntrl', Desc => "[[:Cntrl:]]", Fuzzy => 0);
	570	$Cat{Digit} =
	571	Table->New(Is => 'Digit', Desc => "[[:Digit:]]", Fuzzy => 0);
	572	$Cat{Graph} =
	573	Table->New(Is => 'Graph', Desc => "[[:Graph:]]", Fuzzy => 0);
	574	$Cat{Lower} =
	575	Table->New(Is => 'Lower', Desc => "[[:Lower:]]", Fuzzy => 0);
	576	$Cat{Print} =
	577	Table->New(Is => 'Print', Desc => "[[:Print:]]", Fuzzy => 0);
	578	$Cat{Punct} =
	579	Table->New(Is => 'Punct', Desc => "[[:Punct:]]", Fuzzy => 0);
	580	$Cat{Space} =
	581	Table->New(Is => 'Space', Desc => "[[:Space:]]", Fuzzy => 0);
	582	$Cat{Title} =
	583	Table->New(Is => 'Title', Desc => "[[:Title:]]", Fuzzy => 0);
	584	$Cat{Upper} =
	585	Table->New(Is => 'Upper', Desc => "[[:Upper:]]", Fuzzy => 0);
	586	$Cat{XDigit} =
	587	Table->New(Is => 'XDigit', Desc => "[[:XDigit:]]", Fuzzy => 0);
	588	$Cat{Word} =
	589	Table->New(Is => 'Word', Desc => "[[:Word:]]", Fuzzy => 0);
	590	$Cat{SpacePerl} =
	591	Table->New(Is => 'SpacePerl', Desc => '\s', Fuzzy => 0);
	592
	593	my %To;
	594	$To{Upper} = Table->New();
	595	$To{Lower} = Table->New();
	596	$To{Title} = Table->New();
	597	$To{Digit} = Table->New();
	598
	599	sub gencat($$$$)
	600	{
	601	my ($name, ## Name ("LATIN CAPITAL LETTER A")
	602	$cat, ## Category ("Lu", "Zp", "Nd", etc.)
	603	$code, ## Code point (as an integer)
	604	$op) = @_;
	605
	606	my $MajorCat = substr($cat, 0, 1); ## L, M, Z, S, etc
	607
	608	$Assigned->$op($code);
	609	$Name->$op($code, $name);
	610	$General->$op($code, $cat);
	611
	612	## add to the sub category (e.g. "Lu", "Nd", "Cf", ..)
	613	$Cat{$cat} \|\|= Table->New(Is => $cat,
	614	Desc => "General Category '$cat'",
	615	Fuzzy => 0);
	616	$Cat{$cat}->$op($code);
	617
	618	## add to the major category (e.g. "L", "N", "C", ...)
	619	$Cat{$MajorCat} \|\|= Table->New(Is => $MajorCat,
	620	Desc => "Major Category '$MajorCat'",
	621	Fuzzy => 0);
	622	$Cat{$MajorCat}->$op($code);
	623
	624	($General{$name} \|\|= Table->New)->$op($code, $name);
	625
	626	# 005F: SPACING UNDERSCORE
	627	$Cat{Word}->$op($code) if $cat =~ /^[LMN]/ \|\| $code == 0x005F;
	628	$Cat{Alnum}->$op($code) if $cat =~ /^[LMN]/;
	629	$Cat{Alpha}->$op($code) if $cat =~ /^[LM]/;
	630
	631
	632
	633	$Cat{Space}->$op($code) if $cat =~ /^Z/
	634	\|\| $code == 0x0009 # 0009: HORIZONTAL TAB
	635	\|\| $code == 0x000A # 000A: LINE FEED
	636	\|\| $code == 0x000B # 000B: VERTICAL TAB
	637	\|\| $code == 0x000C # 000C: FORM FEED
	638	\|\| $code == 0x000D; # 000D: CARRIAGE RETURN
	639
	640
	641	$Cat{SpacePerl}->$op($code) if $cat =~ /^Z/
	642	\|\| $code == 0x0009 # 0009: HORIZONTAL TAB
	643	\|\| $code == 0x000A # 000A: LINE FEED
	644	\|\| $code == 0x000C # 000C: FORM FEED
	645	\|\| $code == 0x000D # 000D: CARRIAGE RETURN
	646	\|\| $code == 0x0085 # 0085: <NEXT LINE>
	647	\|\| $code == 0x2028 # 2028: LINE SEPARATOR
	648	\|\| $code == 0x2029;# 2029: PARAGRAPH SEP.
	649
	650	$Cat{Blank}->$op($code) if $cat =~ /^Z[^lp]$/
	651	\|\| $code == 0x0009 # 0009: HORIZONTAL TAB
	652	\|\| $code == 0x0020; # 0020: SPACE
	653
	654	$Cat{Digit}->$op($code) if $cat eq "Nd";
	655	$Cat{Upper}->$op($code) if $cat eq "Lu";
	656	$Cat{Lower}->$op($code) if $cat eq "Ll";
	657	$Cat{Title}->$op($code) if $cat eq "Lt";
	658	$Cat{ASCII}->$op($code) if $code <= 0x007F;
	659	$Cat{Cntrl}->$op($code) if $cat =~ /^C/;
	660	$Cat{Graph}->$op($code) if $cat =~ /^([LMNPS]\|Co)/;
	661	$Cat{Print}->$op($code) if $cat =~ /^([LMNPS]\|Co\|Zs)/;
	662	$Cat{Punct}->$op($code) if $cat =~ /^P/;
	663
	664	$Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9
	665	\|\| ($code >= 0x41 && $code <= 0x46) ## A..F
	666	\|\| ($code >= 0x61 && $code <= 0x66); ## a..f
	667	}
	668
	669	## open ane read file.....
	670	if (not open IN, "UnicodeData.txt") {
	671	die "$0: UnicodeData.txt: $!\n";
	672	}
	673
	674	##
	675	## For building \p{_CombAbove} and \p{_CanonDCIJ}
	676	##
	677	my %_Above_HexCodes; ## Hexcodes for chars with $comb == 230 ("ABOVE")
	678
	679	my %CodeToDeco; ## Maps code to decomp. list for chars with first
	680	## decomp. char an "i" or "j" (for \p{_CanonDCIJ})
	681
	682	## This is filled in as we go....
	683	my $CombAbove = Table->New(Is => '_CombAbove',
	684	Desc => '(for internal casefolding use)',
	685	Fuzzy => 0);
	686
	687	while (<IN>)
	688	{
	689	next unless /^[0-9A-Fa-f]+;/;
	690	s/\s+$//;
	691
	692	my ($hexcode, ## code point in hex (e.g. "0041")
	693	$name, ## character name (e.g. "LATIN CAPITAL LETTER A")
	694	$cat, ## category (e.g. "Lu")
	695	$comb, ## Canonical combining class (e.t. "230")
	696	$bidi, ## directional category (e.g. "L")
	697	$deco, ## decomposition mapping
	698	$decimal, ## decimal digit value
	699	$digit, ## digit value
	700	$number, ## numeric value
	701	$mirrored, ## mirrored
	702	$unicode10, ## name in Unicode 1.0
	703	$comment, ## comment field
	704	$upper, ## uppercase mapping
	705	$lower, ## lowercase mapping
	706	$title, ## titlecase mapping
	707	) = split(/\s;\s/);
	708
	709	# Note that in Unicode 3.2 there will be names like
	710	# LINE FEED (LF), which probably means that \N{} needs
	711	# to cope also with LINE FEED and LF.
	712	$name = $unicode10 if $name eq '<control>' && $unicode10 ne '';
	713
	714	my $code = hex($hexcode);
	715
	716	if ($comb and $comb == 230) {
	717	$CombAbove->Append($code);
	718	$_Above_HexCodes{$hexcode} = 1;
	719	}
	720
	721	## Used in building \p{_CanonDCIJ}
	722	if ($deco and $deco =~ m/^006[9A]\b/) {
	723	$CodeToDeco{$code} = $deco;
	724	}
	725
	726	##
	727	## There are a few pairs of lines like:
	728	## AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
	729	## D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
	730	## that define ranges.
	731	##
	732	if ($name =~ /^<(.+), (First\|Last)>$/)
	733	{
	734	$name = $1;
	735	gencat($name, $cat, $code, $2 eq 'First' ? 'Append' : 'Extend');
	736	#New_Prop(In => $name, $General{$name}, Fuzzy => 1);
	737	}
	738	else
	739	{
	740	## normal (single-character) lines
	741	gencat($name, $cat, $code, 'Append');
	742
	743	# No Append() here since since several codes may map into one.
	744	$To{Upper}->RawAppendRange($code, $code, $upper) if $upper;
	745	$To{Lower}->RawAppendRange($code, $code, $lower) if $lower;
	746	$To{Title}->RawAppendRange($code, $code, $title) if $title;
	747	$To{Digit}->Append($code, $decimal) if length $decimal;
	748
	749	$Bidi->Append($code, $bidi);
	750	$Comb->Append($code, $comb) if $comb;
	751	$Number->Append($code, $number) if length $number;
	752
	753	$Mirrored->Append($code) if $mirrored eq "Y";
	754
	755	$Bidi{$bidi} \|\|= Table->New(Is => "Bidi$bidi",
	756	Desc => "Bi-directional category '$bidi'",
	757	Fuzzy => 0);
	758	$Bidi{$bidi}->Append($code);
	759
	760	if ($deco)
	761	{
	762	$Deco->Append($code, $deco);
	763	if ($deco =~/^<(\w+)>/)
	764	{
	765	$Deco{Compat}->Append($code);
	766
	767	$DC{$1} \|\|= Table->New(Is => "DC$1",
	768	Desc => "Compatible with '$1'",
	769	Fuzzy => 0);
	770	$DC{$1}->Append($code);
	771	}
	772	else
	773	{
	774	$Deco{Canon}->Append($code);
	775	}
	776	}
	777	}
	778	}
	779	close IN;
	780
	781	##
	782	## Tidy up a few special cases....
	783	##
	784
	785	$Cat{Cn} = $Assigned->Invert; ## Cn is everything that doesn't exist
	786	New_Prop(Is => 'Cn',
	787	$Cat{Cn},
	788	Desc => "General Category 'Cn' [not functional in Perl]",
	789	Fuzzy => 0);
	790
	791	## Unassigned is the same as 'Cn'
	792	New_Alias(Is => 'Unassigned', SameAs => 'Cn', Fuzzy => 0);
	793
	794	$Cat{C}->Replace($Cat{C}->Merge($Cat{Cn})); ## Now merge in Cn into C
	795
	796
	797	# L& is Ll, Lu, and Lt.
	798	New_Prop(Is => 'L&',
	799	Table->Merge(@Cat{qw[Ll Lu Lt]}),
	800	Desc => '[\p{Ll}\p{Lu}\p{Lt}]',
	801	Fuzzy => 0);
	802
	803	## Any and All are all code points.
	804	my $Any = Table->New(Is => 'Any',
	805	Desc => sprintf("[\\x{0000}-\\x{%X}]",
	806	$LastUnicodeCodepoint),
	807	Fuzzy => 0);
	808	$Any->RawAppendRange(0, $LastUnicodeCodepoint);
	809
	810	New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 0);
	811
	812	##
	813	## Build special properties for Perl's internal case-folding needs:
	814	## \p{_CaseIgnorable}
	815	## \p{_CanonDCIJ}
	816	## \p{_CombAbove}
	817	## _CombAbove was built above. Others are built here....
	818	##
	819
	820	## \p{_CaseIgnorable} is [\p{Mn}\0x00AD\x2010]
	821	New_Prop(Is => '_CaseIgnorable',
	822	Table->Merge($Cat{Mn},
	823	0x00AD, #SOFT HYPHEN
	824	0x2010), #HYPHEN
	825	Desc => '(for internal casefolding use)',
	826	Fuzzy => 0);
	827
	828
	829	## \p{_CanonDCIJ} is fairly complex...
	830	my $CanonCDIJ = Table->New(Is => '_CanonDCIJ',
	831	Desc => '(for internal casefolding use)',
	832	Fuzzy => 0);
	833	## It contains the ASCII 'i' and 'j'....
	834	$CanonCDIJ->Append(0x0069); # ASCII ord("i")
	835	$CanonCDIJ->Append(0x006A); # ASCII ord("j")
	836	## ...and any character with a decomposition that starts with either of
	837	## those code points, but only if the decomposition does not have any
	838	## combining character with the "ABOVE" canonical combining class.
	839	for my $code (sort { $a <=> $b} keys %CodeToDeco)
	840	{
	841	## Need to ensure that all decomposition characters do not have
	842	## a %HexCodeToComb in %AboveCombClasses.
	843	my $want = 1;
	844	for my $deco_hexcode (split / /, $CodeToDeco{$code})
	845	{
	846	if (exists $_Above_HexCodes{$deco_hexcode}) {
	847	## one of the decmposition chars has an ABOVE combination
	848	## class, so we're not interested in this one
	849	$want = 0;
	850	last;
	851	}
	852	}
	853	if ($want) {
	854	$CanonCDIJ->Append($code);
	855	}
	856	}
	857
	858
	859
	860	##
	861	## Now dump the files.
	862	##
	863	$Name->Write("Name.pl");
	864	$Bidi->Write("Bidirectional.pl");
	865	$Comb->Write("CombiningClass.pl");
	866	$Deco->Write("Decomposition.pl");
	867	$Number->Write("Number.pl");
	868	$General->Write("Category.pl");
	869
	870	for my $to (sort keys %To) {
	871	$To{$to}->Write("To/$to.pl");
	872	}
	873	}
	874
	875	##
	876	## Process LineBreak.txt
	877	##
	878	sub LineBreak_Txt()
	879	{
	880	if (not open IN, "LineBreak.txt") {
	881	die "$0: LineBreak.txt: $!\n";
	882	}
	883
	884	my $Lbrk = Table->New();
	885	my %Lbrk;
	886
	887	while (<IN>)
	888	{
	889	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s;\s(\w+)/;
	890
	891	my ($first, $last, $lbrk) = (hex($1), hex($2\|\|""), $3);
	892
	893	$Lbrk->Append($first, $lbrk);
	894
	895	$Lbrk{$lbrk} \|\|= Table->New(Is => "Lbrk$lbrk",
	896	Desc => "Linebreak category '$lbrk'",
	897	Fuzzy => 0);
	898	$Lbrk{$lbrk}->Append($first);
	899
	900	if ($last) {
	901	$Lbrk->Extend($last);
	902	$Lbrk{$lbrk}->Extend($last);
	903	}
	904	}
	905	close IN;
	906
	907	$Lbrk->Write("Lbrk.pl");
	908	}
	909
	910	##
	911	## Process ArabicShaping.txt.
	912	##
	913	sub ArabicShaping_txt()
	914	{
	915	if (not open IN, "ArabicShaping.txt") {
	916	die "$0: ArabicShaping.txt: $!\n";
	917	}
	918
	919	my $ArabLink = Table->New();
	920	my $ArabLinkGroup = Table->New();
	921
	922	while (<IN>)
	923	{
	924	next unless /^[0-9A-Fa-f]+;/;
	925	s/\s+$//;
	926
	927	my ($hexcode, $name, $link, $linkgroup) = split(/\s;\s/);
	928	my $code = hex($hexcode);
	929	$ArabLink->Append($code, $link);
	930	$ArabLinkGroup->Append($code, $linkgroup);
	931	}
	932	close IN;
	933
	934	$ArabLink->Write("ArabLink.pl");
	935	$ArabLinkGroup->Write("ArabLnkGrp.pl");
	936	}
	937
	938	##
	939	## Process Jamo.txt.
	940	##
	941	sub Jamo_txt()
	942	{
	943	if (not open IN, "Jamo.txt") {
	944	die "$0: Jamo.txt: $!\n";
	945	}
	946	my $Short = Table->New();
	947
	948	while (<IN>)
	949	{
	950	next unless /^([0-9A-Fa-f]+)\s;\s(\w*)/;
	951	my ($code, $short) = (hex($1), $2);
	952
	953	$Short->Append($code, $short);
	954	}
	955	close IN;
	956	$Short->Write("JamoShort.pl");
	957	}
	958
	959	##
	960	## Process Scripts.txt.
	961	##
	962	sub Scripts_txt()
	963	{
	964	my @ScriptInfo;
	965
	966	if (not open(IN, "Scripts.txt")) {
	967	die "$0: Scripts.txt: $!\n";
	968	}
	969	while (<IN>) {
	970	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s;\s(.+?)\s*\#/;
	971
	972	# Wait until all the scripts have been read since
	973	# they are not listed in numeric order.
	974	push @ScriptInfo, [ hex($1), hex($2\|\|""), $3 ];
	975	}
	976	close IN;
	977
	978	# Now append the scripts properties in their code point order.
	979
	980	my %Script;
	981	my $Scripts = Table->New();
	982
	983	for my $script (sort { $a->[0] <=> $b->[0] } @ScriptInfo)
	984	{
	985	my ($first, $last, $name) = @$script;
	986	$Scripts->Append($first, $name);
	987
	988	$Script{$name} \|\|= Table->New(Is => $name,
	989	Desc => "Script '$name'",
	990	Fuzzy => 1);
	991	$Script{$name}->Append($first, $name);
	992
	993	if ($last) {
	994	$Scripts->Extend($last);
	995	$Script{$name}->Extend($last);
	996	}
	997	}
	998
	999	$Scripts->Write("Scripts.pl");
	1000
	1001	## Common is everything not explicitly assigned to a Script
	1002	##
	1003	## *shouldn't this be intersected with \p{Assigned}? ****
	1004	##
	1005	New_Prop(Is => 'Common',
	1006	$Scripts->Invert,
	1007	Desc => 'Pseudo-Script of codepoints not in other Unicode scripts',
	1008	Fuzzy => 1);
	1009	}
	1010
	1011	##
	1012	## Given a name like "Close Punctuation", return a regex (that when applied
	1013	## with /i) matches any valid form of that name (e.g. "ClosePunctuation",
	1014	## "Close-Punctuation", etc.)
	1015	##
	1016	## Accept any space, dash, or underbar where in the official name there is
	1017	## space or a dash (or underbar, but there never is).
	1018	##
	1019	##
	1020	sub NameToRegex($)
	1021	{
	1022	my $Name = shift;
	1023	$Name =~ s/[- _]/(?:[-_]\|\\s+)?/g;
	1024	return $Name;
	1025	}
	1026
	1027	##
	1028	## Process Blocks.txt.
	1029	##
	1030	sub Blocks_txt()
	1031	{
	1032	my $Blocks = Table->New();
	1033	my %Blocks;
	1034
	1035	if (not open IN, "Blocks.txt") {
	1036	die "$0: Blocks.txt: $!\n";
	1037	}
	1038
	1039	while (<IN>)
	1040	{
	1041	#next if not /Private Use$/;
	1042	next if not /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s;\s(.+?)\s*$/;
	1043
	1044	my ($first, $last, $name) = (hex($1), hex($2), $3);
	1045
	1046	$Blocks->Append($first, $name);
	1047
	1048	$Blocks{$name} \|\|= Table->New(In => $name,
	1049	Desc => "Block '$name'",
	1050	Fuzzy => 1);
	1051	$Blocks{$name}->Append($first, $name);
	1052
	1053	if ($last and $last != $first) {
	1054	$Blocks->Extend($last);
	1055	$Blocks{$name}->Extend($last);
	1056	}
	1057	}
	1058	close IN;
	1059
	1060	$Blocks->Write("Blocks.pl");
	1061	}
	1062
	1063	##
	1064	## Read in the PropList.txt. It contains extended properties not
	1065	## listed in the UnicodeData.txt, such as 'Other_Alphabetic':
	1066	## alphabetic but not of the general category L; many modifiers
	1067	## belong to this extended property category: while they are not
	1068	## alphabets, they are alphabetic in nature.
	1069	##
	1070	sub PropList_txt()
	1071	{
	1072	my @PropInfo;
	1073
	1074	if (not open IN, "PropList.txt") {
	1075	die "$0: PropList.txt: $!\n";
	1076	}
	1077
	1078	while (<IN>)
	1079	{
	1080	next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s;\s(.+?)\s*\#/;
	1081
	1082	# Wait until all the extended properties have been read since
	1083	# they are not listed in numeric order.
	1084	push @PropInfo, [ hex($1), hex($2\|\|""), $3 ];
	1085	}
	1086	close IN;
	1087
	1088	# Now append the extended properties in their code point order.
	1089	my $Props = Table->New();
	1090	my %Prop;
	1091
	1092	for my $prop (sort { $a->[0] <=> $b->[0] } @PropInfo)
	1093	{
	1094	my ($first, $last, $name) = @$prop;
	1095	$Props->Append($first, $name);
	1096
	1097	$Prop{$name} \|\|= Table->New(Is => $name,
	1098	Desc => "Extended property '$name'",
	1099	Fuzzy => 1);
	1100	$Prop{$name}->Append($first, $name);
	1101
	1102	if ($last) {
	1103	$Props->Extend($last);
	1104	$Prop{$name}->Extend($last);
	1105	}
	1106	}
	1107
	1108	# Alphabetic is L and Other_Alphabetic.
	1109	New_Prop(Is => 'Alphabetic',
	1110	Table->Merge($Cat{L}, $Prop{Other_Alphabetic}),
	1111	Desc => '[\p{L}\p{OtherAlphabetic}]', # use canonical names here
	1112	Fuzzy => 1);
	1113
	1114	# Lowercase is Ll and Other_Lowercase.
	1115	New_Prop(Is => 'Lowercase',
	1116	Table->Merge($Cat{Ll}, $Prop{Other_Lowercase}),
	1117	Desc => '[\p{Ll}\p{OtherLowercase}]', # use canonical names here
	1118	Fuzzy => 1);
	1119
	1120	# Uppercase is Lu and Other_Uppercase.
	1121	New_Prop(Is => 'Uppercase',
	1122	Table->Merge($Cat{Lu}, $Prop{Other_Uppercase}),
	1123	Desc => '[\p{Lu}\p{Other_Uppercase}]', # use canonical names here
	1124	Fuzzy => 1);
	1125
	1126	# Math is Sm and Other_Math.
	1127	New_Prop(Is => 'Math',
	1128	Table->Merge($Cat{Sm}, $Prop{Other_Math}),
	1129	Desc => '[\p{Sm}\p{OtherMath}]', # use canonical names here
	1130	Fuzzy => 1);
	1131
	1132	# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
	1133	New_Prop(Is => 'ID_Start',
	1134	Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl]}),
	1135	Desc => '[\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]',
	1136	Fuzzy => 1);
	1137
	1138	# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
	1139	New_Prop(Is => 'ID_Continue',
	1140	Table->Merge(@Cat{qw[Ll Lu Lt Lm Lo Nl Mn Mc Nd Pc ]}),
	1141	Desc => '[\p{ID_Start}\p{Mn}\p{Mc}\p{Nd}\p{Pc}]',
	1142	Fuzzy => 1);
	1143	}
	1144
	1145	sub Make_GC_Aliases()
	1146	{
	1147	##
	1148	## The mapping from General Category long forms to short forms is
	1149	## currently hardwired here since no simple data file in the UCD
	1150	## seems to do that. Unicode 3.2 will assumedly correct this.
	1151	##
	1152	my %Is = (
	1153	'Letter' => 'L',
	1154	'Uppercase_Letter' => 'Lu',
	1155	'Lowercase_Letter' => 'Ll',
	1156	'Titlecase_Letter' => 'Lt',
	1157	'Modifier_Letter' => 'Lm',
	1158	'Other_Letter' => 'Lo',
	1159
	1160	'Mark' => 'M',
	1161	'Non_Spacing_Mark' => 'Mn',
	1162	'Spacing_Mark' => 'Mc',
	1163	'Enclosing_Mark' => 'Me',
	1164
	1165	'Separator' => 'Z',
	1166	'Space_Separator' => 'Zs',
	1167	'Line_Separator' => 'Zl',
	1168	'Paragraph_Separator' => 'Zp',
	1169
	1170	'Number' => 'N',
	1171	'Decimal_Number' => 'Nd',
	1172	'Letter_Number' => 'Nl',
	1173	'Other_Number' => 'No',
	1174
	1175	'Punctuation' => 'P',
	1176	'Connector_Punctuation' => 'Pc',
	1177	'Dash_Punctuation' => 'Pd',
	1178	'Open_Punctuation' => 'Ps',
	1179	'Close_Punctuation' => 'Pe',
	1180	'Initial_Punctuation' => 'Pi',
	1181	'Final_Punctuation' => 'Pf',
	1182	'Other_Punctuation' => 'Po',
	1183
	1184	'Symbol' => 'S',
	1185	'Math_Symbol' => 'Sm',
	1186	'Currency_Symbol' => 'Sc',
	1187	'Modifier_Symbol' => 'Sk',
	1188	'Other_Symbol' => 'So',
	1189
	1190	'Other' => 'C',
	1191	'Control' => 'Cc',
	1192	'Format' => 'Cf',
	1193	'Surrogate' => 'Cs',
	1194	'Private Use' => 'Co',
	1195	'Unassigned' => 'Cn',
	1196	);
	1197
	1198	## make the aliases....
	1199	while (my ($Alias, $Name) = each %Is) {
	1200	New_Alias(Is => $Alias, SameAs => $Name, Fuzzy => 1);
	1201	}
	1202	}
	1203
	1204
	1205	##
	1206	## These are used in:
	1207	## MakePropTestScript()
	1208	## WriteAllMappings()
	1209	## for making the test script.
	1210	##
	1211	my %FuzzyNameToTest;
	1212	my %ExactNameToTest;
	1213
	1214
	1215	## This used only for making the test script
	1216	sub GenTests($$$$)
	1217	{
	1218	my $FH = shift;
	1219	my $Prop = shift;
	1220	my $MatchCode = shift;
	1221	my $FailCode = shift;
	1222
	1223	if (defined $MatchCode) {
	1224	printf $FH qq/Expect(1, "\\x{%04X}", '\\p{$Prop}' );\n/, $MatchCode;
	1225	printf $FH qq/Expect(0, "\\x{%04X}", '\\p{^$Prop}');\n/, $MatchCode;
	1226	printf $FH qq/Expect(0, "\\x{%04X}", '\\P{$Prop}' );\n/, $MatchCode;
	1227	printf $FH qq/Expect(1, "\\x{%04X}", '\\P{^$Prop}');\n/, $MatchCode;
	1228	}
	1229	if (defined $FailCode) {
	1230	printf $FH qq/Expect(0, "\\x{%04X}", '\\p{$Prop}' );\n/, $FailCode;
	1231	printf $FH qq/Expect(1, "\\x{%04X}", '\\p{^$Prop}');\n/, $FailCode;
	1232	printf $FH qq/Expect(1, "\\x{%04X}", '\\P{$Prop}' );\n/, $FailCode;
	1233	printf $FH qq/Expect(0, "\\x{%04X}", '\\P{^$Prop}');\n/, $FailCode;
	1234	}
	1235	}
	1236
	1237	## This used only for making the test script
	1238	sub ExpectError($$)
	1239	{
	1240	my $FH = shift;
	1241	my $prop = shift;
	1242
	1243	print $FH qq/Error('\\p{$prop}');\n/;
	1244	print $FH qq/Error('\\P{$prop}');\n/;
	1245	}
	1246
	1247	## This used only for making the test script
	1248	my @GoodSeps = (
	1249	" ",
	1250	"-",
	1251	" \t ",
	1252	"",
	1253	"",
	1254	"_",
	1255	);
	1256	my @BadSeps = (
	1257	"--",
	1258	"__",
	1259	" _",
	1260	"/"
	1261	);
	1262
	1263	## This used only for making the test script
	1264	sub RandomlyFuzzifyName($;$)
	1265	{
	1266	my $Name = shift;
	1267	my $WantError = shift; ## if true, make an error
	1268
	1269	my @parts;
	1270	for my $part (split /[-\s_]+/, $Name)
	1271	{
	1272	if (@parts) {
	1273	if ($WantError and rand() < 0.3) {
	1274	push @parts, $BadSeps[rand(@BadSeps)];
	1275	$WantError = 0;
	1276	} else {
	1277	push @parts, $GoodSeps[rand(@GoodSeps)];
	1278	}
	1279	}
	1280	my $switch = int rand(4);
	1281	if ($switch == 0) {
	1282	push @parts, uc $part;
	1283	} elsif ($switch == 1) {
	1284	push @parts, lc $part;
	1285	} elsif ($switch == 2) {
	1286	push @parts, ucfirst $part;
	1287	} else {
	1288	push @parts, $part;
	1289	}
	1290	}
	1291	my $new = join('', @parts);
	1292
	1293	if ($WantError) {
	1294	if (rand() >= 0.5) {
	1295	$new .= $BadSeps[rand(@BadSeps)];
	1296	} else {
	1297	$new = $BadSeps[rand(@BadSeps)] . $new;
	1298	}
	1299	}
	1300	return $new;
	1301	}
	1302
	1303	## This used only for making the test script
	1304	sub MakePropTestScript()
	1305	{
	1306	## this written directly -- it's huge.
	1307	if (not open OUT, ">TestProp.pl") {
	1308	die "$0: TestProp.pl: $!\n";
	1309	}
	1310	print OUT <DATA>;
	1311
	1312	while (my ($Name, $Table) = each %ExactNameToTest)
	1313	{
	1314	GenTests(*OUT, $Name, $Table->ValidCode, $Table->InvalidCode);
	1315	ExpectError(*OUT, uc $Name) if uc $Name ne $Name;
	1316	ExpectError(*OUT, lc $Name) if lc $Name ne $Name;
	1317	}
	1318
	1319
	1320	while (my ($Name, $Table) = each %FuzzyNameToTest)
	1321	{
	1322	my $Orig = $CanonicalToOrig{$Name};
	1323	my %Names = (
	1324	$Name => 1,
	1325	$Orig => 1,
	1326	RandomlyFuzzifyName($Orig) => 1
	1327	);
	1328
	1329	for my $N (keys %Names) {
	1330	GenTests(*OUT, $N, $Table->ValidCode, $Table->InvalidCode);
	1331	}
	1332
	1333	ExpectError(*OUT, RandomlyFuzzifyName($Orig, 'ERROR'));
	1334	}
	1335
	1336	print OUT "Finished();\n";
	1337	close OUT;
	1338	}
	1339
	1340
	1341	##
	1342	## These are used only in:
	1343	## RegisterFileForName()
	1344	## WriteAllMappings()
	1345	##
	1346	my %Exact; ## will become %utf8::Exact;
	1347	my %Canonical; ## will become %utf8::Canonical;
	1348	my %CaComment; ## Comment for %Canonical entry of same key
	1349
	1350	##
	1351	## Given info about a name and a datafile that it should be associated with,
	1352	## register that assocation in %Exact and %Canonical.
	1353	sub RegisterFileForName($$$$)
	1354	{
	1355	my $Type = shift;
	1356	my $Name = shift;
	1357	my $IsFuzzy = shift;
	1358	my $filename = shift;
	1359
	1360	##
	1361	## Now in details for the mapping. $Type eq 'Is' has the
	1362	## Is removed, as it will be removed in utf8_heavy when this
	1363	## data is being checked. In keeps its "In", but a second
	1364	## sans-In record is written if it doesn't conflict with
	1365	## anything already there.
	1366	##
	1367	if (not $IsFuzzy)
	1368	{
	1369	if ($Type eq 'Is') {
	1370	die "oops[$Name]" if $Exact{$Name};
	1371	$Exact{$Name} = $filename;
	1372	} else {
	1373	die "oops[$Type$Name]" if $Exact{"$Type$Name"};
	1374	$Exact{"$Type$Name"} = $filename;
	1375	$Exact{$Name} = $filename if not $Exact{$Name};
	1376	}
	1377	}
	1378	else
	1379	{
	1380	my $CName = lc $Name;
	1381	if ($Type eq 'Is') {
	1382	die "oops[$CName]" if $Canonical{$CName};
	1383	$Canonical{$CName} = $filename;
	1384	$CaComment{$CName} = $Name if $Name =~ tr/A-Z// >= 2;
	1385	} else {
	1386	die "oops[$Type$CName]" if $Canonical{lc "$Type$CName"};
	1387	$Canonical{lc "$Type$CName"} = $filename;
	1388	$CaComment{lc "$Type$CName"} = "$Type$Name";
	1389	if (not $Canonical{$CName}) {
	1390	$Canonical{$CName} = $filename;
	1391	$CaComment{$CName} = "$Type$Name";
	1392	}
	1393	}
	1394	}
	1395	}
	1396
	1397	##
	1398	## Writes the info accumulated in
	1399	##
	1400	## %TableInfo;
	1401	## %FuzzyNames;
	1402	## %AliasInfo;
	1403	##
	1404	##
	1405	sub WriteAllMappings()
	1406	{
	1407	my @MAP;
	1408
	1409	my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts)
	1410
	1411	## 'Is' MUST come first, so its names have precidence over 'In's
	1412	for my $Type ('Is', 'In')
	1413	{
	1414	my %RawNameToFile; ## a per-$Type cache
	1415
	1416	for my $Name (sort {length $a <=> length $b} keys %{$TableInfo{$Type}})
	1417	{
	1418	## Note: $Name is already canonical
	1419	my $Table = $TableInfo{$Type}->{$Name};
	1420	my $IsFuzzy = $FuzzyNames{$Type}->{$Name};
	1421
	1422	## Need an 8.3 safe filename (which means "an 8 safe" $filename)
	1423	my $filename;
	1424	{
	1425	## 'Is' items lose 'Is' from the basename.
	1426	$filename = $Type eq 'Is' ? $Name : "$Type$Name";
	1427
	1428	$filename =~ s/[^\w_]+/_/g; # "L&" -> "L_"
	1429	substr($filename, 8) = '' if length($filename) > 8;
	1430
	1431	##
	1432	## Make sure the basename doesn't conflict with something we
	1433	## might have already written. If we have, say,
	1434	## InGreekExtended1
	1435	## InGreekExtended2
	1436	## they become
	1437	## InGreekE
	1438	## InGreek2
	1439	##
	1440	while (my $num = $BaseNames{lc $filename}++)
	1441	{
	1442	$num++; ## so basenames with numbers start with '2', which
	1443	## just looks more natural.
	1444	## Want to append $num, but if it'll make the basename longer
	1445	## than 8 characters, pre-truncate $filename so that the result
	1446	## is acceptable.
	1447	my $delta = length($filename) + length($num) - 8;
	1448	if ($delta > 0) {
	1449	substr($filename, -$delta) = $num;
	1450	} else {
	1451	$filename .= $num;
	1452	}
	1453	}
	1454	};
	1455
	1456	##
	1457	## Construct a nice comment to add to the file, and build data
	1458	## for the "./Properties" file along the way.
	1459	##
	1460	my $Comment;
	1461	{
	1462	my $Desc = $TableDesc{$Type}->{$Name} \|\| "";
	1463	## get list of names this table is reference by
	1464	my @Supported = $Name;
	1465	while (my ($Orig, $Alias) = each %{ $AliasInfo{$Type} })
	1466	{
	1467	if ($Orig eq $Name) {
	1468	push @Supported, $Alias;
	1469	}
	1470	}
	1471
	1472	my $TypeToShow = $Type eq 'Is' ? "" : $Type;
	1473	my $OrigProp;
	1474
	1475	$Comment = "This file supports:\n";
	1476	for my $N (@Supported)
	1477	{
	1478	my $IsFuzzy = $FuzzyNames{$Type}->{$N};
	1479	my $Prop = "\\p{$TypeToShow$Name}";
	1480	$OrigProp = $Prop if not $OrigProp; #cache for aliases
	1481	if ($IsFuzzy) {
	1482	$Comment .= "\t$Prop (and fuzzy permutations)\n";
	1483	} else {
	1484	$Comment .= "\t$Prop\n";
	1485	}
	1486	my $MyDesc = ($N eq $Name) ? $Desc : "Alias for $OrigProp ($Desc)";
	1487
	1488	push @MAP, sprintf("%s %-42s %s\n",
	1489	$IsFuzzy ? '*' : ' ', $Prop, $MyDesc);
	1490	}
	1491	if ($Desc) {
	1492	$Comment .= "\nMeaning: $Desc\n";
	1493	}
	1494
	1495	}
	1496	##
	1497	## Okay, write the file...
	1498	##
	1499	$Table->Write("lib/$filename.pl", $Comment);
	1500
	1501	## and register it
	1502	$RawNameToFile{$Name} = $filename;
	1503	RegisterFileForName($Type => $Name, $IsFuzzy, $filename);
	1504
	1505	if ($IsFuzzy)
	1506	{
	1507	my $CName = CanonicalName($Type . '_'. $Name);
	1508	$FuzzyNameToTest{$Name} = $Table if !$FuzzyNameToTest{$Name};
	1509	$FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName};
	1510	} else {
	1511	$ExactNameToTest{$Name} = $Table;
	1512	}
	1513
	1514	}
	1515
	1516	## Register aliase info
	1517	for my $Name (sort {length $a <=> length $b} keys %{$AliasInfo{$Type}})
	1518	{
	1519	my $Alias = $AliasInfo{$Type}->{$Name};
	1520	my $IsFuzzy = $FuzzyNames{$Type}->{$Alias};
	1521	my $filename = $RawNameToFile{$Name};
	1522	die "oops [$Alias]->[$Name]" if not $filename;
	1523	RegisterFileForName($Type => $Alias, $IsFuzzy, $filename);
	1524
	1525	my $Table = $TableInfo{$Type}->{$Name};
	1526	die "oops" if not $Table;
	1527	if ($IsFuzzy)
	1528	{
	1529	my $CName = CanonicalName($Type .'_'. $Alias);
	1530	$FuzzyNameToTest{$Alias} = $Table if !$FuzzyNameToTest{$Alias};
	1531	$FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName};
	1532	} else {
	1533	$ExactNameToTest{$Alias} = $Table;
	1534	}
	1535	}
	1536	}
	1537
	1538	##
	1539	## Write out the property list
	1540	##
	1541	{
	1542	my @OUT = (
	1543	"##\n",
	1544	"## This file created by $0\n",
	1545	"## List of built-in \\p{...}/\\P{...} properties.\n",
	1546	"##\n",
	1547	"## '*' means name may be 'fuzzy'\n",
	1548	"##\n\n",
	1549	sort { substr($a,2) cmp substr($b, 2) } @MAP,
	1550	);
	1551	WriteIfChanged('Properties', @OUT);
	1552	}
	1553
	1554	use Text::Tabs (); ## using this makes the files about half the size
	1555
	1556	## Write Exact.pl
	1557	{
	1558	my @OUT = (
	1559	$HEADER,
	1560	"##\n",
	1561	"## Data in this file used by ../utf8_heavy.pl\n",
	1562	"##\n\n",
	1563	"## Mapping from name to filename in ./lib\n",
	1564	"%utf8::Exact = (\n",
	1565	);
	1566
	1567	for my $Name (sort keys %Exact)
	1568	{
	1569	my $File = $Exact{$Name};
	1570	$Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name ";
	1571	my $Text = sprintf("%-15s => %s,\n", $Name, qq/'$File'/);
	1572	push @OUT, Text::Tabs::unexpand($Text);
	1573	}
	1574	push @OUT, ");\n1;\n";
	1575
	1576	WriteIfChanged('Exact.pl', @OUT);
	1577	}
	1578
	1579	## Write Canonical.pl
	1580	{
	1581	my @OUT = (
	1582	$HEADER,
	1583	"##\n",
	1584	"## Data in this file used by ../utf8_heavy.pl\n",
	1585	"##\n\n",
	1586	"## Mapping from lc(canonical name) to filename in ./lib\n",
	1587	"%utf8::Canonical = (\n",
	1588	);
	1589	my $Trail = ""; ## used just to keep the spacing pretty
	1590	for my $Name (sort keys %Canonical)
	1591	{
	1592	my $File = $Canonical{$Name};
	1593	if ($CaComment{$Name}) {
	1594	push @OUT, "\n" if not $Trail;
	1595	push @OUT, " # $CaComment{$Name}\n";
	1596	$Trail = "\n";
	1597	} else {
	1598	$Trail = "";
	1599	}
	1600	$Name = $Name =~ m/\W/ ? qq/'$Name'/ : " $Name ";
	1601	my $Text = sprintf(" %-41s => %s,\n$Trail", $Name, qq/'$File'/);
	1602	push @OUT, Text::Tabs::unexpand($Text);
	1603	}
	1604	push @OUT, ");\n1\n";
	1605	WriteIfChanged('Canonical.pl', @OUT);
	1606	}
	1607
	1608	MakePropTestScript() if $MakeTestScript;
	1609	}
	1610
	1611
	1612	sub SpecialCasing_txt()
	1613	{
	1614	#
	1615	# Read in the special cases.
	1616	#
	1617
	1618	my %CaseInfo;
	1619
	1620	if (not open IN, "SpecialCasing.txt") {
	1621	die "$0: SpecialCasing.txt: $!\n";
	1622	}
	1623	while (<IN>) {
	1624	next unless /^[0-9A-Fa-f]+;/;
	1625	s/\#.*//;
	1626	s/\s+$//;
	1627
	1628	my ($code, $lower, $title, $upper, $condition) = split(/\s;\s/);
	1629
	1630	if ($condition) { # not implemented yet
	1631	print "# SKIPPING $_\n" if $Verbose;
	1632	next;
	1633	}
	1634
	1635	# Wait until all the special cases have been read since
	1636	# they are not listed in numeric order.
	1637	my $ix = hex($code);
	1638	push @{$CaseInfo{Lower}}, [ $ix, $code, $lower ]
	1639	unless $code eq $lower;
	1640	push @{$CaseInfo{Title}}, [ $ix, $code, $title ]
	1641	unless $code eq $title;
	1642	push @{$CaseInfo{Upper}}, [ $ix, $code, $upper ]
	1643	unless $code eq $upper;
	1644	}
	1645	close IN;
	1646
	1647	# Now write out the special cases properties in their code point order.
	1648	# Prepend them to the To/{Upper,Lower,Title}.pl.
	1649
	1650	for my $case (qw(Lower Title Upper))
	1651	{
	1652	my $NormalCase = do "To/$case.pl" \|\| die "$0: $@\n";
	1653
	1654	my @OUT = (
	1655	$HEADER, "\n",
	1656	"%utf8::ToSpec$case =\n(\n",
	1657	);
	1658
	1659	for my $prop (sort { $a->[0] <=> $b->[0] } @{$CaseInfo{$case}}) {
	1660	my ($ix, $code, $to) = @$prop;
	1661	my $tostr =
	1662	join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
	1663	push @OUT, sprintf qq['%04X' => "$tostr",\n], $ix;
	1664	# Remove any single-character mappings for
	1665	# the same character since we are going for
	1666	# the special casing rules.
	1667	$NormalCase =~ s/^$code\t\t\w+\n//m;
	1668	}
	1669	push @OUT, (
	1670	");\n\n",
	1671	"return <<'END';\n",
	1672	$NormalCase,
	1673	"END\n"
	1674	);
	1675	WriteIfChanged("To/$case.pl", @OUT);
	1676	}
	1677	}
	1678
	1679	#
	1680	# Read in the case foldings.
	1681	#
	1682	# We will do full case folding, C + F + I (see CaseFolding.txt).
	1683	#
	1684	sub CaseFolding_txt()
	1685	{
	1686	if (not open IN, "CaseFolding.txt") {
	1687	die "$0: CaseFolding.txt: $!\n";
	1688	}
	1689
	1690	my $Fold = Table->New();
	1691	my %Fold;
	1692
	1693	while (<IN>) {
	1694	# Skip status 'S', simple case folding
	1695	next unless /^([0-9A-Fa-f]+)\s;\s([CFI])\s;\s([0-9A-Fa-f]+(?: [0-9A-Fa-f]+))\s;/;
	1696
	1697	my ($code, $status, $fold) = (hex($1), $2, $3);
	1698
	1699	if ($status eq 'C') { # Common: one-to-one folding
	1700	# No append() since several codes may fold into one.
	1701	$Fold->RawAppendRange($code, $code, $fold);
	1702	} else { # F: full, or I: dotted uppercase I -> dotless lowercase I
	1703	$Fold{$code} = $fold;
	1704	}
	1705	}
	1706	close IN;
	1707
	1708	$Fold->Write("To/Fold.pl");
	1709
	1710	#
	1711	# Prepend the special foldings to the common foldings.
	1712	#
	1713	my $CommonFold = do "To/Fold.pl" \|\| die "$0: To/Fold.pl: $!\n";
	1714
	1715	my @OUT = (
	1716	$HEADER, "\n",
	1717	"%utf8::ToSpecFold =\n(\n",
	1718	);
	1719	for my $code (sort { $a <=> $b } keys %Fold) {
	1720	my $foldstr =
	1721	join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
	1722	push @OUT, sprintf qq['%04X' => "$foldstr",\n], $code;
	1723	}
	1724	push @OUT, (
	1725	");\n\n",
	1726	"return <<'END';\n",
	1727	$CommonFold,
	1728	"END\n",
	1729	);
	1730
	1731	WriteIfChanged("To/Fold.pl", @OUT);
	1732	}
	1733
	1734	## Do it....
	1735
	1736	UnicodeData_Txt();
	1737	Make_GC_Aliases();
	1738	PropList_txt();
	1739
	1740	Scripts_txt();
	1741	Blocks_txt();
	1742
	1743	WriteAllMappings();
	1744
	1745	LineBreak_Txt();
	1746	ArabicShaping_txt();
	1747	Jamo_txt();
	1748	SpecialCasing_txt();
	1749	CaseFolding_txt();
	1750
	1751	exit(0);
	1752
	1753	## TRAILING CODE IS USED BY MakePropTestScript()
	1754	__DATA__
	1755	use strict;
	1756	use warnings;
	1757
	1758	my $Tests = 0;
	1759	my $Fails = 0;
	1760
	1761	sub Expect($$$)
	1762	{
	1763	my $Expect = shift;
	1764	my $String = shift;
	1765	my $Regex = shift;
	1766	my $Line = (caller)[2];
	1767
	1768	$Tests++;
	1769	my $RegObj;
	1770	my $result = eval {
	1771	$RegObj = qr/$Regex/;
	1772	$String =~ $RegObj ? 1 : 0
	1773	};
	1774
	1775	if (not defined $result) {
	1776	print "couldn't compile /$Regex/ on $0 line $Line: $@\n";
	1777	$Fails++;
	1778	} elsif ($result ^ $Expect) {
	1779	print "bad result (expected $Expect) on $0 line $Line: $@\n";
	1780	$Fails++;
	1781	}
	1782	}
	1783
	1784	sub Error($)
	1785	{
	1786	my $Regex = shift;
	1787	$Tests++;
	1788	if (eval { 'x' =~ qr/$Regex/; 1 }) {
	1789	$Fails++;
	1790	my $Line = (caller)[2];
	1791	print "expected error for /$Regex/ on $0 line $Line: $@\n";
	1792	}
	1793	}
	1794
	1795	sub Finished()
	1796	{
	1797	if ($Fails == 0) {
	1798	print "All $Tests tests passed.\n";
	1799	exit(0);
	1800	} else {
	1801	print "$Tests tests, $Fails failed!\n";
	1802	exit(-1);
	1803	}
	1804	}