Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package charnames; |
2 | use strict; | |
3 | use warnings; | |
4 | use Carp; | |
5 | our $VERSION = '1.01'; | |
6 | ||
7 | use bytes (); # for $bytes::hint_bits | |
8 | $charnames::hint_bits = 0x20000; | |
9 | ||
10 | my %alias1 = ( | |
11 | # Icky 3.2 names with parentheses. | |
12 | 'LINE FEED' => 'LINE FEED (LF)', | |
13 | 'FORM FEED' => 'FORM FEED (FF)', | |
14 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)', | |
15 | 'NEXT LINE' => 'NEXT LINE (NEL)', | |
16 | # Convenience. | |
17 | 'LF' => 'LINE FEED (LF)', | |
18 | 'FF' => 'FORM FEED (FF)', | |
19 | 'CR' => 'CARRIAGE RETURN (CR)', | |
20 | 'NEL' => 'NEXT LINE (NEL)', | |
21 | # More convenience. For futher convencience, | |
22 | # it is suggested some way using using the NamesList | |
23 | # aliases is implemented. | |
24 | 'ZWNJ' => 'ZERO WIDTH NON-JOINER', | |
25 | 'ZWJ' => 'ZERO WIDTH JOINER', | |
26 | 'BOM' => 'BYTE ORDER MARK', | |
27 | ); | |
28 | ||
29 | my %alias2 = ( | |
30 | # Pre-3.2 compatibility (only for the first 256 characters). | |
31 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION', | |
32 | 'VERTICAL TABULATION' => 'LINE TABULATION', | |
33 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR', | |
34 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE', | |
35 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO', | |
36 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE', | |
37 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD', | |
38 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD', | |
39 | ); | |
40 | ||
41 | my $txt; | |
42 | ||
43 | # This is not optimized in any way yet | |
44 | sub charnames | |
45 | { | |
46 | my $name = shift; | |
47 | ||
48 | if (exists $alias1{$name}) { | |
49 | $name = $alias1{$name}; | |
50 | } | |
51 | if (exists $alias2{$name}) { | |
52 | require warnings; | |
53 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead}); | |
54 | $name = $alias2{$name}; | |
55 | } | |
56 | ||
57 | my $ord; | |
58 | my @off; | |
59 | my $fname; | |
60 | ||
61 | if ($name eq "BYTE ORDER MARK") { | |
62 | $fname = $name; | |
63 | $ord = 0xFEFF; | |
64 | } else { | |
65 | ## Suck in the code/name list as a big string. | |
66 | ## Lines look like: | |
67 | ## "0052\t\tLATIN CAPITAL LETTER R\n" | |
68 | $txt = do "unicore/Name.pl" unless $txt; | |
69 | ||
70 | ## @off will hold the index into the code/name string of the start and | |
71 | ## end of the name as we find it. | |
72 | ||
73 | ## If :full, look for the the name exactly | |
74 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { | |
75 | @off = ($-[0], $+[0]); | |
76 | } | |
77 | ||
78 | ## If we didn't get above, and :short allowed, look for the short name. | |
79 | ## The short name is like "greek:Sigma" | |
80 | unless (@off) { | |
81 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { | |
82 | my ($script, $cname) = ($1,$2); | |
83 | my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); | |
84 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { | |
85 | @off = ($-[0], $+[0]); | |
86 | } | |
87 | } | |
88 | } | |
89 | ||
90 | ## If we still don't have it, check for the name among the loaded | |
91 | ## scripts. | |
92 | if (not @off) | |
93 | { | |
94 | my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); | |
95 | for my $script ( @{$^H{charnames_scripts}} ) | |
96 | { | |
97 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { | |
98 | @off = ($-[0], $+[0]); | |
99 | last; | |
100 | } | |
101 | } | |
102 | } | |
103 | ||
104 | ## If we don't have it by now, give up. | |
105 | unless (@off) { | |
106 | carp "Unknown charname '$name'"; | |
107 | return "\x{FFFD}"; | |
108 | } | |
109 | ||
110 | ## | |
111 | ## Now know where in the string the name starts. | |
112 | ## The code, in hex, is before that. | |
113 | ## | |
114 | ## The code can be 4-6 characters long, so we've got to sort of | |
115 | ## go look for it, just after the newline that comes before $off[0]. | |
116 | ## | |
117 | ## This would be much easier if unicore/Name.pl had info in | |
118 | ## a name/code order, instead of code/name order. | |
119 | ## | |
120 | ## The +1 after the rindex() is to skip past the newline we're finding, | |
121 | ## or, if the rindex() fails, to put us to an offset of zero. | |
122 | ## | |
123 | my $hexstart = rindex($txt, "\n", $off[0]) + 1; | |
124 | ||
125 | ## we know where it starts, so turn into number - | |
126 | ## the ordinal for the char. | |
127 | $ord = hex substr($txt, $hexstart, $off[0] - $hexstart); | |
128 | } | |
129 | ||
130 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect? | |
131 | use bytes; | |
132 | return chr $ord if $ord <= 255; | |
133 | my $hex = sprintf "%04x", $ord; | |
134 | if (not defined $fname) { | |
135 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; | |
136 | } | |
137 | croak "Character 0x$hex with name '$fname' is above 0xFF"; | |
138 | } | |
139 | ||
140 | no warnings 'utf8'; # allow even illegal characters | |
141 | return pack "U", $ord; | |
142 | } | |
143 | ||
144 | sub import | |
145 | { | |
146 | shift; ## ignore class name | |
147 | ||
148 | if (not @_) | |
149 | { | |
150 | carp("`use charnames' needs explicit imports list"); | |
151 | } | |
152 | $^H |= $charnames::hint_bits; | |
153 | $^H{charnames} = \&charnames ; | |
154 | ||
155 | ## | |
156 | ## fill %h keys with our @_ args. | |
157 | ## | |
158 | my %h; | |
159 | @h{@_} = (1) x @_; | |
160 | ||
161 | $^H{charnames_full} = delete $h{':full'}; | |
162 | $^H{charnames_short} = delete $h{':short'}; | |
163 | $^H{charnames_scripts} = [map uc, keys %h]; | |
164 | ||
165 | ## | |
166 | ## If utf8? warnings are enabled, and some scripts were given, | |
167 | ## see if at least we can find one letter of each script. | |
168 | ## | |
169 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) | |
170 | { | |
171 | $txt = do "unicore/Name.pl" unless $txt; | |
172 | ||
173 | for my $script (@{$^H{charnames_scripts}}) | |
174 | { | |
175 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) { | |
176 | warnings::warn('utf8', "No such script: '$script'"); | |
177 | } | |
178 | } | |
179 | } | |
180 | } | |
181 | ||
182 | require Unicode::UCD; # for Unicode::UCD::_getcode() | |
183 | ||
184 | my %viacode; | |
185 | ||
186 | sub viacode | |
187 | { | |
188 | if (@_ != 1) { | |
189 | carp "charnames::viacode() expects one argument"; | |
190 | return () | |
191 | } | |
192 | ||
193 | my $arg = shift; | |
194 | my $code = Unicode::UCD::_getcode($arg); | |
195 | ||
196 | my $hex; | |
197 | ||
198 | if (defined $code) { | |
199 | $hex = sprintf "%04X", $arg; | |
200 | } else { | |
201 | carp("unexpected arg \"$arg\" to charnames::viacode()"); | |
202 | return; | |
203 | } | |
204 | ||
205 | if ($code > 0x10FFFF) { | |
206 | carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex; | |
207 | return; | |
208 | } | |
209 | ||
210 | return $viacode{$hex} if exists $viacode{$hex}; | |
211 | ||
212 | $txt = do "unicore/Name.pl" unless $txt; | |
213 | ||
214 | if ($txt =~ m/^$hex\t\t(.+)/m) { | |
215 | return $viacode{$hex} = $1; | |
216 | } else { | |
217 | return; | |
218 | } | |
219 | } | |
220 | ||
221 | my %vianame; | |
222 | ||
223 | sub vianame | |
224 | { | |
225 | if (@_ != 1) { | |
226 | carp "charnames::vianame() expects one name argument"; | |
227 | return () | |
228 | } | |
229 | ||
230 | my $arg = shift; | |
231 | ||
232 | return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; | |
233 | ||
234 | return $vianame{$arg} if exists $vianame{$arg}; | |
235 | ||
236 | $txt = do "unicore/Name.pl" unless $txt; | |
237 | ||
238 | my $pos = index $txt, "\t\t$arg\n"; | |
239 | if ($[ <= $pos) { | |
240 | my $posLF = rindex $txt, "\n", $pos; | |
241 | (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; | |
242 | return $vianame{$arg} = hex $code; | |
243 | ||
244 | # If $pos is at the 1st line, $posLF must be $[ - 1 (not found); | |
245 | # then $posLF + 1 equals to $[ (at the beginning of $txt). | |
246 | # Otherwise $posLF is the position of "\n"; | |
247 | # then $posLF + 1 must be the position of the next to "\n" | |
248 | # (the beginning of the line). | |
249 | # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t", | |
250 | # "10300\t", "100000", etc. So we can get the code via removing TAB. | |
251 | } else { | |
252 | return; | |
253 | } | |
254 | } | |
255 | ||
256 | ||
257 | 1; | |
258 | __END__ | |
259 | ||
260 | =head1 NAME | |
261 | ||
262 | charnames - define character names for C<\N{named}> string literal escapes | |
263 | ||
264 | =head1 SYNOPSIS | |
265 | ||
266 | use charnames ':full'; | |
267 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n"; | |
268 | ||
269 | use charnames ':short'; | |
270 | print "\N{greek:Sigma} is an upper-case sigma.\n"; | |
271 | ||
272 | use charnames qw(cyrillic greek); | |
273 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n"; | |
274 | ||
275 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE" | |
276 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330" | |
277 | ||
278 | =head1 DESCRIPTION | |
279 | ||
280 | Pragma C<use charnames> supports arguments C<:full>, C<:short> and | |
281 | script names. If C<:full> is present, for expansion of | |
282 | C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of | |
283 | standard Unicode names of chars. If C<:short> is present, and | |
284 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up | |
285 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used | |
286 | with script name arguments, then for C<\N{CHARNAME}> the name | |
287 | C<CHARNAME> is looked up as a letter in the given scripts (in the | |
288 | specified order). | |
289 | ||
290 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> | |
291 | this pragma looks for the names | |
292 | ||
293 | SCRIPTNAME CAPITAL LETTER CHARNAME | |
294 | SCRIPTNAME SMALL LETTER CHARNAME | |
295 | SCRIPTNAME LETTER CHARNAME | |
296 | ||
297 | in the table of standard Unicode names. If C<CHARNAME> is lowercase, | |
298 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant | |
299 | is ignored. | |
300 | ||
301 | Note that C<\N{...}> is compile-time, it's a special form of string | |
302 | constant used inside double-quoted strings: in other words, you cannot | |
303 | use variables inside the C<\N{...}>. If you want similar run-time | |
304 | functionality, use charnames::vianame(). | |
305 | ||
306 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F) | |
307 | as of Unicode 3.1, there are no official Unicode names but you can use | |
308 | instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In | |
309 | Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429 | |
310 | has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081, | |
311 | U+0084, and U+0099 do not have names even in ISO 6429. | |
312 | ||
313 | Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}" | |
314 | is the Unicode smiley face, or "\N{WHITE SMILING FACE}". | |
315 | ||
316 | =head1 CUSTOM TRANSLATORS | |
317 | ||
318 | The mechanism of translation of C<\N{...}> escapes is general and not | |
319 | hardwired into F<charnames.pm>. A module can install custom | |
320 | translations (inside the scope which C<use>s the module) with the | |
321 | following magic incantation: | |
322 | ||
323 | use charnames (); # for $charnames::hint_bits | |
324 | sub import { | |
325 | shift; | |
326 | $^H |= $charnames::hint_bits; | |
327 | $^H{charnames} = \&translator; | |
328 | } | |
329 | ||
330 | Here translator() is a subroutine which takes C<CHARNAME> as an | |
331 | argument, and returns text to insert into the string instead of the | |
332 | C<\N{CHARNAME}> escape. Since the text to insert should be different | |
333 | in C<bytes> mode and out of it, the function should check the current | |
334 | state of C<bytes>-flag as in: | |
335 | ||
336 | use bytes (); # for $bytes::hint_bits | |
337 | sub translator { | |
338 | if ($^H & $bytes::hint_bits) { | |
339 | return bytes_translator(@_); | |
340 | } | |
341 | else { | |
342 | return utf8_translator(@_); | |
343 | } | |
344 | } | |
345 | ||
346 | =head1 charnames::viacode(code) | |
347 | ||
348 | Returns the full name of the character indicated by the numeric code. | |
349 | The example | |
350 | ||
351 | print charnames::viacode(0x2722); | |
352 | ||
353 | prints "FOUR TEARDROP-SPOKED ASTERISK". | |
354 | ||
355 | Returns undef if no name is known for the code. | |
356 | ||
357 | This works only for the standard names, and does not yet apply | |
358 | to custom translators. | |
359 | ||
360 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK | |
361 | SPACE", not "BYTE ORDER MARK". | |
362 | ||
363 | =head1 charnames::vianame(name) | |
364 | ||
365 | Returns the code point indicated by the name. | |
366 | The example | |
367 | ||
368 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK"); | |
369 | ||
370 | prints "2722". | |
371 | ||
372 | Returns undef if the name is unknown. | |
373 | ||
374 | This works only for the standard names, and does not yet apply | |
375 | to custom translators. | |
376 | ||
377 | =head1 ALIASES | |
378 | ||
379 | A few aliases have been defined for convenience: instead of having | |
380 | to use the official names | |
381 | ||
382 | LINE FEED (LF) | |
383 | FORM FEED (FF) | |
384 | CARRIAGE RETURN (CR) | |
385 | NEXT LINE (NEL) | |
386 | ||
387 | (yes, with parentheses) one can use | |
388 | ||
389 | LINE FEED | |
390 | FORM FEED | |
391 | CARRIAGE RETURN | |
392 | NEXT LINE | |
393 | LF | |
394 | FF | |
395 | CR | |
396 | NEL | |
397 | ||
398 | One can also use | |
399 | ||
400 | BYTE ORDER MARK | |
401 | BOM | |
402 | ||
403 | and | |
404 | ||
405 | ZWNJ | |
406 | ZWJ | |
407 | ||
408 | for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER. | |
409 | ||
410 | For backward compatibility one can use the old names for | |
411 | certain C0 and C1 controls | |
412 | ||
413 | old new | |
414 | ||
415 | HORIZONTAL TABULATION CHARACTER TABULATION | |
416 | VERTICAL TABULATION LINE TABULATION | |
417 | FILE SEPARATOR INFORMATION SEPARATOR FOUR | |
418 | GROUP SEPARATOR INFORMATION SEPARATOR THREE | |
419 | RECORD SEPARATOR INFORMATION SEPARATOR TWO | |
420 | UNIT SEPARATOR INFORMATION SEPARATOR ONE | |
421 | PARTIAL LINE DOWN PARTIAL LINE FORWARD | |
422 | PARTIAL LINE UP PARTIAL LINE BACKWARD | |
423 | ||
424 | but the old names in addition to giving the character | |
425 | will also give a warning about being deprecated. | |
426 | ||
427 | =head1 ILLEGAL CHARACTERS | |
428 | ||
429 | If you ask by name for a character that does not exist, a warning is | |
430 | given and the Unicode I<replacement character> "\x{FFFD}" is returned. | |
431 | ||
432 | If you ask by code for a character that does not exist, no warning is | |
433 | given and C<undef> is returned. (Though if you ask for a code point | |
434 | past U+10FFFF you do get a warning.) | |
435 | ||
436 | =head1 BUGS | |
437 | ||
438 | Since evaluation of the translation function happens in a middle of | |
439 | compilation (of a string literal), the translation function should not | |
440 | do any C<eval>s or C<require>s. This restriction should be lifted in | |
441 | a future version of Perl. | |
442 | ||
443 | =cut |