Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | package Encode::Alias; |
2 | use strict; | |
3 | no warnings 'redefine'; | |
4 | use Encode; | |
5 | our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; | |
6 | sub DEBUG () { 0 } | |
7 | ||
8 | use base qw(Exporter); | |
9 | ||
10 | # Public, encouraged API is exported by default | |
11 | ||
12 | our @EXPORT = | |
13 | qw ( | |
14 | define_alias | |
15 | find_alias | |
16 | ); | |
17 | ||
18 | our @Alias; # ordered matching list | |
19 | our %Alias; # cached known aliases | |
20 | ||
21 | sub find_alias{ | |
22 | my $class = shift; | |
23 | my $find = shift; | |
24 | unless (exists $Alias{$find}) { | |
25 | $Alias{$find} = undef; # Recursion guard | |
26 | for (my $i=0; $i < @Alias; $i += 2){ | |
27 | my $alias = $Alias[$i]; | |
28 | my $val = $Alias[$i+1]; | |
29 | my $new; | |
30 | if (ref($alias) eq 'Regexp' && $find =~ $alias){ | |
31 | DEBUG and warn "eval $val"; | |
32 | $new = eval $val; | |
33 | DEBUG and $@ and warn "$val, $@"; | |
34 | }elsif (ref($alias) eq 'CODE'){ | |
35 | DEBUG and warn "$alias", "->", "($find)"; | |
36 | $new = $alias->($find); | |
37 | }elsif (lc($find) eq lc($alias)){ | |
38 | $new = $val; | |
39 | } | |
40 | if (defined($new)){ | |
41 | next if $new eq $find; # avoid (direct) recursion on bugs | |
42 | DEBUG and warn "$alias, $new"; | |
43 | my $enc = (ref($new)) ? $new : Encode::find_encoding($new); | |
44 | if ($enc){ | |
45 | $Alias{$find} = $enc; | |
46 | last; | |
47 | } | |
48 | } | |
49 | } | |
50 | # case insensitive search when canonical is not in all lowercase | |
51 | # RT ticket #7835 | |
52 | unless ($Alias{$find}){ | |
53 | my $lcfind = lc($find); | |
54 | for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){ | |
55 | $lcfind eq lc($name) or next; | |
56 | $Alias{$find} = Encode::find_encoding($name); | |
57 | DEBUG and warn "$find => $name"; | |
58 | } | |
59 | } | |
60 | } | |
61 | if (DEBUG){ | |
62 | my $name; | |
63 | if (my $e = $Alias{$find}){ | |
64 | $name = $e->name; | |
65 | }else{ | |
66 | $name = ""; | |
67 | } | |
68 | warn "find_alias($class, $find)->name = $name"; | |
69 | } | |
70 | return $Alias{$find}; | |
71 | } | |
72 | ||
73 | sub define_alias{ | |
74 | while (@_){ | |
75 | my ($alias,$name) = splice(@_,0,2); | |
76 | unshift(@Alias, $alias => $name); # newer one has precedence | |
77 | if (ref($alias)){ | |
78 | # clear %Alias cache to allow overrides | |
79 | my @a = keys %Alias; | |
80 | for my $k (@a){ | |
81 | if (ref($alias) eq 'Regexp' && $k =~ $alias){ | |
82 | DEBUG and warn "delete \$Alias\{$k\}"; | |
83 | delete $Alias{$k}; | |
84 | } | |
85 | elsif (ref($alias) eq 'CODE'){ | |
86 | DEBUG and warn "delete \$Alias\{$k\}"; | |
87 | delete $Alias{$alias->($name)}; | |
88 | } | |
89 | } | |
90 | }else{ | |
91 | DEBUG and warn "delete \$Alias\{$alias\}"; | |
92 | delete $Alias{$alias}; | |
93 | } | |
94 | } | |
95 | } | |
96 | ||
97 | # Allow latin-1 style names as well | |
98 | # 0 1 2 3 4 5 6 7 8 9 10 | |
99 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); | |
100 | # Allow winlatin1 style names as well | |
101 | our %Winlatin2cp = ( | |
102 | 'latin1' => 1252, | |
103 | 'latin2' => 1250, | |
104 | 'cyrillic' => 1251, | |
105 | 'greek' => 1253, | |
106 | 'turkish' => 1254, | |
107 | 'hebrew' => 1255, | |
108 | 'arabic' => 1256, | |
109 | 'baltic' => 1257, | |
110 | 'vietnamese' => 1258, | |
111 | ); | |
112 | ||
113 | init_aliases(); | |
114 | ||
115 | sub undef_aliases{ | |
116 | @Alias = (); | |
117 | %Alias = (); | |
118 | } | |
119 | ||
120 | sub init_aliases | |
121 | { | |
122 | undef_aliases(); | |
123 | # Try all-lower-case version should all else fails | |
124 | define_alias( qr/^(.*)$/ => '"\L$1"' ); | |
125 | ||
126 | # UTF/UCS stuff | |
127 | define_alias( qr/^UTF-?7$/i => '"UTF-7"'); | |
128 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); | |
129 | define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', | |
130 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', | |
131 | qr/^iso-10646-1$/i => '"UCS-2BE"' ); | |
132 | define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', | |
133 | qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', | |
134 | qr/^UTF-?(16|32)$/i => '"UTF-$1"', | |
135 | ); | |
136 | # ASCII | |
137 | define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); | |
138 | define_alias('C' => 'ascii'); | |
139 | define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"'); | |
140 | # Allow variants of iso-8859-1 etc. | |
141 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); | |
142 | ||
143 | # At least HP-UX has these. | |
144 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); | |
145 | ||
146 | # More HP stuff. | |
147 | define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); | |
148 | ||
149 | # The Official name of ASCII. | |
150 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); | |
151 | ||
152 | # This is a font issue, not an encoding issue. | |
153 | # (The currency symbol of the Latin 1 upper half | |
154 | # has been redefined as the euro symbol.) | |
155 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); | |
156 | ||
157 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i | |
158 | => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' ); | |
159 | ||
160 | define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| | |
161 | hebrew|arabic|baltic|vietnamese)$/ix => | |
162 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); | |
163 | ||
164 | # Common names for non-latin preferred MIME names | |
165 | define_alias( 'ascii' => 'US-ascii', | |
166 | 'cyrillic' => 'iso-8859-5', | |
167 | 'arabic' => 'iso-8859-6', | |
168 | 'greek' => 'iso-8859-7', | |
169 | 'hebrew' => 'iso-8859-8', | |
170 | 'thai' => 'iso-8859-11', | |
171 | 'tis620' => 'iso-8859-11', | |
172 | ); | |
173 | ||
174 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. | |
175 | # And Microsoft has their own naming (again, surprisingly). | |
176 | # And windows-* is registered in IANA! | |
177 | define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"'); | |
178 | ||
179 | # Sometimes seen with a leading zero. | |
180 | # define_alias( qr/\bcp037\b/i => '"cp37"'); | |
181 | ||
182 | # Mac Mappings | |
183 | # predefined in *.ucm; unneeded | |
184 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); | |
185 | define_alias( qr/^mac_(.*)$/i => '"mac$1"'); | |
186 | # Ououououou. gone. They are differente! | |
187 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); | |
188 | ||
189 | # Standardize on the dashed versions. | |
190 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); | |
191 | ||
192 | unless ($Encode::ON_EBCDIC){ | |
193 | # for Encode::CN | |
194 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); | |
195 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); | |
196 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) | |
197 | # CP936 doesn't have vendor-addon for GBK, so they're identical. | |
198 | define_alias( qr/^gbk$/i => '"cp936"'); | |
199 | # This fixes gb2312 vs. euc-cn confusion, practically | |
200 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); | |
201 | # for Encode::JP | |
202 | define_alias( qr/\bjis$/i => '"7bit-jis"' ); | |
203 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); | |
204 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); | |
205 | define_alias( qr/\bujis$/i => '"euc-jp"' ); | |
206 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); | |
207 | define_alias( qr/\bsjis$/i => '"shiftjis"' ); | |
208 | define_alias( qr/\bwindows-31j$/i => '"cp932"' ); | |
209 | # for Encode::KR | |
210 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); | |
211 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); | |
212 | # This fixes ksc5601 vs. euc-kr confusion, practically | |
213 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); | |
214 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); | |
215 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); | |
216 | # for Encode::TW | |
217 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); | |
218 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); | |
219 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); | |
220 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); | |
221 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); | |
222 | } | |
223 | # utf8 is blessed :) | |
224 | define_alias( qr/^UTF-8$/i => '"utf-8-strict"'); | |
225 | # At last, Map white space and _ to '-' | |
226 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); | |
227 | } | |
228 | ||
229 | 1; | |
230 | __END__ | |
231 | ||
232 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 | |
233 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 | |
234 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) | |
235 | # TODO: Armenian encoding ARMSCII-8 | |
236 | # TODO: Hebrew encoding ISO-8859-8-1 | |
237 | # TODO: Thai encoding TCVN | |
238 | # TODO: Vietnamese encodings VPS | |
239 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese | |
240 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic | |
241 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese | |
242 | # Kannada Khmer Korean Laotian Malayalam Mongolian | |
243 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese | |
244 | ||
245 | =head1 NAME | |
246 | ||
247 | Encode::Alias - alias definitions to encodings | |
248 | ||
249 | =head1 SYNOPSIS | |
250 | ||
251 | use Encode; | |
252 | use Encode::Alias; | |
253 | define_alias( newName => ENCODING); | |
254 | ||
255 | =head1 DESCRIPTION | |
256 | ||
257 | Allows newName to be used as an alias for ENCODING. ENCODING may be | |
258 | either the name of an encoding or an encoding object (as described | |
259 | in L<Encode>). | |
260 | ||
261 | Currently I<newName> can be specified in the following ways: | |
262 | ||
263 | =over 4 | |
264 | ||
265 | =item As a simple string. | |
266 | ||
267 | =item As a qr// compiled regular expression, e.g.: | |
268 | ||
269 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); | |
270 | ||
271 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed | |
272 | in order to allow C<$1> etc. to be substituted. The example is one | |
273 | way to alias names as used in X11 fonts to the MIME names for the | |
274 | iso-8859-* family. Note the double quotes inside the single quotes. | |
275 | ||
276 | (or, you don't have to do this yourself because this example is predefined) | |
277 | ||
278 | If you are using a regex here, you have to use the quotes as shown or | |
279 | it won't work. Also note that regex handling is tricky even for the | |
280 | experienced. Use this feature with caution. | |
281 | ||
282 | =item As a code reference, e.g.: | |
283 | ||
284 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); | |
285 | ||
286 | The same effect as the example above in a different way. The coderef | |
287 | takes the alias name as an argument and returns a canonical name on | |
288 | success or undef if not. Note the second argument is not required. | |
289 | Use this with even more caution than the regex version. | |
290 | ||
291 | =back | |
292 | ||
293 | =head3 Changes in code reference aliasing | |
294 | ||
295 | As of Encode 1.87, the older form | |
296 | ||
297 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); | |
298 | ||
299 | no longer works. | |
300 | ||
301 | Encode up to 1.86 internally used "local $_" to implement ths older | |
302 | form. But consider the code below; | |
303 | ||
304 | use Encode; | |
305 | $_ = "eeeee" ; | |
306 | while (/(e)/g) { | |
307 | my $utf = decode('aliased-encoding-name', $1); | |
308 | print "position:",pos,"\n"; | |
309 | } | |
310 | ||
311 | Prior to Encode 1.86 this fails because of "local $_". | |
312 | ||
313 | =head2 Alias overloading | |
314 | ||
315 | You can override predefined aliases by simply applying define_alias(). | |
316 | The new alias is always evaluated first, and when necessary, | |
317 | define_alias() flushes the internal cache to make the new definition | |
318 | available. | |
319 | ||
320 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a | |
321 | # superset of SHIFT_JIS | |
322 | ||
323 | define_alias( qr/shift.*jis$/i => '"cp932"' ); | |
324 | define_alias( qr/sjis$/i => '"cp932"' ); | |
325 | ||
326 | If you want to zap all predefined aliases, you can use | |
327 | ||
328 | Encode::Alias->undef_aliases; | |
329 | ||
330 | to do so. And | |
331 | ||
332 | Encode::Alias->init_aliases; | |
333 | ||
334 | gets the factory settings back. | |
335 | ||
336 | =head1 SEE ALSO | |
337 | ||
338 | L<Encode>, L<Encode::Supported> | |
339 | ||
340 | =cut | |
341 |