| 1 | package Encode::Alias; |
| 2 | use strict; |
| 3 | no warnings 'redefine'; |
| 4 | use Encode; |
| 5 | our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; |
| 6 | sub DEBUG () { 0 } |
| 7 | |
| 8 | use base qw(Exporter); |
| 9 | |
| 10 | # Public, encouraged API is exported by default |
| 11 | |
| 12 | our @EXPORT = |
| 13 | qw ( |
| 14 | define_alias |
| 15 | find_alias |
| 16 | ); |
| 17 | |
| 18 | our @Alias; # ordered matching list |
| 19 | our %Alias; # cached known aliases |
| 20 | |
| 21 | sub find_alias{ |
| 22 | my $class = shift; |
| 23 | my $find = shift; |
| 24 | unless (exists $Alias{$find}) { |
| 25 | $Alias{$find} = undef; # Recursion guard |
| 26 | for (my $i=0; $i < @Alias; $i += 2){ |
| 27 | my $alias = $Alias[$i]; |
| 28 | my $val = $Alias[$i+1]; |
| 29 | my $new; |
| 30 | if (ref($alias) eq 'Regexp' && $find =~ $alias){ |
| 31 | DEBUG and warn "eval $val"; |
| 32 | $new = eval $val; |
| 33 | DEBUG and $@ and warn "$val, $@"; |
| 34 | }elsif (ref($alias) eq 'CODE'){ |
| 35 | DEBUG and warn "$alias", "->", "($find)"; |
| 36 | $new = $alias->($find); |
| 37 | }elsif (lc($find) eq lc($alias)){ |
| 38 | $new = $val; |
| 39 | } |
| 40 | if (defined($new)){ |
| 41 | next if $new eq $find; # avoid (direct) recursion on bugs |
| 42 | DEBUG and warn "$alias, $new"; |
| 43 | my $enc = (ref($new)) ? $new : Encode::find_encoding($new); |
| 44 | if ($enc){ |
| 45 | $Alias{$find} = $enc; |
| 46 | last; |
| 47 | } |
| 48 | } |
| 49 | } |
| 50 | # case insensitive search when canonical is not in all lowercase |
| 51 | # RT ticket #7835 |
| 52 | unless ($Alias{$find}){ |
| 53 | my $lcfind = lc($find); |
| 54 | for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){ |
| 55 | $lcfind eq lc($name) or next; |
| 56 | $Alias{$find} = Encode::find_encoding($name); |
| 57 | DEBUG and warn "$find => $name"; |
| 58 | } |
| 59 | } |
| 60 | } |
| 61 | if (DEBUG){ |
| 62 | my $name; |
| 63 | if (my $e = $Alias{$find}){ |
| 64 | $name = $e->name; |
| 65 | }else{ |
| 66 | $name = ""; |
| 67 | } |
| 68 | warn "find_alias($class, $find)->name = $name"; |
| 69 | } |
| 70 | return $Alias{$find}; |
| 71 | } |
| 72 | |
| 73 | sub define_alias{ |
| 74 | while (@_){ |
| 75 | my ($alias,$name) = splice(@_,0,2); |
| 76 | unshift(@Alias, $alias => $name); # newer one has precedence |
| 77 | if (ref($alias)){ |
| 78 | # clear %Alias cache to allow overrides |
| 79 | my @a = keys %Alias; |
| 80 | for my $k (@a){ |
| 81 | if (ref($alias) eq 'Regexp' && $k =~ $alias){ |
| 82 | DEBUG and warn "delete \$Alias\{$k\}"; |
| 83 | delete $Alias{$k}; |
| 84 | } |
| 85 | elsif (ref($alias) eq 'CODE'){ |
| 86 | DEBUG and warn "delete \$Alias\{$k\}"; |
| 87 | delete $Alias{$alias->($name)}; |
| 88 | } |
| 89 | } |
| 90 | }else{ |
| 91 | DEBUG and warn "delete \$Alias\{$alias\}"; |
| 92 | delete $Alias{$alias}; |
| 93 | } |
| 94 | } |
| 95 | } |
| 96 | |
| 97 | # Allow latin-1 style names as well |
| 98 | # 0 1 2 3 4 5 6 7 8 9 10 |
| 99 | our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); |
| 100 | # Allow winlatin1 style names as well |
| 101 | our %Winlatin2cp = ( |
| 102 | 'latin1' => 1252, |
| 103 | 'latin2' => 1250, |
| 104 | 'cyrillic' => 1251, |
| 105 | 'greek' => 1253, |
| 106 | 'turkish' => 1254, |
| 107 | 'hebrew' => 1255, |
| 108 | 'arabic' => 1256, |
| 109 | 'baltic' => 1257, |
| 110 | 'vietnamese' => 1258, |
| 111 | ); |
| 112 | |
| 113 | init_aliases(); |
| 114 | |
| 115 | sub undef_aliases{ |
| 116 | @Alias = (); |
| 117 | %Alias = (); |
| 118 | } |
| 119 | |
| 120 | sub init_aliases |
| 121 | { |
| 122 | undef_aliases(); |
| 123 | # Try all-lower-case version should all else fails |
| 124 | define_alias( qr/^(.*)$/ => '"\L$1"' ); |
| 125 | |
| 126 | # UTF/UCS stuff |
| 127 | define_alias( qr/^UTF-?7$/i => '"UTF-7"'); |
| 128 | define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); |
| 129 | define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', |
| 130 | qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', |
| 131 | qr/^iso-10646-1$/i => '"UCS-2BE"' ); |
| 132 | define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', |
| 133 | qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', |
| 134 | qr/^UTF-?(16|32)$/i => '"UTF-$1"', |
| 135 | ); |
| 136 | # ASCII |
| 137 | define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); |
| 138 | define_alias('C' => 'ascii'); |
| 139 | define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"'); |
| 140 | # Allow variants of iso-8859-1 etc. |
| 141 | define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); |
| 142 | |
| 143 | # At least HP-UX has these. |
| 144 | define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); |
| 145 | |
| 146 | # More HP stuff. |
| 147 | define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); |
| 148 | |
| 149 | # The Official name of ASCII. |
| 150 | define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); |
| 151 | |
| 152 | # This is a font issue, not an encoding issue. |
| 153 | # (The currency symbol of the Latin 1 upper half |
| 154 | # has been redefined as the euro symbol.) |
| 155 | define_alias( qr/^(.+)\@euro$/i => '"$1"' ); |
| 156 | |
| 157 | define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i |
| 158 | => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' ); |
| 159 | |
| 160 | define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| |
| 161 | hebrew|arabic|baltic|vietnamese)$/ix => |
| 162 | '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); |
| 163 | |
| 164 | # Common names for non-latin preferred MIME names |
| 165 | define_alias( 'ascii' => 'US-ascii', |
| 166 | 'cyrillic' => 'iso-8859-5', |
| 167 | 'arabic' => 'iso-8859-6', |
| 168 | 'greek' => 'iso-8859-7', |
| 169 | 'hebrew' => 'iso-8859-8', |
| 170 | 'thai' => 'iso-8859-11', |
| 171 | 'tis620' => 'iso-8859-11', |
| 172 | ); |
| 173 | |
| 174 | # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. |
| 175 | # And Microsoft has their own naming (again, surprisingly). |
| 176 | # And windows-* is registered in IANA! |
| 177 | define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"'); |
| 178 | |
| 179 | # Sometimes seen with a leading zero. |
| 180 | # define_alias( qr/\bcp037\b/i => '"cp37"'); |
| 181 | |
| 182 | # Mac Mappings |
| 183 | # predefined in *.ucm; unneeded |
| 184 | # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); |
| 185 | define_alias( qr/^mac_(.*)$/i => '"mac$1"'); |
| 186 | # Ououououou. gone. They are differente! |
| 187 | # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); |
| 188 | |
| 189 | # Standardize on the dashed versions. |
| 190 | define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); |
| 191 | |
| 192 | unless ($Encode::ON_EBCDIC){ |
| 193 | # for Encode::CN |
| 194 | define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); |
| 195 | define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); |
| 196 | # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) |
| 197 | # CP936 doesn't have vendor-addon for GBK, so they're identical. |
| 198 | define_alias( qr/^gbk$/i => '"cp936"'); |
| 199 | # This fixes gb2312 vs. euc-cn confusion, practically |
| 200 | define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); |
| 201 | # for Encode::JP |
| 202 | define_alias( qr/\bjis$/i => '"7bit-jis"' ); |
| 203 | define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); |
| 204 | define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); |
| 205 | define_alias( qr/\bujis$/i => '"euc-jp"' ); |
| 206 | define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); |
| 207 | define_alias( qr/\bsjis$/i => '"shiftjis"' ); |
| 208 | define_alias( qr/\bwindows-31j$/i => '"cp932"' ); |
| 209 | # for Encode::KR |
| 210 | define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); |
| 211 | define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); |
| 212 | # This fixes ksc5601 vs. euc-kr confusion, practically |
| 213 | define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); |
| 214 | define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); |
| 215 | define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); |
| 216 | # for Encode::TW |
| 217 | define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); |
| 218 | define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); |
| 219 | define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); |
| 220 | define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); |
| 221 | define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); |
| 222 | } |
| 223 | # utf8 is blessed :) |
| 224 | define_alias( qr/^UTF-8$/i => '"utf-8-strict"'); |
| 225 | # At last, Map white space and _ to '-' |
| 226 | define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); |
| 227 | } |
| 228 | |
| 229 | 1; |
| 230 | __END__ |
| 231 | |
| 232 | # TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 |
| 233 | # TODO: HP-UX '15' encodings japanese15 korean15 roi15 |
| 234 | # TODO: Cyrillic encoding ISO-IR-111 (useful?) |
| 235 | # TODO: Armenian encoding ARMSCII-8 |
| 236 | # TODO: Hebrew encoding ISO-8859-8-1 |
| 237 | # TODO: Thai encoding TCVN |
| 238 | # TODO: Vietnamese encodings VPS |
| 239 | # TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese |
| 240 | # ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic |
| 241 | # Farsi Georgian Gujarati Gurmukhi Hebrew Japanese |
| 242 | # Kannada Khmer Korean Laotian Malayalam Mongolian |
| 243 | # Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese |
| 244 | |
| 245 | =head1 NAME |
| 246 | |
| 247 | Encode::Alias - alias definitions to encodings |
| 248 | |
| 249 | =head1 SYNOPSIS |
| 250 | |
| 251 | use Encode; |
| 252 | use Encode::Alias; |
| 253 | define_alias( newName => ENCODING); |
| 254 | |
| 255 | =head1 DESCRIPTION |
| 256 | |
| 257 | Allows newName to be used as an alias for ENCODING. ENCODING may be |
| 258 | either the name of an encoding or an encoding object (as described |
| 259 | in L<Encode>). |
| 260 | |
| 261 | Currently I<newName> can be specified in the following ways: |
| 262 | |
| 263 | =over 4 |
| 264 | |
| 265 | =item As a simple string. |
| 266 | |
| 267 | =item As a qr// compiled regular expression, e.g.: |
| 268 | |
| 269 | define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); |
| 270 | |
| 271 | In this case, if I<ENCODING> is not a reference, it is C<eval>-ed |
| 272 | in order to allow C<$1> etc. to be substituted. The example is one |
| 273 | way to alias names as used in X11 fonts to the MIME names for the |
| 274 | iso-8859-* family. Note the double quotes inside the single quotes. |
| 275 | |
| 276 | (or, you don't have to do this yourself because this example is predefined) |
| 277 | |
| 278 | If you are using a regex here, you have to use the quotes as shown or |
| 279 | it won't work. Also note that regex handling is tricky even for the |
| 280 | experienced. Use this feature with caution. |
| 281 | |
| 282 | =item As a code reference, e.g.: |
| 283 | |
| 284 | define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); |
| 285 | |
| 286 | The same effect as the example above in a different way. The coderef |
| 287 | takes the alias name as an argument and returns a canonical name on |
| 288 | success or undef if not. Note the second argument is not required. |
| 289 | Use this with even more caution than the regex version. |
| 290 | |
| 291 | =back |
| 292 | |
| 293 | =head3 Changes in code reference aliasing |
| 294 | |
| 295 | As of Encode 1.87, the older form |
| 296 | |
| 297 | define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); |
| 298 | |
| 299 | no longer works. |
| 300 | |
| 301 | Encode up to 1.86 internally used "local $_" to implement ths older |
| 302 | form. But consider the code below; |
| 303 | |
| 304 | use Encode; |
| 305 | $_ = "eeeee" ; |
| 306 | while (/(e)/g) { |
| 307 | my $utf = decode('aliased-encoding-name', $1); |
| 308 | print "position:",pos,"\n"; |
| 309 | } |
| 310 | |
| 311 | Prior to Encode 1.86 this fails because of "local $_". |
| 312 | |
| 313 | =head2 Alias overloading |
| 314 | |
| 315 | You can override predefined aliases by simply applying define_alias(). |
| 316 | The new alias is always evaluated first, and when necessary, |
| 317 | define_alias() flushes the internal cache to make the new definition |
| 318 | available. |
| 319 | |
| 320 | # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a |
| 321 | # superset of SHIFT_JIS |
| 322 | |
| 323 | define_alias( qr/shift.*jis$/i => '"cp932"' ); |
| 324 | define_alias( qr/sjis$/i => '"cp932"' ); |
| 325 | |
| 326 | If you want to zap all predefined aliases, you can use |
| 327 | |
| 328 | Encode::Alias->undef_aliases; |
| 329 | |
| 330 | to do so. And |
| 331 | |
| 332 | Encode::Alias->init_aliases; |
| 333 | |
| 334 | gets the factory settings back. |
| 335 | |
| 336 | =head1 SEE ALSO |
| 337 | |
| 338 | L<Encode>, L<Encode::Supported> |
| 339 | |
| 340 | =cut |
| 341 | |