our $VERSION = do { my @r = (q
$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x
$#r, @r };
# Public, encouraged API is exported by default
our @Alias; # ordered matching list
our %Alias; # cached known aliases
unless (exists $Alias{$find}) {
$Alias{$find} = undef; # Recursion guard
for (my $i=0; $i < @Alias; $i += 2){
if (ref($alias) eq 'Regexp' && $find =~ $alias){
DEBUG
and warn "eval $val";
DEBUG
and $@
and warn "$val, $@";
}elsif (ref($alias) eq 'CODE'){
DEBUG
and warn "$alias", "->", "($find)";
}elsif (lc($find) eq lc($alias)){
next if $new eq $find; # avoid (direct) recursion on bugs
DEBUG
and warn "$alias, $new";
my $enc = (ref($new)) ?
$new : Encode
::find_encoding
($new);
# case insensitive search when canonical is not in all lowercase
for my $name (keys %Encode::Encoding
, keys %Encode::ExtModule
){
$lcfind eq lc($name) or next;
$Alias{$find} = Encode
::find_encoding
($name);
DEBUG
and warn "$find => $name";
if (my $e = $Alias{$find}){
warn "find_alias($class, $find)->name = $name";
my ($alias,$name) = splice(@_,0,2);
unshift(@Alias, $alias => $name); # newer one has precedence
# clear %Alias cache to allow overrides
if (ref($alias) eq 'Regexp' && $k =~ $alias){
DEBUG
and warn "delete \$Alias\{$k\}";
elsif (ref($alias) eq 'CODE'){
DEBUG
and warn "delete \$Alias\{$k\}";
delete $Alias{$alias->($name)};
DEBUG
and warn "delete \$Alias\{$alias\}";
# Allow latin-1 style names as well
our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
# Allow winlatin1 style names as well
# Try all-lower-case version should all else fails
define_alias
( qr/^(.*)$/ => '"\L$1"' );
define_alias
( qr/^UTF-?7$/i => '"UTF-7"');
define_alias
( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
define_alias
( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
qr/^iso-10646-1$/i => '"UCS-2BE"' );
define_alias
( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
qr/^UTF-?(16|32)$/i => '"UTF-$1"',
define_alias
(qr/^(?:US-?)ascii$/i => '"ascii"');
define_alias
('C' => 'ascii');
define_alias
(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
# Allow variants of iso-8859-1 etc.
define_alias
( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
# At least HP-UX has these.
define_alias
( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
define_alias
( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
# The Official name of ASCII.
define_alias
( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
# This is a font issue, not an encoding issue.
# (The currency symbol of the Latin 1 upper half
# has been redefined as the euro symbol.)
define_alias
( qr/^(.+)\@euro$/i => '"$1"' );
define_alias
( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
=> 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
define_alias
( qr
/\bwin
(latin
[12]|cyrillic
|baltic
|greek
|turkish
|
hebrew
|arabic
|baltic
|vietnamese
)$/ix
=>
'"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
# Common names for non-latin preferred MIME names
define_alias
( 'ascii' => 'US-ascii',
'cyrillic' => 'iso-8859-5',
'arabic' => 'iso-8859-6',
'hebrew' => 'iso-8859-8',
'tis620' => 'iso-8859-11',
# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
# And Microsoft has their own naming (again, surprisingly).
# And windows-* is registered in IANA!
define_alias
( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
# Sometimes seen with a leading zero.
# define_alias( qr/\bcp037\b/i => '"cp37"');
# predefined in *.ucm; unneeded
# define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
define_alias
( qr/^mac_(.*)$/i => '"mac$1"');
# Ououououou. gone. They are differente!
# define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
# Standardize on the dashed versions.
define_alias
( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
unless ($Encode::ON_EBCDIC
){
define_alias
( qr/\beuc.*cn$/i => '"euc-cn"' );
define_alias
( qr/\bcn.*euc$/i => '"euc-cn"' );
# define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
# CP936 doesn't have vendor-addon for GBK, so they're identical.
define_alias
( qr/^gbk$/i => '"cp936"');
# This fixes gb2312 vs. euc-cn confusion, practically
define_alias
( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
define_alias
( qr/\bjis$/i => '"7bit-jis"' );
define_alias
( qr/\beuc.*jp$/i => '"euc-jp"' );
define_alias
( qr/\bjp.*euc$/i => '"euc-jp"' );
define_alias
( qr/\bujis$/i => '"euc-jp"' );
define_alias
( qr/\bshift.*jis$/i => '"shiftjis"' );
define_alias
( qr/\bsjis$/i => '"shiftjis"' );
define_alias
( qr/\bwindows-31j$/i => '"cp932"' );
define_alias
( qr/\beuc.*kr$/i => '"euc-kr"' );
define_alias
( qr/\bkr.*euc$/i => '"euc-kr"' );
# This fixes ksc5601 vs. euc-kr confusion, practically
define_alias
( qr/(?:x-)?uhc$/i => '"cp949"' );
define_alias
( qr/(?:x-)?windows-949$/i => '"cp949"' );
define_alias
( qr/\bks_c_5601-1987$/i => '"cp949"' );
define_alias
( qr/\bbig-?5$/i => '"big5-eten"' );
define_alias
( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
define_alias
( qr/\btca[-_]?big5$/i => '"big5-eten"' );
define_alias
( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
define_alias
( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
define_alias
( qr/^UTF-8$/i => '"utf-8-strict"');
# At last, Map white space and _ to '-'
define_alias
( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
# TODO: HP-UX '15' encodings japanese15 korean15 roi15
# TODO: Cyrillic encoding ISO-IR-111 (useful?)
# TODO: Armenian encoding ARMSCII-8
# TODO: Hebrew encoding ISO-8859-8-1
# TODO: Thai encoding TCVN
# TODO: Vietnamese encodings VPS
# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
# Kannada Khmer Korean Laotian Malayalam Mongolian
# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
Encode::Alias - alias definitions to encodings
define_alias( newName => ENCODING);
Allows newName to be used as an alias for ENCODING. ENCODING may be
either the name of an encoding or an encoding object (as described
Currently I<newName> can be specified in the following ways:
=item As a simple string.
=item As a qr// compiled regular expression, e.g.:
define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
in order to allow C<$1> etc. to be substituted. The example is one
way to alias names as used in X11 fonts to the MIME names for the
iso-8859-* family. Note the double quotes inside the single quotes.
(or, you don't have to do this yourself because this example is predefined)
If you are using a regex here, you have to use the quotes as shown or
it won't work. Also note that regex handling is tricky even for the
experienced. Use this feature with caution.
=item As a code reference, e.g.:
define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
The same effect as the example above in a different way. The coderef
takes the alias name as an argument and returns a canonical name on
success or undef if not. Note the second argument is not required.
Use this with even more caution than the regex version.
=head3 Changes in code reference aliasing
As of Encode 1.87, the older form
define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
Encode up to 1.86 internally used "local $_" to implement ths older
form. But consider the code below;
my $utf = decode('aliased-encoding-name', $1);
print "position:",pos,"\n";
Prior to Encode 1.86 this fails because of "local $_".
You can override predefined aliases by simply applying define_alias().
The new alias is always evaluated first, and when necessary,
define_alias() flushes the internal cache to make the new definition
# redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
define_alias( qr/shift.*jis$/i => '"cp932"' );
define_alias( qr/sjis$/i => '"cp932"' );
If you want to zap all predefined aliases, you can use
Encode::Alias->undef_aliases;
Encode::Alias->init_aliases;
gets the factory settings back.
L<Encode>, L<Encode::Supported>