| 1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
| 2 | .\" |
| 3 | .\" Standard preamble: |
| 4 | .\" ======================================================================== |
| 5 | .de Sh \" Subsection heading |
| 6 | .br |
| 7 | .if t .Sp |
| 8 | .ne 5 |
| 9 | .PP |
| 10 | \fB\\$1\fR |
| 11 | .PP |
| 12 | .. |
| 13 | .de Sp \" Vertical space (when we can't use .PP) |
| 14 | .if t .sp .5v |
| 15 | .if n .sp |
| 16 | .. |
| 17 | .de Vb \" Begin verbatim text |
| 18 | .ft CW |
| 19 | .nf |
| 20 | .ne \\$1 |
| 21 | .. |
| 22 | .de Ve \" End verbatim text |
| 23 | .ft R |
| 24 | .fi |
| 25 | .. |
| 26 | .\" Set up some character translations and predefined strings. \*(-- will |
| 27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
| 28 | .\" double quote, and \*(R" will give a right double quote. | will give a |
| 29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to |
| 30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' |
| 31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. |
| 32 | .tr \(*W-|\(bv\*(Tr |
| 33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
| 34 | .ie n \{\ |
| 35 | . ds -- \(*W- |
| 36 | . ds PI pi |
| 37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
| 38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
| 39 | . ds L" "" |
| 40 | . ds R" "" |
| 41 | . ds C` "" |
| 42 | . ds C' "" |
| 43 | 'br\} |
| 44 | .el\{\ |
| 45 | . ds -- \|\(em\| |
| 46 | . ds PI \(*p |
| 47 | . ds L" `` |
| 48 | . ds R" '' |
| 49 | 'br\} |
| 50 | .\" |
| 51 | .\" If the F register is turned on, we'll generate index entries on stderr for |
| 52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index |
| 53 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
| 54 | .\" output yourself in some meaningful fashion. |
| 55 | .if \nF \{\ |
| 56 | . de IX |
| 57 | . tm Index:\\$1\t\\n%\t"\\$2" |
| 58 | .. |
| 59 | . nr % 0 |
| 60 | . rr F |
| 61 | .\} |
| 62 | .\" |
| 63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
| 64 | .\" way too many mistakes in technical documents. |
| 65 | .hy 0 |
| 66 | .if n .na |
| 67 | .\" |
| 68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
| 69 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
| 70 | . \" fudge factors for nroff and troff |
| 71 | .if n \{\ |
| 72 | . ds #H 0 |
| 73 | . ds #V .8m |
| 74 | . ds #F .3m |
| 75 | . ds #[ \f1 |
| 76 | . ds #] \fP |
| 77 | .\} |
| 78 | .if t \{\ |
| 79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
| 80 | . ds #V .6m |
| 81 | . ds #F 0 |
| 82 | . ds #[ \& |
| 83 | . ds #] \& |
| 84 | .\} |
| 85 | . \" simple accents for nroff and troff |
| 86 | .if n \{\ |
| 87 | . ds ' \& |
| 88 | . ds ` \& |
| 89 | . ds ^ \& |
| 90 | . ds , \& |
| 91 | . ds ~ ~ |
| 92 | . ds / |
| 93 | .\} |
| 94 | .if t \{\ |
| 95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
| 96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
| 97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
| 98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
| 99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
| 100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
| 101 | .\} |
| 102 | . \" troff and (daisy-wheel) nroff accents |
| 103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
| 104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
| 105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
| 106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
| 107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
| 108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
| 109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
| 110 | .ds ae a\h'-(\w'a'u*4/10)'e |
| 111 | .ds Ae A\h'-(\w'A'u*4/10)'E |
| 112 | . \" corrections for vroff |
| 113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
| 114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
| 115 | . \" for low resolution devices (crt and lpr) |
| 116 | .if \n(.H>23 .if \n(.V>19 \ |
| 117 | \{\ |
| 118 | . ds : e |
| 119 | . ds 8 ss |
| 120 | . ds o a |
| 121 | . ds d- d\h'-1'\(ga |
| 122 | . ds D- D\h'-1'\(hy |
| 123 | . ds th \o'bp' |
| 124 | . ds Th \o'LP' |
| 125 | . ds ae ae |
| 126 | . ds Ae AE |
| 127 | .\} |
| 128 | .rm #[ #] #H #V #F C |
| 129 | .\" ======================================================================== |
| 130 | .\" |
| 131 | .IX Title "Encode::Supported 3" |
| 132 | .TH Encode::Supported 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide" |
| 133 | .SH "NAME" |
| 134 | Encode::Supported \-\- Encodings supported by Encode |
| 135 | .SH "DESCRIPTION" |
| 136 | .IX Header "DESCRIPTION" |
| 137 | .Sh "Encoding Names" |
| 138 | .IX Subsection "Encoding Names" |
| 139 | Encoding names are case insensitive. White space in names |
| 140 | is ignored. In addition, an encoding may have aliases. |
| 141 | Each encoding has one \*(L"canonical\*(R" name. The \*(L"canonical\*(R" |
| 142 | name is chosen from the names of the encoding by picking |
| 143 | the first in the following sequence (with a few exceptions). |
| 144 | .IP "\(bu" 4 |
| 145 | The name used by the Perl community. That includes 'utf8' and 'ascii'. |
| 146 | Unlike aliases, canonical names directly reach the method so such |
| 147 | frequently used words like 'utf8' don't need to do alias lookups. |
| 148 | .IP "\(bu" 4 |
| 149 | The \s-1MIME\s0 name as defined in \s-1IETF\s0 RFCs. This includes all \*(L"iso\-\*(R"s. |
| 150 | .IP "\(bu" 4 |
| 151 | The name in the \s-1IANA\s0 registry. |
| 152 | .IP "\(bu" 4 |
| 153 | The name used by the organization that defined it. |
| 154 | .PP |
| 155 | In case \fIde jure\fR canonical names differ from that of the Encode |
| 156 | module, they are always aliased if it ever be implemented. So you can |
| 157 | safely tell if a given encoding is implemented or not just by passing |
| 158 | the canonical name. |
| 159 | .PP |
| 160 | Because of all the alias issues, and because in the general case |
| 161 | encodings have state, \*(L"Encode\*(R" uses an encoding object internally |
| 162 | once an operation is in progress. |
| 163 | .SH "Supported Encodings" |
| 164 | .IX Header "Supported Encodings" |
| 165 | As of Perl 5.8.0, at least the following encodings are recognized. |
| 166 | Note that unless otherwise specified, they are all case insensitive |
| 167 | (via alias) and all occurrence of spaces are replaced with '\-'. |
| 168 | In other words, \*(L"\s-1ISO\s0 8859 1\*(R" and \*(L"iso\-8859\-1\*(R" are identical. |
| 169 | .PP |
| 170 | Encodings are categorized and implemented in several different modules |
| 171 | but you don't have to \f(CW\*(C`use Encode::XX\*(C'\fR to make them available for |
| 172 | most cases. Encode.pm will automatically load those modules on demand. |
| 173 | .Sh "Built-in Encodings" |
| 174 | .IX Subsection "Built-in Encodings" |
| 175 | The following encodings are always available. |
| 176 | .PP |
| 177 | .Vb 8 |
| 178 | \& Canonical Aliases Comments & References |
| 179 | \& ---------------------------------------------------------------- |
| 180 | \& ascii US-ascii ISO-646-US [ECMA] |
| 181 | \& ascii-ctrl Special Encoding |
| 182 | \& iso-8859-1 latin1 [ISO] |
| 183 | \& null Special Encoding |
| 184 | \& utf8 UTF-8 [RFC2279] |
| 185 | \& ---------------------------------------------------------------- |
| 186 | .Ve |
| 187 | .PP |
| 188 | \&\fInull\fR and \fIascii-ctrl\fR are special. \*(L"null\*(R" fails for all character |
| 189 | so when you set fallback mode to \s-1PERLQQ\s0, \s-1HTMLCREF\s0 or \s-1XMLCREF\s0, \s-1ALL\s0 |
| 190 | \&\s-1CHARACTERS\s0 will fall back to character references. Ditto for |
| 191 | \&\*(L"ascii\-ctrl\*(R" except for control characters. For fallback modes, see |
| 192 | Encode. |
| 193 | .Sh "Encode::Unicode \*(-- other Unicode encodings" |
| 194 | .IX Subsection "Encode::Unicode other Unicode encodings" |
| 195 | Unicode coding schemes other than native utf8 are supported by |
| 196 | Encode::Unicode, which will be autoloaded on demand. |
| 197 | .PP |
| 198 | .Vb 11 |
| 199 | \& ---------------------------------------------------------------- |
| 200 | \& UCS-2BE UCS-2, iso-10646-1 [IANA, UC] |
| 201 | \& UCS-2LE [UC] |
| 202 | \& UTF-16 [UC] |
| 203 | \& UTF-16BE [UC] |
| 204 | \& UTF-16LE [UC] |
| 205 | \& UTF-32 [UC] |
| 206 | \& UTF-32BE UCS-4 [UC] |
| 207 | \& UTF-32LE [UC] |
| 208 | \& UTF-7 [RFC2152] |
| 209 | \& ---------------------------------------------------------------- |
| 210 | .Ve |
| 211 | .PP |
| 212 | To find how (UCS\-2|UTF\-(16|32))(LE|BE)? differ from one another, |
| 213 | see Encode::Unicode. |
| 214 | .PP |
| 215 | \&\s-1UTF\-7\s0 is a special encoding which \*(L"re\-encodes\*(R" \s-1UTF\-16BE\s0 into a 7\-bit |
| 216 | encoding. It is implemented seperately by Encode::Unicode::UTF7. |
| 217 | .Sh "Encode::Byte \*(-- Extended \s-1ASCII\s0" |
| 218 | .IX Subsection "Encode::Byte Extended ASCII" |
| 219 | Encode::Byte implements most single-byte encodings except for |
| 220 | Symbols and \s-1EBCDIC\s0. The following encodings are based on single-byte |
| 221 | encodings implemented as extended \s-1ASCII\s0. Most of them map |
| 222 | \&\ex80\-\exff (upper half) to non-ASCII characters. |
| 223 | .IP "\s-1ISO\-8859\s0 and corresponding vendor mappings" 4 |
| 224 | .IX Item "ISO-8859 and corresponding vendor mappings" |
| 225 | Since there are so many, they are presented in table format with |
| 226 | languages and corresponding encoding names by vendors. Note that |
| 227 | the table is sorted in order of \s-1ISO\-8859\s0 and the corresponding vendor |
| 228 | mappings are slightly different from that of \s-1ISO\s0. See |
| 229 | <http://czyborra.com/charsets/iso8859.html> for details. |
| 230 | .Sp |
| 231 | .Vb 32 |
| 232 | \& Lang/Regions ISO/Other Std. DOS Windows Macintosh Others |
| 233 | \& ---------------------------------------------------------------- |
| 234 | \& N. America (ASCII) cp437 AdobeStandardEncoding |
| 235 | \& cp863 (DOSCanadaF) |
| 236 | \& W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep |
| 237 | \& hp-roman8 |
| 238 | \& cp860 (DOSPortuguese) |
| 239 | \& Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman |
| 240 | \& MacCroatian |
| 241 | \& MacRomanian |
| 242 | \& MacRumanian |
| 243 | \& Latin3[1] iso-8859-3 |
| 244 | \& Latin4[2] iso-8859-4 |
| 245 | \& Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic |
| 246 | \& (See also next section) cp866 MacUkrainian |
| 247 | \& Arabic iso-8859-6 cp864 cp1256 MacArabic |
| 248 | \& cp1006 MacFarsi |
| 249 | \& Greek iso-8859-7 cp737 cp1253 MacGreek |
| 250 | \& cp869 (DOSGreek2) |
| 251 | \& Hebrew iso-8859-8 cp862 cp1255 MacHebrew |
| 252 | \& Turkish iso-8859-9 cp857 cp1254 MacTurkish |
| 253 | \& Nordics iso-8859-10 cp865 |
| 254 | \& cp861 MacIcelandic |
| 255 | \& MacSami |
| 256 | \& Thai iso-8859-11[3] cp874 MacThai |
| 257 | \& (iso-8859-12 is nonexistent. Reserved for Indics?) |
| 258 | \& Baltics iso-8859-13 cp775 cp1257 |
| 259 | \& Celtics iso-8859-14 |
| 260 | \& Latin9 [4] iso-8859-15 |
| 261 | \& Latin10 iso-8859-16 |
| 262 | \& Vietnamese viscii cp1258 MacVietnamese |
| 263 | \& ---------------------------------------------------------------- |
| 264 | .Ve |
| 265 | .Sp |
| 266 | .Vb 5 |
| 267 | \& [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9. |
| 268 | \& [2] Baltics. Now on 8859-10, except for Latvian. |
| 269 | \& [3] TIS 620 + Non-Breaking Space (0xA0 / U+00A0) |
| 270 | \& [4] Nicknamed Latin0; the Euro sign as well as French and Finnish |
| 271 | \& letters that are missing from 8859-1 were added. |
| 272 | .Ve |
| 273 | .Sp |
| 274 | All cp* are also available as ibm\-*, ms\-*, and windows\-* . See also |
| 275 | <http://czyborra.com/charsets/codepages.html>. |
| 276 | .Sp |
| 277 | Macintosh encodings don't seem to be registered in such entities as |
| 278 | \&\s-1IANA\s0. \*(L"Canonical\*(R" names in Encode are based upon Apple's Tech Note |
| 279 | 1150. See <http://developer.apple.com/technotes/tn/tn1150.html> |
| 280 | for details. |
| 281 | .IP "\s-1KOI8\s0 \- De Facto Standard for the Cyrillic world" 4 |
| 282 | .IX Item "KOI8 - De Facto Standard for the Cyrillic world" |
| 283 | Though \s-1ISO\-8859\s0 does have \s-1ISO\-8859\-5\s0, the \s-1KOI8\s0 series is far more |
| 284 | popular in the Net. Encode comes with the following \s-1KOI\s0 charsets. |
| 285 | For gory details, see <http://czyborra.com/charsets/cyrillic.html> |
| 286 | .Sp |
| 287 | .Vb 5 |
| 288 | \& ---------------------------------------------------------------- |
| 289 | \& koi8-f |
| 290 | \& koi8-r cp878 [RFC1489] |
| 291 | \& koi8-u [RFC2319] |
| 292 | \& ---------------------------------------------------------------- |
| 293 | .Ve |
| 294 | .IP "gsm0338 \- Hentai Latin 1" 4 |
| 295 | .IX Item "gsm0338 - Hentai Latin 1" |
| 296 | \&\s-1GSM0338\s0 is for \s-1GSM\s0 handsets. Though it shares alphanumerals with |
| 297 | \&\s-1ASCII\s0, control character ranges and other parts are mapped very |
| 298 | differently, mainly to store Greek characters. There are also escape |
| 299 | sequences (starting with 0x1B) to cover e.g. the Euro sign. Some |
| 300 | special cases like a trailing 0x00 byte or a lone 0x1B byte are not |
| 301 | well-defined and \fIdecode()\fR will return an empty string for them. |
| 302 | One possible workaround is |
| 303 | .Sp |
| 304 | .Vb 3 |
| 305 | \& $gsm =~ s/\ex00\ez/\ex00\ex00/; |
| 306 | \& $uni = decode("gsm0338", $gsm); |
| 307 | \& $uni .= "\exA0" if $gsm =~ /\ex1B\ez/; |
| 308 | .Ve |
| 309 | .Sp |
| 310 | Note that the Encode implementation of \s-1GSM0338\s0 does not implement the |
| 311 | reuse of Latin capital letters as Greek capital letters (for example, |
| 312 | the 0x5A is U+005A (\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 Z), not U+0396 (\s-1GREEK\s0 \s-1CAPITAL\s0 |
| 313 | \&\s-1LETTER\s0 \s-1ZETA\s0). |
| 314 | .Sp |
| 315 | The \s-1GSM0338\s0 is also covered in Encode::Byte even though it is not |
| 316 | an \*(L"extended \s-1ASCII\s0\*(R" encoding. |
| 317 | .Sh "\s-1CJK:\s0 Chinese, Japanese, Korean (Multibyte)" |
| 318 | .IX Subsection "CJK: Chinese, Japanese, Korean (Multibyte)" |
| 319 | Note that Vietnamese is listed above. Also read \*(L"Encoding vs Charset\*(R" |
| 320 | below. Also note that these are implemented in distinct modules by |
| 321 | countries, due to the size concerns (simplified Chinese is mapped |
| 322 | to '\s-1CN\s0', continental China, while traditional Chinese is mapped to |
| 323 | \&'\s-1TW\s0', Taiwan). Please refer to their respective documentation pages. |
| 324 | .IP "Encode::CN \*(-- Continental China" 4 |
| 325 | .IX Item "Encode::CN Continental China" |
| 326 | .Vb 9 |
| 327 | \& Standard DOS/Win Macintosh Comment/Reference |
| 328 | \& ---------------------------------------------------------------- |
| 329 | \& euc-cn [1] MacChineseSimp |
| 330 | \& (gbk) cp936 [2] |
| 331 | \& gb12345-raw { GB12345 without CES } |
| 332 | \& gb2312-raw { GB2312 without CES } |
| 333 | \& hz |
| 334 | \& iso-ir-165 |
| 335 | \& ---------------------------------------------------------------- |
| 336 | .Ve |
| 337 | .Sp |
| 338 | .Vb 2 |
| 339 | \& [1] GB2312 is aliased to this. See L<Microsoft-related naming mess> |
| 340 | \& [2] gbk is aliased to this. See L<Microsoft-related naming mess> |
| 341 | .Ve |
| 342 | .IP "Encode::JP \*(-- Japan" 4 |
| 343 | .IX Item "Encode::JP Japan" |
| 344 | .Vb 11 |
| 345 | \& Standard DOS/Win Macintosh Comment/Reference |
| 346 | \& ---------------------------------------------------------------- |
| 347 | \& euc-jp |
| 348 | \& shiftjis cp932 macJapanese |
| 349 | \& 7bit-jis |
| 350 | \& iso-2022-jp [RFC1468] |
| 351 | \& iso-2022-jp-1 [RFC2237] |
| 352 | \& jis0201-raw { JIS X 0201 (roman + halfwidth kana) without CES } |
| 353 | \& jis0208-raw { JIS X 0208 (Kanji + fullwidth kana) without CES } |
| 354 | \& jis0212-raw { JIS X 0212 (Extended Kanji) without CES } |
| 355 | \& ---------------------------------------------------------------- |
| 356 | .Ve |
| 357 | .IP "Encode::KR \*(-- Korea" 4 |
| 358 | .IX Item "Encode::KR Korea" |
| 359 | .Vb 8 |
| 360 | \& Standard DOS/Win Macintosh Comment/Reference |
| 361 | \& ---------------------------------------------------------------- |
| 362 | \& euc-kr MacKorean [RFC1557] |
| 363 | \& cp949 [1] |
| 364 | \& iso-2022-kr [RFC1557] |
| 365 | \& johab [KS X 1001:1998, Annex 3] |
| 366 | \& ksc5601-raw { KSC5601 without CES } |
| 367 | \& ---------------------------------------------------------------- |
| 368 | .Ve |
| 369 | .Sp |
| 370 | .Vb 2 |
| 371 | \& [1] ks_c_5601-1987, (x-)?windows-949, and uhc are aliased to this. |
| 372 | \& See below. |
| 373 | .Ve |
| 374 | .IP "Encode::TW \*(-- Taiwan" 4 |
| 375 | .IX Item "Encode::TW Taiwan" |
| 376 | .Vb 5 |
| 377 | \& Standard DOS/Win Macintosh Comment/Reference |
| 378 | \& ---------------------------------------------------------------- |
| 379 | \& big5-eten cp950 MacChineseTrad {big5 aliased to big5-eten} |
| 380 | \& big5-hkscs |
| 381 | \& ---------------------------------------------------------------- |
| 382 | .Ve |
| 383 | .IP "Encode::HanExtra \*(-- More Chinese via \s-1CPAN\s0" 4 |
| 384 | .IX Item "Encode::HanExtra More Chinese via CPAN" |
| 385 | Due to the size concerns, additional Chinese encodings below are |
| 386 | distributed separately on \s-1CPAN\s0, under the name Encode::HanExtra. |
| 387 | .Sp |
| 388 | .Vb 8 |
| 389 | \& Standard DOS/Win Macintosh Comment/Reference |
| 390 | \& ---------------------------------------------------------------- |
| 391 | \& big5ext CMEX's Big5e Extension |
| 392 | \& big5plus CMEX's Big5+ Extension |
| 393 | \& cccii Chinese Character Code for Information Interchange |
| 394 | \& euc-tw EUC (Extended Unix Character) |
| 395 | \& gb18030 GBK with Traditional Characters |
| 396 | \& ---------------------------------------------------------------- |
| 397 | .Ve |
| 398 | .IP "Encode::JIS2K \*(-- \s-1JIS\s0 X 0213 encodings via \s-1CPAN\s0" 4 |
| 399 | .IX Item "Encode::JIS2K JIS X 0213 encodings via CPAN" |
| 400 | Due to size concerns, additional Japanese encodings below are |
| 401 | distributed separately on \s-1CPAN\s0, under the name Encode::JIS2K. |
| 402 | .Sp |
| 403 | .Vb 8 |
| 404 | \& Standard DOS/Win Macintosh Comment/Reference |
| 405 | \& ---------------------------------------------------------------- |
| 406 | \& euc-jisx0213 |
| 407 | \& shiftjisx0123 |
| 408 | \& iso-2022-jp-3 |
| 409 | \& jis0213-1-raw |
| 410 | \& jis0213-2-raw |
| 411 | \& ---------------------------------------------------------------- |
| 412 | .Ve |
| 413 | .Sh "Miscellaneous encodings" |
| 414 | .IX Subsection "Miscellaneous encodings" |
| 415 | .IP "Encode::EBCDIC" 4 |
| 416 | .IX Item "Encode::EBCDIC" |
| 417 | See perlebcdic for details. |
| 418 | .Sp |
| 419 | .Vb 8 |
| 420 | \& ---------------------------------------------------------------- |
| 421 | \& cp37 |
| 422 | \& cp500 |
| 423 | \& cp875 |
| 424 | \& cp1026 |
| 425 | \& cp1047 |
| 426 | \& posix-bc |
| 427 | \& ---------------------------------------------------------------- |
| 428 | .Ve |
| 429 | .IP "Encode::Symbols" 4 |
| 430 | .IX Item "Encode::Symbols" |
| 431 | For symbols and dingbats. |
| 432 | .Sp |
| 433 | .Vb 7 |
| 434 | \& ---------------------------------------------------------------- |
| 435 | \& symbol |
| 436 | \& dingbats |
| 437 | \& MacDingbats |
| 438 | \& AdobeZdingbat |
| 439 | \& AdobeSymbol |
| 440 | \& ---------------------------------------------------------------- |
| 441 | .Ve |
| 442 | .IP "Encode::MIME::Header" 4 |
| 443 | .IX Item "Encode::MIME::Header" |
| 444 | Strictly speaking, \s-1MIME\s0 header encoding documented in \s-1RFC\s0 2047 is more |
| 445 | of encapsulation than encoding. However, their support in modern |
| 446 | world is imperative so they are supported. |
| 447 | .Sp |
| 448 | .Vb 5 |
| 449 | \& ---------------------------------------------------------------- |
| 450 | \& MIME-Header [RFC2047] |
| 451 | \& MIME-B [RFC2047] |
| 452 | \& MIME-Q [RFC2047] |
| 453 | \& ---------------------------------------------------------------- |
| 454 | .Ve |
| 455 | .IP "Encode::Guess" 4 |
| 456 | .IX Item "Encode::Guess" |
| 457 | This one is not a name of encoding but a utility that lets you pick up |
| 458 | the most appropriate encoding for a data out of given \fIsuspects\fR. See |
| 459 | Encode::Guess for details. |
| 460 | .SH "Unsupported encodings" |
| 461 | .IX Header "Unsupported encodings" |
| 462 | The following encodings are not supported as yet; some because they |
| 463 | are rarely used, some because of technical difficulties. They may |
| 464 | be supported by external modules via \s-1CPAN\s0 in the future, however. |
| 465 | .IP "\s-1ISO\-2022\-JP\-2\s0 [\s-1RFC1554\s0]" 4 |
| 466 | .IX Item "ISO-2022-JP-2 [RFC1554]" |
| 467 | Not very popular yet. Needs Unicode Database or equivalent to |
| 468 | implement \fIencode()\fR (because it includes \s-1JIS\s0 X 0208/0212, \s-1KSC5601\s0, and |
| 469 | \&\s-1GB2312\s0 simultaneously, whose code points in Unicode overlap. So you |
| 470 | need to lookup the database to determine to what character set a given |
| 471 | Unicode character should belong). |
| 472 | .IP "\s-1ISO\-2022\-CN\s0 [\s-1RFC1922\s0]" 4 |
| 473 | .IX Item "ISO-2022-CN [RFC1922]" |
| 474 | Not very popular. Needs \s-1CNS\s0 11643\-1 and \-2 which are not available in |
| 475 | this module. \s-1CNS\s0 11643 is supported (via euc\-tw) in Encode::HanExtra. |
| 476 | Autrijus Tang may add support for this encoding in his module in future. |
| 477 | .IP "Various HP-UX encodings" 4 |
| 478 | .IX Item "Various HP-UX encodings" |
| 479 | The following are unsupported due to the lack of mapping data. |
| 480 | .Sp |
| 481 | .Vb 2 |
| 482 | \& '8' - arabic8, greek8, hebrew8, kana8, thai8, and turkish8 |
| 483 | \& '15' - japanese15, korean15, and roi15 |
| 484 | .Ve |
| 485 | .IP "Cyrillic encoding \s-1ISO\-IR\-111\s0" 4 |
| 486 | .IX Item "Cyrillic encoding ISO-IR-111" |
| 487 | Anton Tagunov doubts its usefulness. |
| 488 | .IP "\s-1ISO\-8859\-8\-1\s0 [Hebrew]" 4 |
| 489 | .IX Item "ISO-8859-8-1 [Hebrew]" |
| 490 | None of the Encode team knows Hebrew enough (\s-1ISO\-8859\-8\s0, cp1255 and |
| 491 | MacHebrew are supported because and just because there were mappings |
| 492 | available at <http://www.unicode.org/>). Contributions welcome. |
| 493 | .IP "\s-1ISIRI\s0 3342, Iran System, \s-1ISIRI\s0 2900 [Farsi]" 4 |
| 494 | .IX Item "ISIRI 3342, Iran System, ISIRI 2900 [Farsi]" |
| 495 | Ditto. |
| 496 | .IP "Thai encoding \s-1TCVN\s0" 4 |
| 497 | .IX Item "Thai encoding TCVN" |
| 498 | Ditto. |
| 499 | .IP "Vietnamese encodings \s-1VPS\s0" 4 |
| 500 | .IX Item "Vietnamese encodings VPS" |
| 501 | Though Jungshik Shin has reported that Mozilla supports this encoding, |
| 502 | it was too late before 5.8.0 for us to add it. In the future, it |
| 503 | may be available via a separate module. See |
| 504 | <http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf> |
| 505 | and |
| 506 | <http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut> |
| 507 | if you are interested in helping us. |
| 508 | .IP "Various Mac encodings" 4 |
| 509 | .IX Item "Various Mac encodings" |
| 510 | The following are unsupported due to the lack of mapping data. |
| 511 | .Sp |
| 512 | .Vb 5 |
| 513 | \& MacArmenian, MacBengali, MacBurmese, MacEthiopic |
| 514 | \& MacExtArabic, MacGeorgian, MacKannada, MacKhmer |
| 515 | \& MacLaotian, MacMalayalam, MacMongolian, MacOriya |
| 516 | \& MacSinhalese, MacTamil, MacTelugu, MacTibetan |
| 517 | \& MacVietnamese |
| 518 | .Ve |
| 519 | .Sp |
| 520 | The rest which are already available are based upon the vendor mappings |
| 521 | at <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/> . |
| 522 | .IP "(Mac) Indic encodings" 4 |
| 523 | .IX Item "(Mac) Indic encodings" |
| 524 | The maps for the following are available at <http://www.unicode.org/> |
| 525 | but remain unsupport because those encodings need algorithmical |
| 526 | approach, currently unsupported by \fIenc2xs\fR: |
| 527 | .Sp |
| 528 | .Vb 3 |
| 529 | \& MacDevanagari |
| 530 | \& MacGurmukhi |
| 531 | \& MacGujarati |
| 532 | .Ve |
| 533 | .Sp |
| 534 | For details, please see \f(CW\*(C`Unicode mapping issues and notes:\*(C'\fR at |
| 535 | <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/DEVANAGA.TXT> . |
| 536 | .Sp |
| 537 | I believe this issue is prevalent not only for Mac Indics but also in |
| 538 | other Indic encodings, but the above were the only Indic encodings |
| 539 | maps that I could find at <http://www.unicode.org/> . |
| 540 | .SH "Encoding vs. Charset \*(-- terminology" |
| 541 | .IX Header "Encoding vs. Charset terminology" |
| 542 | We are used to using the term (character) \fIencoding\fR and \fIcharacter |
| 543 | set\fR interchangeably. But just as confusing the terms byte and |
| 544 | character is dangerous and the terms should be differentiated when |
| 545 | needed, we need to differentiate \fIencoding\fR and \fIcharacter set\fR. |
| 546 | .PP |
| 547 | To understand that, here is a description of how we make computers |
| 548 | grok our characters. |
| 549 | .IP "\(bu" 4 |
| 550 | First we start with which characters to include. We call this |
| 551 | collection of characters \fIcharacter repertoire\fR. |
| 552 | .IP "\(bu" 4 |
| 553 | Then we have to give each character a unique \s-1ID\s0 so your computer can |
| 554 | tell the difference between 'a' and 'A'. This itemized character |
| 555 | repertoire is now a \fIcharacter set\fR. |
| 556 | .IP "\(bu" 4 |
| 557 | If your computer can grow the character set without further |
| 558 | processing, you can go ahead and use it. This is called a \fIcoded |
| 559 | character set\fR (\s-1CCS\s0) or \fIraw character encoding\fR. \s-1ASCII\s0 is used this |
| 560 | way for most cases. |
| 561 | .IP "\(bu" 4 |
| 562 | But in many cases, especially multi-byte \s-1CJK\s0 encodings, you have to |
| 563 | tweak a little more. Your network connection may not accept any data |
| 564 | with the Most Significant Bit set, and your computer may not be able to |
| 565 | tell if a given byte is a whole character or just half of it. So you |
| 566 | have to \fIencode\fR the character set to use it. |
| 567 | .Sp |
| 568 | A \fIcharacter encoding scheme\fR (\s-1CES\s0) determines how to encode a given |
| 569 | character set, or a set of multiple character sets. 7bit \s-1ISO\-2022\s0 is |
| 570 | an example of a \s-1CES\s0. You switch between character sets via \fIescape |
| 571 | sequences\fR. |
| 572 | .PP |
| 573 | Technically, or mathematically, speaking, a character set encoded in |
| 574 | such a \s-1CES\s0 that maps character by character may form a \s-1CCS\s0. \s-1EUC\s0 is such |
| 575 | an example. The \s-1CES\s0 of \s-1EUC\s0 is as follows: |
| 576 | .IP "\(bu" 4 |
| 577 | Map \s-1ASCII\s0 unchanged. |
| 578 | .IP "\(bu" 4 |
| 579 | Map such a character set that consists of 94 or 96 powered by N |
| 580 | members by adding 0x80 to each byte. |
| 581 | .IP "\(bu" 4 |
| 582 | You can also use 0x8e and 0x8f to indicate that the following sequence of |
| 583 | characters belongs to yet another character set. To each following byte |
| 584 | is added the value 0x80. |
| 585 | .PP |
| 586 | By carefully looking at the encoded byte sequence, you can find that the |
| 587 | byte sequence conforms a unique number. In that sense, \s-1EUC\s0 is a \s-1CCS\s0 |
| 588 | generated by a \s-1CES\s0 above from up to four \s-1CCS\s0 (complicated?). \s-1UTF\-8\s0 |
| 589 | falls into this category. See \*(L"\s-1UTF\-8\s0\*(R" in perlUnicode to find out how |
| 590 | \&\s-1UTF\-8\s0 maps Unicode to a byte sequence. |
| 591 | .PP |
| 592 | You may also have found out by now why 7bit \s-1ISO\-2022\s0 cannot comprise |
| 593 | a \s-1CCS\s0. If you look at a byte sequence \ex21\ex21, you can't tell if |
| 594 | it is two !'s or \s-1IDEOGRAPHIC\s0 \s-1SPACE\s0. \s-1EUC\s0 maps the latter to \exA1\exA1 |
| 595 | so you have no trouble differentiating between \*(L"!!\*(R". and \*(L"\ \*(R". |
| 596 | .SH "Encoding Classification (by Anton Tagunov and Dan Kogai)" |
| 597 | .IX Header "Encoding Classification (by Anton Tagunov and Dan Kogai)" |
| 598 | This section tries to classify the supported encodings by their |
| 599 | applicability for information exchange over the Internet and to |
| 600 | choose the most suitable aliases to name them in the context of |
| 601 | such communication. |
| 602 | .IP "\(bu" 4 |
| 603 | To (en|de)code encodings marked by \f(CW\*(C`(**)\*(C'\fR, you need |
| 604 | \&\f(CW\*(C`Encode::HanExtra\*(C'\fR, available from \s-1CPAN\s0. |
| 605 | .PP |
| 606 | Encoding names |
| 607 | .PP |
| 608 | .Vb 3 |
| 609 | \& US-ASCII UTF-8 ISO-8859-* KOI8-R |
| 610 | \& Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1 |
| 611 | \& EUC-KR Big5 GB2312 |
| 612 | .Ve |
| 613 | .PP |
| 614 | are registered with \s-1IANA\s0 as preferred \s-1MIME\s0 names and may |
| 615 | be used over the Internet. |
| 616 | .PP |
| 617 | \&\f(CW\*(C`Shift_JIS\*(C'\fR has been officialized by \s-1JIS\s0 X 0208:1997. |
| 618 | \&\*(L"Microsoft\-related naming mess\*(R" gives details. |
| 619 | .PP |
| 620 | \&\f(CW\*(C`GB2312\*(C'\fR is the \s-1IANA\s0 name for \f(CW\*(C`EUC\-CN\*(C'\fR. |
| 621 | See \*(L"Microsoft\-related naming mess\*(R" for details. |
| 622 | .PP |
| 623 | \&\f(CW\*(C`GB_2312\-80\*(C'\fR \fIraw\fR encoding is available as \f(CW\*(C`gb2312\-raw\*(C'\fR |
| 624 | with Encode. See Encode::CN for details. |
| 625 | .PP |
| 626 | .Vb 2 |
| 627 | \& EUC-CN |
| 628 | \& KOI8-U [RFC2319] |
| 629 | .Ve |
| 630 | .PP |
| 631 | have not been registered with \s-1IANA\s0 (as of March 2002) but |
| 632 | seem to be supported by major web browsers. |
| 633 | The \s-1IANA\s0 name for \f(CW\*(C`EUC\-CN\*(C'\fR is \f(CW\*(C`GB2312\*(C'\fR. |
| 634 | .PP |
| 635 | .Vb 1 |
| 636 | \& KS_C_5601-1987 |
| 637 | .Ve |
| 638 | .PP |
| 639 | is heavily misused. |
| 640 | See \*(L"Microsoft\-related naming mess\*(R" for details. |
| 641 | .PP |
| 642 | \&\f(CW\*(C`KS_C_5601\-1987\*(C'\fR \fIraw\fR encoding is available as \f(CW\*(C`kcs5601\-raw\*(C'\fR |
| 643 | with Encode. See Encode::KR for details. |
| 644 | .PP |
| 645 | .Vb 1 |
| 646 | \& UTF-16 UTF-16BE UTF-16LE |
| 647 | .Ve |
| 648 | .PP |
| 649 | are IANA-registered \f(CW\*(C`charset\*(C'\fRs. See [\s-1RFC\s0 2781] for details. |
| 650 | Jungshik Shin reports that \s-1UTF\-16\s0 with a \s-1BOM\s0 is well accepted |
| 651 | by \s-1MS\s0 \s-1IE\s0 5/6 and \s-1NS\s0 4/6. Beware however that |
| 652 | .IP "\(bu" 4 |
| 653 | \&\f(CW\*(C`UTF\-16\*(C'\fR support in any software you're going to be |
| 654 | using/interoperating with has probably been less tested |
| 655 | then \f(CW\*(C`UTF\-8\*(C'\fR support |
| 656 | .IP "\(bu" 4 |
| 657 | \&\f(CW\*(C`UTF\-8\*(C'\fR coded data seamlessly passes traditional |
| 658 | command piping (\f(CW\*(C`cat\*(C'\fR, \f(CW\*(C`more\*(C'\fR, etc.) while \f(CW\*(C`UTF\-16\*(C'\fR coded |
| 659 | data is likely to cause confusion (with its zero bytes, |
| 660 | for example) |
| 661 | .IP "\(bu" 4 |
| 662 | it is beyond the power of words to describe the way \s-1HTML\s0 browsers |
| 663 | encode non\-\f(CW\*(C`ASCII\*(C'\fR form data. To get a general impression, visit |
| 664 | <http://ppewww.ph.gla.ac.uk/~flavell/charset/form\-i18n.html>. |
| 665 | While encoding of form data has stabilized for \f(CW\*(C`UTF\-8\*(C'\fR encoded pages |
| 666 | (at least \s-1IE\s0 5/6, \s-1NS\s0 6, and Opera 6 behave consistently), be sure to |
| 667 | expect fun (and cross-browser discrepancies) with \f(CW\*(C`UTF\-16\*(C'\fR encoded |
| 668 | pages! |
| 669 | .PP |
| 670 | The rule of thumb is to use \f(CW\*(C`UTF\-8\*(C'\fR unless you know what |
| 671 | you're doing and unless you really benefit from using \f(CW\*(C`UTF\-16\*(C'\fR. |
| 672 | .PP |
| 673 | .Vb 5 |
| 674 | \& ISO-IR-165 [RFC1345] |
| 675 | \& VISCII |
| 676 | \& GB 12345 |
| 677 | \& GB 18030 (**) (see links bellow) |
| 678 | \& EUC-TW (**) |
| 679 | .Ve |
| 680 | .PP |
| 681 | are totally valid encodings but not registered at \s-1IANA\s0. |
| 682 | The names under which they are listed here are probably the |
| 683 | most widely-known names for these encodings and are recommended |
| 684 | names. |
| 685 | .PP |
| 686 | .Vb 1 |
| 687 | \& BIG5PLUS (**) |
| 688 | .Ve |
| 689 | .PP |
| 690 | is a proprietary name. |
| 691 | .Sh "Microsoft-related naming mess" |
| 692 | .IX Subsection "Microsoft-related naming mess" |
| 693 | Microsoft products misuse the following names: |
| 694 | .IP "\s-1KS_C_5601\-1987\s0" 4 |
| 695 | .IX Item "KS_C_5601-1987" |
| 696 | Microsoft extension to \f(CW\*(C`EUC\-KR\*(C'\fR. |
| 697 | .Sp |
| 698 | Proper names: \f(CW\*(C`CP949\*(C'\fR, \f(CW\*(C`UHC\*(C'\fR, \f(CW\*(C`x\-windows\-949\*(C'\fR (as used by Mozilla). |
| 699 | .Sp |
| 700 | See <http://lists.w3.org/Archives/Public/ietf\-charsets/2001AprJun/0033.html> |
| 701 | for details. |
| 702 | .Sp |
| 703 | Encode aliases \f(CW\*(C`KS_C_5601\-1987\*(C'\fR to \f(CW\*(C`cp949\*(C'\fR to reflect this common |
| 704 | misusage. \fIRaw\fR \f(CW\*(C`KS_C_5601\-1987\*(C'\fR encoding is available as |
| 705 | \&\f(CW\*(C`kcs5601\-raw\*(C'\fR. |
| 706 | .Sp |
| 707 | See Encode::KR for details. |
| 708 | .IP "\s-1GB2312\s0" 4 |
| 709 | .IX Item "GB2312" |
| 710 | Microsoft extension to \f(CW\*(C`EUC\-CN\*(C'\fR. |
| 711 | .Sp |
| 712 | Proper names: \f(CW\*(C`CP936\*(C'\fR, \f(CW\*(C`GBK\*(C'\fR. |
| 713 | .Sp |
| 714 | \&\f(CW\*(C`GB2312\*(C'\fR has been registered in the \f(CW\*(C`EUC\-CN\*(C'\fR meaning at |
| 715 | \&\s-1IANA\s0. This has partially repaired the situation: Microsoft's |
| 716 | \&\f(CW\*(C`GB2312\*(C'\fR has become a superset of the official \f(CW\*(C`GB2312\*(C'\fR. |
| 717 | .Sp |
| 718 | Encode aliases \f(CW\*(C`GB2312\*(C'\fR to \f(CW\*(C`euc\-cn\*(C'\fR in full agreement with |
| 719 | \&\s-1IANA\s0 registration. \f(CW\*(C`cp936\*(C'\fR is supported separately. |
| 720 | \&\fIRaw\fR \f(CW\*(C`GB_2312\-80\*(C'\fR encoding is available as \f(CW\*(C`gb2312\-raw\*(C'\fR. |
| 721 | .Sp |
| 722 | See Encode::CN for details. |
| 723 | .IP "Big5" 4 |
| 724 | .IX Item "Big5" |
| 725 | Microsoft extension to \f(CW\*(C`Big5\*(C'\fR. |
| 726 | .Sp |
| 727 | Proper name: \f(CW\*(C`CP950\*(C'\fR. |
| 728 | .Sp |
| 729 | Encode separately supports \f(CW\*(C`Big5\*(C'\fR and \f(CW\*(C`cp950\*(C'\fR. |
| 730 | .IP "Shift_JIS" 4 |
| 731 | .IX Item "Shift_JIS" |
| 732 | Microsoft's understanding of \f(CW\*(C`Shift_JIS\*(C'\fR. |
| 733 | .Sp |
| 734 | \&\s-1JIS\s0 has not endorsed the full Microsoft standard however. |
| 735 | The official \f(CW\*(C`Shift_JIS\*(C'\fR includes only \s-1JIS\s0 X 0201 and \s-1JIS\s0 X 0208 |
| 736 | character sets, while Microsoft has always used \f(CW\*(C`Shift_JIS\*(C'\fR |
| 737 | to encode a wider character repertoire. See \f(CW\*(C`IANA\*(C'\fR registration for |
| 738 | \&\f(CW\*(C`Windows\-31J\*(C'\fR. |
| 739 | .Sp |
| 740 | As a historical predecessor, Microsoft's variant |
| 741 | probably has more rights for the name, though it may be objected |
| 742 | that Microsoft shouldn't have used \s-1JIS\s0 as part of the name |
| 743 | in the first place. |
| 744 | .Sp |
| 745 | Unambiguous name: \f(CW\*(C`CP932\*(C'\fR. \f(CW\*(C`IANA\*(C'\fR name (also used by Mozilla, and |
| 746 | provided as an alias by Encode): \f(CW\*(C`Windows\-31J\*(C'\fR. |
| 747 | .Sp |
| 748 | Encode separately supports \f(CW\*(C`Shift_JIS\*(C'\fR and \f(CW\*(C`cp932\*(C'\fR. |
| 749 | .SH "Glossary" |
| 750 | .IX Header "Glossary" |
| 751 | .IP "character repertoire" 4 |
| 752 | .IX Item "character repertoire" |
| 753 | A collection of unique characters. A \fIcharacter\fR set in the strictest |
| 754 | sense. At this stage, characters are not numbered. |
| 755 | .IP "coded character set (\s-1CCS\s0)" 4 |
| 756 | .IX Item "coded character set (CCS)" |
| 757 | A character set that is mapped in a way computers can use directly. |
| 758 | Many character encodings, including \s-1EUC\s0, fall in this category. |
| 759 | .IP "character encoding scheme (\s-1CES\s0)" 4 |
| 760 | .IX Item "character encoding scheme (CES)" |
| 761 | An algorithm to map a character set to a byte sequence. You don't |
| 762 | have to be able to tell which character set a given byte sequence |
| 763 | belongs. 7\-bit \s-1ISO\-2022\s0 is a \s-1CES\s0 but it cannot be a \s-1CCS\s0. \s-1EUC\s0 is an |
| 764 | example of being both a \s-1CCS\s0 and \s-1CES\s0. |
| 765 | .IP "charset (in \s-1MIME\s0 context)" 4 |
| 766 | .IX Item "charset (in MIME context)" |
| 767 | has long been used in the meaning of \f(CW\*(C`encoding\*(C'\fR, \s-1CES\s0. |
| 768 | .Sp |
| 769 | While the word combination \f(CW\*(C`character set\*(C'\fR has lost this meaning |
| 770 | in \s-1MIME\s0 context since [\s-1RFC\s0 2130], the \f(CW\*(C`charset\*(C'\fR abbreviation has |
| 771 | retained it. This is how [\s-1RFC\s0 2277] and [\s-1RFC\s0 2278] bless \f(CW\*(C`charset\*(C'\fR: |
| 772 | .Sp |
| 773 | .Vb 7 |
| 774 | \& This document uses the term "charset" to mean a set of rules for |
| 775 | \& mapping from a sequence of octets to a sequence of characters, such |
| 776 | \& as the combination of a coded character set and a character encoding |
| 777 | \& scheme; this is also what is used as an identifier in MIME "charset=" |
| 778 | \& parameters, and registered in the IANA charset registry ... (Note |
| 779 | \& that this is NOT a term used by other standards bodies, such as ISO). |
| 780 | \& [RFC 2277] |
| 781 | .Ve |
| 782 | .IP "\s-1EUC\s0" 4 |
| 783 | .IX Item "EUC" |
| 784 | Extended Unix Character. See \s-1ISO\-2022\s0. |
| 785 | .IP "\s-1ISO\-2022\s0" 4 |
| 786 | .IX Item "ISO-2022" |
| 787 | A \s-1CES\s0 that was carefully designed to coexist with \s-1ASCII\s0. There are a 7 |
| 788 | bit version and an 8 bit version. |
| 789 | .Sp |
| 790 | The 7 bit version switches character set via escape sequence so it |
| 791 | cannot form a \s-1CCS\s0. Since this is more difficult to handle in programs |
| 792 | than the 8 bit version, the 7 bit version is not very popular except for |
| 793 | iso\-2022\-jp, the \fIde facto\fR standard \s-1CES\s0 for e\-mails. |
| 794 | .Sp |
| 795 | The 8 bit version can form a \s-1CCS\s0. \s-1EUC\s0 and \s-1ISO\-8859\s0 are two examples |
| 796 | thereof. Pre\-5.6 perl could use them as string literals. |
| 797 | .IP "\s-1UCS\s0" 4 |
| 798 | .IX Item "UCS" |
| 799 | Short for \fIUniversal Character Set\fR. When you say just \s-1UCS\s0, it means |
| 800 | \&\fIUnicode\fR. |
| 801 | .IP "\s-1UCS\-2\s0" 4 |
| 802 | .IX Item "UCS-2" |
| 803 | \&\s-1ISO/IEC\s0 10646 encoding form: Universal Character Set coded in two |
| 804 | octets. |
| 805 | .IP "Unicode" 4 |
| 806 | .IX Item "Unicode" |
| 807 | A character set that aims to include all character repertoires of the |
| 808 | world. Many character sets in various national as well as industrial |
| 809 | standards have become, in a way, just subsets of Unicode. |
| 810 | .IP "\s-1UTF\s0" 4 |
| 811 | .IX Item "UTF" |
| 812 | Short for \fIUnicode Transformation Format\fR. Determines how to map a |
| 813 | Unicode character into a byte sequence. |
| 814 | .IP "\s-1UTF\-16\s0" 4 |
| 815 | .IX Item "UTF-16" |
| 816 | A \s-1UTF\s0 in 16\-bit encoding. Can either be in big endian or little |
| 817 | endian. The big endian version is called \s-1UTF\-16BE\s0 (equal to \s-1UCS\-2\s0 + |
| 818 | surrogate support) and the little endian version is called \s-1UTF\-16LE\s0. |
| 819 | .SH "See Also" |
| 820 | .IX Header "See Also" |
| 821 | Encode, |
| 822 | Encode::Byte, |
| 823 | Encode::CN, Encode::JP, Encode::KR, Encode::TW, |
| 824 | Encode::EBCDIC, Encode::Symbol |
| 825 | Encode::MIME::Header, Encode::Guess |
| 826 | .SH "References" |
| 827 | .IX Header "References" |
| 828 | .IP "\s-1ECMA\s0" 4 |
| 829 | .IX Item "ECMA" |
| 830 | European Computer Manufacturers Association |
| 831 | <http://www.ecma.ch> |
| 832 | .RS 4 |
| 833 | .ie n .IP "\s-1ECMA\-035\s0 (eq ""ISO\-2022"")" 4 |
| 834 | .el .IP "\s-1ECMA\-035\s0 (eq \f(CWISO\-2022\fR)" 4 |
| 835 | .IX Item "ECMA-035 (eq ISO-2022)" |
| 836 | <http://www.ecma.ch/ecma1/STAND/ECMA\-035.HTM> |
| 837 | .Sp |
| 838 | The specification of \s-1ISO\-2022\s0 is available from the link above. |
| 839 | .RE |
| 840 | .RS 4 |
| 841 | .RE |
| 842 | .IP "\s-1IANA\s0" 4 |
| 843 | .IX Item "IANA" |
| 844 | Internet Assigned Numbers Authority |
| 845 | <http://www.iana.org/> |
| 846 | .RS 4 |
| 847 | .IP "Assigned Charset Names by \s-1IANA\s0" 4 |
| 848 | .IX Item "Assigned Charset Names by IANA" |
| 849 | <http://www.iana.org/assignments/character\-sets> |
| 850 | .Sp |
| 851 | Most of the \f(CW\*(C`canonical names\*(C'\fR in Encode derive from this list |
| 852 | so you can directly apply the string you have extracted from \s-1MIME\s0 |
| 853 | header of mails and web pages. |
| 854 | .RE |
| 855 | .RS 4 |
| 856 | .RE |
| 857 | .IP "\s-1ISO\s0" 4 |
| 858 | .IX Item "ISO" |
| 859 | International Organization for Standardization |
| 860 | <http://www.iso.ch/> |
| 861 | .IP "\s-1RFC\s0" 4 |
| 862 | .IX Item "RFC" |
| 863 | Request For Comments \*(-- need I say more? |
| 864 | <http://www.rfc\-editor.org/>, <http://www.rfc.net/>, |
| 865 | <http://www.faqs.org/rfcs/> |
| 866 | .IP "\s-1UC\s0" 4 |
| 867 | .IX Item "UC" |
| 868 | Unicode Consortium |
| 869 | <http://www.unicode.org/> |
| 870 | .RS 4 |
| 871 | .IP "Unicode Glossary" 4 |
| 872 | .IX Item "Unicode Glossary" |
| 873 | <http://www.unicode.org/glossary/> |
| 874 | .Sp |
| 875 | The glossary of this document is based upon this site. |
| 876 | .RE |
| 877 | .RS 4 |
| 878 | .RE |
| 879 | .Sh "Other Notable Sites" |
| 880 | .IX Subsection "Other Notable Sites" |
| 881 | .IP "czyborra.com" 4 |
| 882 | .IX Item "czyborra.com" |
| 883 | <http://czyborra.com/> |
| 884 | .Sp |
| 885 | Contains a lot of useful information, especially gory details of \s-1ISO\s0 |
| 886 | vs. vendor mappings. |
| 887 | .IP "\s-1CJK\s0.inf" 4 |
| 888 | .IX Item "CJK.inf" |
| 889 | <http://www.oreilly.com/people/authors/lunde/cjk_inf.html> |
| 890 | .Sp |
| 891 | Somewhat obsolete (last update in 1996), but still useful. Also try |
| 892 | .Sp |
| 893 | <ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf> |
| 894 | .Sp |
| 895 | You will find brief info on \f(CW\*(C`EUC\-CN\*(C'\fR, \f(CW\*(C`GBK\*(C'\fR and mostly on \f(CW\*(C`GB 18030\*(C'\fR. |
| 896 | .IP "Jungshik Shin's Hangul \s-1FAQ\s0" 4 |
| 897 | .IX Item "Jungshik Shin's Hangul FAQ" |
| 898 | <http://jshin.net/faq> |
| 899 | .Sp |
| 900 | And especially its subject 8. |
| 901 | .Sp |
| 902 | <http://jshin.net/faq/qa8.html> |
| 903 | .Sp |
| 904 | A comprehensive overview of the Korean (\f(CW\*(C`KS *\*(C'\fR) standards. |
| 905 | .ie n .IP "debian.org: ""Introduction to i18n""" 4 |
| 906 | .el .IP "debian.org: ``Introduction to i18n''" 4 |
| 907 | .IX Item "debian.org: Introduction to i18n" |
| 908 | A brief description for most of the mentioned \s-1CJK\s0 encodings is |
| 909 | contained in |
| 910 | <http://www.debian.org/doc/manuals/intro\-i18n/ch\-codes.en.html> |
| 911 | .Sh "Offline sources" |
| 912 | .IX Subsection "Offline sources" |
| 913 | .ie n .IP """CJKV Information Processing"" by Ken Lunde" 4 |
| 914 | .el .IP "\f(CWCJKV Information Processing\fR by Ken Lunde" 4 |
| 915 | .IX Item "CJKV Information Processing by Ken Lunde" |
| 916 | \&\s-1CJKV\s0 Information Processing |
| 917 | 1999 O'Reilly & Associates, \s-1ISBN\s0 : 1\-56592\-224\-7 |
| 918 | .Sp |
| 919 | The modern successor of \f(CW\*(C`CJK.inf\*(C'\fR. |
| 920 | .Sp |
| 921 | Features a comprehensive coverage of \s-1CJKV\s0 character sets and |
| 922 | encodings along with many other issues faced by anyone trying |
| 923 | to better support \s-1CJKV\s0 languages/scripts in all the areas of |
| 924 | information processing. |
| 925 | .Sp |
| 926 | To purchase this book, visit |
| 927 | <http://www.oreilly.com/catalog/cjkvinfo/> |
| 928 | or your favourite bookstore. |