Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / amd64 / man / man3 / Encode::Supported.3
CommitLineData
920dae64
AT
1.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "Encode::Supported 3"
132.TH Encode::Supported 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
133.SH "NAME"
134Encode::Supported \-\- Encodings supported by Encode
135.SH "DESCRIPTION"
136.IX Header "DESCRIPTION"
137.Sh "Encoding Names"
138.IX Subsection "Encoding Names"
139Encoding names are case insensitive. White space in names
140is ignored. In addition, an encoding may have aliases.
141Each encoding has one \*(L"canonical\*(R" name. The \*(L"canonical\*(R"
142name is chosen from the names of the encoding by picking
143the first in the following sequence (with a few exceptions).
144.IP "\(bu" 4
145The name used by the Perl community. That includes 'utf8' and 'ascii'.
146Unlike aliases, canonical names directly reach the method so such
147frequently used words like 'utf8' don't need to do alias lookups.
148.IP "\(bu" 4
149The \s-1MIME\s0 name as defined in \s-1IETF\s0 RFCs. This includes all \*(L"iso\-\*(R"s.
150.IP "\(bu" 4
151The name in the \s-1IANA\s0 registry.
152.IP "\(bu" 4
153The name used by the organization that defined it.
154.PP
155In case \fIde jure\fR canonical names differ from that of the Encode
156module, they are always aliased if it ever be implemented. So you can
157safely tell if a given encoding is implemented or not just by passing
158the canonical name.
159.PP
160Because of all the alias issues, and because in the general case
161encodings have state, \*(L"Encode\*(R" uses an encoding object internally
162once an operation is in progress.
163.SH "Supported Encodings"
164.IX Header "Supported Encodings"
165As of Perl 5.8.0, at least the following encodings are recognized.
166Note that unless otherwise specified, they are all case insensitive
167(via alias) and all occurrence of spaces are replaced with '\-'.
168In other words, \*(L"\s-1ISO\s0 8859 1\*(R" and \*(L"iso\-8859\-1\*(R" are identical.
169.PP
170Encodings are categorized and implemented in several different modules
171but you don't have to \f(CW\*(C`use Encode::XX\*(C'\fR to make them available for
172most cases. Encode.pm will automatically load those modules on demand.
173.Sh "Built-in Encodings"
174.IX Subsection "Built-in Encodings"
175The following encodings are always available.
176.PP
177.Vb 8
178\& Canonical Aliases Comments & References
179\& ----------------------------------------------------------------
180\& ascii US-ascii ISO-646-US [ECMA]
181\& ascii-ctrl Special Encoding
182\& iso-8859-1 latin1 [ISO]
183\& null Special Encoding
184\& utf8 UTF-8 [RFC2279]
185\& ----------------------------------------------------------------
186.Ve
187.PP
188\&\fInull\fR and \fIascii-ctrl\fR are special. \*(L"null\*(R" fails for all character
189so when you set fallback mode to \s-1PERLQQ\s0, \s-1HTMLCREF\s0 or \s-1XMLCREF\s0, \s-1ALL\s0
190\&\s-1CHARACTERS\s0 will fall back to character references. Ditto for
191\&\*(L"ascii\-ctrl\*(R" except for control characters. For fallback modes, see
192Encode.
193.Sh "Encode::Unicode \*(-- other Unicode encodings"
194.IX Subsection "Encode::Unicode other Unicode encodings"
195Unicode coding schemes other than native utf8 are supported by
196Encode::Unicode, which will be autoloaded on demand.
197.PP
198.Vb 11
199\& ----------------------------------------------------------------
200\& UCS-2BE UCS-2, iso-10646-1 [IANA, UC]
201\& UCS-2LE [UC]
202\& UTF-16 [UC]
203\& UTF-16BE [UC]
204\& UTF-16LE [UC]
205\& UTF-32 [UC]
206\& UTF-32BE UCS-4 [UC]
207\& UTF-32LE [UC]
208\& UTF-7 [RFC2152]
209\& ----------------------------------------------------------------
210.Ve
211.PP
212To find how (UCS\-2|UTF\-(16|32))(LE|BE)? differ from one another,
213see Encode::Unicode.
214.PP
215\&\s-1UTF\-7\s0 is a special encoding which \*(L"re\-encodes\*(R" \s-1UTF\-16BE\s0 into a 7\-bit
216encoding. It is implemented seperately by Encode::Unicode::UTF7.
217.Sh "Encode::Byte \*(-- Extended \s-1ASCII\s0"
218.IX Subsection "Encode::Byte Extended ASCII"
219Encode::Byte implements most single-byte encodings except for
220Symbols and \s-1EBCDIC\s0. The following encodings are based on single-byte
221encodings implemented as extended \s-1ASCII\s0. Most of them map
222\&\ex80\-\exff (upper half) to non-ASCII characters.
223.IP "\s-1ISO\-8859\s0 and corresponding vendor mappings" 4
224.IX Item "ISO-8859 and corresponding vendor mappings"
225Since there are so many, they are presented in table format with
226languages and corresponding encoding names by vendors. Note that
227the table is sorted in order of \s-1ISO\-8859\s0 and the corresponding vendor
228mappings are slightly different from that of \s-1ISO\s0. See
229<http://czyborra.com/charsets/iso8859.html> for details.
230.Sp
231.Vb 32
232\& Lang/Regions ISO/Other Std. DOS Windows Macintosh Others
233\& ----------------------------------------------------------------
234\& N. America (ASCII) cp437 AdobeStandardEncoding
235\& cp863 (DOSCanadaF)
236\& W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
237\& hp-roman8
238\& cp860 (DOSPortuguese)
239\& Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman
240\& MacCroatian
241\& MacRomanian
242\& MacRumanian
243\& Latin3[1] iso-8859-3
244\& Latin4[2] iso-8859-4
245\& Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic
246\& (See also next section) cp866 MacUkrainian
247\& Arabic iso-8859-6 cp864 cp1256 MacArabic
248\& cp1006 MacFarsi
249\& Greek iso-8859-7 cp737 cp1253 MacGreek
250\& cp869 (DOSGreek2)
251\& Hebrew iso-8859-8 cp862 cp1255 MacHebrew
252\& Turkish iso-8859-9 cp857 cp1254 MacTurkish
253\& Nordics iso-8859-10 cp865
254\& cp861 MacIcelandic
255\& MacSami
256\& Thai iso-8859-11[3] cp874 MacThai
257\& (iso-8859-12 is nonexistent. Reserved for Indics?)
258\& Baltics iso-8859-13 cp775 cp1257
259\& Celtics iso-8859-14
260\& Latin9 [4] iso-8859-15
261\& Latin10 iso-8859-16
262\& Vietnamese viscii cp1258 MacVietnamese
263\& ----------------------------------------------------------------
264.Ve
265.Sp
266.Vb 5
267\& [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.
268\& [2] Baltics. Now on 8859-10, except for Latvian.
269\& [3] TIS 620 + Non-Breaking Space (0xA0 / U+00A0)
270\& [4] Nicknamed Latin0; the Euro sign as well as French and Finnish
271\& letters that are missing from 8859-1 were added.
272.Ve
273.Sp
274All cp* are also available as ibm\-*, ms\-*, and windows\-* . See also
275<http://czyborra.com/charsets/codepages.html>.
276.Sp
277Macintosh encodings don't seem to be registered in such entities as
278\&\s-1IANA\s0. \*(L"Canonical\*(R" names in Encode are based upon Apple's Tech Note
2791150. See <http://developer.apple.com/technotes/tn/tn1150.html>
280for details.
281.IP "\s-1KOI8\s0 \- De Facto Standard for the Cyrillic world" 4
282.IX Item "KOI8 - De Facto Standard for the Cyrillic world"
283Though \s-1ISO\-8859\s0 does have \s-1ISO\-8859\-5\s0, the \s-1KOI8\s0 series is far more
284popular in the Net. Encode comes with the following \s-1KOI\s0 charsets.
285For gory details, see <http://czyborra.com/charsets/cyrillic.html>
286.Sp
287.Vb 5
288\& ----------------------------------------------------------------
289\& koi8-f
290\& koi8-r cp878 [RFC1489]
291\& koi8-u [RFC2319]
292\& ----------------------------------------------------------------
293.Ve
294.IP "gsm0338 \- Hentai Latin 1" 4
295.IX Item "gsm0338 - Hentai Latin 1"
296\&\s-1GSM0338\s0 is for \s-1GSM\s0 handsets. Though it shares alphanumerals with
297\&\s-1ASCII\s0, control character ranges and other parts are mapped very
298differently, mainly to store Greek characters. There are also escape
299sequences (starting with 0x1B) to cover e.g. the Euro sign. Some
300special cases like a trailing 0x00 byte or a lone 0x1B byte are not
301well-defined and \fIdecode()\fR will return an empty string for them.
302One possible workaround is
303.Sp
304.Vb 3
305\& $gsm =~ s/\ex00\ez/\ex00\ex00/;
306\& $uni = decode("gsm0338", $gsm);
307\& $uni .= "\exA0" if $gsm =~ /\ex1B\ez/;
308.Ve
309.Sp
310Note that the Encode implementation of \s-1GSM0338\s0 does not implement the
311reuse of Latin capital letters as Greek capital letters (for example,
312the 0x5A is U+005A (\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 Z), not U+0396 (\s-1GREEK\s0 \s-1CAPITAL\s0
313\&\s-1LETTER\s0 \s-1ZETA\s0).
314.Sp
315The \s-1GSM0338\s0 is also covered in Encode::Byte even though it is not
316an \*(L"extended \s-1ASCII\s0\*(R" encoding.
317.Sh "\s-1CJK:\s0 Chinese, Japanese, Korean (Multibyte)"
318.IX Subsection "CJK: Chinese, Japanese, Korean (Multibyte)"
319Note that Vietnamese is listed above. Also read \*(L"Encoding vs Charset\*(R"
320below. Also note that these are implemented in distinct modules by
321countries, due to the size concerns (simplified Chinese is mapped
322to '\s-1CN\s0', continental China, while traditional Chinese is mapped to
323\&'\s-1TW\s0', Taiwan). Please refer to their respective documentation pages.
324.IP "Encode::CN \*(-- Continental China" 4
325.IX Item "Encode::CN Continental China"
326.Vb 9
327\& Standard DOS/Win Macintosh Comment/Reference
328\& ----------------------------------------------------------------
329\& euc-cn [1] MacChineseSimp
330\& (gbk) cp936 [2]
331\& gb12345-raw { GB12345 without CES }
332\& gb2312-raw { GB2312 without CES }
333\& hz
334\& iso-ir-165
335\& ----------------------------------------------------------------
336.Ve
337.Sp
338.Vb 2
339\& [1] GB2312 is aliased to this. See L<Microsoft-related naming mess>
340\& [2] gbk is aliased to this. See L<Microsoft-related naming mess>
341.Ve
342.IP "Encode::JP \*(-- Japan" 4
343.IX Item "Encode::JP Japan"
344.Vb 11
345\& Standard DOS/Win Macintosh Comment/Reference
346\& ----------------------------------------------------------------
347\& euc-jp
348\& shiftjis cp932 macJapanese
349\& 7bit-jis
350\& iso-2022-jp [RFC1468]
351\& iso-2022-jp-1 [RFC2237]
352\& jis0201-raw { JIS X 0201 (roman + halfwidth kana) without CES }
353\& jis0208-raw { JIS X 0208 (Kanji + fullwidth kana) without CES }
354\& jis0212-raw { JIS X 0212 (Extended Kanji) without CES }
355\& ----------------------------------------------------------------
356.Ve
357.IP "Encode::KR \*(-- Korea" 4
358.IX Item "Encode::KR Korea"
359.Vb 8
360\& Standard DOS/Win Macintosh Comment/Reference
361\& ----------------------------------------------------------------
362\& euc-kr MacKorean [RFC1557]
363\& cp949 [1]
364\& iso-2022-kr [RFC1557]
365\& johab [KS X 1001:1998, Annex 3]
366\& ksc5601-raw { KSC5601 without CES }
367\& ----------------------------------------------------------------
368.Ve
369.Sp
370.Vb 2
371\& [1] ks_c_5601-1987, (x-)?windows-949, and uhc are aliased to this.
372\& See below.
373.Ve
374.IP "Encode::TW \*(-- Taiwan" 4
375.IX Item "Encode::TW Taiwan"
376.Vb 5
377\& Standard DOS/Win Macintosh Comment/Reference
378\& ----------------------------------------------------------------
379\& big5-eten cp950 MacChineseTrad {big5 aliased to big5-eten}
380\& big5-hkscs
381\& ----------------------------------------------------------------
382.Ve
383.IP "Encode::HanExtra \*(-- More Chinese via \s-1CPAN\s0" 4
384.IX Item "Encode::HanExtra More Chinese via CPAN"
385Due to the size concerns, additional Chinese encodings below are
386distributed separately on \s-1CPAN\s0, under the name Encode::HanExtra.
387.Sp
388.Vb 8
389\& Standard DOS/Win Macintosh Comment/Reference
390\& ----------------------------------------------------------------
391\& big5ext CMEX's Big5e Extension
392\& big5plus CMEX's Big5+ Extension
393\& cccii Chinese Character Code for Information Interchange
394\& euc-tw EUC (Extended Unix Character)
395\& gb18030 GBK with Traditional Characters
396\& ----------------------------------------------------------------
397.Ve
398.IP "Encode::JIS2K \*(-- \s-1JIS\s0 X 0213 encodings via \s-1CPAN\s0" 4
399.IX Item "Encode::JIS2K JIS X 0213 encodings via CPAN"
400Due to size concerns, additional Japanese encodings below are
401distributed separately on \s-1CPAN\s0, under the name Encode::JIS2K.
402.Sp
403.Vb 8
404\& Standard DOS/Win Macintosh Comment/Reference
405\& ----------------------------------------------------------------
406\& euc-jisx0213
407\& shiftjisx0123
408\& iso-2022-jp-3
409\& jis0213-1-raw
410\& jis0213-2-raw
411\& ----------------------------------------------------------------
412.Ve
413.Sh "Miscellaneous encodings"
414.IX Subsection "Miscellaneous encodings"
415.IP "Encode::EBCDIC" 4
416.IX Item "Encode::EBCDIC"
417See perlebcdic for details.
418.Sp
419.Vb 8
420\& ----------------------------------------------------------------
421\& cp37
422\& cp500
423\& cp875
424\& cp1026
425\& cp1047
426\& posix-bc
427\& ----------------------------------------------------------------
428.Ve
429.IP "Encode::Symbols" 4
430.IX Item "Encode::Symbols"
431For symbols and dingbats.
432.Sp
433.Vb 7
434\& ----------------------------------------------------------------
435\& symbol
436\& dingbats
437\& MacDingbats
438\& AdobeZdingbat
439\& AdobeSymbol
440\& ----------------------------------------------------------------
441.Ve
442.IP "Encode::MIME::Header" 4
443.IX Item "Encode::MIME::Header"
444Strictly speaking, \s-1MIME\s0 header encoding documented in \s-1RFC\s0 2047 is more
445of encapsulation than encoding. However, their support in modern
446world is imperative so they are supported.
447.Sp
448.Vb 5
449\& ----------------------------------------------------------------
450\& MIME-Header [RFC2047]
451\& MIME-B [RFC2047]
452\& MIME-Q [RFC2047]
453\& ----------------------------------------------------------------
454.Ve
455.IP "Encode::Guess" 4
456.IX Item "Encode::Guess"
457This one is not a name of encoding but a utility that lets you pick up
458the most appropriate encoding for a data out of given \fIsuspects\fR. See
459Encode::Guess for details.
460.SH "Unsupported encodings"
461.IX Header "Unsupported encodings"
462The following encodings are not supported as yet; some because they
463are rarely used, some because of technical difficulties. They may
464be supported by external modules via \s-1CPAN\s0 in the future, however.
465.IP "\s-1ISO\-2022\-JP\-2\s0 [\s-1RFC1554\s0]" 4
466.IX Item "ISO-2022-JP-2 [RFC1554]"
467Not very popular yet. Needs Unicode Database or equivalent to
468implement \fIencode()\fR (because it includes \s-1JIS\s0 X 0208/0212, \s-1KSC5601\s0, and
469\&\s-1GB2312\s0 simultaneously, whose code points in Unicode overlap. So you
470need to lookup the database to determine to what character set a given
471Unicode character should belong).
472.IP "\s-1ISO\-2022\-CN\s0 [\s-1RFC1922\s0]" 4
473.IX Item "ISO-2022-CN [RFC1922]"
474Not very popular. Needs \s-1CNS\s0 11643\-1 and \-2 which are not available in
475this module. \s-1CNS\s0 11643 is supported (via euc\-tw) in Encode::HanExtra.
476Autrijus Tang may add support for this encoding in his module in future.
477.IP "Various HP-UX encodings" 4
478.IX Item "Various HP-UX encodings"
479The following are unsupported due to the lack of mapping data.
480.Sp
481.Vb 2
482\& '8' - arabic8, greek8, hebrew8, kana8, thai8, and turkish8
483\& '15' - japanese15, korean15, and roi15
484.Ve
485.IP "Cyrillic encoding \s-1ISO\-IR\-111\s0" 4
486.IX Item "Cyrillic encoding ISO-IR-111"
487Anton Tagunov doubts its usefulness.
488.IP "\s-1ISO\-8859\-8\-1\s0 [Hebrew]" 4
489.IX Item "ISO-8859-8-1 [Hebrew]"
490None of the Encode team knows Hebrew enough (\s-1ISO\-8859\-8\s0, cp1255 and
491MacHebrew are supported because and just because there were mappings
492available at <http://www.unicode.org/>). Contributions welcome.
493.IP "\s-1ISIRI\s0 3342, Iran System, \s-1ISIRI\s0 2900 [Farsi]" 4
494.IX Item "ISIRI 3342, Iran System, ISIRI 2900 [Farsi]"
495Ditto.
496.IP "Thai encoding \s-1TCVN\s0" 4
497.IX Item "Thai encoding TCVN"
498Ditto.
499.IP "Vietnamese encodings \s-1VPS\s0" 4
500.IX Item "Vietnamese encodings VPS"
501Though Jungshik Shin has reported that Mozilla supports this encoding,
502it was too late before 5.8.0 for us to add it. In the future, it
503may be available via a separate module. See
504<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf>
505and
506<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut>
507if you are interested in helping us.
508.IP "Various Mac encodings" 4
509.IX Item "Various Mac encodings"
510The following are unsupported due to the lack of mapping data.
511.Sp
512.Vb 5
513\& MacArmenian, MacBengali, MacBurmese, MacEthiopic
514\& MacExtArabic, MacGeorgian, MacKannada, MacKhmer
515\& MacLaotian, MacMalayalam, MacMongolian, MacOriya
516\& MacSinhalese, MacTamil, MacTelugu, MacTibetan
517\& MacVietnamese
518.Ve
519.Sp
520The rest which are already available are based upon the vendor mappings
521at <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/> .
522.IP "(Mac) Indic encodings" 4
523.IX Item "(Mac) Indic encodings"
524The maps for the following are available at <http://www.unicode.org/>
525but remain unsupport because those encodings need algorithmical
526approach, currently unsupported by \fIenc2xs\fR:
527.Sp
528.Vb 3
529\& MacDevanagari
530\& MacGurmukhi
531\& MacGujarati
532.Ve
533.Sp
534For details, please see \f(CW\*(C`Unicode mapping issues and notes:\*(C'\fR at
535<http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/DEVANAGA.TXT> .
536.Sp
537I believe this issue is prevalent not only for Mac Indics but also in
538other Indic encodings, but the above were the only Indic encodings
539maps that I could find at <http://www.unicode.org/> .
540.SH "Encoding vs. Charset \*(-- terminology"
541.IX Header "Encoding vs. Charset terminology"
542We are used to using the term (character) \fIencoding\fR and \fIcharacter
543set\fR interchangeably. But just as confusing the terms byte and
544character is dangerous and the terms should be differentiated when
545needed, we need to differentiate \fIencoding\fR and \fIcharacter set\fR.
546.PP
547To understand that, here is a description of how we make computers
548grok our characters.
549.IP "\(bu" 4
550First we start with which characters to include. We call this
551collection of characters \fIcharacter repertoire\fR.
552.IP "\(bu" 4
553Then we have to give each character a unique \s-1ID\s0 so your computer can
554tell the difference between 'a' and 'A'. This itemized character
555repertoire is now a \fIcharacter set\fR.
556.IP "\(bu" 4
557If your computer can grow the character set without further
558processing, you can go ahead and use it. This is called a \fIcoded
559character set\fR (\s-1CCS\s0) or \fIraw character encoding\fR. \s-1ASCII\s0 is used this
560way for most cases.
561.IP "\(bu" 4
562But in many cases, especially multi-byte \s-1CJK\s0 encodings, you have to
563tweak a little more. Your network connection may not accept any data
564with the Most Significant Bit set, and your computer may not be able to
565tell if a given byte is a whole character or just half of it. So you
566have to \fIencode\fR the character set to use it.
567.Sp
568A \fIcharacter encoding scheme\fR (\s-1CES\s0) determines how to encode a given
569character set, or a set of multiple character sets. 7bit \s-1ISO\-2022\s0 is
570an example of a \s-1CES\s0. You switch between character sets via \fIescape
571sequences\fR.
572.PP
573Technically, or mathematically, speaking, a character set encoded in
574such a \s-1CES\s0 that maps character by character may form a \s-1CCS\s0. \s-1EUC\s0 is such
575an example. The \s-1CES\s0 of \s-1EUC\s0 is as follows:
576.IP "\(bu" 4
577Map \s-1ASCII\s0 unchanged.
578.IP "\(bu" 4
579Map such a character set that consists of 94 or 96 powered by N
580members by adding 0x80 to each byte.
581.IP "\(bu" 4
582You can also use 0x8e and 0x8f to indicate that the following sequence of
583characters belongs to yet another character set. To each following byte
584is added the value 0x80.
585.PP
586By carefully looking at the encoded byte sequence, you can find that the
587byte sequence conforms a unique number. In that sense, \s-1EUC\s0 is a \s-1CCS\s0
588generated by a \s-1CES\s0 above from up to four \s-1CCS\s0 (complicated?). \s-1UTF\-8\s0
589falls into this category. See \*(L"\s-1UTF\-8\s0\*(R" in perlUnicode to find out how
590\&\s-1UTF\-8\s0 maps Unicode to a byte sequence.
591.PP
592You may also have found out by now why 7bit \s-1ISO\-2022\s0 cannot comprise
593a \s-1CCS\s0. If you look at a byte sequence \ex21\ex21, you can't tell if
594it is two !'s or \s-1IDEOGRAPHIC\s0 \s-1SPACE\s0. \s-1EUC\s0 maps the latter to \exA1\exA1
595so you have no trouble differentiating between \*(L"!!\*(R". and \*(L"\ \*(R".
596.SH "Encoding Classification (by Anton Tagunov and Dan Kogai)"
597.IX Header "Encoding Classification (by Anton Tagunov and Dan Kogai)"
598This section tries to classify the supported encodings by their
599applicability for information exchange over the Internet and to
600choose the most suitable aliases to name them in the context of
601such communication.
602.IP "\(bu" 4
603To (en|de)code encodings marked by \f(CW\*(C`(**)\*(C'\fR, you need
604\&\f(CW\*(C`Encode::HanExtra\*(C'\fR, available from \s-1CPAN\s0.
605.PP
606Encoding names
607.PP
608.Vb 3
609\& US-ASCII UTF-8 ISO-8859-* KOI8-R
610\& Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
611\& EUC-KR Big5 GB2312
612.Ve
613.PP
614are registered with \s-1IANA\s0 as preferred \s-1MIME\s0 names and may
615be used over the Internet.
616.PP
617\&\f(CW\*(C`Shift_JIS\*(C'\fR has been officialized by \s-1JIS\s0 X 0208:1997.
618\&\*(L"Microsoft\-related naming mess\*(R" gives details.
619.PP
620\&\f(CW\*(C`GB2312\*(C'\fR is the \s-1IANA\s0 name for \f(CW\*(C`EUC\-CN\*(C'\fR.
621See \*(L"Microsoft\-related naming mess\*(R" for details.
622.PP
623\&\f(CW\*(C`GB_2312\-80\*(C'\fR \fIraw\fR encoding is available as \f(CW\*(C`gb2312\-raw\*(C'\fR
624with Encode. See Encode::CN for details.
625.PP
626.Vb 2
627\& EUC-CN
628\& KOI8-U [RFC2319]
629.Ve
630.PP
631have not been registered with \s-1IANA\s0 (as of March 2002) but
632seem to be supported by major web browsers.
633The \s-1IANA\s0 name for \f(CW\*(C`EUC\-CN\*(C'\fR is \f(CW\*(C`GB2312\*(C'\fR.
634.PP
635.Vb 1
636\& KS_C_5601-1987
637.Ve
638.PP
639is heavily misused.
640See \*(L"Microsoft\-related naming mess\*(R" for details.
641.PP
642\&\f(CW\*(C`KS_C_5601\-1987\*(C'\fR \fIraw\fR encoding is available as \f(CW\*(C`kcs5601\-raw\*(C'\fR
643with Encode. See Encode::KR for details.
644.PP
645.Vb 1
646\& UTF-16 UTF-16BE UTF-16LE
647.Ve
648.PP
649are IANA-registered \f(CW\*(C`charset\*(C'\fRs. See [\s-1RFC\s0 2781] for details.
650Jungshik Shin reports that \s-1UTF\-16\s0 with a \s-1BOM\s0 is well accepted
651by \s-1MS\s0 \s-1IE\s0 5/6 and \s-1NS\s0 4/6. Beware however that
652.IP "\(bu" 4
653\&\f(CW\*(C`UTF\-16\*(C'\fR support in any software you're going to be
654using/interoperating with has probably been less tested
655then \f(CW\*(C`UTF\-8\*(C'\fR support
656.IP "\(bu" 4
657\&\f(CW\*(C`UTF\-8\*(C'\fR coded data seamlessly passes traditional
658command piping (\f(CW\*(C`cat\*(C'\fR, \f(CW\*(C`more\*(C'\fR, etc.) while \f(CW\*(C`UTF\-16\*(C'\fR coded
659data is likely to cause confusion (with its zero bytes,
660for example)
661.IP "\(bu" 4
662it is beyond the power of words to describe the way \s-1HTML\s0 browsers
663encode non\-\f(CW\*(C`ASCII\*(C'\fR form data. To get a general impression, visit
664<http://ppewww.ph.gla.ac.uk/~flavell/charset/form\-i18n.html>.
665While encoding of form data has stabilized for \f(CW\*(C`UTF\-8\*(C'\fR encoded pages
666(at least \s-1IE\s0 5/6, \s-1NS\s0 6, and Opera 6 behave consistently), be sure to
667expect fun (and cross-browser discrepancies) with \f(CW\*(C`UTF\-16\*(C'\fR encoded
668pages!
669.PP
670The rule of thumb is to use \f(CW\*(C`UTF\-8\*(C'\fR unless you know what
671you're doing and unless you really benefit from using \f(CW\*(C`UTF\-16\*(C'\fR.
672.PP
673.Vb 5
674\& ISO-IR-165 [RFC1345]
675\& VISCII
676\& GB 12345
677\& GB 18030 (**) (see links bellow)
678\& EUC-TW (**)
679.Ve
680.PP
681are totally valid encodings but not registered at \s-1IANA\s0.
682The names under which they are listed here are probably the
683most widely-known names for these encodings and are recommended
684names.
685.PP
686.Vb 1
687\& BIG5PLUS (**)
688.Ve
689.PP
690is a proprietary name.
691.Sh "Microsoft-related naming mess"
692.IX Subsection "Microsoft-related naming mess"
693Microsoft products misuse the following names:
694.IP "\s-1KS_C_5601\-1987\s0" 4
695.IX Item "KS_C_5601-1987"
696Microsoft extension to \f(CW\*(C`EUC\-KR\*(C'\fR.
697.Sp
698Proper names: \f(CW\*(C`CP949\*(C'\fR, \f(CW\*(C`UHC\*(C'\fR, \f(CW\*(C`x\-windows\-949\*(C'\fR (as used by Mozilla).
699.Sp
700See <http://lists.w3.org/Archives/Public/ietf\-charsets/2001AprJun/0033.html>
701for details.
702.Sp
703Encode aliases \f(CW\*(C`KS_C_5601\-1987\*(C'\fR to \f(CW\*(C`cp949\*(C'\fR to reflect this common
704misusage. \fIRaw\fR \f(CW\*(C`KS_C_5601\-1987\*(C'\fR encoding is available as
705\&\f(CW\*(C`kcs5601\-raw\*(C'\fR.
706.Sp
707See Encode::KR for details.
708.IP "\s-1GB2312\s0" 4
709.IX Item "GB2312"
710Microsoft extension to \f(CW\*(C`EUC\-CN\*(C'\fR.
711.Sp
712Proper names: \f(CW\*(C`CP936\*(C'\fR, \f(CW\*(C`GBK\*(C'\fR.
713.Sp
714\&\f(CW\*(C`GB2312\*(C'\fR has been registered in the \f(CW\*(C`EUC\-CN\*(C'\fR meaning at
715\&\s-1IANA\s0. This has partially repaired the situation: Microsoft's
716\&\f(CW\*(C`GB2312\*(C'\fR has become a superset of the official \f(CW\*(C`GB2312\*(C'\fR.
717.Sp
718Encode aliases \f(CW\*(C`GB2312\*(C'\fR to \f(CW\*(C`euc\-cn\*(C'\fR in full agreement with
719\&\s-1IANA\s0 registration. \f(CW\*(C`cp936\*(C'\fR is supported separately.
720\&\fIRaw\fR \f(CW\*(C`GB_2312\-80\*(C'\fR encoding is available as \f(CW\*(C`gb2312\-raw\*(C'\fR.
721.Sp
722See Encode::CN for details.
723.IP "Big5" 4
724.IX Item "Big5"
725Microsoft extension to \f(CW\*(C`Big5\*(C'\fR.
726.Sp
727Proper name: \f(CW\*(C`CP950\*(C'\fR.
728.Sp
729Encode separately supports \f(CW\*(C`Big5\*(C'\fR and \f(CW\*(C`cp950\*(C'\fR.
730.IP "Shift_JIS" 4
731.IX Item "Shift_JIS"
732Microsoft's understanding of \f(CW\*(C`Shift_JIS\*(C'\fR.
733.Sp
734\&\s-1JIS\s0 has not endorsed the full Microsoft standard however.
735The official \f(CW\*(C`Shift_JIS\*(C'\fR includes only \s-1JIS\s0 X 0201 and \s-1JIS\s0 X 0208
736character sets, while Microsoft has always used \f(CW\*(C`Shift_JIS\*(C'\fR
737to encode a wider character repertoire. See \f(CW\*(C`IANA\*(C'\fR registration for
738\&\f(CW\*(C`Windows\-31J\*(C'\fR.
739.Sp
740As a historical predecessor, Microsoft's variant
741probably has more rights for the name, though it may be objected
742that Microsoft shouldn't have used \s-1JIS\s0 as part of the name
743in the first place.
744.Sp
745Unambiguous name: \f(CW\*(C`CP932\*(C'\fR. \f(CW\*(C`IANA\*(C'\fR name (also used by Mozilla, and
746provided as an alias by Encode): \f(CW\*(C`Windows\-31J\*(C'\fR.
747.Sp
748Encode separately supports \f(CW\*(C`Shift_JIS\*(C'\fR and \f(CW\*(C`cp932\*(C'\fR.
749.SH "Glossary"
750.IX Header "Glossary"
751.IP "character repertoire" 4
752.IX Item "character repertoire"
753A collection of unique characters. A \fIcharacter\fR set in the strictest
754sense. At this stage, characters are not numbered.
755.IP "coded character set (\s-1CCS\s0)" 4
756.IX Item "coded character set (CCS)"
757A character set that is mapped in a way computers can use directly.
758Many character encodings, including \s-1EUC\s0, fall in this category.
759.IP "character encoding scheme (\s-1CES\s0)" 4
760.IX Item "character encoding scheme (CES)"
761An algorithm to map a character set to a byte sequence. You don't
762have to be able to tell which character set a given byte sequence
763belongs. 7\-bit \s-1ISO\-2022\s0 is a \s-1CES\s0 but it cannot be a \s-1CCS\s0. \s-1EUC\s0 is an
764example of being both a \s-1CCS\s0 and \s-1CES\s0.
765.IP "charset (in \s-1MIME\s0 context)" 4
766.IX Item "charset (in MIME context)"
767has long been used in the meaning of \f(CW\*(C`encoding\*(C'\fR, \s-1CES\s0.
768.Sp
769While the word combination \f(CW\*(C`character set\*(C'\fR has lost this meaning
770in \s-1MIME\s0 context since [\s-1RFC\s0 2130], the \f(CW\*(C`charset\*(C'\fR abbreviation has
771retained it. This is how [\s-1RFC\s0 2277] and [\s-1RFC\s0 2278] bless \f(CW\*(C`charset\*(C'\fR:
772.Sp
773.Vb 7
774\& This document uses the term "charset" to mean a set of rules for
775\& mapping from a sequence of octets to a sequence of characters, such
776\& as the combination of a coded character set and a character encoding
777\& scheme; this is also what is used as an identifier in MIME "charset="
778\& parameters, and registered in the IANA charset registry ... (Note
779\& that this is NOT a term used by other standards bodies, such as ISO).
780\& [RFC 2277]
781.Ve
782.IP "\s-1EUC\s0" 4
783.IX Item "EUC"
784Extended Unix Character. See \s-1ISO\-2022\s0.
785.IP "\s-1ISO\-2022\s0" 4
786.IX Item "ISO-2022"
787A \s-1CES\s0 that was carefully designed to coexist with \s-1ASCII\s0. There are a 7
788bit version and an 8 bit version.
789.Sp
790The 7 bit version switches character set via escape sequence so it
791cannot form a \s-1CCS\s0. Since this is more difficult to handle in programs
792than the 8 bit version, the 7 bit version is not very popular except for
793iso\-2022\-jp, the \fIde facto\fR standard \s-1CES\s0 for e\-mails.
794.Sp
795The 8 bit version can form a \s-1CCS\s0. \s-1EUC\s0 and \s-1ISO\-8859\s0 are two examples
796thereof. Pre\-5.6 perl could use them as string literals.
797.IP "\s-1UCS\s0" 4
798.IX Item "UCS"
799Short for \fIUniversal Character Set\fR. When you say just \s-1UCS\s0, it means
800\&\fIUnicode\fR.
801.IP "\s-1UCS\-2\s0" 4
802.IX Item "UCS-2"
803\&\s-1ISO/IEC\s0 10646 encoding form: Universal Character Set coded in two
804octets.
805.IP "Unicode" 4
806.IX Item "Unicode"
807A character set that aims to include all character repertoires of the
808world. Many character sets in various national as well as industrial
809standards have become, in a way, just subsets of Unicode.
810.IP "\s-1UTF\s0" 4
811.IX Item "UTF"
812Short for \fIUnicode Transformation Format\fR. Determines how to map a
813Unicode character into a byte sequence.
814.IP "\s-1UTF\-16\s0" 4
815.IX Item "UTF-16"
816A \s-1UTF\s0 in 16\-bit encoding. Can either be in big endian or little
817endian. The big endian version is called \s-1UTF\-16BE\s0 (equal to \s-1UCS\-2\s0 +
818surrogate support) and the little endian version is called \s-1UTF\-16LE\s0.
819.SH "See Also"
820.IX Header "See Also"
821Encode,
822Encode::Byte,
823Encode::CN, Encode::JP, Encode::KR, Encode::TW,
824Encode::EBCDIC, Encode::Symbol
825Encode::MIME::Header, Encode::Guess
826.SH "References"
827.IX Header "References"
828.IP "\s-1ECMA\s0" 4
829.IX Item "ECMA"
830European Computer Manufacturers Association
831<http://www.ecma.ch>
832.RS 4
833.ie n .IP "\s-1ECMA\-035\s0 (eq ""ISO\-2022"")" 4
834.el .IP "\s-1ECMA\-035\s0 (eq \f(CWISO\-2022\fR)" 4
835.IX Item "ECMA-035 (eq ISO-2022)"
836<http://www.ecma.ch/ecma1/STAND/ECMA\-035.HTM>
837.Sp
838The specification of \s-1ISO\-2022\s0 is available from the link above.
839.RE
840.RS 4
841.RE
842.IP "\s-1IANA\s0" 4
843.IX Item "IANA"
844Internet Assigned Numbers Authority
845<http://www.iana.org/>
846.RS 4
847.IP "Assigned Charset Names by \s-1IANA\s0" 4
848.IX Item "Assigned Charset Names by IANA"
849<http://www.iana.org/assignments/character\-sets>
850.Sp
851Most of the \f(CW\*(C`canonical names\*(C'\fR in Encode derive from this list
852so you can directly apply the string you have extracted from \s-1MIME\s0
853header of mails and web pages.
854.RE
855.RS 4
856.RE
857.IP "\s-1ISO\s0" 4
858.IX Item "ISO"
859International Organization for Standardization
860<http://www.iso.ch/>
861.IP "\s-1RFC\s0" 4
862.IX Item "RFC"
863Request For Comments \*(-- need I say more?
864<http://www.rfc\-editor.org/>, <http://www.rfc.net/>,
865<http://www.faqs.org/rfcs/>
866.IP "\s-1UC\s0" 4
867.IX Item "UC"
868Unicode Consortium
869<http://www.unicode.org/>
870.RS 4
871.IP "Unicode Glossary" 4
872.IX Item "Unicode Glossary"
873<http://www.unicode.org/glossary/>
874.Sp
875The glossary of this document is based upon this site.
876.RE
877.RS 4
878.RE
879.Sh "Other Notable Sites"
880.IX Subsection "Other Notable Sites"
881.IP "czyborra.com" 4
882.IX Item "czyborra.com"
883<http://czyborra.com/>
884.Sp
885Contains a lot of useful information, especially gory details of \s-1ISO\s0
886vs. vendor mappings.
887.IP "\s-1CJK\s0.inf" 4
888.IX Item "CJK.inf"
889<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
890.Sp
891Somewhat obsolete (last update in 1996), but still useful. Also try
892.Sp
893<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
894.Sp
895You will find brief info on \f(CW\*(C`EUC\-CN\*(C'\fR, \f(CW\*(C`GBK\*(C'\fR and mostly on \f(CW\*(C`GB 18030\*(C'\fR.
896.IP "Jungshik Shin's Hangul \s-1FAQ\s0" 4
897.IX Item "Jungshik Shin's Hangul FAQ"
898<http://jshin.net/faq>
899.Sp
900And especially its subject 8.
901.Sp
902<http://jshin.net/faq/qa8.html>
903.Sp
904A comprehensive overview of the Korean (\f(CW\*(C`KS *\*(C'\fR) standards.
905.ie n .IP "debian.org: ""Introduction to i18n""" 4
906.el .IP "debian.org: ``Introduction to i18n''" 4
907.IX Item "debian.org: Introduction to i18n"
908A brief description for most of the mentioned \s-1CJK\s0 encodings is
909contained in
910<http://www.debian.org/doc/manuals/intro\-i18n/ch\-codes.en.html>
911.Sh "Offline sources"
912.IX Subsection "Offline sources"
913.ie n .IP """CJKV Information Processing"" by Ken Lunde" 4
914.el .IP "\f(CWCJKV Information Processing\fR by Ken Lunde" 4
915.IX Item "CJKV Information Processing by Ken Lunde"
916\&\s-1CJKV\s0 Information Processing
9171999 O'Reilly & Associates, \s-1ISBN\s0 : 1\-56592\-224\-7
918.Sp
919The modern successor of \f(CW\*(C`CJK.inf\*(C'\fR.
920.Sp
921Features a comprehensive coverage of \s-1CJKV\s0 character sets and
922encodings along with many other issues faced by anyone trying
923to better support \s-1CJKV\s0 languages/scripts in all the areas of
924information processing.
925.Sp
926To purchase this book, visit
927<http://www.oreilly.com/catalog/cjkvinfo/>
928or your favourite bookstore.