Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / Encode::Supported.3
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "Encode::Supported 3"
132.TH Encode::Supported 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134Encode::Supported \-\- Encodings supported by Encode
135.SH "DESCRIPTION"
136.IX Header "DESCRIPTION"
137.Sh "Encoding Names"
138.IX Subsection "Encoding Names"
139Encoding names are case insensitive. White space in names
140is ignored. In addition, an encoding may have aliases.
141Each encoding has one \*(L"canonical\*(R" name. The \*(L"canonical\*(R"
142name is chosen from the names of the encoding by picking
143the first in the following sequence (with a few exceptions).
144.IP "\(bu" 4
145The name used by the Perl community. That includes 'utf8' and 'ascii'.
146Unlike aliases, canonical names directly reach the method so such
147frequently used words like 'utf8' don't need to do alias lookups.
148.IP "\(bu" 4
149The \s-1MIME\s0 name as defined in \s-1IETF\s0 RFCs. This includes all \*(L"iso\-\*(R"s.
150.IP "\(bu" 4
151The name in the \s-1IANA\s0 registry.
152.IP "\(bu" 4
153The name used by the organization that defined it.
154.PP
155In case \fIde jure\fR canonical names differ from that of the Encode
156module, they are always aliased if it ever be implemented. So you can
157safely tell if a given encoding is implemented or not just by passing
158the canonical name.
159.PP
160Because of all the alias issues, and because in the general case
161encodings have state, \*(L"Encode\*(R" uses an encoding object internally
162once an operation is in progress.
163.SH "Supported Encodings"
164.IX Header "Supported Encodings"
165As of Perl 5.8.0, at least the following encodings are recognized.
166Note that unless otherwise specified, they are all case insensitive
167(via alias) and all occurrence of spaces are replaced with '\-'.
168In other words, \*(L"\s-1ISO\s0 8859 1\*(R" and \*(L"iso\-8859\-1\*(R" are identical.
169.PP
170Encodings are categorized and implemented in several different modules
171but you don't have to \f(CW\*(C`use Encode::XX\*(C'\fR to make them available for
172most cases. Encode.pm will automatically load those modules on demand.
173.Sh "Built-in Encodings"
174.IX Subsection "Built-in Encodings"
175The following encodings are always available.
176.PP
177.Vb 8
178\& Canonical Aliases Comments & References
179\& ----------------------------------------------------------------
180\& ascii US-ascii ISO-646-US [ECMA]
181\& ascii-ctrl Special Encoding
182\& iso-8859-1 latin1 [ISO]
183\& null Special Encoding
184\& utf8 UTF-8 [RFC2279]
185\& ----------------------------------------------------------------
186.Ve
187.PP
188\&\fInull\fR and \fIascii-ctrl\fR are special. \*(L"null\*(R" fails for all character
189so when you set fallback mode to \s-1PERLQQ\s0, \s-1HTMLCREF\s0 or \s-1XMLCREF\s0, \s-1ALL\s0
190\&\s-1CHARACTERS\s0 will fall back to character references. Ditto for
191\&\*(L"ascii\-ctrl\*(R" except for control characters. For fallback modes, see
192Encode.
193.Sh "Encode::Unicode \*(-- other Unicode encodings"
194.IX Subsection "Encode::Unicode other Unicode encodings"
195Unicode coding schemes other than native utf8 are supported by
196Encode::Unicode, which will be autoloaded on demand.
197.PP
198.Vb 10
199\& ----------------------------------------------------------------
200\& UCS-2BE UCS-2, iso-10646-1 [IANA, UC]
201\& UCS-2LE [UC]
202\& UTF-16 [UC]
203\& UTF-16BE [UC]
204\& UTF-16LE [UC]
205\& UTF-32 [UC]
206\& UTF-32BE UCS-4 [UC]
207\& UTF-32LE [UC]
208\& ----------------------------------------------------------------
209.Ve
210.PP
211To find how (UCS\-2|UTF\-(16|32))(LE|BE)? differ from one another,
212see Encode::Unicode.
213.Sh "Encode::Byte \*(-- Extended \s-1ASCII\s0"
214.IX Subsection "Encode::Byte Extended ASCII"
215Encode::Byte implements most single-byte encodings except for
216Symbols and \s-1EBCDIC\s0. The following encodings are based on single-byte
217encodings implemented as extended \s-1ASCII\s0. Most of them map
218\&\ex80\-\exff (upper half) to non-ASCII characters.
219.IP "\s-1ISO\-8859\s0 and corresponding vendor mappings" 4
220.IX Item "ISO-8859 and corresponding vendor mappings"
221Since there are so many, they are presented in table format with
222languages and corresponding encoding names by vendors. Note that
223the table is sorted in order of \s-1ISO\-8859\s0 and the corresponding vendor
224mappings are slightly different from that of \s-1ISO\s0. See
225<http://czyborra.com/charsets/iso8859.html> for details.
226.Sp
227.Vb 32
228\& Lang/Regions ISO/Other Std. DOS Windows Macintosh Others
229\& ----------------------------------------------------------------
230\& N. America (ASCII) cp437 AdobeStandardEncoding
231\& cp863 (DOSCanadaF)
232\& W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
233\& hp-roman8
234\& cp860 (DOSPortuguese)
235\& Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman
236\& MacCroatian
237\& MacRomanian
238\& MacRumanian
239\& Latin3 [1] iso-8859-3
240\& Latin4 [2] iso-8859-4
241\& Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic
242\& (See also next section) cp866 MacUkrainian
243\& Arabic iso-8859-6 cp864 cp1256 MacArabic
244\& cp1006 MacFarsi
245\& Greek iso-8859-7 cp737 cp1253 MacGreek
246\& cp869 (DOSGreek2)
247\& Hebrew iso-8859-8 cp862 cp1255 MacHebrew
248\& Turkish iso-8859-9 cp857 cp1254 MacTurkish
249\& Nordics iso-8859-10 cp865
250\& cp861 MacIcelandic
251\& MacSami
252\& Thai iso-8859-11 [3] cp874 MacThai
253\& (iso-8859-12 is nonexistent. Reserved for Indics?)
254\& Baltics iso-8859-13 cp775 cp1257
255\& Celtics iso-8859-14
256\& Latin9 [4] iso-8859-15
257\& Latin10 iso-8859-16
258\& Vietnamese viscii cp1258 MacVietnamese
259\& ----------------------------------------------------------------
260.Ve
261.Sp
262.Vb 5
263\& [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.
264\& [2] Baltics. Now on 8859-10, except for Latvian.
265\& [3] Also know as TIS 620.
266\& [4] Nicknamed Latin0; the Euro sign as well as French and Finnish
267\& letters that are missing from 8859-1 were added.
268.Ve
269.Sp
270All cp* are also available as ibm\-*, ms\-*, and windows\-* . See also
271<http://czyborra.com/charsets/codepages.html>.
272.Sp
273Macintosh encodings don't seem to be registered in such entities as
274\&\s-1IANA\s0. \*(L"Canonical\*(R" names in Encode are based upon Apple's Tech Note
2751150. See <http://developer.apple.com/technotes/tn/tn1150.html>
276for details.
277.IP "\s-1KOI8\s0 \- De Facto Standard for the Cyrillic world" 4
278.IX Item "KOI8 - De Facto Standard for the Cyrillic world"
279Though \s-1ISO\-8859\s0 does have \s-1ISO\-8859\-5\s0, the \s-1KOI8\s0 series is far more
280popular in the Net. Encode comes with the following \s-1KOI\s0 charsets.
281For gory details, see <http://czyborra.com/charsets/cyrillic.html>
282.Sp
283.Vb 5
284\& ----------------------------------------------------------------
285\& koi8-f
286\& koi8-r cp878 [RFC1489]
287\& koi8-u [RFC2319]
288\& ----------------------------------------------------------------
289.Ve
290.IP "gsm0338 \- Hentai Latin 1" 4
291.IX Item "gsm0338 - Hentai Latin 1"
292\&\s-1GSM0338\s0 is for \s-1GSM\s0 handsets. Though it shares alphanumerals with
293\&\s-1ASCII\s0, control character ranges and other parts are mapped very
294differently, presumably to store Greek and Cyrillic alphabets.
295This is also covered in Encode::Byte even though it is not an
296\&\*(L"extended \s-1ASCII\s0\*(R" encoding.
297.Sh "\s-1CJK:\s0 Chinese, Japanese, Korean (Multibyte)"
298.IX Subsection "CJK: Chinese, Japanese, Korean (Multibyte)"
299Note that Vietnamese is listed above. Also read \*(L"Encoding vs Charset\*(R"
300below. Also note that these are implemented in distinct modules by
301countries, due the the size concerns (simplified Chinese is mapped
302to '\s-1CN\s0', continental China, while traditional Chinese is mapped to
303\&'\s-1TW\s0', Taiwan). Please refer to their respective documentataion pages.
304.IP "Encode::CN \*(-- Continental China" 4
305.IX Item "Encode::CN Continental China"
306.Vb 9
307\& Standard DOS/Win Macintosh Comment/Reference
308\& ----------------------------------------------------------------
309\& euc-cn [1] MacChineseSimp
310\& (gbk) cp936 [2]
311\& gb12345-raw { GB12345 without CES }
312\& gb2312-raw { GB2312 without CES }
313\& hz
314\& iso-ir-165
315\& ----------------------------------------------------------------
316.Ve
317.Sp
318.Vb 2
319\& [1] GB2312 is aliased to this. See L<Microsoft-related naming mess>
320\& [2] gbk is aliased to this. See L<Microsoft-related naming mess>
321.Ve
322.IP "Encode::JP \*(-- Japan" 4
323.IX Item "Encode::JP Japan"
324.Vb 11
325\& Standard DOS/Win Macintosh Comment/Reference
326\& ----------------------------------------------------------------
327\& euc-jp
328\& shiftjis cp932 macJapanese
329\& 7bit-jis
330\& iso-2022-jp [RFC1468]
331\& iso-2022-jp-1 [RFC2237]
332\& jis0201-raw { JIS X 0201 (roman + halfwidth kana) without CES }
333\& jis0208-raw { JIS X 0208 (Kanji + fullwidth kana) without CES }
334\& jis0212-raw { JIS X 0212 (Extended Kanji) without CES }
335\& ----------------------------------------------------------------
336.Ve
337.IP "Encode::KR \*(-- Korea" 4
338.IX Item "Encode::KR Korea"
339.Vb 8
340\& Standard DOS/Win Macintosh Comment/Reference
341\& ----------------------------------------------------------------
342\& euc-kr MacKorean [RFC1557]
343\& cp949 [1]
344\& iso-2022-kr [RFC1557]
345\& johab [KS X 1001:1998, Annex 3]
346\& ksc5601-raw { KSC5601 without CES }
347\& ----------------------------------------------------------------
348.Ve
349.Sp
350.Vb 2
351\& [1] ks_c_5601-1987, (x-)?windows-949, and uhc are aliased to this.
352\& See below.
353.Ve
354.IP "Encode::TW \*(-- Taiwan" 4
355.IX Item "Encode::TW Taiwan"
356.Vb 5
357\& Standard DOS/Win Macintosh Comment/Reference
358\& ----------------------------------------------------------------
359\& big5-eten cp950 MacChineseTrad {big5 aliased to big5-eten}
360\& big5-hkscs
361\& ----------------------------------------------------------------
362.Ve
363.IP "Encode::HanExtra \*(-- More Chinese via \s-1CPAN\s0" 4
364.IX Item "Encode::HanExtra More Chinese via CPAN"
365Due to size concerns, additional Chinese encodings below are
366distributed separately on \s-1CPAN\s0, under the name Encode::HanExtra.
367.Sp
368.Vb 8
369\& Standard DOS/Win Macintosh Comment/Reference
370\& ----------------------------------------------------------------
371\& big5ext CMEX's Big5e Extension
372\& big5plus CMEX's Big5+ Extension
373\& cccii Chinese Character Code for Information Interchange
374\& euc-tw EUC (Extended Unix Character)
375\& gb18030 GBK with Traditional Characters
376\& ----------------------------------------------------------------
377.Ve
378.IP "Encode::JIS2K \*(-- \s-1JIS\s0 X 0213 encodings via \s-1CPAN\s0" 4
379.IX Item "Encode::JIS2K JIS X 0213 encodings via CPAN"
380Due to size concerns, additional Japanese encodings below are
381distributed separately on \s-1CPAN\s0, under the name Encode::JIS2K.
382.Sp
383.Vb 8
384\& Standard DOS/Win Macintosh Comment/Reference
385\& ----------------------------------------------------------------
386\& euc-jisx0213
387\& shiftjisx0123
388\& iso-2022-jp-3
389\& jis0213-1-raw
390\& jis0213-2-raw
391\& ----------------------------------------------------------------
392.Ve
393.Sh "Miscellaneous encodings"
394.IX Subsection "Miscellaneous encodings"
395.IP "Encode::EBCDIC" 4
396.IX Item "Encode::EBCDIC"
397See perlebcdic for details.
398.Sp
399.Vb 8
400\& ----------------------------------------------------------------
401\& cp37
402\& cp500
403\& cp875
404\& cp1026
405\& cp1047
406\& posix-bc
407\& ----------------------------------------------------------------
408.Ve
409.IP "Encode::Symbols" 4
410.IX Item "Encode::Symbols"
411For symbols and dingbats.
412.Sp
413.Vb 7
414\& ----------------------------------------------------------------
415\& symbol
416\& dingbats
417\& MacDingbats
418\& AdobeZdingbat
419\& AdobeSymbol
420\& ----------------------------------------------------------------
421.Ve
422.IP "Encode::MIME::Header" 4
423.IX Item "Encode::MIME::Header"
424Strictly speaking, \s-1MIME\s0 header encoding documented in \s-1RFC\s0 2047 is more
425of encapsulation than encoding. But included anyway.
426.Sp
427.Vb 5
428\& ----------------------------------------------------------------
429\& MIME-Header [RFC2047]
430\& MIME-B [RFC2047]
431\& MIME-Q [RFC2047]
432\& ----------------------------------------------------------------
433.Ve
434.IP "Encode::Guess" 4
435.IX Item "Encode::Guess"
436This one is not a name of encoding but a utility that lets you pick up
437the most appropriate encoding for a data out of given \fIsuspects\fR. See
438Encode::Guess for details.
439.SH "Unsupported encodings"
440.IX Header "Unsupported encodings"
441The following encodings are not supported as yet; some because they
442are rarely used, some because of technical difficulties. They may
443be supported by external modules via \s-1CPAN\s0 in the future, however.
444.IP "\s-1ISO\-2022\-JP\-2\s0 [\s-1RFC1554\s0]" 4
445.IX Item "ISO-2022-JP-2 [RFC1554]"
446Not very popular yet. Needs Unicode Database or equivalent to
447implement \fIencode()\fR (because it includes \s-1JIS\s0 X 0208/0212, \s-1KSC5601\s0, and
448\&\s-1GB2312\s0 simultaneously, whose code points in Unicode overlap. So you
449need to lookup the database to determine to what character set a given
450Unicode character should belong).
451.IP "\s-1ISO\-2022\-CN\s0 [\s-1RFC1922\s0]" 4
452.IX Item "ISO-2022-CN [RFC1922]"
453Not very popular. Needs \s-1CNS\s0 11643\-1 and \-2 which are not available in
454this module. \s-1CNS\s0 11643 is supported (via euc\-tw) in Encode::HanExtra.
455Autrijus Tang may add support for this encoding in his module in future.
456.IP "Various HP-UX encodings" 4
457.IX Item "Various HP-UX encodings"
458The following are unsupported due to the lack of mapping data.
459.Sp
460.Vb 2
461\& '8' - arabic8, greek8, hebrew8, kana8, thai8, and turkish8
462\& '15' - japanese15, korean15, and roi15
463.Ve
464.IP "Cyrillic encoding \s-1ISO\-IR\-111\s0" 4
465.IX Item "Cyrillic encoding ISO-IR-111"
466Anton Tagunov doubts its usefulness.
467.IP "\s-1ISO\-8859\-8\-1\s0 [Hebrew]" 4
468.IX Item "ISO-8859-8-1 [Hebrew]"
469None of the Encode team knows Hebrew enough (\s-1ISO\-8859\-8\s0, cp1255 and
470MacHebrew are supported because and just because there were mappings
471available at <http://www.unicode.org/>). Contributions welcome.
472.IP "\s-1ISIRI\s0 3342, Iran System, \s-1ISIRI\s0 2900 [Farsi]" 4
473.IX Item "ISIRI 3342, Iran System, ISIRI 2900 [Farsi]"
474Ditto.
475.IP "Thai encoding \s-1TCVN\s0" 4
476.IX Item "Thai encoding TCVN"
477Ditto.
478.IP "Vietnamese encodings \s-1VPS\s0" 4
479.IX Item "Vietnamese encodings VPS"
480Though Jungshik Shin has reported that Mozilla supports this encoding,
481it was too late before 5.8.0 for us to add it. In the future, it
482may be available via a separate module. See
483<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf>
484and
485<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut>
486if you are interested in helping us.
487.IP "Various Mac encodings" 4
488.IX Item "Various Mac encodings"
489The following are unsupported due to the lack of mapping data.
490.Sp
491.Vb 5
492\& MacArmenian, MacBengali, MacBurmese, MacEthiopic
493\& MacExtArabic, MacGeorgian, MacKannada, MacKhmer
494\& MacLaotian, MacMalayalam, MacMongolian, MacOriya
495\& MacSinhalese, MacTamil, MacTelugu, MacTibetan
496\& MacVietnamese
497.Ve
498.Sp
499The rest which are already available are based upon the vendor mappings
500at <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/> .
501.IP "(Mac) Indic encodings" 4
502.IX Item "(Mac) Indic encodings"
503The maps for the following are available at <http://www.unicode.org/>
504but remain unsupport because those encodings need algorithmical
505approach, currently unsupported by \fIenc2xs\fR:
506.Sp
507.Vb 3
508\& MacDevanagari
509\& MacGurmukhi
510\& MacGujarati
511.Ve
512.Sp
513For details, please see \f(CW\*(C`Unicode mapping issues and notes:\*(C'\fR at
514<http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/DEVANAGA.TXT> .
515.Sp
516I believe this issue is prevalent not only for Mac Indics but also in
517other Indic encodings, but the above were the only Indic encodings
518maps that I could find at <http://www.unicode.org/> .
519.SH "Encoding vs. Charset \*(-- terminology"
520.IX Header "Encoding vs. Charset terminology"
521We are used to using the term (character) \fIencoding\fR and \fIcharacter
522set\fR interchangeably. But just as confusing the terms byte and
523character is dangerous and the terms should be differentiated when
524needed, we need to differentiate \fIencoding\fR and \fIcharacter set\fR.
525.PP
526To understand that, here is a description of how we make computers
527grok our characters.
528.IP "\(bu" 4
529First we start with which characters to include. We call this
530collection of characters \fIcharacter repertoire\fR.
531.IP "\(bu" 4
532Then we have to give each character a unique \s-1ID\s0 so your computer can
533tell the difference between 'a' and 'A'. This itemized character
534repertoire is now a \fIcharacter set\fR.
535.IP "\(bu" 4
536If your computer can grow the character set without further
537processing, you can go ahead and use it. This is called a \fIcoded
538character set\fR (\s-1CCS\s0) or \fIraw character encoding\fR. \s-1ASCII\s0 is used this
539way for most cases.
540.IP "\(bu" 4
541But in many cases, especially multi-byte \s-1CJK\s0 encodings, you have to
542tweak a little more. Your network connection may not accept any data
543with the Most Significant Bit set, and your computer may not be able to
544tell if a given byte is a whole character or just half of it. So you
545have to \fIencode\fR the character set to use it.
546.Sp
547A \fIcharacter encoding scheme\fR (\s-1CES\s0) determines how to encode a given
548character set, or a set of multiple character sets. 7bit \s-1ISO\-2022\s0 is
549an example of a \s-1CES\s0. You switch between character sets via \fIescape
550sequences\fR.
551.PP
552Technically, or mathematically, speaking, a character set encoded in
553such a \s-1CES\s0 that maps character by character may form a \s-1CCS\s0. \s-1EUC\s0 is such
554an example. The \s-1CES\s0 of \s-1EUC\s0 is as follows:
555.IP "\(bu" 4
556Map \s-1ASCII\s0 unchanged.
557.IP "\(bu" 4
558Map such a character set that consists of 94 or 96 powered by N
559members by adding 0x80 to each byte.
560.IP "\(bu" 4
561You can also use 0x8e and 0x8f to indicate that the following sequence of
562characters belongs to yet another character set. To each following byte
563is added the value 0x80.
564.PP
565By carefully looking at the encoded byte sequence, you can find that the
566byte sequence conforms a unique number. In that sense, \s-1EUC\s0 is a \s-1CCS\s0
567generated by a \s-1CES\s0 above from up to four \s-1CCS\s0 (complicated?). \s-1UTF\-8\s0
568falls into this category. See \*(L"\s-1UTF\-8\s0\*(R" in perlUnicode to find out how
569\&\s-1UTF\-8\s0 maps Unicode to a byte sequence.
570.PP
571You may also have found out by now why 7bit \s-1ISO\-2022\s0 cannot comprise
572a \s-1CCS\s0. If you look at a byte sequence \ex21\ex21, you can't tell if
573it is two !'s or \s-1IDEOGRAPHIC\s0 \s-1SPACE\s0. \s-1EUC\s0 maps the latter to \exA1\exA1
574so you have no trouble differentiating between \*(L"!!\*(R". and \*(L"\ \*(R".
575.SH "Encoding Classification (by Anton Tagunov and Dan Kogai)"
576.IX Header "Encoding Classification (by Anton Tagunov and Dan Kogai)"
577This section tries to classify the supported encodings by their
578applicability for information exchange over the Internet and to
579choose the most suitable aliases to name them in the context of
580such communication.
581.IP "\(bu" 4
582To (en|de)code encodings marked by \f(CW\*(C`(**)\*(C'\fR, you need
583\&\f(CW\*(C`Encode::HanExtra\*(C'\fR, available from \s-1CPAN\s0.
584.PP
585Encoding names
586.PP
587.Vb 3
588\& US-ASCII UTF-8 ISO-8859-* KOI8-R
589\& Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
590\& EUC-KR Big5 GB2312
591.Ve
592.PP
593are registered with \s-1IANA\s0 as preferred \s-1MIME\s0 names and may
594be used over the Internet.
595.PP
596\&\f(CW\*(C`Shift_JIS\*(C'\fR has been officialized by \s-1JIS\s0 X 0208:1997.
597\&\*(L"Microsoft\-related naming mess\*(R" gives details.
598.PP
599\&\f(CW\*(C`GB2312\*(C'\fR is the \s-1IANA\s0 name for \f(CW\*(C`EUC\-CN\*(C'\fR.
600See \*(L"Microsoft\-related naming mess\*(R" for details.
601.PP
602\&\f(CW\*(C`GB_2312\-80\*(C'\fR \fIraw\fR encoding is available as \f(CW\*(C`gb2312\-raw\*(C'\fR
603with Encode. See Encode::CN for details.
604.PP
605.Vb 2
606\& EUC-CN
607\& KOI8-U [RFC2319]
608.Ve
609.PP
610have not been registered with \s-1IANA\s0 (as of March 2002) but
611seem to be supported by major web browsers.
612The \s-1IANA\s0 name for \f(CW\*(C`EUC\-CN\*(C'\fR is \f(CW\*(C`GB2312\*(C'\fR.
613.PP
614.Vb 1
615\& KS_C_5601-1987
616.Ve
617.PP
618is heavily misused.
619See \*(L"Microsoft\-related naming mess\*(R" for details.
620.PP
621\&\f(CW\*(C`KS_C_5601\-1987\*(C'\fR \fIraw\fR encoding is available as \f(CW\*(C`kcs5601\-raw\*(C'\fR
622with Encode. See Encode::KR for details.
623.PP
624.Vb 1
625\& UTF-16 UTF-16BE UTF-16LE
626.Ve
627.PP
628are IANA-registered \f(CW\*(C`charset\*(C'\fRs. See [\s-1RFC\s0 2781] for details.
629Jungshik Shin reports that \s-1UTF\-16\s0 with a \s-1BOM\s0 is well accepted
630by \s-1MS\s0 \s-1IE\s0 5/6 and \s-1NS\s0 4/6. Beware however that
631.IP "\(bu" 4
632\&\f(CW\*(C`UTF\-16\*(C'\fR support in any software you're going to be
633using/interoperating with has probably been less tested
634then \f(CW\*(C`UTF\-8\*(C'\fR support
635.IP "\(bu" 4
636\&\f(CW\*(C`UTF\-8\*(C'\fR coded data seamlessly passes traditional
637command piping (\f(CW\*(C`cat\*(C'\fR, \f(CW\*(C`more\*(C'\fR, etc.) while \f(CW\*(C`UTF\-16\*(C'\fR coded
638data is likely to cause confusion (with its zero bytes,
639for example)
640.IP "\(bu" 4
641it is beyond the power of words to describe the way \s-1HTML\s0 browsers
642encode non\-\f(CW\*(C`ASCII\*(C'\fR form data. To get a general impression, visit
643<http://ppewww.ph.gla.ac.uk/~flavell/charset/form\-i18n.html>.
644While encoding of form data has stabilized for \f(CW\*(C`UTF\-8\*(C'\fR encoded pages
645(at least \s-1IE\s0 5/6, \s-1NS\s0 6, and Opera 6 behave consistently), be sure to
646expect fun (and cross-browser discrepancies) with \f(CW\*(C`UTF\-16\*(C'\fR encoded
647pages!
648.PP
649The rule of thumb is to use \f(CW\*(C`UTF\-8\*(C'\fR unless you know what
650you're doing and unless you really benefit from using \f(CW\*(C`UTF\-16\*(C'\fR.
651.PP
652.Vb 5
653\& ISO-IR-165 [RFC1345]
654\& VISCII
655\& GB 12345
656\& GB 18030 (**) (see links bellow)
657\& EUC-TW (**)
658.Ve
659.PP
660are totally valid encodings but not registered at \s-1IANA\s0.
661The names under which they are listed here are probably the
662most widely-known names for these encodings and are recommended
663names.
664.PP
665.Vb 1
666\& BIG5PLUS (**)
667.Ve
668.PP
669is a proprietary name.
670.Sh "Microsoft-related naming mess"
671.IX Subsection "Microsoft-related naming mess"
672Microsoft products misuse the following names:
673.IP "\s-1KS_C_5601\-1987\s0" 4
674.IX Item "KS_C_5601-1987"
675Microsoft extension to \f(CW\*(C`EUC\-KR\*(C'\fR.
676.Sp
677Proper names: \f(CW\*(C`CP949\*(C'\fR, \f(CW\*(C`UHC\*(C'\fR, \f(CW\*(C`x\-windows\-949\*(C'\fR (as used by Mozilla).
678.Sp
679See <http://lists.w3.org/Archives/Public/ietf\-charsets/2001AprJun/0033.html>
680for details.
681.Sp
682Encode aliases \f(CW\*(C`KS_C_5601\-1987\*(C'\fR to \f(CW\*(C`cp949\*(C'\fR to reflect this common
683misusage. \fIRaw\fR \f(CW\*(C`KS_C_5601\-1987\*(C'\fR encoding is available as
684\&\f(CW\*(C`kcs5601\-raw\*(C'\fR.
685.Sp
686See Encode::KR for details.
687.IP "\s-1GB2312\s0" 4
688.IX Item "GB2312"
689Microsoft extension to \f(CW\*(C`EUC\-CN\*(C'\fR.
690.Sp
691Proper names: \f(CW\*(C`CP936\*(C'\fR, \f(CW\*(C`GBK\*(C'\fR.
692.Sp
693\&\f(CW\*(C`GB2312\*(C'\fR has been registered in the \f(CW\*(C`EUC\-CN\*(C'\fR meaning at
694\&\s-1IANA\s0. This has partially repaired the situation: Microsoft's
695\&\f(CW\*(C`GB2312\*(C'\fR has become a superset of the official \f(CW\*(C`GB2312\*(C'\fR.
696.Sp
697Encode aliases \f(CW\*(C`GB2312\*(C'\fR to \f(CW\*(C`euc\-cn\*(C'\fR in full agreement with
698\&\s-1IANA\s0 registration. \f(CW\*(C`cp936\*(C'\fR is supported separately.
699\&\fIRaw\fR \f(CW\*(C`GB_2312\-80\*(C'\fR encoding is available as \f(CW\*(C`gb2312\-raw\*(C'\fR.
700.Sp
701See Encode::CN for details.
702.IP "Big5" 4
703.IX Item "Big5"
704Microsoft extension to \f(CW\*(C`Big5\*(C'\fR.
705.Sp
706Proper name: \f(CW\*(C`CP950\*(C'\fR.
707.Sp
708Encode separately supports \f(CW\*(C`Big5\*(C'\fR and \f(CW\*(C`cp950\*(C'\fR.
709.IP "Shift_JIS" 4
710.IX Item "Shift_JIS"
711Microsoft's understanding of \f(CW\*(C`Shift_JIS\*(C'\fR.
712.Sp
713\&\s-1JIS\s0 has not endorsed the full Microsoft standard however.
714The official \f(CW\*(C`Shift_JIS\*(C'\fR includes only \s-1JIS\s0 X 0201 and \s-1JIS\s0 X 0208
715character sets, while Microsoft has always used \f(CW\*(C`Shift_JIS\*(C'\fR
716to encode a wider character repertoire. See \f(CW\*(C`IANA\*(C'\fR registration for
717\&\f(CW\*(C`Windows\-31J\*(C'\fR.
718.Sp
719As a historical predecessor, Microsoft's variant
720probably has more rights for the name, though it may be objected
721that Microsoft shouldn't have used \s-1JIS\s0 as part of the name
722in the first place.
723.Sp
724Unambiguous name: \f(CW\*(C`CP932\*(C'\fR. \f(CW\*(C`IANA\*(C'\fR name (not used?): \f(CW\*(C`Windows\-31J\*(C'\fR.
725.Sp
726Encode separately supports \f(CW\*(C`Shift_JIS\*(C'\fR and \f(CW\*(C`cp932\*(C'\fR.
727.SH "Glossary"
728.IX Header "Glossary"
729.IP "character repertoire" 4
730.IX Item "character repertoire"
731A collection of unique characters. A \fIcharacter\fR set in the strictest
732sense. At this stage, characters are not numbered.
733.IP "coded character set (\s-1CCS\s0)" 4
734.IX Item "coded character set (CCS)"
735A character set that is mapped in a way computers can use directly.
736Many character encodings, including \s-1EUC\s0, fall in this category.
737.IP "character encoding scheme (\s-1CES\s0)" 4
738.IX Item "character encoding scheme (CES)"
739An algorithm to map a character set to a byte sequence. You don't
740have to be able to tell which character set a given byte sequence
741belongs. 7\-bit \s-1ISO\-2022\s0 is a \s-1CES\s0 but it cannot be a \s-1CCS\s0. \s-1EUC\s0 is an
742example of being both a \s-1CCS\s0 and \s-1CES\s0.
743.IP "charset (in \s-1MIME\s0 context)" 4
744.IX Item "charset (in MIME context)"
745has long been used in the meaning of \f(CW\*(C`encoding\*(C'\fR, \s-1CES\s0.
746.Sp
747While the word combination \f(CW\*(C`character set\*(C'\fR has lost this meaning
748in \s-1MIME\s0 context since [\s-1RFC\s0 2130], the \f(CW\*(C`charset\*(C'\fR abbreviation has
749retained it. This is how [\s-1RFC\s0 2277] and [\s-1RFC\s0 2278] bless \f(CW\*(C`charset\*(C'\fR:
750.Sp
751.Vb 7
752\& This document uses the term "charset" to mean a set of rules for
753\& mapping from a sequence of octets to a sequence of characters, such
754\& as the combination of a coded character set and a character encoding
755\& scheme; this is also what is used as an identifier in MIME "charset="
756\& parameters, and registered in the IANA charset registry ... (Note
757\& that this is NOT a term used by other standards bodies, such as ISO).
758\& [RFC 2277]
759.Ve
760.IP "\s-1EUC\s0" 4
761.IX Item "EUC"
762Extended Unix Character. See \s-1ISO\-2022\s0.
763.IP "\s-1ISO\-2022\s0" 4
764.IX Item "ISO-2022"
765A \s-1CES\s0 that was carefully designed to coexist with \s-1ASCII\s0. There are a 7
766bit version and an 8 bit version.
767.Sp
768The 7 bit version switches character set via escape sequence so it
769cannot form a \s-1CCS\s0. Since this is more difficult to handle in programs
770than the 8 bit version, the 7 bit version is not very popular except for
771iso\-2022\-jp, the \fIde facto\fR standard \s-1CES\s0 for e\-mails.
772.Sp
773The 8 bit version can form a \s-1CCS\s0. \s-1EUC\s0 and \s-1ISO\-8859\s0 are two examples
774thereof. Pre\-5.6 perl could use them as string literals.
775.IP "\s-1UCS\s0" 4
776.IX Item "UCS"
777Short for \fIUniversal Character Set\fR. When you say just \s-1UCS\s0, it means
778\&\fIUnicode\fR.
779.IP "\s-1UCS\-2\s0" 4
780.IX Item "UCS-2"
781\&\s-1ISO/IEC\s0 10646 encoding form: Universal Character Set coded in two
782octets.
783.IP "Unicode" 4
784.IX Item "Unicode"
785A character set that aims to include all character repertoires of the
786world. Many character sets in various national as well as industrial
787standards have become, in a way, just subsets of Unicode.
788.IP "\s-1UTF\s0" 4
789.IX Item "UTF"
790Short for \fIUnicode Transformation Format\fR. Determines how to map a
791Unicode character into a byte sequence.
792.IP "\s-1UTF\-16\s0" 4
793.IX Item "UTF-16"
794A \s-1UTF\s0 in 16\-bit encoding. Can either be in big endian or little
795endian. The big endian version is called \s-1UTF\-16BE\s0 (equal to \s-1UCS\-2\s0 +
796surrogate support) and the little endian version is called \s-1UTF\-16LE\s0.
797.SH "See Also"
798.IX Header "See Also"
799Encode,
800Encode::Byte,
801Encode::CN, Encode::JP, Encode::KR, Encode::TW,
802Encode::EBCDIC, Encode::Symbol
803Encode::MIME::Header, Encode::Guess
804.SH "References"
805.IX Header "References"
806.IP "\s-1ECMA\s0" 4
807.IX Item "ECMA"
808European Computer Manufacturers Association
809<http://www.ecma.ch>
810.RS 4
811.ie n .IP "\s-1ECMA\-035\s0 (eq ""ISO\-2022"")" 4
812.el .IP "\s-1ECMA\-035\s0 (eq \f(CWISO\-2022\fR)" 4
813.IX Item "ECMA-035 (eq ISO-2022)"
814<http://www.ecma.ch/ecma1/STAND/ECMA\-035.HTM>
815.Sp
816The specification of \s-1ISO\-2022\s0 is available from the link above.
817.RE
818.RS 4
819.RE
820.IP "\s-1IANA\s0" 4
821.IX Item "IANA"
822Internet Assigned Numbers Authority
823<http://www.iana.org/>
824.RS 4
825.IP "Assigned Charset Names by \s-1IANA\s0" 4
826.IX Item "Assigned Charset Names by IANA"
827<http://www.iana.org/assignments/character\-sets>
828.Sp
829Most of the \f(CW\*(C`canonical names\*(C'\fR in Encode derive from this list
830so you can directly apply the string you have extracted from \s-1MIME\s0
831header of mails and web pages.
832.RE
833.RS 4
834.RE
835.IP "\s-1ISO\s0" 4
836.IX Item "ISO"
837International Organization for Standardization
838<http://www.iso.ch/>
839.IP "\s-1RFC\s0" 4
840.IX Item "RFC"
841Request For Comments \*(-- need I say more?
842<http://www.rfc\-editor.org/>, <http://www.rfc.net/>,
843<http://www.faqs.org/rfcs/>
844.IP "\s-1UC\s0" 4
845.IX Item "UC"
846Unicode Consortium
847<http://www.unicode.org/>
848.RS 4
849.IP "Unicode Glossary" 4
850.IX Item "Unicode Glossary"
851<http://www.unicode.org/glossary/>
852.Sp
853The glossary of this document is based upon this site.
854.RE
855.RS 4
856.RE
857.Sh "Other Notable Sites"
858.IX Subsection "Other Notable Sites"
859.IP "czyborra.com" 4
860.IX Item "czyborra.com"
861<http://czyborra.com/>
862.Sp
863Contains a a lot of useful information, especially gory details of \s-1ISO\s0
864vs. vendor mappings.
865.IP "\s-1CJK\s0.inf" 4
866.IX Item "CJK.inf"
867<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
868.Sp
869Somewhat obsolete (last update in 1996), but still useful. Also try
870.Sp
871<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
872.Sp
873You will find brief info on \f(CW\*(C`EUC\-CN\*(C'\fR, \f(CW\*(C`GBK\*(C'\fR and mostly on \f(CW\*(C`GB 18030\*(C'\fR.
874.IP "Jungshik Shin's Hangul \s-1FAQ\s0" 4
875.IX Item "Jungshik Shin's Hangul FAQ"
876<http://jshin.net/faq>
877.Sp
878And especially its subject 8.
879.Sp
880<http://jshin.net/faq/qa8.html>
881.Sp
882A comprehensive overview of the Korean (\f(CW\*(C`KS *\*(C'\fR) standards.
883.ie n .IP "debian.org: ""Introduction to i18n""" 4
884.el .IP "debian.org: ``Introduction to i18n''" 4
885.IX Item "debian.org: Introduction to i18n"
886A brief description for most of the mentioned \s-1CJK\s0 encodings is
887contained in
888<http://www.debian.org/doc/manuals/intro\-i18n/ch\-codes.en.html>
889.Sh "Offline sources"
890.IX Subsection "Offline sources"
891.ie n .IP """CJKV Information Processing"" by Ken Lunde" 4
892.el .IP "\f(CWCJKV Information Processing\fR by Ken Lunde" 4
893.IX Item "CJKV Information Processing by Ken Lunde"
894\&\s-1CJKV\s0 Information Processing
8951999 O'Reilly & Associates, \s-1ISBN\s0 : 1\-56592\-224\-7
896.Sp
897The modern successor of \f(CW\*(C`CJK.inf\*(C'\fR.
898.Sp
899Features a comprehensive coverage of \s-1CJKV\s0 character sets and
900encodings along with many other issues faced by anyone trying
901to better support \s-1CJKV\s0 languages/scripts in all the areas of
902information processing.
903.Sp
904To purchase this book, visit
905<http://www.oreilly.com/catalog/cjkvinfo/>
906or your favourite bookstore.