git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame - sam-t2/devtools/v8plus/man/man3/Encode::Supported.3

Commit	Line	Data
920dae64 AT	1	.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
65	.hy 0
66	.if n .na
67	.\"
68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69	.\" Fear. Run. Save yourself. No user-serviceable parts.
70	. \" fudge factors for nroff and troff
71	.if n \{\
72	. ds #H 0
73	. ds #V .8m
74	. ds #F .3m
75	. ds #[ \f1
76	. ds #] \fP
77	.\}
78	.if t \{\
79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80	. ds #V .6m
81	. ds #F 0
82	. ds #[ \&
83	. ds #] \&
84	.\}
85	. \" simple accents for nroff and troff
86	.if n \{\
87	. ds ' \&
88	. ds ` \&
89	. ds ^ \&
90	. ds , \&
91	. ds ~ ~
92	. ds /
93	.\}
94	.if t \{\
95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
101	.\}
102	. \" troff and (daisy-wheel) nroff accents
103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
110	.ds ae a\h'-(\w'a'u*4/10)'e
111	.ds Ae A\h'-(\w'A'u*4/10)'E
112	. \" corrections for vroff
113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
115	. \" for low resolution devices (crt and lpr)
116	.if \n(.H>23 .if \n(.V>19 \
117	\{\
118	. ds : e
119	. ds 8 ss
120	. ds o a
121	. ds d- d\h'-1'\(ga
122	. ds D- D\h'-1'\(hy
123	. ds th \o'bp'
124	. ds Th \o'LP'
125	. ds ae ae
126	. ds Ae AE
127	.\}
128	.rm #[ #] #H #V #F C
129	.\" ========================================================================
130	.\"
131	.IX Title "Encode::Supported 3"
132	.TH Encode::Supported 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
133	.SH "NAME"
134	Encode::Supported \-\- Encodings supported by Encode
135	.SH "DESCRIPTION"
136	.IX Header "DESCRIPTION"
137	.Sh "Encoding Names"
138	.IX Subsection "Encoding Names"
139	Encoding names are case insensitive. White space in names
140	is ignored. In addition, an encoding may have aliases.
141	Each encoding has one \(L"canonical\(R" name. The \(L"canonical\(R"
142	name is chosen from the names of the encoding by picking
143	the first in the following sequence (with a few exceptions).
144	.IP "\(bu" 4
145	The name used by the Perl community. That includes 'utf8' and 'ascii'.
146	Unlike aliases, canonical names directly reach the method so such
147	frequently used words like 'utf8' don't need to do alias lookups.
148	.IP "\(bu" 4
149	The \s-1MIME\s0 name as defined in \s-1IETF\s0 RFCs. This includes all \(L"iso\-\(R"s.
150	.IP "\(bu" 4
151	The name in the \s-1IANA\s0 registry.
152	.IP "\(bu" 4
153	The name used by the organization that defined it.
154	.PP
155	In case \fIde jure\fR canonical names differ from that of the Encode
156	module, they are always aliased if it ever be implemented. So you can
157	safely tell if a given encoding is implemented or not just by passing
158	the canonical name.
159	.PP
160	Because of all the alias issues, and because in the general case
161	encodings have state, \(L"Encode\(R" uses an encoding object internally
162	once an operation is in progress.
163	.SH "Supported Encodings"
164	.IX Header "Supported Encodings"
165	As of Perl 5.8.0, at least the following encodings are recognized.
166	Note that unless otherwise specified, they are all case insensitive
167	(via alias) and all occurrence of spaces are replaced with '\-'.
168	In other words, \(L"\s-1ISO\s0 8859 1\(R" and \(L"iso\-8859\-1\(R" are identical.
169	.PP
170	Encodings are categorized and implemented in several different modules
171	but you don't have to \f(CW\(C`use Encode::XX\(C'\fR to make them available for
172	most cases. Encode.pm will automatically load those modules on demand.
173	.Sh "Built-in Encodings"
174	.IX Subsection "Built-in Encodings"
175	The following encodings are always available.
176	.PP
177	.Vb 8
178	\& Canonical Aliases Comments & References
179	\& ----------------------------------------------------------------
180	\& ascii US-ascii ISO-646-US [ECMA]
181	\& ascii-ctrl Special Encoding
182	\& iso-8859-1 latin1 [ISO]
183	\& null Special Encoding
184	\& utf8 UTF-8 [RFC2279]
185	\& ----------------------------------------------------------------
186	.Ve
187	.PP
188	\&\fInull\fR and \fIascii-ctrl\fR are special. \(L"null\(R" fails for all character
189	so when you set fallback mode to \s-1PERLQQ\s0, \s-1HTMLCREF\s0 or \s-1XMLCREF\s0, \s-1ALL\s0
190	\&\s-1CHARACTERS\s0 will fall back to character references. Ditto for
191	\&\(L"ascii\-ctrl\(R" except for control characters. For fallback modes, see
192	Encode.
193	.Sh "Encode::Unicode \*(-- other Unicode encodings"
194	.IX Subsection "Encode::Unicode other Unicode encodings"
195	Unicode coding schemes other than native utf8 are supported by
196	Encode::Unicode, which will be autoloaded on demand.
197	.PP
198	.Vb 11
199	\& ----------------------------------------------------------------
200	\& UCS-2BE UCS-2, iso-10646-1 [IANA, UC]
201	\& UCS-2LE [UC]
202	\& UTF-16 [UC]
203	\& UTF-16BE [UC]
204	\& UTF-16LE [UC]
205	\& UTF-32 [UC]
206	\& UTF-32BE UCS-4 [UC]
207	\& UTF-32LE [UC]
208	\& UTF-7 [RFC2152]
209	\& ----------------------------------------------------------------
210	.Ve
211	.PP
212	To find how (UCS\-2\|UTF\-(16\|32))(LE\|BE)? differ from one another,
213	see Encode::Unicode.
214	.PP
215	\&\s-1UTF\-7\s0 is a special encoding which \(L"re\-encodes\(R" \s-1UTF\-16BE\s0 into a 7\-bit
216	encoding. It is implemented seperately by Encode::Unicode::UTF7.
217	.Sh "Encode::Byte \*(-- Extended \s-1ASCII\s0"
218	.IX Subsection "Encode::Byte Extended ASCII"
219	Encode::Byte implements most single-byte encodings except for
220	Symbols and \s-1EBCDIC\s0. The following encodings are based on single-byte
221	encodings implemented as extended \s-1ASCII\s0. Most of them map
222	\&\ex80\-\exff (upper half) to non-ASCII characters.
223	.IP "\s-1ISO\-8859\s0 and corresponding vendor mappings" 4
224	.IX Item "ISO-8859 and corresponding vendor mappings"
225	Since there are so many, they are presented in table format with
226	languages and corresponding encoding names by vendors. Note that
227	the table is sorted in order of \s-1ISO\-8859\s0 and the corresponding vendor
228	mappings are slightly different from that of \s-1ISO\s0. See
229	<http://czyborra.com/charsets/iso8859.html> for details.
230	.Sp
231	.Vb 32
232	\& Lang/Regions ISO/Other Std. DOS Windows Macintosh Others
233	\& ----------------------------------------------------------------
234	\& N. America (ASCII) cp437 AdobeStandardEncoding
235	\& cp863 (DOSCanadaF)
236	\& W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
237	\& hp-roman8
238	\& cp860 (DOSPortuguese)
239	\& Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman
240	\& MacCroatian
241	\& MacRomanian
242	\& MacRumanian
243	\& Latin3[1] iso-8859-3
244	\& Latin4[2] iso-8859-4
245	\& Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic
246	\& (See also next section) cp866 MacUkrainian
247	\& Arabic iso-8859-6 cp864 cp1256 MacArabic
248	\& cp1006 MacFarsi
249	\& Greek iso-8859-7 cp737 cp1253 MacGreek
250	\& cp869 (DOSGreek2)
251	\& Hebrew iso-8859-8 cp862 cp1255 MacHebrew
252	\& Turkish iso-8859-9 cp857 cp1254 MacTurkish
253	\& Nordics iso-8859-10 cp865
254	\& cp861 MacIcelandic
255	\& MacSami
256	\& Thai iso-8859-11[3] cp874 MacThai
257	\& (iso-8859-12 is nonexistent. Reserved for Indics?)
258	\& Baltics iso-8859-13 cp775 cp1257
259	\& Celtics iso-8859-14
260	\& Latin9 [4] iso-8859-15
261	\& Latin10 iso-8859-16
262	\& Vietnamese viscii cp1258 MacVietnamese
263	\& ----------------------------------------------------------------
264	.Ve
265	.Sp
266	.Vb 5
267	\& [1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.
268	\& [2] Baltics. Now on 8859-10, except for Latvian.
269	\& [3] TIS 620 + Non-Breaking Space (0xA0 / U+00A0)
270	\& [4] Nicknamed Latin0; the Euro sign as well as French and Finnish
271	\& letters that are missing from 8859-1 were added.
272	.Ve
273	.Sp
274	All cp* are also available as ibm\-, ms\-, and windows\-* . See also
275	<http://czyborra.com/charsets/codepages.html>.
276	.Sp
277	Macintosh encodings don't seem to be registered in such entities as
278	\&\s-1IANA\s0. \(L"Canonical\(R" names in Encode are based upon Apple's Tech Note
279	1150. See <http://developer.apple.com/technotes/tn/tn1150.html>
280	for details.
281	.IP "\s-1KOI8\s0 \- De Facto Standard for the Cyrillic world" 4
282	.IX Item "KOI8 - De Facto Standard for the Cyrillic world"
283	Though \s-1ISO\-8859\s0 does have \s-1ISO\-8859\-5\s0, the \s-1KOI8\s0 series is far more
284	popular in the Net. Encode comes with the following \s-1KOI\s0 charsets.
285	For gory details, see <http://czyborra.com/charsets/cyrillic.html>
286	.Sp
287	.Vb 5
288	\& ----------------------------------------------------------------
289	\& koi8-f
290	\& koi8-r cp878 [RFC1489]
291	\& koi8-u [RFC2319]
292	\& ----------------------------------------------------------------
293	.Ve
294	.IP "gsm0338 \- Hentai Latin 1" 4
295	.IX Item "gsm0338 - Hentai Latin 1"
296	\&\s-1GSM0338\s0 is for \s-1GSM\s0 handsets. Though it shares alphanumerals with
297	\&\s-1ASCII\s0, control character ranges and other parts are mapped very
298	differently, mainly to store Greek characters. There are also escape
299	sequences (starting with 0x1B) to cover e.g. the Euro sign. Some
300	special cases like a trailing 0x00 byte or a lone 0x1B byte are not
301	well-defined and \fIdecode()\fR will return an empty string for them.
302	One possible workaround is
303	.Sp
304	.Vb 3
305	\& $gsm =~ s/\ex00\ez/\ex00\ex00/;
306	\& $uni = decode("gsm0338", $gsm);
307	\& $uni .= "\exA0" if $gsm =~ /\ex1B\ez/;
308	.Ve
309	.Sp
310	Note that the Encode implementation of \s-1GSM0338\s0 does not implement the
311	reuse of Latin capital letters as Greek capital letters (for example,
312	the 0x5A is U+005A (\s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 Z), not U+0396 (\s-1GREEK\s0 \s-1CAPITAL\s0
313	\&\s-1LETTER\s0 \s-1ZETA\s0).
314	.Sp
315	The \s-1GSM0338\s0 is also covered in Encode::Byte even though it is not
316	an \(L"extended \s-1ASCII\s0\(R" encoding.
317	.Sh "\s-1CJK:\s0 Chinese, Japanese, Korean (Multibyte)"
318	.IX Subsection "CJK: Chinese, Japanese, Korean (Multibyte)"
319	Note that Vietnamese is listed above. Also read \(L"Encoding vs Charset\(R"
320	below. Also note that these are implemented in distinct modules by
321	countries, due to the size concerns (simplified Chinese is mapped
322	to '\s-1CN\s0', continental China, while traditional Chinese is mapped to
323	\&'\s-1TW\s0', Taiwan). Please refer to their respective documentation pages.
324	.IP "Encode::CN \*(-- Continental China" 4
325	.IX Item "Encode::CN Continental China"
326	.Vb 9
327	\& Standard DOS/Win Macintosh Comment/Reference
328	\& ----------------------------------------------------------------
329	\& euc-cn [1] MacChineseSimp
330	\& (gbk) cp936 [2]
331	\& gb12345-raw { GB12345 without CES }
332	\& gb2312-raw { GB2312 without CES }
333	\& hz
334	\& iso-ir-165
335	\& ----------------------------------------------------------------
336	.Ve
337	.Sp
338	.Vb 2
339	\& [1] GB2312 is aliased to this. See L<Microsoft-related naming mess>
340	\& [2] gbk is aliased to this. See L<Microsoft-related naming mess>
341	.Ve
342	.IP "Encode::JP \*(-- Japan" 4
343	.IX Item "Encode::JP Japan"
344	.Vb 11
345	\& Standard DOS/Win Macintosh Comment/Reference
346	\& ----------------------------------------------------------------
347	\& euc-jp
348	\& shiftjis cp932 macJapanese
349	\& 7bit-jis
350	\& iso-2022-jp [RFC1468]
351	\& iso-2022-jp-1 [RFC2237]
352	\& jis0201-raw { JIS X 0201 (roman + halfwidth kana) without CES }
353	\& jis0208-raw { JIS X 0208 (Kanji + fullwidth kana) without CES }
354	\& jis0212-raw { JIS X 0212 (Extended Kanji) without CES }
355	\& ----------------------------------------------------------------
356	.Ve
357	.IP "Encode::KR \*(-- Korea" 4
358	.IX Item "Encode::KR Korea"
359	.Vb 8
360	\& Standard DOS/Win Macintosh Comment/Reference
361	\& ----------------------------------------------------------------
362	\& euc-kr MacKorean [RFC1557]
363	\& cp949 [1]
364	\& iso-2022-kr [RFC1557]
365	\& johab [KS X 1001:1998, Annex 3]
366	\& ksc5601-raw { KSC5601 without CES }
367	\& ----------------------------------------------------------------
368	.Ve
369	.Sp
370	.Vb 2
371	\& [1] ks_c_5601-1987, (x-)?windows-949, and uhc are aliased to this.
372	\& See below.
373	.Ve
374	.IP "Encode::TW \*(-- Taiwan" 4
375	.IX Item "Encode::TW Taiwan"
376	.Vb 5
377	\& Standard DOS/Win Macintosh Comment/Reference
378	\& ----------------------------------------------------------------
379	\& big5-eten cp950 MacChineseTrad {big5 aliased to big5-eten}
380	\& big5-hkscs
381	\& ----------------------------------------------------------------
382	.Ve
383	.IP "Encode::HanExtra \*(-- More Chinese via \s-1CPAN\s0" 4
384	.IX Item "Encode::HanExtra More Chinese via CPAN"
385	Due to the size concerns, additional Chinese encodings below are
386	distributed separately on \s-1CPAN\s0, under the name Encode::HanExtra.
387	.Sp
388	.Vb 8
389	\& Standard DOS/Win Macintosh Comment/Reference
390	\& ----------------------------------------------------------------
391	\& big5ext CMEX's Big5e Extension
392	\& big5plus CMEX's Big5+ Extension
393	\& cccii Chinese Character Code for Information Interchange
394	\& euc-tw EUC (Extended Unix Character)
395	\& gb18030 GBK with Traditional Characters
396	\& ----------------------------------------------------------------
397	.Ve
398	.IP "Encode::JIS2K \*(-- \s-1JIS\s0 X 0213 encodings via \s-1CPAN\s0" 4
399	.IX Item "Encode::JIS2K JIS X 0213 encodings via CPAN"
400	Due to size concerns, additional Japanese encodings below are
401	distributed separately on \s-1CPAN\s0, under the name Encode::JIS2K.
402	.Sp
403	.Vb 8
404	\& Standard DOS/Win Macintosh Comment/Reference
405	\& ----------------------------------------------------------------
406	\& euc-jisx0213
407	\& shiftjisx0123
408	\& iso-2022-jp-3
409	\& jis0213-1-raw
410	\& jis0213-2-raw
411	\& ----------------------------------------------------------------
412	.Ve
413	.Sh "Miscellaneous encodings"
414	.IX Subsection "Miscellaneous encodings"
415	.IP "Encode::EBCDIC" 4
416	.IX Item "Encode::EBCDIC"
417	See perlebcdic for details.
418	.Sp
419	.Vb 8
420	\& ----------------------------------------------------------------
421	\& cp37
422	\& cp500
423	\& cp875
424	\& cp1026
425	\& cp1047
426	\& posix-bc
427	\& ----------------------------------------------------------------
428	.Ve
429	.IP "Encode::Symbols" 4
430	.IX Item "Encode::Symbols"
431	For symbols and dingbats.
432	.Sp
433	.Vb 7
434	\& ----------------------------------------------------------------
435	\& symbol
436	\& dingbats
437	\& MacDingbats
438	\& AdobeZdingbat
439	\& AdobeSymbol
440	\& ----------------------------------------------------------------
441	.Ve
442	.IP "Encode::MIME::Header" 4
443	.IX Item "Encode::MIME::Header"
444	Strictly speaking, \s-1MIME\s0 header encoding documented in \s-1RFC\s0 2047 is more
445	of encapsulation than encoding. However, their support in modern
446	world is imperative so they are supported.
447	.Sp
448	.Vb 5
449	\& ----------------------------------------------------------------
450	\& MIME-Header [RFC2047]
451	\& MIME-B [RFC2047]
452	\& MIME-Q [RFC2047]
453	\& ----------------------------------------------------------------
454	.Ve
455	.IP "Encode::Guess" 4
456	.IX Item "Encode::Guess"
457	This one is not a name of encoding but a utility that lets you pick up
458	the most appropriate encoding for a data out of given \fIsuspects\fR. See
459	Encode::Guess for details.
460	.SH "Unsupported encodings"
461	.IX Header "Unsupported encodings"
462	The following encodings are not supported as yet; some because they
463	are rarely used, some because of technical difficulties. They may
464	be supported by external modules via \s-1CPAN\s0 in the future, however.
465	.IP "\s-1ISO\-2022\-JP\-2\s0 [\s-1RFC1554\s0]" 4
466	.IX Item "ISO-2022-JP-2 [RFC1554]"
467	Not very popular yet. Needs Unicode Database or equivalent to
468	implement \fIencode()\fR (because it includes \s-1JIS\s0 X 0208/0212, \s-1KSC5601\s0, and
469	\&\s-1GB2312\s0 simultaneously, whose code points in Unicode overlap. So you
470	need to lookup the database to determine to what character set a given
471	Unicode character should belong).
472	.IP "\s-1ISO\-2022\-CN\s0 [\s-1RFC1922\s0]" 4
473	.IX Item "ISO-2022-CN [RFC1922]"
474	Not very popular. Needs \s-1CNS\s0 11643\-1 and \-2 which are not available in
475	this module. \s-1CNS\s0 11643 is supported (via euc\-tw) in Encode::HanExtra.
476	Autrijus Tang may add support for this encoding in his module in future.
477	.IP "Various HP-UX encodings" 4
478	.IX Item "Various HP-UX encodings"
479	The following are unsupported due to the lack of mapping data.
480	.Sp
481	.Vb 2
482	\& '8' - arabic8, greek8, hebrew8, kana8, thai8, and turkish8
483	\& '15' - japanese15, korean15, and roi15
484	.Ve
485	.IP "Cyrillic encoding \s-1ISO\-IR\-111\s0" 4
486	.IX Item "Cyrillic encoding ISO-IR-111"
487	Anton Tagunov doubts its usefulness.
488	.IP "\s-1ISO\-8859\-8\-1\s0 [Hebrew]" 4
489	.IX Item "ISO-8859-8-1 [Hebrew]"
490	None of the Encode team knows Hebrew enough (\s-1ISO\-8859\-8\s0, cp1255 and
491	MacHebrew are supported because and just because there were mappings
492	available at <http://www.unicode.org/>). Contributions welcome.
493	.IP "\s-1ISIRI\s0 3342, Iran System, \s-1ISIRI\s0 2900 [Farsi]" 4
494	.IX Item "ISIRI 3342, Iran System, ISIRI 2900 [Farsi]"
495	Ditto.
496	.IP "Thai encoding \s-1TCVN\s0" 4
497	.IX Item "Thai encoding TCVN"
498	Ditto.
499	.IP "Vietnamese encodings \s-1VPS\s0" 4
500	.IX Item "Vietnamese encodings VPS"
501	Though Jungshik Shin has reported that Mozilla supports this encoding,
502	it was too late before 5.8.0 for us to add it. In the future, it
503	may be available via a separate module. See
504	<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf>
505	and
506	<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut>
507	if you are interested in helping us.
508	.IP "Various Mac encodings" 4
509	.IX Item "Various Mac encodings"
510	The following are unsupported due to the lack of mapping data.
511	.Sp
512	.Vb 5
513	\& MacArmenian, MacBengali, MacBurmese, MacEthiopic
514	\& MacExtArabic, MacGeorgian, MacKannada, MacKhmer
515	\& MacLaotian, MacMalayalam, MacMongolian, MacOriya
516	\& MacSinhalese, MacTamil, MacTelugu, MacTibetan
517	\& MacVietnamese
518	.Ve
519	.Sp
520	The rest which are already available are based upon the vendor mappings
521	at <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/> .
522	.IP "(Mac) Indic encodings" 4
523	.IX Item "(Mac) Indic encodings"
524	The maps for the following are available at <http://www.unicode.org/>
525	but remain unsupport because those encodings need algorithmical
526	approach, currently unsupported by \fIenc2xs\fR:
527	.Sp
528	.Vb 3
529	\& MacDevanagari
530	\& MacGurmukhi
531	\& MacGujarati
532	.Ve
533	.Sp
534	For details, please see \f(CW\(C`Unicode mapping issues and notes:\(C'\fR at
535	<http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/DEVANAGA.TXT> .
536	.Sp
537	I believe this issue is prevalent not only for Mac Indics but also in
538	other Indic encodings, but the above were the only Indic encodings
539	maps that I could find at <http://www.unicode.org/> .
540	.SH "Encoding vs. Charset \*(-- terminology"
541	.IX Header "Encoding vs. Charset terminology"
542	We are used to using the term (character) \fIencoding\fR and \fIcharacter
543	set\fR interchangeably. But just as confusing the terms byte and
544	character is dangerous and the terms should be differentiated when
545	needed, we need to differentiate \fIencoding\fR and \fIcharacter set\fR.
546	.PP
547	To understand that, here is a description of how we make computers
548	grok our characters.
549	.IP "\(bu" 4
550	First we start with which characters to include. We call this
551	collection of characters \fIcharacter repertoire\fR.
552	.IP "\(bu" 4
553	Then we have to give each character a unique \s-1ID\s0 so your computer can
554	tell the difference between 'a' and 'A'. This itemized character
555	repertoire is now a \fIcharacter set\fR.
556	.IP "\(bu" 4
557	If your computer can grow the character set without further
558	processing, you can go ahead and use it. This is called a \fIcoded
559	character set\fR (\s-1CCS\s0) or \fIraw character encoding\fR. \s-1ASCII\s0 is used this
560	way for most cases.
561	.IP "\(bu" 4
562	But in many cases, especially multi-byte \s-1CJK\s0 encodings, you have to
563	tweak a little more. Your network connection may not accept any data
564	with the Most Significant Bit set, and your computer may not be able to
565	tell if a given byte is a whole character or just half of it. So you
566	have to \fIencode\fR the character set to use it.
567	.Sp
568	A \fIcharacter encoding scheme\fR (\s-1CES\s0) determines how to encode a given
569	character set, or a set of multiple character sets. 7bit \s-1ISO\-2022\s0 is
570	an example of a \s-1CES\s0. You switch between character sets via \fIescape
571	sequences\fR.
572	.PP
573	Technically, or mathematically, speaking, a character set encoded in
574	such a \s-1CES\s0 that maps character by character may form a \s-1CCS\s0. \s-1EUC\s0 is such
575	an example. The \s-1CES\s0 of \s-1EUC\s0 is as follows:
576	.IP "\(bu" 4
577	Map \s-1ASCII\s0 unchanged.
578	.IP "\(bu" 4
579	Map such a character set that consists of 94 or 96 powered by N
580	members by adding 0x80 to each byte.
581	.IP "\(bu" 4
582	You can also use 0x8e and 0x8f to indicate that the following sequence of
583	characters belongs to yet another character set. To each following byte
584	is added the value 0x80.
585	.PP
586	By carefully looking at the encoded byte sequence, you can find that the
587	byte sequence conforms a unique number. In that sense, \s-1EUC\s0 is a \s-1CCS\s0
588	generated by a \s-1CES\s0 above from up to four \s-1CCS\s0 (complicated?). \s-1UTF\-8\s0
589	falls into this category. See \(L"\s-1UTF\-8\s0\(R" in perlUnicode to find out how
590	\&\s-1UTF\-8\s0 maps Unicode to a byte sequence.
591	.PP
592	You may also have found out by now why 7bit \s-1ISO\-2022\s0 cannot comprise
593	a \s-1CCS\s0. If you look at a byte sequence \ex21\ex21, you can't tell if
594	it is two !'s or \s-1IDEOGRAPHIC\s0 \s-1SPACE\s0. \s-1EUC\s0 maps the latter to \exA1\exA1
595	so you have no trouble differentiating between \(L"!!\(R". and \(L"\ \(R".
596	.SH "Encoding Classification (by Anton Tagunov and Dan Kogai)"
597	.IX Header "Encoding Classification (by Anton Tagunov and Dan Kogai)"
598	This section tries to classify the supported encodings by their
599	applicability for information exchange over the Internet and to
600	choose the most suitable aliases to name them in the context of
601	such communication.
602	.IP "\(bu" 4
603	To (en\|de)code encodings marked by \f(CW\(C`()\(C'\fR, you need
604	\&\f(CW\(C`Encode::HanExtra\(C'\fR, available from \s-1CPAN\s0.
605	.PP
606	Encoding names
607	.PP
608	.Vb 3
609	\& US-ASCII UTF-8 ISO-8859-* KOI8-R
610	\& Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
611	\& EUC-KR Big5 GB2312
612	.Ve
613	.PP
614	are registered with \s-1IANA\s0 as preferred \s-1MIME\s0 names and may
615	be used over the Internet.
616	.PP
617	\&\f(CW\(C`Shift_JIS\(C'\fR has been officialized by \s-1JIS\s0 X 0208:1997.
618	\&\(L"Microsoft\-related naming mess\(R" gives details.
619	.PP
620	\&\f(CW\(C`GB2312\(C'\fR is the \s-1IANA\s0 name for \f(CW\(C`EUC\-CN\(C'\fR.
621	See \(L"Microsoft\-related naming mess\(R" for details.
622	.PP
623	\&\f(CW\(C`GB_2312\-80\(C'\fR \fIraw\fR encoding is available as \f(CW\(C`gb2312\-raw\(C'\fR
624	with Encode. See Encode::CN for details.
625	.PP
626	.Vb 2
627	\& EUC-CN
628	\& KOI8-U [RFC2319]
629	.Ve
630	.PP
631	have not been registered with \s-1IANA\s0 (as of March 2002) but
632	seem to be supported by major web browsers.
633	The \s-1IANA\s0 name for \f(CW\(C`EUC\-CN\(C'\fR is \f(CW\(C`GB2312\(C'\fR.
634	.PP
635	.Vb 1
636	\& KS_C_5601-1987
637	.Ve
638	.PP
639	is heavily misused.
640	See \(L"Microsoft\-related naming mess\(R" for details.
641	.PP
642	\&\f(CW\(C`KS_C_5601\-1987\(C'\fR \fIraw\fR encoding is available as \f(CW\(C`kcs5601\-raw\(C'\fR
643	with Encode. See Encode::KR for details.
644	.PP
645	.Vb 1
646	\& UTF-16 UTF-16BE UTF-16LE
647	.Ve
648	.PP
649	are IANA-registered \f(CW\(C`charset\(C'\fRs. See [\s-1RFC\s0 2781] for details.
650	Jungshik Shin reports that \s-1UTF\-16\s0 with a \s-1BOM\s0 is well accepted
651	by \s-1MS\s0 \s-1IE\s0 5/6 and \s-1NS\s0 4/6. Beware however that
652	.IP "\(bu" 4
653	\&\f(CW\(C`UTF\-16\(C'\fR support in any software you're going to be
654	using/interoperating with has probably been less tested
655	then \f(CW\(C`UTF\-8\(C'\fR support
656	.IP "\(bu" 4
657	\&\f(CW\(C`UTF\-8\(C'\fR coded data seamlessly passes traditional
658	command piping (\f(CW\(C`cat\(C'\fR, \f(CW\(C`more\(C'\fR, etc.) while \f(CW\(C`UTF\-16\(C'\fR coded
659	data is likely to cause confusion (with its zero bytes,
660	for example)
661	.IP "\(bu" 4
662	it is beyond the power of words to describe the way \s-1HTML\s0 browsers
663	encode non\-\f(CW\(C`ASCII\(C'\fR form data. To get a general impression, visit
664	<http://ppewww.ph.gla.ac.uk/~flavell/charset/form\-i18n.html>.
665	While encoding of form data has stabilized for \f(CW\(C`UTF\-8\(C'\fR encoded pages
666	(at least \s-1IE\s0 5/6, \s-1NS\s0 6, and Opera 6 behave consistently), be sure to
667	expect fun (and cross-browser discrepancies) with \f(CW\(C`UTF\-16\(C'\fR encoded
668	pages!
669	.PP
670	The rule of thumb is to use \f(CW\(C`UTF\-8\(C'\fR unless you know what
671	you're doing and unless you really benefit from using \f(CW\(C`UTF\-16\(C'\fR.
672	.PP
673	.Vb 5
674	\& ISO-IR-165 [RFC1345]
675	\& VISCII
676	\& GB 12345
677	\& GB 18030 (**) (see links bellow)
678	\& EUC-TW (**)
679	.Ve
680	.PP
681	are totally valid encodings but not registered at \s-1IANA\s0.
682	The names under which they are listed here are probably the
683	most widely-known names for these encodings and are recommended
684	names.
685	.PP
686	.Vb 1
687	\& BIG5PLUS (**)
688	.Ve
689	.PP
690	is a proprietary name.
691	.Sh "Microsoft-related naming mess"
692	.IX Subsection "Microsoft-related naming mess"
693	Microsoft products misuse the following names:
694	.IP "\s-1KS_C_5601\-1987\s0" 4
695	.IX Item "KS_C_5601-1987"
696	Microsoft extension to \f(CW\(C`EUC\-KR\(C'\fR.
697	.Sp
698	Proper names: \f(CW\(C`CP949\(C'\fR, \f(CW\(C`UHC\(C'\fR, \f(CW\(C`x\-windows\-949\(C'\fR (as used by Mozilla).
699	.Sp
700	See <http://lists.w3.org/Archives/Public/ietf\-charsets/2001AprJun/0033.html>
701	for details.
702	.Sp
703	Encode aliases \f(CW\(C`KS_C_5601\-1987\(C'\fR to \f(CW\(C`cp949\(C'\fR to reflect this common
704	misusage. \fIRaw\fR \f(CW\(C`KS_C_5601\-1987\(C'\fR encoding is available as
705	\&\f(CW\(C`kcs5601\-raw\(C'\fR.
706	.Sp
707	See Encode::KR for details.
708	.IP "\s-1GB2312\s0" 4
709	.IX Item "GB2312"
710	Microsoft extension to \f(CW\(C`EUC\-CN\(C'\fR.
711	.Sp
712	Proper names: \f(CW\(C`CP936\(C'\fR, \f(CW\(C`GBK\(C'\fR.
713	.Sp
714	\&\f(CW\(C`GB2312\(C'\fR has been registered in the \f(CW\(C`EUC\-CN\(C'\fR meaning at
715	\&\s-1IANA\s0. This has partially repaired the situation: Microsoft's
716	\&\f(CW\(C`GB2312\(C'\fR has become a superset of the official \f(CW\(C`GB2312\(C'\fR.
717	.Sp
718	Encode aliases \f(CW\(C`GB2312\(C'\fR to \f(CW\(C`euc\-cn\(C'\fR in full agreement with
719	\&\s-1IANA\s0 registration. \f(CW\(C`cp936\(C'\fR is supported separately.
720	\&\fIRaw\fR \f(CW\(C`GB_2312\-80\(C'\fR encoding is available as \f(CW\(C`gb2312\-raw\(C'\fR.
721	.Sp
722	See Encode::CN for details.
723	.IP "Big5" 4
724	.IX Item "Big5"
725	Microsoft extension to \f(CW\(C`Big5\(C'\fR.
726	.Sp
727	Proper name: \f(CW\(C`CP950\(C'\fR.
728	.Sp
729	Encode separately supports \f(CW\(C`Big5\(C'\fR and \f(CW\(C`cp950\(C'\fR.
730	.IP "Shift_JIS" 4
731	.IX Item "Shift_JIS"
732	Microsoft's understanding of \f(CW\(C`Shift_JIS\(C'\fR.
733	.Sp
734	\&\s-1JIS\s0 has not endorsed the full Microsoft standard however.
735	The official \f(CW\(C`Shift_JIS\(C'\fR includes only \s-1JIS\s0 X 0201 and \s-1JIS\s0 X 0208
736	character sets, while Microsoft has always used \f(CW\(C`Shift_JIS\(C'\fR
737	to encode a wider character repertoire. See \f(CW\(C`IANA\(C'\fR registration for
738	\&\f(CW\(C`Windows\-31J\(C'\fR.
739	.Sp
740	As a historical predecessor, Microsoft's variant
741	probably has more rights for the name, though it may be objected
742	that Microsoft shouldn't have used \s-1JIS\s0 as part of the name
743	in the first place.
744	.Sp
745	Unambiguous name: \f(CW\(C`CP932\(C'\fR. \f(CW\(C`IANA\(C'\fR name (also used by Mozilla, and
746	provided as an alias by Encode): \f(CW\(C`Windows\-31J\(C'\fR.
747	.Sp
748	Encode separately supports \f(CW\(C`Shift_JIS\(C'\fR and \f(CW\(C`cp932\(C'\fR.
749	.SH "Glossary"
750	.IX Header "Glossary"
751	.IP "character repertoire" 4
752	.IX Item "character repertoire"
753	A collection of unique characters. A \fIcharacter\fR set in the strictest
754	sense. At this stage, characters are not numbered.
755	.IP "coded character set (\s-1CCS\s0)" 4
756	.IX Item "coded character set (CCS)"
757	A character set that is mapped in a way computers can use directly.
758	Many character encodings, including \s-1EUC\s0, fall in this category.
759	.IP "character encoding scheme (\s-1CES\s0)" 4
760	.IX Item "character encoding scheme (CES)"
761	An algorithm to map a character set to a byte sequence. You don't
762	have to be able to tell which character set a given byte sequence
763	belongs. 7\-bit \s-1ISO\-2022\s0 is a \s-1CES\s0 but it cannot be a \s-1CCS\s0. \s-1EUC\s0 is an
764	example of being both a \s-1CCS\s0 and \s-1CES\s0.
765	.IP "charset (in \s-1MIME\s0 context)" 4
766	.IX Item "charset (in MIME context)"
767	has long been used in the meaning of \f(CW\(C`encoding\(C'\fR, \s-1CES\s0.
768	.Sp
769	While the word combination \f(CW\(C`character set\(C'\fR has lost this meaning
770	in \s-1MIME\s0 context since [\s-1RFC\s0 2130], the \f(CW\(C`charset\(C'\fR abbreviation has
771	retained it. This is how [\s-1RFC\s0 2277] and [\s-1RFC\s0 2278] bless \f(CW\(C`charset\(C'\fR:
772	.Sp
773	.Vb 7
774	\& This document uses the term "charset" to mean a set of rules for
775	\& mapping from a sequence of octets to a sequence of characters, such
776	\& as the combination of a coded character set and a character encoding
777	\& scheme; this is also what is used as an identifier in MIME "charset="
778	\& parameters, and registered in the IANA charset registry ... (Note
779	\& that this is NOT a term used by other standards bodies, such as ISO).
780	\& [RFC 2277]
781	.Ve
782	.IP "\s-1EUC\s0" 4
783	.IX Item "EUC"
784	Extended Unix Character. See \s-1ISO\-2022\s0.
785	.IP "\s-1ISO\-2022\s0" 4
786	.IX Item "ISO-2022"
787	A \s-1CES\s0 that was carefully designed to coexist with \s-1ASCII\s0. There are a 7
788	bit version and an 8 bit version.
789	.Sp
790	The 7 bit version switches character set via escape sequence so it
791	cannot form a \s-1CCS\s0. Since this is more difficult to handle in programs
792	than the 8 bit version, the 7 bit version is not very popular except for
793	iso\-2022\-jp, the \fIde facto\fR standard \s-1CES\s0 for e\-mails.
794	.Sp
795	The 8 bit version can form a \s-1CCS\s0. \s-1EUC\s0 and \s-1ISO\-8859\s0 are two examples
796	thereof. Pre\-5.6 perl could use them as string literals.
797	.IP "\s-1UCS\s0" 4
798	.IX Item "UCS"
799	Short for \fIUniversal Character Set\fR. When you say just \s-1UCS\s0, it means
800	\&\fIUnicode\fR.
801	.IP "\s-1UCS\-2\s0" 4
802	.IX Item "UCS-2"
803	\&\s-1ISO/IEC\s0 10646 encoding form: Universal Character Set coded in two
804	octets.
805	.IP "Unicode" 4
806	.IX Item "Unicode"
807	A character set that aims to include all character repertoires of the
808	world. Many character sets in various national as well as industrial
809	standards have become, in a way, just subsets of Unicode.
810	.IP "\s-1UTF\s0" 4
811	.IX Item "UTF"
812	Short for \fIUnicode Transformation Format\fR. Determines how to map a
813	Unicode character into a byte sequence.
814	.IP "\s-1UTF\-16\s0" 4
815	.IX Item "UTF-16"
816	A \s-1UTF\s0 in 16\-bit encoding. Can either be in big endian or little
817	endian. The big endian version is called \s-1UTF\-16BE\s0 (equal to \s-1UCS\-2\s0 +
818	surrogate support) and the little endian version is called \s-1UTF\-16LE\s0.
819	.SH "See Also"
820	.IX Header "See Also"
821	Encode,
822	Encode::Byte,
823	Encode::CN, Encode::JP, Encode::KR, Encode::TW,
824	Encode::EBCDIC, Encode::Symbol
825	Encode::MIME::Header, Encode::Guess
826	.SH "References"
827	.IX Header "References"
828	.IP "\s-1ECMA\s0" 4
829	.IX Item "ECMA"
830	European Computer Manufacturers Association
831	<http://www.ecma.ch>
832	.RS 4
833	.ie n .IP "\s-1ECMA\-035\s0 (eq ""ISO\-2022"")" 4
834	.el .IP "\s-1ECMA\-035\s0 (eq \f(CWISO\-2022\fR)" 4
835	.IX Item "ECMA-035 (eq ISO-2022)"
836	<http://www.ecma.ch/ecma1/STAND/ECMA\-035.HTM>
837	.Sp
838	The specification of \s-1ISO\-2022\s0 is available from the link above.
839	.RE
840	.RS 4
841	.RE
842	.IP "\s-1IANA\s0" 4
843	.IX Item "IANA"
844	Internet Assigned Numbers Authority
845	<http://www.iana.org/>
846	.RS 4
847	.IP "Assigned Charset Names by \s-1IANA\s0" 4
848	.IX Item "Assigned Charset Names by IANA"
849	<http://www.iana.org/assignments/character\-sets>
850	.Sp
851	Most of the \f(CW\(C`canonical names\(C'\fR in Encode derive from this list
852	so you can directly apply the string you have extracted from \s-1MIME\s0
853	header of mails and web pages.
854	.RE
855	.RS 4
856	.RE
857	.IP "\s-1ISO\s0" 4
858	.IX Item "ISO"
859	International Organization for Standardization
860	<http://www.iso.ch/>
861	.IP "\s-1RFC\s0" 4
862	.IX Item "RFC"
863	Request For Comments \*(-- need I say more?
864	<http://www.rfc\-editor.org/>, <http://www.rfc.net/>,
865	<http://www.faqs.org/rfcs/>
866	.IP "\s-1UC\s0" 4
867	.IX Item "UC"
868	Unicode Consortium
869	<http://www.unicode.org/>
870	.RS 4
871	.IP "Unicode Glossary" 4
872	.IX Item "Unicode Glossary"
873	<http://www.unicode.org/glossary/>
874	.Sp
875	The glossary of this document is based upon this site.
876	.RE
877	.RS 4
878	.RE
879	.Sh "Other Notable Sites"
880	.IX Subsection "Other Notable Sites"
881	.IP "czyborra.com" 4
882	.IX Item "czyborra.com"
883	<http://czyborra.com/>
884	.Sp
885	Contains a lot of useful information, especially gory details of \s-1ISO\s0
886	vs. vendor mappings.
887	.IP "\s-1CJK\s0.inf" 4
888	.IX Item "CJK.inf"
889	<http://www.oreilly.com/people/authors/lunde/cjk_inf.html>
890	.Sp
891	Somewhat obsolete (last update in 1996), but still useful. Also try
892	.Sp
893	<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
894	.Sp
895	You will find brief info on \f(CW\(C`EUC\-CN\(C'\fR, \f(CW\(C`GBK\(C'\fR and mostly on \f(CW\(C`GB 18030\(C'\fR.
896	.IP "Jungshik Shin's Hangul \s-1FAQ\s0" 4
897	.IX Item "Jungshik Shin's Hangul FAQ"
898	<http://jshin.net/faq>
899	.Sp
900	And especially its subject 8.
901	.Sp
902	<http://jshin.net/faq/qa8.html>
903	.Sp
904	A comprehensive overview of the Korean (\f(CW\(C`KS \*(C'\fR) standards.
905	.ie n .IP "debian.org: ""Introduction to i18n""" 4
906	.el .IP "debian.org: ``Introduction to i18n''" 4
907	.IX Item "debian.org: Introduction to i18n"
908	A brief description for most of the mentioned \s-1CJK\s0 encodings is
909	contained in
910	<http://www.debian.org/doc/manuals/intro\-i18n/ch\-codes.en.html>
911	.Sh "Offline sources"
912	.IX Subsection "Offline sources"
913	.ie n .IP """CJKV Information Processing"" by Ken Lunde" 4
914	.el .IP "\f(CWCJKV Information Processing\fR by Ken Lunde" 4
915	.IX Item "CJKV Information Processing by Ken Lunde"
916	\&\s-1CJKV\s0 Information Processing
917	1999 O'Reilly & Associates, \s-1ISBN\s0 : 1\-56592\-224\-7
918	.Sp
919	The modern successor of \f(CW\(C`CJK.inf\(C'\fR.
920	.Sp
921	Features a comprehensive coverage of \s-1CJKV\s0 character sets and
922	encodings along with many other issues faced by anyone trying
923	to better support \s-1CJKV\s0 languages/scripts in all the areas of
924	information processing.
925	.Sp
926	To purchase this book, visit
927	<http://www.oreilly.com/catalog/cjkvinfo/>
928	or your favourite bookstore.