git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "Encode 3"
	132	.TH Encode 3 "2001-09-21" "perl v5.8.8" "Perl Programmers Reference Guide"
	133	.SH "NAME"
	134	Encode \- character encodings
	135	.SH "SYNOPSIS"
	136	.IX Header "SYNOPSIS"
	137	.Vb 1
	138	\& use Encode;
	139	.Ve
	140	.Sh "Table of Contents"
	141	.IX Subsection "Table of Contents"
	142	Encode consists of a collection of modules whose details are too big
	143	to fit in one document. This \s-1POD\s0 itself explains the top-level APIs
	144	and general topics at a glance. For other topics and more details,
	145	see the PODs below:
	146	.PP
	147	.Vb 10
	148	\& Name Description
	149	\& --------------------------------------------------------
	150	\& Encode::Alias Alias definitions to encodings
	151	\& Encode::Encoding Encode Implementation Base Class
	152	\& Encode::Supported List of Supported Encodings
	153	\& Encode::CN Simplified Chinese Encodings
	154	\& Encode::JP Japanese Encodings
	155	\& Encode::KR Korean Encodings
	156	\& Encode::TW Traditional Chinese Encodings
	157	\& --------------------------------------------------------
	158	.Ve
	159	.SH "DESCRIPTION"
	160	.IX Header "DESCRIPTION"
	161	The \f(CW\(C`Encode\(C'\fR module provides the interfaces between Perl's strings
	162	and the rest of the system. Perl strings are sequences of
	163	\&\fBcharacters\fR.
	164	.PP
	165	The repertoire of characters that Perl can represent is at least that
	166	defined by the Unicode Consortium. On most platforms the ordinal
	167	values of the characters (as returned by \f(CW\(C`ord(ch)\(C'\fR) is the \*(L"Unicode
	168	codepoint\*(R" for the character (the exceptions are those platforms where
	169	the legacy encoding is some variant of \s-1EBCDIC\s0 rather than a super-set
	170	of \s-1ASCII\s0 \- see perlebcdic).
	171	.PP
	172	Traditionally, computer data has been moved around in 8\-bit chunks
	173	often called \(L"bytes\(R". These chunks are also known as \(L"octets\(R" in
	174	networking standards. Perl is widely used to manipulate data of many
	175	types \- not only strings of characters representing human or computer
	176	languages but also \(L"binary\(R" data being the machine's representation of
	177	numbers, pixels in an image \- or just about anything.
	178	.PP
	179	When Perl is processing \(L"binary data\(R", the programmer wants Perl to
	180	process \(L"sequences of bytes\(R". This is not a problem for Perl \- as a
	181	byte has 256 possible values, it easily fits in Perl's much larger
	182	\&\(L"logical character\(R".
	183	.Sh "\s-1TERMINOLOGY\s0"
	184	.IX Subsection "TERMINOLOGY"
	185	.IP "\(bu" 2
	186	\&\fIcharacter\fR: a character in the range 0..(2**32\-1) (or more).
	187	(What Perl's strings are made of.)
	188	.IP "\(bu" 2
	189	\&\fIbyte\fR: a character in the range 0..255
	190	(A special case of a Perl character.)
	191	.IP "\(bu" 2
	192	\&\fIoctet\fR: 8 bits of data, with ordinal values 0..255
	193	(Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
	194	.SH "PERL ENCODING API"
	195	.IX Header "PERL ENCODING API"
	196	.ie n .IP "$octets = encode(\s-1ENCODING\s0, $string [, \s-1CHECK\s0])" 2
	197	.el .IP "$octets = encode(\s-1ENCODING\s0, \f(CW$string\fR [, \s-1CHECK\s0])" 2
	198	.IX Item "$octets = encode(ENCODING, $string [, CHECK])"
	199	Encodes a string from Perl's internal form into \fI\s-1ENCODING\s0\fR and returns
	200	a sequence of octets. \s-1ENCODING\s0 can be either a canonical name or
	201	an alias. For encoding names and aliases, see \(L"Defining Aliases\(R".
	202	For \s-1CHECK\s0, see \(L"Handling Malformed Data\(R".
	203	.Sp
	204	For example, to convert a string from Perl's internal format to
	205	iso\-8859\-1 (also known as Latin1),
	206	.Sp
	207	.Vb 1
	208	\& $octets = encode("iso-8859-1", $string);
	209	.Ve
	210	.Sp
	211	\&\fB\s-1CAVEAT\s0\fR: When you run \f(CW\(C`$octets = encode("utf8", $string)\(C'\fR, then \f(CW$octets\fR
	212	\&\fBmay not be equal to\fR \f(CW$string\fR. Though they both contain the same data, the utf8 flag
	213	for \f(CW$octets\fR is \fBalways\fR off. When you encode anything, utf8 flag of
	214	the result is always off, even when it contains completely valid utf8
	215	string. See \(L"The \s-1UTF\-8\s0 flag\(R" below.
	216	.Sp
	217	If the \f(CW$string\fR is \f(CW\(C`undef\(C'\fR then \f(CW\(C`undef\(C'\fR is returned.
	218	.ie n .IP "$string = decode(\s-1ENCODING\s0, $octets [, \s-1CHECK\s0])" 2
	219	.el .IP "$string = decode(\s-1ENCODING\s0, \f(CW$octets\fR [, \s-1CHECK\s0])" 2
	220	.IX Item "$string = decode(ENCODING, $octets [, CHECK])"
	221	Decodes a sequence of octets assumed to be in \fI\s-1ENCODING\s0\fR into Perl's
	222	internal form and returns the resulting string. As in \fIencode()\fR,
	223	\&\s-1ENCODING\s0 can be either a canonical name or an alias. For encoding names
	224	and aliases, see \(L"Defining Aliases\(R". For \s-1CHECK\s0, see
	225	\&\(L"Handling Malformed Data\(R".
	226	.Sp
	227	For example, to convert \s-1ISO\-8859\-1\s0 data to a string in Perl's internal format:
	228	.Sp
	229	.Vb 1
	230	\& $string = decode("iso-8859-1", $octets);
	231	.Ve
	232	.Sp
	233	\&\fB\s-1CAVEAT\s0\fR: When you run \f(CW\(C`$string = decode("utf8", $octets)\(C'\fR, then \f(CW$string\fR
	234	\&\fBmay not be equal to\fR \f(CW$octets\fR. Though they both contain the same data,
	235	the utf8 flag for \f(CW$string\fR is on unless \f(CW$octets\fR entirely consists of
	236	\&\s-1ASCII\s0 data (or \s-1EBCDIC\s0 on \s-1EBCDIC\s0 machines). See \(L"The \s-1UTF\-8\s0 flag\(R"
	237	below.
	238	.Sp
	239	If the \f(CW$string\fR is \f(CW\(C`undef\(C'\fR then \f(CW\(C`undef\(C'\fR is returned.
	240	.IP "[$length =] from_to($octets, \s-1FROM_ENC\s0, \s-1TO_ENC\s0 [, \s-1CHECK\s0])" 2
	241	.IX Item "[$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])"
	242	Converts \fBin-place\fR data between two encodings. The data in \f(CW$octets\fR
	243	must be encoded as octets and not as characters in Perl's internal
	244	format. For example, to convert \s-1ISO\-8859\-1\s0 data to Microsoft's \s-1CP1250\s0
	245	encoding:
	246	.Sp
	247	.Vb 1
	248	\& from_to($octets, "iso-8859-1", "cp1250");
	249	.Ve
	250	.Sp
	251	and to convert it back:
	252	.Sp
	253	.Vb 1
	254	\& from_to($octets, "cp1250", "iso-8859-1");
	255	.Ve
	256	.Sp
	257	Note that because the conversion happens in place, the data to be
	258	converted cannot be a string constant; it must be a scalar variable.
	259	.Sp
	260	\&\fIfrom_to()\fR returns the length of the converted string in octets on
	261	success, \fIundef\fR on error.
	262	.Sp
	263	\&\fB\s-1CAVEAT\s0\fR: The following operations look the same but are not quite so;
	264	.Sp
	265	.Vb 2
	266	\& from_to($data, "iso-8859-1", "utf8"); #1
	267	\& $data = decode("iso-8859-1", $data); #2
	268	.Ve
	269	.Sp
	270	Both #1 and #2 make \f(CW$data\fR consist of a completely valid \s-1UTF\-8\s0 string
	271	but only #2 turns utf8 flag on. #1 is equivalent to
	272	.Sp
	273	.Vb 1
	274	\& $data = encode("utf8", decode("iso-8859-1", $data));
	275	.Ve
	276	.Sp
	277	See \(L"The \s-1UTF\-8\s0 flag\(R" below.
	278	.IP "$octets = encode_utf8($string);" 2
	279	.IX Item "$octets = encode_utf8($string);"
	280	Equivalent to \f(CW\(C`$octets = encode("utf8", $string);\(C'\fR The characters
	281	that comprise \f(CW$string\fR are encoded in Perl's internal format and the
	282	result is returned as a sequence of octets. All possible
	283	characters have a \s-1UTF\-8\s0 representation so this function cannot fail.
	284	.IP "$string = decode_utf8($octets [, \s-1CHECK\s0]);" 2
	285	.IX Item "$string = decode_utf8($octets [, CHECK]);"
	286	equivalent to \f(CW\(C`$string = decode("utf8", $octets [, CHECK])\(C'\fR.
	287	The sequence of octets represented by
	288	\&\f(CW$octets\fR is decoded from \s-1UTF\-8\s0 into a sequence of logical
	289	characters. Not all sequences of octets form valid \s-1UTF\-8\s0 encodings, so
	290	it is possible for this call to fail. For \s-1CHECK\s0, see
	291	\&\(L"Handling Malformed Data\(R".
	292	.Sh "Listing available encodings"
	293	.IX Subsection "Listing available encodings"
	294	.Vb 2
	295	\& use Encode;
	296	\& @list = Encode->encodings();
	297	.Ve
	298	.PP
	299	Returns a list of the canonical names of the available encodings that
	300	are loaded. To get a list of all available encodings including the
	301	ones that are not loaded yet, say
	302	.PP
	303	.Vb 1
	304	\& @all_encodings = Encode->encodings(":all");
	305	.Ve
	306	.PP
	307	Or you can give the name of a specific module.
	308	.PP
	309	.Vb 1
	310	\& @with_jp = Encode->encodings("Encode::JP");
	311	.Ve
	312	.PP
	313	When \(L"::\(R" is not in the name, \(L"Encode::\(R" is assumed.
	314	.PP
	315	.Vb 1
	316	\& @ebcdic = Encode->encodings("EBCDIC");
	317	.Ve
	318	.PP
	319	To find out in detail which encodings are supported by this package,
	320	see Encode::Supported.
	321	.Sh "Defining Aliases"
	322	.IX Subsection "Defining Aliases"
	323	To add a new alias to a given encoding, use:
	324	.PP
	325	.Vb 3
	326	\& use Encode;
	327	\& use Encode::Alias;
	328	\& define_alias(newName => ENCODING);
	329	.Ve
	330	.PP
	331	After that, newName can be used as an alias for \s-1ENCODING\s0.
	332	\&\s-1ENCODING\s0 may be either the name of an encoding or an
	333	\&\fIencoding object\fR
	334	.PP
	335	But before you do so, make sure the alias is nonexistent with
	336	\&\f(CW\(C`resolve_alias()\(C'\fR, which returns the canonical name thereof.
	337	i.e.
	338	.PP
	339	.Vb 3
	340	\& Encode::resolve_alias("latin1") eq "iso-8859-1" # true
	341	\& Encode::resolve_alias("iso-8859-12") # false; nonexistent
	342	\& Encode::resolve_alias($name) eq $name # true if $name is canonical
	343	.Ve
	344	.PP
	345	\&\fIresolve_alias()\fR does not need \f(CW\(C`use Encode::Alias\(C'\fR; it can be
	346	exported via \f(CW\(C`use Encode qw(resolve_alias)\(C'\fR.
	347	.PP
	348	See Encode::Alias for details.
	349	.SH "Encoding via PerlIO"
	350	.IX Header "Encoding via PerlIO"
	351	If your perl supports \fIPerlIO\fR (which is the default), you can use a PerlIO layer to decode
	352	and encode directly via a filehandle. The following two examples
	353	are totally identical in their functionality.
	354	.PP
	355	.Vb 4
	356	\& # via PerlIO
	357	\& open my $in, "<:encoding(shiftjis)", $infile or die;
	358	\& open my $out, ">:encoding(euc-jp)", $outfile or die;
	359	\& while(<$in>){ print $out $_; }
	360	.Ve
	361	.PP
	362	.Vb 7
	363	\& # via from_to
	364	\& open my $in, "<", $infile or die;
	365	\& open my $out, ">", $outfile or die;
	366	\& while(<$in>){
	367	\& from_to($_, "shiftjis", "euc-jp", 1);
	368	\& print $out $_;
	369	\& }
	370	.Ve
	371	.PP
	372	Unfortunately, it may be that encodings are PerlIO\-savvy. You can check
	373	if your encoding is supported by PerlIO by calling the \f(CW\(C`perlio_ok\(C'\fR
	374	method.
	375	.PP
	376	.Vb 2
	377	\& Encode::perlio_ok("hz"); # False
	378	\& find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
	379	.Ve
	380	.PP
	381	.Vb 2
	382	\& use Encode qw(perlio_ok); # exported upon request
	383	\& perlio_ok("euc-jp")
	384	.Ve
	385	.PP
	386	Fortunately, all encodings that come with Encode core are PerlIO-savvy
	387	except for hz and ISO\-2022\-kr. For gory details, see
	388	Encode::Encoding and Encode::PerlIO.
	389	.SH "Handling Malformed Data"
	390	.IX Header "Handling Malformed Data"
	391	The optional \fI\s-1CHECK\s0\fR argument tells Encode what to do when it
	392	encounters malformed data. Without \s-1CHECK\s0, Encode::FB_DEFAULT ( == 0 )
	393	is assumed.
	394	.PP
	395	As of version 2.12 Encode supports coderef values for \s-1CHECK\s0. See below.
	396	.IP "\fB\s-1NOTE:\s0\fR Not all encoding support this feature" 2
	397	.IX Item "NOTE: Not all encoding support this feature"
	398	Some encodings ignore \fI\s-1CHECK\s0\fR argument. For example,
	399	Encode::Unicode ignores \fI\s-1CHECK\s0\fR and it always croaks on error.
	400	.PP
	401	Now here is the list of \fI\s-1CHECK\s0\fR values available
	402	.IP "\fI\s-1CHECK\s0\fR = Encode::FB_DEFAULT ( == 0)" 2
	403	.IX Item "CHECK = Encode::FB_DEFAULT ( == 0)"
	404	If \fI\s-1CHECK\s0\fR is 0, (en\|de)code will put a \fIsubstitution character\fR in
	405	place of a malformed character. When you encode, <subchar>
	406	will be used. When you decode the code point \f(CW0xFFFD\fR is used. If
	407	the data is supposed to be \s-1UTF\-8\s0, an optional lexical warning
	408	(category utf8) is given.
	409	.IP "\fI\s-1CHECK\s0\fR = Encode::FB_CROAK ( == 1)" 2
	410	.IX Item "CHECK = Encode::FB_CROAK ( == 1)"
	411	If \fI\s-1CHECK\s0\fR is 1, methods will die on error immediately with an error
	412	message. Therefore, when \fI\s-1CHECK\s0\fR is set to 1, you should trap the
	413	error with eval{} unless you really want to let it die.
	414	.IP "\fI\s-1CHECK\s0\fR = Encode::FB_QUIET" 2
	415	.IX Item "CHECK = Encode::FB_QUIET"
	416	If \fI\s-1CHECK\s0\fR is set to Encode::FB_QUIET, (en\|de)code will immediately
	417	return the portion of the data that has been processed so far when an
	418	error occurs. The data argument will be overwritten with everything
	419	after that point (that is, the unprocessed part of data). This is
	420	handy when you have to call decode repeatedly in the case where your
	421	source data may contain partial multi-byte character sequences,
	422	(i.e. you are reading with a fixed-width buffer). Here is a sample
	423	code that does exactly this:
	424	.Sp
	425	.Vb 5
	426	\& my $buffer = ''; my $string = '';
	427	\& while(read $fh, $buffer, 256, length($buffer)){
	428	\& $string .= decode($encoding, $buffer, Encode::FB_QUIET);
	429	\& # $buffer now contains the unprocessed partial character
	430	\& }
	431	.Ve
	432	.IP "\fI\s-1CHECK\s0\fR = Encode::FB_WARN" 2
	433	.IX Item "CHECK = Encode::FB_WARN"
	434	This is the same as above, except that it warns on error. Handy when
	435	you are debugging the mode above.
	436	.IP "perlqq mode (\fI\s-1CHECK\s0\fR = Encode::FB_PERLQQ)" 2
	437	.IX Item "perlqq mode (CHECK = Encode::FB_PERLQQ)"
	438	.PD 0
	439	.IP "\s-1HTML\s0 charref mode (\fI\s-1CHECK\s0\fR = Encode::FB_HTMLCREF)" 2
	440	.IX Item "HTML charref mode (CHECK = Encode::FB_HTMLCREF)"
	441	.IP "\s-1XML\s0 charref mode (\fI\s-1CHECK\s0\fR = Encode::FB_XMLCREF)" 2
	442	.IX Item "XML charref mode (CHECK = Encode::FB_XMLCREF)"
	443	.PD
	444	For encodings that are implemented by Encode::XS, \s-1CHECK\s0 ==
	445	Encode::FB_PERLQQ turns (en\|de)code into \f(CW\(C`perlqq\(C'\fR fallback mode.
	446	.Sp
	447	When you decode, \f(CW\(C`\ex\f(CI\s-1HH\s0\f(CW\(C'\fR will be inserted for a malformed character,
	448	where \fI\s-1HH\s0\fR is the hex representation of the octet that could not be
	449	decoded to utf8. And when you encode, \f(CW\(C`\ex{\f(CI\s-1HHHH\s0\f(CW}\(C'\fR will be inserted,
	450	where \fI\s-1HHHH\s0\fR is the Unicode \s-1ID\s0 of the character that cannot be found
	451	in the character repertoire of the encoding.
	452	.Sp
	453	\&\s-1HTML/XML\s0 character reference modes are about the same, in place of
	454	\&\f(CW\(C`\ex{\f(CI\s-1HHHH\s0\f(CW}\(C'\fR, \s-1HTML\s0 uses \f(CW\(C`&#\f(CI\s-1NNN\s0\f(CW;\(C'\fR where \fI\s-1NNN\s0\fR is a decimal number and
	455	\&\s-1XML\s0 uses \f(CW\(C`&#x\f(CI\s-1HHHH\s0\f(CW;\(C'\fR where \fI\s-1HHHH\s0\fR is the hexadecimal number.
	456	.Sp
	457	In Encode 2.10 or later, \f(CW\(C`LEAVE_SRC\(C'\fR is also implied.
	458	.IP "The bitmask" 2
	459	.IX Item "The bitmask"
	460	These modes are actually set via a bitmask. Here is how the \s-1FB_XX\s0
	461	constants are laid out. You can import the \s-1FB_XX\s0 constants via
	462	\&\f(CW\(C`use Encode qw(:fallbacks)\(C'\fR; you can import the generic bitmask
	463	constants via \f(CW\(C`use Encode qw(:fallback_all)\(C'\fR.
	464	.Sp
	465	.Vb 8
	466	\& FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
	467	\& DIE_ON_ERR 0x0001 X
	468	\& WARN_ON_ERR 0x0002 X
	469	\& RETURN_ON_ERR 0x0004 X X
	470	\& LEAVE_SRC 0x0008 X
	471	\& PERLQQ 0x0100 X
	472	\& HTMLCREF 0x0200
	473	\& XMLCREF 0x0400
	474	.Ve
	475	.Sh "coderef for \s-1CHECK\s0"
	476	.IX Subsection "coderef for CHECK"
	477	As of Encode 2.12 \s-1CHECK\s0 can also be a code reference which takes the
	478	ord value of unmapped caharacter as an argument and returns a string
	479	that represents the fallback character. For instance,
	480	.PP
	481	.Vb 1
	482	\& $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
	483	.Ve
	484	.PP
	485	Acts like \s-1FB_PERLQQ\s0 but <U+\fI\s-1XXXX\s0\fR> is used instead of
	486	\&\ex{\fI\s-1XXXX\s0\fR}.
	487	.SH "Defining Encodings"
	488	.IX Header "Defining Encodings"
	489	To define a new encoding, use:
	490	.PP
	491	.Vb 2
	492	\& use Encode qw(define_encoding);
	493	\& define_encoding($object, 'canonicalName' [, alias...]);
	494	.Ve
	495	.PP
	496	\&\fIcanonicalName\fR will be associated with \fI$object\fR. The object
	497	should provide the interface described in Encode::Encoding.
	498	If more than two arguments are provided then additional
	499	arguments are taken as aliases for \fI$object\fR.
	500	.PP
	501	See Encode::Encoding for more details.
	502	.SH "The UTF\-8 flag"
	503	.IX Header "The UTF-8 flag"
	504	Before the introduction of utf8 support in perl, The \f(CW\(C`eq\(C'\fR operator
	505	just compared the strings represented by two scalars. Beginning with
	506	perl 5.8, \f(CW\(C`eq\(C'\fR compares two strings with simultaneous consideration
	507	of \fIthe utf8 flag\fR. To explain why we made it so, I will quote page
	508	402 of \f(CW\(C`Programming Perl, 3rd ed.\(C'\fR
	509	.IP "Goal #1:" 2
	510	.IX Item "Goal #1:"
	511	Old byte-oriented programs should not spontaneously break on the old
	512	byte-oriented data they used to work on.
	513	.IP "Goal #2:" 2
	514	.IX Item "Goal #2:"
	515	Old byte-oriented programs should magically start working on the new
	516	character-oriented data when appropriate.
	517	.IP "Goal #3:" 2
	518	.IX Item "Goal #3:"
	519	Programs should run just as fast in the new character-oriented mode
	520	as in the old byte-oriented mode.
	521	.IP "Goal #4:" 2
	522	.IX Item "Goal #4:"
	523	Perl should remain one language, rather than forking into a
	524	byte-oriented Perl and a character-oriented Perl.
	525	.PP
	526	Back when \f(CW\(C`Programming Perl, 3rd ed.\(C'\fR was written, not even Perl 5.6.0
	527	was born and many features documented in the book remained
	528	unimplemented for a long time. Perl 5.8 corrected this and the introduction
	529	of the \s-1UTF\-8\s0 flag is one of them. You can think of this perl notion as of a
	530	byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
	531	flag on).
	532	.PP
	533	Here is how Encode takes care of the utf8 flag.
	534	.IP "\(bu" 2
	535	When you encode, the resulting utf8 flag is always off.
	536	.IP "\(bu" 2
	537	When you decode, the resulting utf8 flag is on unless you can
	538	unambiguously represent data. Here is the definition of
	539	dis\-ambiguity.
	540	.Sp
	541	After \f(CW\(C`$utf8 = decode('foo', $octet);\(C'\fR,
	542	.Sp
	543	.Vb 6
	544	\& When $octet is... The utf8 flag in $utf8 is
	545	\& ---------------------------------------------
	546	\& In ASCII only (or EBCDIC only) OFF
	547	\& In ISO-8859-1 ON
	548	\& In any other Encoding ON
	549	\& ---------------------------------------------
	550	.Ve
	551	.Sp
	552	As you see, there is one exception, In \s-1ASCII\s0. That way you can assume
	553	Goal #1. And with Encode Goal #2 is assumed but you still have to be
	554	careful in such cases mentioned in \fB\s-1CAVEAT\s0\fR paragraphs.
	555	.Sp
	556	This utf8 flag is not visible in perl scripts, exactly for the same
	557	reason you cannot (or you \fIdon't have to\fR) see if a scalar contains a
	558	string, integer, or floating point number. But you can still peek
	559	and poke these if you will. See the section below.
	560	.Sh "Messing with Perl's Internals"
	561	.IX Subsection "Messing with Perl's Internals"
	562	The following \s-1API\s0 uses parts of Perl's internals in the current
	563	implementation. As such, they are efficient but may change.
	564	.IP "is_utf8(\s-1STRING\s0 [, \s-1CHECK\s0])" 2
	565	.IX Item "is_utf8(STRING [, CHECK])"
	566	[\s-1INTERNAL\s0] Tests whether the \s-1UTF\-8\s0 flag is turned on in the \s-1STRING\s0.
	567	If \s-1CHECK\s0 is true, also checks the data in \s-1STRING\s0 for being well-formed
	568	\&\s-1UTF\-8\s0. Returns true if successful, false otherwise.
	569	.Sp
	570	As of perl 5.8.1, utf8 also has \fIutf8::is_utf8()\fR.
	571	.IP "_utf8_on(\s-1STRING\s0)" 2
	572	.IX Item "_utf8_on(STRING)"
	573	[\s-1INTERNAL\s0] Turns on the \s-1UTF\-8\s0 flag in \s-1STRING\s0. The data in \s-1STRING\s0 is
	574	\&\fBnot\fR checked for being well-formed \s-1UTF\-8\s0. Do not use unless you
	575	\&\fBknow\fR that the \s-1STRING\s0 is well-formed \s-1UTF\-8\s0. Returns the previous
	576	state of the \s-1UTF\-8\s0 flag (so please don't treat the return value as
	577	indicating success or failure), or \f(CW\(C`undef\(C'\fR if \s-1STRING\s0 is not a string.
	578	.IP "_utf8_off(\s-1STRING\s0)" 2
	579	.IX Item "_utf8_off(STRING)"
	580	[\s-1INTERNAL\s0] Turns off the \s-1UTF\-8\s0 flag in \s-1STRING\s0. Do not use frivolously.
	581	Returns the previous state of the \s-1UTF\-8\s0 flag (so please don't treat the
	582	return value as indicating success or failure), or \f(CW\(C`undef\(C'\fR if \s-1STRING\s0 is
	583	not a string.
	584	.SH "UTF\-8 vs. utf8"
	585	.IX Header "UTF-8 vs. utf8"
	586	.Vb 3
	587	\& ....We now view strings not as sequences of bytes, but as sequences
	588	\& of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
	589	\& computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
	590	.Ve
	591	.PP
	592	That has been the perl's notion of \s-1UTF\-8\s0 but official \s-1UTF\-8\s0 is more
	593	strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
	594	not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
	595	.PP
	596	Now that is overruled by Larry Wall himself.
	597	.PP
	598	.Vb 5
	599	\& From: Larry Wall <larry@wall.org>
	600	\& Date: December 04, 2004 11:51:58 JST
	601	\& To: perl-unicode@perl.org
	602	\& Subject: Re: Make Encode.pm support the real UTF-8
	603	\& Message-Id: <20041204025158.GA28754@wall.org>
	604	.Ve
	605	.PP
	606	.Vb 4
	607	\& On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
	608	\& : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
	609	\& : but "UTF-8" is the name of the standard and should give the
	610	\& : corresponding behaviour.
	611	.Ve
	612	.PP
	613	.Vb 2
	614	\& For what it's worth, that's how I've always kept them straight in my
	615	\& head.
	616	.Ve
	617	.PP
	618	.Vb 2
	619	\& Also for what it's worth, Perl 6 will mostly default to strict but
	620	\& make it easy to switch back to lax.
	621	.Ve
	622	.PP
	623	.Vb 1
	624	\& Larry
	625	.Ve
	626	.PP
	627	Do you copy? As of Perl 5.8.7, \fB\s-1UTF\-8\s0\fR means strict, official \s-1UTF\-8\s0
	628	while \fButf8\fR means liberal, lax, version thereof. And Encode version
	629	2.10 or later thus groks the difference between \f(CW\(C`UTF\-8\(C'\fR and C\(L"utf8\(R".
	630	.PP
	631	.Vb 2
	632	\& encode("utf8", "\ex{FFFF_FFFF}", 1); # okay
	633	\& encode("UTF-8", "\ex{FFFF_FFFF}", 1); # croaks
	634	.Ve
	635	.PP
	636	\&\f(CW\(C`UTF\-8\(C'\fR in Encode is actually a canonical name for \f(CW\(C`utf\-8\-strict\(C'\fR.
	637	Yes, the hyphen between \(L"\s-1UTF\s0\(R" and \(L"8\(R" is important. Without it Encode
	638	goes \(L"liberal\(R"
	639	.PP
	640	.Vb 4
	641	\& find_encoding("UTF-8")->name # is 'utf-8-strict'
	642	\& find_encoding("utf-8")->name # ditto. names are case insensitive
	643	\& find_encoding("utf8")->name # ditto. "_" are treated as "-"
	644	\& find_encoding("UTF8")->name # is 'utf8'.
	645	.Ve
	646	.SH "SEE ALSO"
	647	.IX Header "SEE ALSO"
	648	Encode::Encoding,
	649	Encode::Supported,
	650	Encode::PerlIO,
	651	encoding,
	652	perlebcdic,
	653	\&\(L"open\(R" in perlfunc,
	654	perlunicode,
	655	utf8,
	656	the Perl Unicode Mailing List <perl\-unicode@perl.org>
	657	.SH "MAINTAINER"
	658	.IX Header "MAINTAINER"
	659	This project was originated by Nick Ing-Simmons and later maintained
	660	by Dan Kogai <dankogai@dan.co.jp>. See \s-1AUTHORS\s0 for a full
	661	list of people involved. For any questions, use
	662	<perl\-unicode@perl.org> so we can all share.