git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame_incremental - tools/perl-5.8.0/man/man1/perluniintro.1

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "PERLUNIINTRO 1"
	132	.TH PERLUNIINTRO 1 "2002-06-08" "perl v5.8.0" "Perl Programmers Reference Guide"
	133	.SH "NAME"
	134	perluniintro \- Perl Unicode introduction
	135	.SH "DESCRIPTION"
	136	.IX Header "DESCRIPTION"
	137	This document gives a general idea of Unicode and how to use Unicode
	138	in Perl.
	139	.Sh "Unicode"
	140	.IX Subsection "Unicode"
	141	Unicode is a character set standard which plans to codify all of the
	142	writing systems of the world, plus many other symbols.
	143	.PP
	144	Unicode and \s-1ISO/IEC\s0 10646 are coordinated standards that provide code
	145	points for characters in almost all modern character set standards,
	146	covering more than 30 writing systems and hundreds of languages,
	147	including all commercially-important modern languages. All characters
	148	in the largest Chinese, Japanese, and Korean dictionaries are also
	149	encoded. The standards will eventually cover almost all characters in
	150	more than 250 writing systems and thousands of languages.
	151	.PP
	152	A Unicode \fIcharacter\fR is an abstract entity. It is not bound to any
	153	particular integer width, especially not to the C language \f(CW\(C`char\(C'\fR.
	154	Unicode is language-neutral and display\-neutral: it does not encode the
	155	language of the text and it does not define fonts or other graphical
	156	layout details. Unicode operates on characters and on text built from
	157	those characters.
	158	.PP
	159	Unicode defines characters like \f(CW\(C`LATIN CAPITAL LETTER A\(C'\fR or \f(CW\*(C`GREEK
	160	SMALL LETTER ALPHA\*(C'\fR and unique numbers for the characters, in this
	161	case 0x0041 and 0x03B1, respectively. These unique numbers are called
	162	\&\fIcode points\fR.
	163	.PP
	164	The Unicode standard prefers using hexadecimal notation for the code
	165	points. If numbers like \f(CW0x0041\fR are unfamiliar to
	166	you, take a peek at a later section, \(L"Hexadecimal Notation\(R".
	167	The Unicode standard uses the notation \f(CW\(C`U+0041 LATIN CAPITAL LETTER A\(C'\fR,
	168	to give the hexadecimal code point and the normative name of
	169	the character.
	170	.PP
	171	Unicode also defines various \fIproperties\fR for the characters, like
	172	\&\(L"uppercase\(R" or \(L"lowercase\(R", \(L"decimal digit\(R", or \(L"punctuation\(R";
	173	these properties are independent of the names of the characters.
	174	Furthermore, various operations on the characters like uppercasing,
	175	lowercasing, and collating (sorting) are defined.
	176	.PP
	177	A Unicode character consists either of a single code point, or a
	178	\&\fIbase character\fR (like \f(CW\(C`LATIN CAPITAL LETTER A\(C'\fR), followed by one or
	179	more \fImodifiers\fR (like \f(CW\(C`COMBINING ACUTE ACCENT\(C'\fR). This sequence of
	180	base character and modifiers is called a \fIcombining character
	181	sequence\fR.
	182	.PP
	183	Whether to call these combining character sequences \(L"characters\(R"
	184	depends on your point of view. If you are a programmer, you probably
	185	would tend towards seeing each element in the sequences as one unit,
	186	or \(L"character\(R". The whole sequence could be seen as one \(L"character\(R",
	187	however, from the user's point of view, since that's probably what it
	188	looks like in the context of the user's language.
	189	.PP
	190	With this \(L"whole sequence\(R" view of characters, the total number of
	191	characters is open\-ended. But in the programmer's \*(L"one unit is one
	192	character\(R" point of view, the concept of \(L"characters\*(R" is more
	193	deterministic. In this document, we take that second point of view:
	194	one \(L"character\(R" is one Unicode code point, be it a base character or
	195	a combining character.
	196	.PP
	197	For some combinations, there are \fIprecomposed\fR characters.
	198	\&\f(CW\(C`LATIN CAPITAL LETTER A WITH ACUTE\(C'\fR, for example, is defined as
	199	a single code point. These precomposed characters are, however,
	200	only available for some combinations, and are mainly
	201	meant to support round-trip conversions between Unicode and legacy
	202	standards (like the \s-1ISO\s0 8859). In the general case, the composing
	203	method is more extensible. To support conversion between
	204	different compositions of the characters, various \fInormalization
	205	forms\fR to standardize representations are also defined.
	206	.PP
	207	Because of backward compatibility with legacy encodings, the \*(L"a unique
	208	number for every character\*(R" idea breaks down a bit: instead, there is
	209	\&\(L"at least one number for every character\(R". The same character could
	210	be represented differently in several legacy encodings. The
	211	converse is also not true: some code points do not have an assigned
	212	character. Firstly, there are unallocated code points within
	213	otherwise used blocks. Secondly, there are special Unicode control
	214	characters that do not represent true characters.
	215	.PP
	216	A common myth about Unicode is that it would be \(L"16\-bit\(R", that is,
	217	Unicode is only represented as \f(CW0x10000\fR (or 65536) characters from
	218	\&\f(CW0x0000\fR to \f(CW0xFFFF\fR. \fBThis is untrue.\fR Since Unicode 2.0, Unicode
	219	has been defined all the way up to 21 bits (\f(CW0x10FFFF\fR), and since
	220	Unicode 3.1, characters have been defined beyond \f(CW0xFFFF\fR. The first
	221	\&\f(CW0x10000\fR characters are called the \fIPlane 0\fR, or the \fIBasic
	222	Multilingual Plane\fR (\s-1BMP\s0). With Unicode 3.1, 17 planes in all are
	223	defined\*(--but nowhere near full of defined characters, yet.
	224	.PP
	225	Another myth is that the 256\-character blocks have something to
	226	do with languages\*(--that each block would define the characters used
	227	by a language or a set of languages. \fBThis is also untrue.\fR
	228	The division into blocks exists, but it is almost completely
	229	accidental\*(--an artifact of how the characters have been and
	230	still are allocated. Instead, there is a concept called \fIscripts\fR,
	231	which is more useful: there is \f(CW\(C`Latin\(C'\fR script, \f(CW\(C`Greek\(C'\fR script, and
	232	so on. Scripts usually span varied parts of several blocks.
	233	For further information see Unicode::UCD.
	234	.PP
	235	The Unicode code points are just abstract numbers. To input and
	236	output these abstract numbers, the numbers must be \fIencoded\fR somehow.
	237	Unicode defines several \fIcharacter encoding forms\fR, of which \fI\s-1UTF\-8\s0\fR
	238	is perhaps the most popular. \s-1UTF\-8\s0 is a variable length encoding that
	239	encodes Unicode characters as 1 to 6 bytes (only 4 with the currently
	240	defined characters). Other encodings include \s-1UTF\-16\s0 and \s-1UTF\-32\s0 and their
	241	big\- and little-endian variants (\s-1UTF\-8\s0 is byte-order independent)
	242	The \s-1ISO/IEC\s0 10646 defines the \s-1UCS\-2\s0 and \s-1UCS\-4\s0 encoding forms.
	243	.PP
	244	For more information about encodings\*(--for instance, to learn what
	245	\&\fIsurrogates\fR and \fIbyte order marks\fR (BOMs) are\*(--see perlunicode.
	246	.Sh "Perl's Unicode Support"
	247	.IX Subsection "Perl's Unicode Support"
	248	Starting from Perl 5.6.0, Perl has had the capacity to handle Unicode
	249	natively. Perl 5.8.0, however, is the first recommended release for
	250	serious Unicode work. The maintenance release 5.6.1 fixed many of the
	251	problems of the initial Unicode implementation, but for example
	252	regular expressions still do not work with Unicode in 5.6.1.
	253	.PP
	254	\&\fBStarting from Perl 5.8.0, the use of \f(CB\(C`use utf8\(C'\fB is no longer
	255	necessary.\fR In earlier releases the \f(CW\(C`utf8\(C'\fR pragma was used to declare
	256	that operations in the current block or file would be Unicode\-aware.
	257	This model was found to be wrong, or at least clumsy: the \(L"Unicodeness\(R"
	258	is now carried with the data, instead of being attached to the
	259	operations. Only one case remains where an explicit \f(CW\(C`use utf8\(C'\fR is
	260	needed: if your Perl script itself is encoded in \s-1UTF\-8\s0, you can use
	261	\&\s-1UTF\-8\s0 in your identifier names, and in string and regular expression
	262	literals, by saying \f(CW\(C`use utf8\(C'\fR. This is not the default because
	263	scripts with legacy 8\-bit data in them would break. See utf8.
	264	.Sh "Perl's Unicode Model"
	265	.IX Subsection "Perl's Unicode Model"
	266	Perl supports both pre\-5.6 strings of eight-bit native bytes, and
	267	strings of Unicode characters. The principle is that Perl tries to
	268	keep its data as eight-bit bytes for as long as possible, but as soon
	269	as Unicodeness cannot be avoided, the data is transparently upgraded
	270	to Unicode.
	271	.PP
	272	Internally, Perl currently uses either whatever the native eight-bit
	273	character set of the platform (for example Latin\-1) is, defaulting to
	274	\&\s-1UTF\-8\s0, to encode Unicode strings. Specifically, if all code points in
	275	the string are \f(CW0xFF\fR or less, Perl uses the native eight-bit
	276	character set. Otherwise, it uses \s-1UTF\-8\s0.
	277	.PP
	278	A user of Perl does not normally need to know nor care how Perl
	279	happens to encode its internal strings, but it becomes relevant when
	280	outputting Unicode strings to a stream without a PerlIO layer \*(-- one with
	281	the \(L"default\(R" encoding. In such a case, the raw bytes used internally
	282	(the native character set or \s-1UTF\-8\s0, as appropriate for each string)
	283	will be used, and a \(L"Wide character\(R" warning will be issued if those
	284	strings contain a character beyond 0x00FF.
	285	.PP
	286	For example,
	287	.PP
	288	.Vb 1
	289	\& perl -e 'print "\ex{DF}\en", "\ex{0100}\ex{DF}\en"'
	290	.Ve
	291	.PP
	292	produces a fairly useless mixture of native bytes and \s-1UTF\-8\s0, as well
	293	as a warning:
	294	.PP
	295	.Vb 1
	296	\& Wide character in print at ...
	297	.Ve
	298	.PP
	299	To output \s-1UTF\-8\s0, use the \f(CW\(C`:utf8\(C'\fR output layer. Prepending
	300	.PP
	301	.Vb 1
	302	\& binmode(STDOUT, ":utf8");
	303	.Ve
	304	.PP
	305	to this sample program ensures that the output is completely \s-1UTF\-8\s0,
	306	and removes the program's warning.
	307	.PP
	308	If your locale environment variables (\f(CW\(C`LANGUAGE\(C'\fR, \f(CW\(C`LC_ALL\(C'\fR,
	309	\&\f(CW\(C`LC_CTYPE\(C'\fR, \f(CW\(C`LANG\(C'\fR) contain the strings '\s-1UTF\-8\s0' or '\s-1UTF8\s0',
	310	regardless of case, then the default encoding of your \s-1STDIN\s0, \s-1STDOUT\s0,
	311	and \s-1STDERR\s0 and of \fBany subsequent file open\fR, is \s-1UTF\-8\s0. Note that
	312	this means that Perl expects other software to work, too: if Perl has
	313	been led to believe that \s-1STDIN\s0 should be \s-1UTF\-8\s0, but then \s-1STDIN\s0 coming
	314	in from another command is not \s-1UTF\-8\s0, Perl will complain about the
	315	malformed \s-1UTF\-8\s0.
	316	.PP
	317	All features that combine Unicode and I/O also require using the new
	318	PerlIO feature. Almost all Perl 5.8 platforms do use PerlIO, though:
	319	you can see whether yours is by running \(L"perl \-V\(R" and looking for
	320	\&\f(CW\(C`useperlio=define\(C'\fR.
	321	.Sh "Unicode and \s-1EBCDIC\s0"
	322	.IX Subsection "Unicode and EBCDIC"
	323	Perl 5.8.0 also supports Unicode on \s-1EBCDIC\s0 platforms. There,
	324	Unicode support is somewhat more complex to implement since
	325	additional conversions are needed at every step. Some problems
	326	remain, see perlebcdic for details.
	327	.PP
	328	In any case, the Unicode support on \s-1EBCDIC\s0 platforms is better than
	329	in the 5.6 series, which didn't work much at all for \s-1EBCDIC\s0 platform.
	330	On \s-1EBCDIC\s0 platforms, the internal Unicode encoding form is UTF-EBCDIC
	331	instead of \s-1UTF\-8\s0. The difference is that as \s-1UTF\-8\s0 is \(L"ASCII\-safe\(R" in
	332	that \s-1ASCII\s0 characters encode to \s-1UTF\-8\s0 as\-is, while UTF-EBCDIC is
	333	\&\(L"EBCDIC\-safe\(R".
	334	.Sh "Creating Unicode"
	335	.IX Subsection "Creating Unicode"
	336	To create Unicode characters in literals for code points above \f(CW0xFF\fR,
	337	use the \f(CW\(C`\ex{...}\(C'\fR notation in double-quoted strings:
	338	.PP
	339	.Vb 1
	340	\& my $smiley = "\ex{263a}";
	341	.Ve
	342	.PP
	343	Similarly, it can be used in regular expression literals
	344	.PP
	345	.Vb 1
	346	\& $smiley =~ /\ex{263a}/;
	347	.Ve
	348	.PP
	349	At run-time you can use \f(CW\(C`chr()\(C'\fR:
	350	.PP
	351	.Vb 1
	352	\& my $hebrew_alef = chr(0x05d0);
	353	.Ve
	354	.PP
	355	See \(L"Further Resources\(R" for how to find all these numeric codes.
	356	.PP
	357	Naturally, \f(CW\(C`ord()\(C'\fR will do the reverse: it turns a character into
	358	a code point.
	359	.PP
	360	Note that \f(CW\(C`\ex..\(C'\fR (no \f(CW\(C`{}\(C'\fR and only two hexadecimal digits), \f(CW\(C`\ex{...}\(C'\fR,
	361	and \f(CW\(C`chr(...)\(C'\fR for arguments less than \f(CW0x100\fR (decimal 256)
	362	generate an eight-bit character for backward compatibility with older
	363	Perls. For arguments of \f(CW0x100\fR or more, Unicode characters are
	364	always produced. If you want to force the production of Unicode
	365	characters regardless of the numeric value, use \f(CW\(C`pack("U", ...)\(C'\fR
	366	instead of \f(CW\(C`\ex..\(C'\fR, \f(CW\(C`\ex{...}\(C'\fR, or \f(CW\(C`chr()\(C'\fR.
	367	.PP
	368	You can also use the \f(CW\(C`charnames\(C'\fR pragma to invoke characters
	369	by name in double-quoted strings:
	370	.PP
	371	.Vb 2
	372	\& use charnames ':full';
	373	\& my $arabic_alef = "\eN{ARABIC LETTER ALEF}";
	374	.Ve
	375	.PP
	376	And, as mentioned above, you can also \f(CW\(C`pack()\(C'\fR numbers into Unicode
	377	characters:
	378	.PP
	379	.Vb 1
	380	\& my $georgian_an = pack("U", 0x10a0);
	381	.Ve
	382	.PP
	383	Note that both \f(CW\(C`\ex{...}\(C'\fR and \f(CW\(C`\eN{...}\(C'\fR are compile-time string
	384	constants: you cannot use variables in them. if you want similar
	385	run-time functionality, use \f(CW\(C`chr()\(C'\fR and \f(CW\(C`charnames::vianame()\(C'\fR.
	386	.PP
	387	Also note that if all the code points for pack \(L"U\(R" are below 0x100,
	388	bytes will be generated, just like if you were using \f(CW\(C`chr()\(C'\fR.
	389	.PP
	390	.Vb 1
	391	\& my $bytes = pack("U*", 0x80, 0xFF);
	392	.Ve
	393	.PP
	394	If you want to force the result to Unicode characters, use the special
	395	\&\f(CW"U0"\fR prefix. It consumes no arguments but forces the result to be
	396	in Unicode characters, instead of bytes.
	397	.PP
	398	.Vb 1
	399	\& my $chars = pack("U0U*", 0x80, 0xFF);
	400	.Ve
	401	.Sh "Handling Unicode"
	402	.IX Subsection "Handling Unicode"
	403	Handling Unicode is for the most part transparent: just use the
	404	strings as usual. Functions like \f(CW\(C`index()\(C'\fR, \f(CW\(C`length()\(C'\fR, and
	405	\&\f(CW\(C`substr()\(C'\fR will work on the Unicode characters; regular expressions
	406	will work on the Unicode characters (see perlunicode and perlretut).
	407	.PP
	408	Note that Perl considers combining character sequences to be
	409	characters, so for example
	410	.PP
	411	.Vb 2
	412	\& use charnames ':full';
	413	\& print length("\eN{LATIN CAPITAL LETTER A}\eN{COMBINING ACUTE ACCENT}"), "\en";
	414	.Ve
	415	.PP
	416	will print 2, not 1. The only exception is that regular expressions
	417	have \f(CW\(C`\eX\(C'\fR for matching a combining character sequence.
	418	.PP
	419	Life is not quite so transparent, however, when working with legacy
	420	encodings, I/O, and certain special cases:
	421	.Sh "Legacy Encodings"
	422	.IX Subsection "Legacy Encodings"
	423	When you combine legacy data and Unicode the legacy data needs
	424	to be upgraded to Unicode. Normally \s-1ISO\s0 8859\-1 (or \s-1EBCDIC\s0, if
	425	applicable) is assumed. You can override this assumption by
	426	using the \f(CW\(C`encoding\(C'\fR pragma, for example
	427	.PP
	428	.Vb 1
	429	\& use encoding 'latin2'; # ISO 8859-2
	430	.Ve
	431	.PP
	432	in which case literals (string or regular expressions), \f(CW\(C`chr()\(C'\fR,
	433	and \f(CW\(C`ord()\(C'\fR in your whole script are assumed to produce Unicode
	434	characters from \s-1ISO\s0 8859\-2 code points. Note that the matching for
	435	encoding names is forgiving: instead of \f(CW\(C`latin2\(C'\fR you could have
	436	said \f(CW\(C`Latin 2\(C'\fR, or \f(CW\(C`iso8859\-2\(C'\fR, or other variations. With just
	437	.PP
	438	.Vb 1
	439	\& use encoding;
	440	.Ve
	441	.PP
	442	the environment variable \f(CW\(C`PERL_ENCODING\(C'\fR will be consulted.
	443	If that variable isn't set, the encoding pragma will fail.
	444	.PP
	445	The \f(CW\(C`Encode\(C'\fR module knows about many encodings and has interfaces
	446	for doing conversions between those encodings:
	447	.PP
	448	.Vb 2
	449	\& use Encode 'from_to';
	450	\& from_to($data, "iso-8859-3", "utf-8"); # from legacy to utf-8
	451	.Ve
	452	.Sh "Unicode I/O"
	453	.IX Subsection "Unicode I/O"
	454	Normally, writing out Unicode data
	455	.PP
	456	.Vb 1
	457	\& print FH $some_string_with_unicode, "\en";
	458	.Ve
	459	.PP
	460	produces raw bytes that Perl happens to use to internally encode the
	461	Unicode string. Perl's internal encoding depends on the system as
	462	well as what characters happen to be in the string at the time. If
	463	any of the characters are at code points \f(CW0x100\fR or above, you will get
	464	a warning. To ensure that the output is explicitly rendered in the
	465	encoding you desire\(--and to avoid the warning\(--open the stream with
	466	the desired encoding. Some examples:
	467	.PP
	468	.Vb 1
	469	\& open FH, ">:utf8", "file";
	470	.Ve
	471	.PP
	472	.Vb 3
	473	\& open FH, ">:encoding(ucs2)", "file";
	474	\& open FH, ">:encoding(UTF-8)", "file";
	475	\& open FH, ">:encoding(shift_jis)", "file";
	476	.Ve
	477	.PP
	478	and on already open streams, use \f(CW\(C`binmode()\(C'\fR:
	479	.PP
	480	.Vb 1
	481	\& binmode(STDOUT, ":utf8");
	482	.Ve
	483	.PP
	484	.Vb 3
	485	\& binmode(STDOUT, ":encoding(ucs2)");
	486	\& binmode(STDOUT, ":encoding(UTF-8)");
	487	\& binmode(STDOUT, ":encoding(shift_jis)");
	488	.Ve
	489	.PP
	490	The matching of encoding names is loose: case does not matter, and
	491	many encodings have several aliases. Note that the \f(CW\(C`:utf8\(C'\fR layer
	492	must always be specified exactly like that; it is \fInot\fR subject to
	493	the loose matching of encoding names.
	494	.PP
	495	See PerlIO for the \f(CW\(C`:utf8\(C'\fR layer, PerlIO::encoding and
	496	Encode::PerlIO for the \f(CW\(C`:encoding()\(C'\fR layer, and
	497	Encode::Supported for many encodings supported by the \f(CW\(C`Encode\(C'\fR
	498	module.
	499	.PP
	500	Reading in a file that you know happens to be encoded in one of the
	501	Unicode or legacy encodings does not magically turn the data into
	502	Unicode in Perl's eyes. To do that, specify the appropriate
	503	layer when opening files
	504	.PP
	505	.Vb 2
	506	\& open(my $fh,'<:utf8', 'anything');
	507	\& my $line_of_unicode = <$fh>;
	508	.Ve
	509	.PP
	510	.Vb 2
	511	\& open(my $fh,'<:encoding(Big5)', 'anything');
	512	\& my $line_of_unicode = <$fh>;
	513	.Ve
	514	.PP
	515	The I/O layers can also be specified more flexibly with
	516	the \f(CW\(C`open\(C'\fR pragma. See open, or look at the following example.
	517	.PP
	518	.Vb 7
	519	\& use open ':utf8'; # input and output default layer will be UTF-8
	520	\& open X, ">file";
	521	\& print X chr(0x100), "\en";
	522	\& close X;
	523	\& open Y, "<file";
	524	\& printf "%#x\en", ord(<Y>); # this should print 0x100
	525	\& close Y;
	526	.Ve
	527	.PP
	528	With the \f(CW\(C`open\(C'\fR pragma you can use the \f(CW\(C`:locale\(C'\fR layer
	529	.PP
	530	.Vb 9
	531	\& $ENV{LC_ALL} = $ENV{LANG} = 'ru_RU.KOI8-R';
	532	\& # the :locale will probe the locale environment variables like LC_ALL
	533	\& use open OUT => ':locale'; # russki parusski
	534	\& open(O, ">koi8");
	535	\& print O chr(0x430); # Unicode CYRILLIC SMALL LETTER A = KOI8-R 0xc1
	536	\& close O;
	537	\& open(I, "<koi8");
	538	\& printf "%#x\en", ord(<I>), "\en"; # this should print 0xc1
	539	\& close I;
	540	.Ve
	541	.PP
	542	or you can also use the \f(CW':encoding(...)'\fR layer
	543	.PP
	544	.Vb 2
	545	\& open(my $epic,'<:encoding(iso-8859-7)','iliad.greek');
	546	\& my $line_of_unicode = <$epic>;
	547	.Ve
	548	.PP
	549	These methods install a transparent filter on the I/O stream that
	550	converts data from the specified encoding when it is read in from the
	551	stream. The result is always Unicode.
	552	.PP
	553	The open pragma affects all the \f(CW\(C`open()\(C'\fR calls after the pragma by
	554	setting default layers. If you want to affect only certain
	555	streams, use explicit layers directly in the \f(CW\(C`open()\(C'\fR call.
	556	.PP
	557	You can switch encodings on an already opened stream by using
	558	\&\f(CW\(C`binmode()\(C'\fR; see \(L"binmode\(R" in perlfunc.
	559	.PP
	560	The \f(CW\(C`:locale\(C'\fR does not currently (as of Perl 5.8.0) work with
	561	\&\f(CW\(C`open()\(C'\fR and \f(CW\(C`binmode()\(C'\fR, only with the \f(CW\(C`open\(C'\fR pragma. The
	562	\&\f(CW\(C`:utf8\(C'\fR and \f(CW\(C`:encoding(...)\(C'\fR methods do work with all of \f(CW\(C`open()\(C'\fR,
	563	\&\f(CW\(C`binmode()\(C'\fR, and the \f(CW\(C`open\(C'\fR pragma.
	564	.PP
	565	Similarly, you may use these I/O layers on output streams to
	566	automatically convert Unicode to the specified encoding when it is
	567	written to the stream. For example, the following snippet copies the
	568	contents of the file \(L"text.jis\(R" (encoded as \s-1ISO\-2022\-JP\s0, aka \s-1JIS\s0) to
	569	the file \(L"text.utf8\(R", encoded as \s-1UTF\-8:\s0
	570	.PP
	571	.Vb 3
	572	\& open(my $nihongo, '<:encoding(iso2022-jp)', 'text.jis');
	573	\& open(my $unicode, '>:utf8', 'text.utf8');
	574	\& while (<$nihongo>) { print $unicode }
	575	.Ve
	576	.PP
	577	The naming of encodings, both by the \f(CW\(C`open()\(C'\fR and by the \f(CW\(C`open\(C'\fR
	578	pragma, is similar to the \f(CW\(C`encoding\(C'\fR pragma in that it allows for
	579	flexible names: \f(CW\(C`koi8\-r\(C'\fR and \f(CW\(C`KOI8R\(C'\fR will both be understood.
	580	.PP
	581	Common encodings recognized by \s-1ISO\s0, \s-1MIME\s0, \s-1IANA\s0, and various other
	582	standardisation organisations are recognised; for a more detailed
	583	list see Encode::Supported.
	584	.PP
	585	\&\f(CW\(C`read()\(C'\fR reads characters and returns the number of characters.
	586	\&\f(CW\(C`seek()\(C'\fR and \f(CW\(C`tell()\(C'\fR operate on byte counts, as do \f(CW\(C`sysread()\(C'\fR
	587	and \f(CW\(C`sysseek()\(C'\fR.
	588	.PP
	589	Notice that because of the default behaviour of not doing any
	590	conversion upon input if there is no default layer,
	591	it is easy to mistakenly write code that keeps on expanding a file
	592	by repeatedly encoding the data:
	593	.PP
	594	.Vb 8
	595	\& # BAD CODE WARNING
	596	\& open F, "file";
	597	\& local $/; ## read in the whole file of 8-bit characters
	598	\& $t = <F>;
	599	\& close F;
	600	\& open F, ">:utf8", "file";
	601	\& print F $t; ## convert to UTF-8 on output
	602	\& close F;
	603	.Ve
	604	.PP
	605	If you run this code twice, the contents of the \fIfile\fR will be twice
	606	\&\s-1UTF\-8\s0 encoded. A \f(CW\(C`use open ':utf8'\(C'\fR would have avoided the bug, or
	607	explicitly opening also the \fIfile\fR for input as \s-1UTF\-8\s0.
	608	.PP
	609	\&\fB\s-1NOTE\s0\fR: the \f(CW\(C`:utf8\(C'\fR and \f(CW\(C`:encoding\(C'\fR features work only if your
	610	Perl has been built with the new PerlIO feature.
	611	.Sh "Displaying Unicode As Text"
	612	.IX Subsection "Displaying Unicode As Text"
	613	Sometimes you might want to display Perl scalars containing Unicode as
	614	simple \s-1ASCII\s0 (or \s-1EBCDIC\s0) text. The following subroutine converts
	615	its argument so that Unicode characters with code points greater than
	616	255 are displayed as \f(CW\(C`\ex{...}\(C'\fR, control characters (like \f(CW\(C`\en\(C'\fR) are
	617	displayed as \f(CW\(C`\ex..\(C'\fR, and the rest of the characters as themselves:
	618	.PP
	619	.Vb 9
	620	\& sub nice_string {
	621	\& join("",
	622	\& map { $_ > 255 ? # if wide character...
	623	\& sprintf("\e\ex{%04X}", $_) : # \ex{...}
	624	\& chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
	625	\& sprintf("\e\ex%02X", $_) : # \ex..
	626	\& chr($_) # else as themselves
	627	\& } unpack("U*", $_[0])); # unpack Unicode characters
	628	\& }
	629	.Ve
	630	.PP
	631	For example,
	632	.PP
	633	.Vb 1
	634	\& nice_string("foo\ex{100}bar\en")
	635	.Ve
	636	.PP
	637	returns:
	638	.PP
	639	.Vb 1
	640	\& "foo\ex{0100}bar\ex0A"
	641	.Ve
	642	.Sh "Special Cases"
	643	.IX Subsection "Special Cases"
	644	.IP "\(bu" 4
	645	Bit Complement Operator ~ And \fIvec()\fR
	646	.Sp
	647	The bit complement operator \f(CW\(C`~\(C'\fR may produce surprising results if
	648	used on strings containing characters with ordinal values above
	649	255. In such a case, the results are consistent with the internal
	650	encoding of the characters, but not with much else. So don't do
	651	that. Similarly for \f(CW\(C`vec()\(C'\fR: you will be operating on the
	652	internally-encoded bit patterns of the Unicode characters, not on
	653	the code point values, which is very probably not what you want.
	654	.IP "\(bu" 4
	655	Peeking At Perl's Internal Encoding
	656	.Sp
	657	Normal users of Perl should never care how Perl encodes any particular
	658	Unicode string (because the normal ways to get at the contents of a
	659	string with Unicode\(--via input and output\(--should always be via
	660	explicitly-defined I/O layers). But if you must, there are two
	661	ways of looking behind the scenes.
	662	.Sp
	663	One way of peeking inside the internal encoding of Unicode characters
	664	is to use \f(CW\(C`unpack("C", ...\(C'\fR to get the bytes or \f(CW\(C`unpack("H", ...)\(C'\fR
	665	to display the bytes:
	666	.Sp
	667	.Vb 2
	668	\& # this prints c4 80 for the UTF-8 bytes 0xc4 0x80
	669	\& print join(" ", unpack("H*", pack("U", 0x100))), "\en";
	670	.Ve
	671	.Sp
	672	Yet another way would be to use the Devel::Peek module:
	673	.Sp
	674	.Vb 1
	675	\& perl -MDevel::Peek -e 'Dump(chr(0x100))'
	676	.Ve
	677	.Sp
	678	That shows the \s-1UTF8\s0 flag in \s-1FLAGS\s0 and both the \s-1UTF\-8\s0 bytes
	679	and Unicode characters in \f(CW\(C`PV\(C'\fR. See also later in this document
	680	the discussion about the \f(CW\(C`is_utf8\(C'\fR function of the \f(CW\(C`Encode\(C'\fR module.
	681	.Sh "Advanced Topics"
	682	.IX Subsection "Advanced Topics"
	683	.IP "\(bu" 4
	684	String Equivalence
	685	.Sp
	686	The question of string equivalence turns somewhat complicated
	687	in Unicode: what do you mean by \(L"equal\(R"?
	688	.Sp
	689	(Is \f(CW\(C`LATIN CAPITAL LETTER A WITH ACUTE\(C'\fR equal to
	690	\&\f(CW\(C`LATIN CAPITAL LETTER A\(C'\fR?)
	691	.Sp
	692	The short answer is that by default Perl compares equivalence (\f(CW\(C`eq\(C'\fR,
	693	\&\f(CW\(C`ne\(C'\fR) based only on code points of the characters. In the above
	694	case, the answer is no (because 0x00C1 != 0x0041). But sometimes, any
	695	\&\s-1CAPITAL\s0 \s-1LETTER\s0 As should be considered equal, or even As of any case.
	696	.Sp
	697	The long answer is that you need to consider character normalization
	698	and casing issues: see Unicode::Normalize, Unicode Technical
	699	Reports #15 and #21, \fIUnicode Normalization Forms\fR and \fICase
	700	Mappings\fR, http://www.unicode.org/unicode/reports/tr15/ and
	701	http://www.unicode.org/unicode/reports/tr21/
	702	.Sp
	703	As of Perl 5.8.0, the \(L"Full\(R" case-folding of \fICase
	704	Mappings/SpecialCasing\fR is implemented.
	705	.IP "\(bu" 4
	706	String Collation
	707	.Sp
	708	People like to see their strings nicely sorted\*(--or as Unicode
	709	parlance goes, collated. But again, what do you mean by collate?
	710	.Sp
	711	(Does \f(CW\(C`LATIN CAPITAL LETTER A WITH ACUTE\(C'\fR come before or after
	712	\&\f(CW\(C`LATIN CAPITAL LETTER A WITH GRAVE\(C'\fR?)
	713	.Sp
	714	The short answer is that by default, Perl compares strings (\f(CW\(C`lt\(C'\fR,
	715	\&\f(CW\(C`le\(C'\fR, \f(CW\(C`cmp\(C'\fR, \f(CW\(C`ge\(C'\fR, \f(CW\(C`gt\(C'\fR) based only on the code points of the
	716	characters. In the above case, the answer is \(L"after\(R", since
	717	\&\f(CW0x00C1\fR > \f(CW0x00C0\fR.
	718	.Sp
	719	The long answer is that \(L"it depends\(R", and a good answer cannot be
	720	given without knowing (at the very least) the language context.
	721	See Unicode::Collate, and \fIUnicode Collation Algorithm\fR
	722	http://www.unicode.org/unicode/reports/tr10/
	723	.Sh "Miscellaneous"
	724	.IX Subsection "Miscellaneous"
	725	.IP "\(bu" 4
	726	Character Ranges and Classes
	727	.Sp
	728	Character ranges in regular expression character classes (\f(CW\(C`/[a\-z]/\(C'\fR)
	729	and in the \f(CW\(C`tr///\(C'\fR (also known as \f(CW\(C`y///\(C'\fR) operator are not magically
	730	Unicode\-aware. What this means that \f(CW\(C`[A\-Za\-z]\(C'\fR will not magically start
	731	to mean \(L"all alphabetic letters\(R"; not that it does mean that even for
	732	8\-bit characters, you should be using \f(CW\(C`/[[:alpha:]]/\(C'\fR in that case.
	733	.Sp
	734	For specifying character classes like that in regular expressions,
	735	you can use the various Unicode properties\(--\f(CW\(C`\epL\*(C'\fR, or perhaps
	736	\&\f(CW\(C`\ep{Alphabetic}\(C'\fR, in this particular case. You can use Unicode
	737	code points as the end points of character ranges, but there is no
	738	magic associated with specifying a certain range. For further
	739	information\(--there are dozens of Unicode character classes\(--see
	740	perlunicode.
	741	.IP "\(bu" 4
	742	String-To-Number Conversions
	743	.Sp
	744	Unicode does define several other decimal\(--and numeric\(--characters
	745	besides the familiar 0 to 9, such as the Arabic and Indic digits.
	746	Perl does not support string-to-number conversion for digits other
	747	than \s-1ASCII\s0 0 to 9 (and \s-1ASCII\s0 a to f for hexadecimal).
	748	.Sh "Questions With Answers"
	749	.IX Subsection "Questions With Answers"
	750	.IP "\(bu" 4
	751	Will My Old Scripts Break?
	752	.Sp
	753	Very probably not. Unless you are generating Unicode characters
	754	somehow, old behaviour should be preserved. About the only behaviour
	755	that has changed and which could start generating Unicode is the old
	756	behaviour of \f(CW\(C`chr()\(C'\fR where supplying an argument more than 255
	757	produced a character modulo 255. \f(CW\(C`chr(300)\(C'\fR, for example, was equal
	758	to \f(CW\(C`chr(45)\(C'\fR or \(L"\-\(R" (in \s-1ASCII\s0), now it is \s-1LATIN\s0 \s-1CAPITAL\s0 \s-1LETTER\s0 I \s-1WITH\s0
	759	\&\s-1BREVE\s0.
	760	.IP "\(bu" 4
	761	How Do I Make My Scripts Work With Unicode?
	762	.Sp
	763	Very little work should be needed since nothing changes until you
	764	generate Unicode data. The most important thing is getting input as
	765	Unicode; for that, see the earlier I/O discussion.
	766	.IP "\(bu" 4
	767	How Do I Know Whether My String Is In Unicode?
	768	.Sp
	769	You shouldn't care. No, you really shouldn't. No, really. If you
	770	have to care\(--beyond the cases described above\(--it means that we
	771	didn't get the transparency of Unicode quite right.
	772	.Sp
	773	Okay, if you insist:
	774	.Sp
	775	.Vb 2
	776	\& use Encode 'is_utf8';
	777	\& print is_utf8($string) ? 1 : 0, "\en";
	778	.Ve
	779	.Sp
	780	But note that this doesn't mean that any of the characters in the
	781	string are necessary \s-1UTF\-8\s0 encoded, or that any of the characters have
	782	code points greater than 0xFF (255) or even 0x80 (128), or that the
	783	string has any characters at all. All the \f(CW\(C`is_utf8()\(C'\fR does is to
	784	return the value of the internal \(L"utf8ness\(R" flag attached to the
	785	\&\f(CW$string\fR. If the flag is off, the bytes in the scalar are interpreted
	786	as a single byte encoding. If the flag is on, the bytes in the scalar
	787	are interpreted as the (multi\-byte, variable\-length) \s-1UTF\-8\s0 encoded code
	788	points of the characters. Bytes added to an \s-1UTF\-8\s0 encoded string are
	789	automatically upgraded to \s-1UTF\-8\s0. If mixed non\-UTF8 and \s-1UTF\-8\s0 scalars
	790	are merged (double\-quoted interpolation, explicit concatenation, and
	791	printf/sprintf parameter substitution), the result will be \s-1UTF\-8\s0 encoded
	792	as if copies of the byte strings were upgraded to \s-1UTF\-8:\s0 for example,
	793	.Sp
	794	.Vb 3
	795	\& $a = "ab\ex80c";
	796	\& $b = "\ex{100}";
	797	\& print "$a = $b\en";
	798	.Ve
	799	.Sp
	800	the output string will be UTF\-8\-encoded \f(CW\(C`ab\ex80c\ex{100}\en\(C'\fR, but note
	801	that \f(CW$a\fR will stay byte\-encoded.
	802	.Sp
	803	Sometimes you might really need to know the byte length of a string
	804	instead of the character length. For that use either the
	805	\&\f(CW\(C`Encode::encode_utf8()\(C'\fR function or the \f(CW\(C`bytes\(C'\fR pragma and its only
	806	defined function \f(CW\(C`length()\(C'\fR:
	807	.Sp
	808	.Vb 7
	809	\& my $unicode = chr(0x100);
	810	\& print length($unicode), "\en"; # will print 1
	811	\& require Encode;
	812	\& print length(Encode::encode_utf8($unicode)), "\en"; # will print 2
	813	\& use bytes;
	814	\& print length($unicode), "\en"; # will also print 2
	815	\& # (the 0xC4 0x80 of the UTF-8)
	816	.Ve
	817	.IP "\(bu" 4
	818	How Do I Detect Data That's Not Valid In a Particular Encoding?
	819	.Sp
	820	Use the \f(CW\(C`Encode\(C'\fR package to try converting it.
	821	For example,
	822	.Sp
	823	.Vb 6
	824	\& use Encode 'encode_utf8';
	825	\& if (encode_utf8($string_of_bytes_that_I_think_is_utf8)) {
	826	\& # valid
	827	\& } else {
	828	\& # invalid
	829	\& }
	830	.Ve
	831	.Sp
	832	For \s-1UTF\-8\s0 only, you can use:
	833	.Sp
	834	.Vb 2
	835	\& use warnings;
	836	\& @chars = unpack("U0U*", $string_of_bytes_that_I_think_is_utf8);
	837	.Ve
	838	.Sp
	839	If invalid, a \f(CW\(C`Malformed UTF\-8 character (byte 0x##) in unpack\(C'\fR
	840	warning is produced. The \(L"U0\(R" means \*(L"expect strictly \s-1UTF\-8\s0 encoded
	841	Unicode\(R". Without that the \f(CW\(C`unpack("U", ...)\(C'\fR would accept also
	842	data like \f(CW\(C`chr(0xFF\(C'\fR), similarly to the \f(CW\(C`pack\(C'\fR as we saw earlier.
	843	.IP "\(bu" 4
	844	How Do I Convert Binary Data Into a Particular Encoding, Or Vice Versa?
	845	.Sp
	846	This probably isn't as useful as you might think.
	847	Normally, you shouldn't need to.
	848	.Sp
	849	In one sense, what you are asking doesn't make much sense: encodings
	850	are for characters, and binary data are not \(L"characters\(R", so converting
	851	\&\(L"data\(R" into some encoding isn't meaningful unless you know in what
	852	character set and encoding the binary data is in, in which case it's
	853	not just binary data, now is it?
	854	.Sp
	855	If you have a raw sequence of bytes that you know should be
	856	interpreted via a particular encoding, you can use \f(CW\(C`Encode\(C'\fR:
	857	.Sp
	858	.Vb 2
	859	\& use Encode 'from_to';
	860	\& from_to($data, "iso-8859-1", "utf-8"); # from latin-1 to utf-8
	861	.Ve
	862	.Sp
	863	The call to \f(CW\(C`from_to()\(C'\fR changes the bytes in \f(CW$data\fR, but nothing
	864	material about the nature of the string has changed as far as Perl is
	865	concerned. Both before and after the call, the string \f(CW$data\fR
	866	contains just a bunch of 8\-bit bytes. As far as Perl is concerned,
	867	the encoding of the string remains as \(L"system\-native 8\-bit bytes\(R".
	868	.Sp
	869	You might relate this to a fictional 'Translate' module:
	870	.Sp
	871	.Vb 4
	872	\& use Translate;
	873	\& my $phrase = "Yes";
	874	\& Translate::from_to($phrase, 'english', 'deutsch');
	875	\& ## phrase now contains "Ja"
	876	.Ve
	877	.Sp
	878	The contents of the string changes, but not the nature of the string.
	879	Perl doesn't know any more after the call than before that the
	880	contents of the string indicates the affirmative.
	881	.Sp
	882	Back to converting data. If you have (or want) data in your system's
	883	native 8\-bit encoding (e.g. Latin\-1, \s-1EBCDIC\s0, etc.), you can use
	884	pack/unpack to convert to/from Unicode.
	885	.Sp
	886	.Vb 2
	887	\& $native_string = pack("C", unpack("U", $Unicode_string));
	888	\& $Unicode_string = pack("U", unpack("C", $native_string));
	889	.Ve
	890	.Sp
	891	If you have a sequence of bytes you \fBknow\fR is valid \s-1UTF\-8\s0,
	892	but Perl doesn't know it yet, you can make Perl a believer, too:
	893	.Sp
	894	.Vb 2
	895	\& use Encode 'decode_utf8';
	896	\& $Unicode = decode_utf8($bytes);
	897	.Ve
	898	.Sp
	899	You can convert well-formed \s-1UTF\-8\s0 to a sequence of bytes, but if
	900	you just want to convert random binary data into \s-1UTF\-8\s0, you can't.
	901	\&\fBAny random collection of bytes isn't well-formed \s-1UTF\-8\s0\fR. You can
	902	use \f(CW\(C`unpack("C", $string)\*(C'\fR for the former, and you can create
	903	well-formed Unicode data by \f(CW\(C`pack("U", 0xff, ...)\*(C'\fR.
	904	.IP "\(bu" 4
	905	How Do I Display Unicode? How Do I Input Unicode?
	906	.Sp
	907	See http://www.alanwood.net/unicode/ and
	908	http://www.cl.cam.ac.uk/~mgk25/unicode.html
	909	.IP "\(bu" 4
	910	How Does Unicode Work With Traditional Locales?
	911	.Sp
	912	In Perl, not very well. Avoid using locales through the \f(CW\(C`locale\(C'\fR
	913	pragma. Use only one or the other.
	914	.Sh "Hexadecimal Notation"
	915	.IX Subsection "Hexadecimal Notation"
	916	The Unicode standard prefers using hexadecimal notation because
	917	that more clearly shows the division of Unicode into blocks of 256 characters.
	918	Hexadecimal is also simply shorter than decimal. You can use decimal
	919	notation, too, but learning to use hexadecimal just makes life easier
	920	with the Unicode standard. The \f(CW\(C`U+HHHH\(C'\fR notation uses hexadecimal,
	921	for example.
	922	.PP
	923	The \f(CW\(C`0x\(C'\fR prefix means a hexadecimal number, the digits are 0\-9 \fIand\fR
	924	a\-f (or A\-F, case doesn't matter). Each hexadecimal digit represents
	925	four bits, or half a byte. \f(CW\(C`print 0x..., "\en"\(C'\fR will show a
	926	hexadecimal number in decimal, and \f(CW\(C`printf "%x\en", $decimal\(C'\fR will
	927	show a decimal number in hexadecimal. If you have just the
	928	\&\(L"hex digits\(R" of a hexadecimal number, you can use the \f(CW\(C`hex()\(C'\fR function.
	929	.PP
	930	.Vb 6
	931	\& print 0x0009, "\en"; # 9
	932	\& print 0x000a, "\en"; # 10
	933	\& print 0x000f, "\en"; # 15
	934	\& print 0x0010, "\en"; # 16
	935	\& print 0x0011, "\en"; # 17
	936	\& print 0x0100, "\en"; # 256
	937	.Ve
	938	.PP
	939	.Vb 1
	940	\& print 0x0041, "\en"; # 65
	941	.Ve
	942	.PP
	943	.Vb 2
	944	\& printf "%x\en", 65; # 41
	945	\& printf "%#x\en", 65; # 0x41
	946	.Ve
	947	.PP
	948	.Vb 1
	949	\& print hex("41"), "\en"; # 65
	950	.Ve
	951	.Sh "Further Resources"
	952	.IX Subsection "Further Resources"
	953	.IP "\(bu" 4
	954	Unicode Consortium
	955	.Sp
	956	.Vb 1
	957	\& http://www.unicode.org/
	958	.Ve
	959	.IP "\(bu" 4
	960	Unicode \s-1FAQ\s0
	961	.Sp
	962	.Vb 1
	963	\& http://www.unicode.org/unicode/faq/
	964	.Ve
	965	.IP "\(bu" 4
	966	Unicode Glossary
	967	.Sp
	968	.Vb 1
	969	\& http://www.unicode.org/glossary/
	970	.Ve
	971	.IP "\(bu" 4
	972	Unicode Useful Resources
	973	.Sp
	974	.Vb 1
	975	\& http://www.unicode.org/unicode/onlinedat/resources.html
	976	.Ve
	977	.IP "\(bu" 4
	978	Unicode and Multilingual Support in \s-1HTML\s0, Fonts, Web Browsers and Other Applications
	979	.Sp
	980	.Vb 1
	981	\& http://www.alanwood.net/unicode/
	982	.Ve
	983	.IP "\(bu" 4
	984	\&\s-1UTF\-8\s0 and Unicode \s-1FAQ\s0 for Unix/Linux
	985	.Sp
	986	.Vb 1
	987	\& http://www.cl.cam.ac.uk/~mgk25/unicode.html
	988	.Ve
	989	.IP "\(bu" 4
	990	Legacy Character Sets
	991	.Sp
	992	.Vb 2
	993	\& http://www.czyborra.com/
	994	\& http://www.eki.ee/letter/
	995	.Ve
	996	.IP "\(bu" 4
	997	The Unicode support files live within the Perl installation in the
	998	directory
	999	.Sp
	1000	.Vb 1
	1001	\& $Config{installprivlib}/unicore
	1002	.Ve
	1003	.Sp
	1004	in Perl 5.8.0 or newer, and
	1005	.Sp
	1006	.Vb 1
	1007	\& $Config{installprivlib}/unicode
	1008	.Ve
	1009	.Sp
	1010	in the Perl 5.6 series. (The renaming to \fIlib/unicore\fR was done to
	1011	avoid naming conflicts with lib/Unicode in case-insensitive filesystems.)
	1012	The main Unicode data file is \fIUnicodeData.txt\fR (or \fIUnicode.301\fR in
	1013	Perl 5.6.1.) You can find the \f(CW$Config{installprivlib}\fR by
	1014	.Sp
	1015	.Vb 1
	1016	\& perl "-V:installprivlib"
	1017	.Ve
	1018	.Sp
	1019	You can explore various information from the Unicode data files using
	1020	the \f(CW\(C`Unicode::UCD\(C'\fR module.
	1021	.SH "UNICODE IN OLDER PERLS"
	1022	.IX Header "UNICODE IN OLDER PERLS"
	1023	If you cannot upgrade your Perl to 5.8.0 or later, you can still
	1024	do some Unicode processing by using the modules \f(CW\(C`Unicode::String\(C'\fR,
	1025	\&\f(CW\(C`Unicode::Map8\(C'\fR, and \f(CW\(C`Unicode::Map\(C'\fR, available from \s-1CPAN\s0.
	1026	If you have the \s-1GNU\s0 recode installed, you can also use the
	1027	Perl front-end \f(CW\(C`Convert::Recode\(C'\fR for character conversions.
	1028	.PP
	1029	The following are fast conversions from \s-1ISO\s0 8859\-1 (Latin\-1) bytes
	1030	to \s-1UTF\-8\s0 bytes, the code works even with older Perl 5 versions.
	1031	.PP
	1032	.Vb 2
	1033	\& # ISO 8859-1 to UTF-8
	1034	\& s/([\ex80-\exFF])/chr(0xC0\|ord($1)>>6).chr(0x80\|ord($1)&0x3F)/eg;
	1035	.Ve
	1036	.PP
	1037	.Vb 2
	1038	\& # UTF-8 to ISO 8859-1
	1039	\& s/([\exC2\exC3])([\ex80-\exBF])/chr(ord($1)<<6&0xC0\|ord($2)&0x3F)/eg;
	1040	.Ve
	1041	.SH "SEE ALSO"
	1042	.IX Header "SEE ALSO"
	1043	perlunicode, Encode, encoding, open, utf8, bytes,
	1044	perlretut, Unicode::Collate, Unicode::Normalize, Unicode::UCD
	1045	.SH "ACKNOWLEDGMENTS"
	1046	.IX Header "ACKNOWLEDGMENTS"
	1047	Thanks to the kind readers of the perl5\-porters@perl.org,
	1048	perl\-unicode@perl.org, linux\-utf8@nl.linux.org, and unicore@unicode.org
	1049	mailing lists for their valuable feedback.
	1050	.SH "AUTHOR, COPYRIGHT, AND LICENSE"
	1051	.IX Header "AUTHOR, COPYRIGHT, AND LICENSE"
	1052	Copyright 2001\-2002 Jarkko Hietaniemi <jhi@iki.fi>
	1053	.PP
	1054	This document may be distributed under the same terms as Perl itself.