Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / encoding.3
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "encoding 3"
132.TH encoding 3 "2002-06-01" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134encoding \- allows you to write your script in non\-ascii or non\-utf8
135.SH "SYNOPSIS"
136.IX Header "SYNOPSIS"
137.Vb 2
138\& use encoding "greek"; # Perl like Greek to you?
139\& use encoding "euc-jp"; # Jperl!
140.Ve
141.PP
142.Vb 1
143\& # or you can even do this if your shell supports your native encoding
144.Ve
145.PP
146.Vb 2
147\& perl -Mencoding=latin2 -e '...' # Feeling centrally European?
148\& perl -Mencoding=euc-kr -e '...' # Or Korean?
149.Ve
150.PP
151.Vb 1
152\& # more control
153.Ve
154.PP
155.Vb 2
156\& # A simple euc-cn => utf-8 converter
157\& use encoding "euc-cn", STDOUT => "utf8"; while(<>){print};
158.Ve
159.PP
160.Vb 2
161\& # "no encoding;" supported (but not scoped!)
162\& no encoding;
163.Ve
164.PP
165.Vb 4
166\& # an alternate way, Filter
167\& use encoding "euc-jp", Filter=>1;
168\& use utf8;
169\& # now you can use kanji identifiers -- in euc-jp!
170.Ve
171.SH "ABSTRACT"
172.IX Header "ABSTRACT"
173Let's start with a bit of history: Perl 5.6.0 introduced Unicode
174support. You could apply \f(CW\*(C`substr()\*(C'\fR and regexes even to complex \s-1CJK\s0
175characters \*(-- so long as the script was written in \s-1UTF\-8\s0. But back
176then, text editors that supported \s-1UTF\-8\s0 were still rare and many users
177instead chose to write scripts in legacy encodings, giving up a whole
178new feature of Perl 5.6.
179.PP
180Rewind to the future: starting from perl 5.8.0 with the \fBencoding\fR
181pragma, you can write your script in any encoding you like (so long
182as the \f(CW\*(C`Encode\*(C'\fR module supports it) and still enjoy Unicode support.
183You can write code in EUC-JP as follows:
184.PP
185.Vb 3
186\& my $Rakuda = "\exF1\exD1\exF1\exCC"; # Camel in Kanji
187\& #<-char-><-char-> # 4 octets
188\& s/\ebCamel\eb/$Rakuda/;
189.Ve
190.PP
191And with \f(CW\*(C`use encoding "euc\-jp"\*(C'\fR in effect, it is the same thing as
192the code in \s-1UTF\-8:\s0
193.PP
194.Vb 2
195\& my $Rakuda = "\ex{99F1}\ex{99DD}"; # two Unicode Characters
196\& s/\ebCamel\eb/$Rakuda/;
197.Ve
198.PP
199The \fBencoding\fR pragma also modifies the filehandle disciplines of
200\&\s-1STDIN\s0, \s-1STDOUT\s0, and \s-1STDERR\s0 to the specified encoding. Therefore,
201.PP
202.Vb 5
203\& use encoding "euc-jp";
204\& my $message = "Camel is the symbol of perl.\en";
205\& my $Rakuda = "\exF1\exD1\exF1\exCC"; # Camel in Kanji
206\& $message =~ s/\ebCamel\eb/$Rakuda/;
207\& print $message;
208.Ve
209.PP
210Will print \*(L"\exF1\exD1\exF1\exCC is the symbol of perl.\en\*(R",
211not \*(L"\ex{99F1}\ex{99DD} is the symbol of perl.\en\*(R".
212.PP
213You can override this by giving extra arguments; see below.
214.SH "USAGE"
215.IX Header "USAGE"
216.IP "use encoding [\fI\s-1ENCNAME\s0\fR] ;" 4
217.IX Item "use encoding [ENCNAME] ;"
218Sets the script encoding to \fI\s-1ENCNAME\s0\fR. Filehandle disciplines of
219\&\s-1STDIN\s0 and \s-1STDOUT\s0 are set to ":encoding(\fI\s-1ENCNAME\s0\fR)". Note that \s-1STDERR\s0
220will not be changed.
221.Sp
222If no encoding is specified, the environment variable \s-1PERL_ENCODING\s0
223is consulted. If no encoding can be found, the error \f(CW\*(C`Unknown encoding
224\&'\f(CI\s-1ENCNAME\s0\f(CW'\*(C'\fR will be thrown.
225.Sp
226Note that non-STD file handles remain unaffected. Use \f(CW\*(C`use open\*(C'\fR or
227\&\f(CW\*(C`binmode\*(C'\fR to change disciplines of those.
228.IP "use encoding \fI\s-1ENCNAME\s0\fR [ \s-1STDIN\s0 => \fI\s-1ENCNAME_IN\s0\fR ...] ;" 4
229.IX Item "use encoding ENCNAME [ STDIN => ENCNAME_IN ...] ;"
230You can also individually set encodings of \s-1STDIN\s0 and \s-1STDOUT\s0 via the
231\&\f(CW\*(C`STDIN => \f(CI\s-1ENCNAME\s0\f(CW\*(C'\fR form. In this case, you cannot omit the
232first \fI\s-1ENCNAME\s0\fR. \f(CW\*(C`STDIN => undef\*(C'\fR turns the \s-1IO\s0 transcoding
233completely off.
234.IP "no encoding;" 4
235.IX Item "no encoding;"
236Unsets the script encoding. The disciplines of \s-1STDIN\s0, \s-1STDOUT\s0 are
237reset to \*(L":raw\*(R" (the default unprocessed raw stream of bytes).
238.SH "CAVEATS"
239.IX Header "CAVEATS"
240.Sh "\s-1NOT\s0 \s-1SCOPED\s0"
241.IX Subsection "NOT SCOPED"
242The pragma is a per script, not a per block lexical. Only the last
243\&\f(CW\*(C`use encoding\*(C'\fR or \f(CW\*(C`no encoding\*(C'\fR matters, and it affects
244\&\fBthe whole script\fR. However, the <no encoding> pragma is supported and
245\&\fBuse encoding\fR can appear as many times as you want in a given script.
246The multiple use of this pragma is discouraged.
247.PP
248Because of this nature, the use of this pragma inside the module is
249strongly discouraged (because the influence of this pragma lasts not
250only for the module but the script that uses). But if you have to,
251make sure you say \f(CW\*(C`no encoding\*(C'\fR at the end of the module so you
252contain the influence of the pragma within the module.
253.Sh "\s-1DO\s0 \s-1NOT\s0 \s-1MIX\s0 \s-1MULTIPLE\s0 \s-1ENCODINGS\s0"
254.IX Subsection "DO NOT MIX MULTIPLE ENCODINGS"
255Notice that only literals (string or regular expression) having only
256legacy code points are affected: if you mix data like this
257.PP
258.Vb 1
259\& \exDF\ex{100}
260.Ve
261.PP
262the data is assumed to be in (Latin 1 and) Unicode, not in your native
263encoding. In other words, this will match in \*(L"greek\*(R":
264.PP
265.Vb 1
266\& "\exDF" =~ /\ex{3af}/
267.Ve
268.PP
269but this will not
270.PP
271.Vb 1
272\& "\exDF\ex{100}" =~ /\ex{3af}\ex{100}/
273.Ve
274.PP
275since the \f(CW\*(C`\exDF\*(C'\fR (\s-1ISO\s0 8859\-7 \s-1GREEK\s0 \s-1SMALL\s0 \s-1LETTER\s0 \s-1IOTA\s0 \s-1WITH\s0 \s-1TONOS\s0) on
276the left will \fBnot\fR be upgraded to \f(CW\*(C`\ex{3af}\*(C'\fR (Unicode \s-1GREEK\s0 \s-1SMALL\s0
277\&\s-1LETTER\s0 \s-1IOTA\s0 \s-1WITH\s0 \s-1TONOS\s0) because of the \f(CW\*(C`\ex{100}\*(C'\fR on the left. You
278should not be mixing your legacy data and Unicode in the same string.
279.PP
280This pragma also affects encoding of the 0x80..0xFF code point range:
281normally characters in that range are left as eight-bit bytes (unless
282they are combined with characters with code points 0x100 or larger,
283in which case all characters need to become \s-1UTF\-8\s0 encoded), but if
284the \f(CW\*(C`encoding\*(C'\fR pragma is present, even the 0x80..0xFF range always
285gets \s-1UTF\-8\s0 encoded.
286.PP
287After all, the best thing about this pragma is that you don't have to
288resort to \ex{....} just to spell your name in a native encoding.
289So feel free to put your strings in your encoding in quotes and
290regexes.
291.SH "Non-ASCII Identifiers and Filter option"
292.IX Header "Non-ASCII Identifiers and Filter option"
293The magic of \f(CW\*(C`use encoding\*(C'\fR is not applied to the names of
294identifiers. In order to make \f(CW\*(C`${"\ex{4eba}"}++\*(C'\fR ($human++, where human
295is a single Han ideograph) work, you still need to write your script
296in \s-1UTF\-8\s0 or use a source filter.
297.PP
298In other words, the same restriction as with Jperl applies.
299.PP
300If you dare to experiment, however, you can try the Filter option.
301.IP "use encoding \fI\s-1ENCNAME\s0\fR Filter=>1;" 4
302.IX Item "use encoding ENCNAME Filter=>1;"
303This turns the encoding pragma into a source filter. While the default
304approach just decodes interpolated literals (in \fIqq()\fR and \fIqr()\fR), this
305will apply a source filter to the entire source code. In this case,
306\&\s-1STDIN\s0 and \s-1STDOUT\s0 remain untouched.
307.PP
308What does this mean? Your source code behaves as if it is written in
309\&\s-1UTF\-8\s0. So even if your editor only supports Shift_JIS, for example,
310you can still try examples in Chapter 15 of \f(CW\*(C`Programming Perl, 3rd
311Ed.\*(C'\fR. For instance, you can use \s-1UTF\-8\s0 identifiers.
312.PP
313This option is significantly slower and (as of this writing) non-ASCII
314identifiers are not very stable \s-1WITHOUT\s0 this option and with the
315source code written in \s-1UTF\-8\s0.
316.PP
317To make your script in legacy encoding work with minimum effort,
318do not use Filter=>1.
319.SH "EXAMPLE \- Greekperl"
320.IX Header "EXAMPLE - Greekperl"
321.Vb 1
322\& use encoding "iso 8859-7";
323.Ve
324.PP
325.Vb 1
326\& # \exDF in ISO 8859-7 (Greek) is \ex{3af} in Unicode.
327.Ve
328.PP
329.Vb 2
330\& $a = "\exDF";
331\& $b = "\ex{100}";
332.Ve
333.PP
334.Vb 1
335\& printf "%#x\en", ord($a); # will print 0x3af, not 0xdf
336.Ve
337.PP
338.Vb 1
339\& $c = $a . $b;
340.Ve
341.PP
342.Vb 1
343\& # $c will be "\ex{3af}\ex{100}", not "\ex{df}\ex{100}".
344.Ve
345.PP
346.Vb 1
347\& # chr() is affected, and ...
348.Ve
349.PP
350.Vb 1
351\& print "mega\en" if ord(chr(0xdf)) == 0x3af;
352.Ve
353.PP
354.Vb 1
355\& # ... ord() is affected by the encoding pragma ...
356.Ve
357.PP
358.Vb 1
359\& print "tera\en" if ord(pack("C", 0xdf)) == 0x3af;
360.Ve
361.PP
362.Vb 1
363\& # ... as are eq and cmp ...
364.Ve
365.PP
366.Vb 2
367\& print "peta\en" if "\ex{3af}" eq pack("C", 0xdf);
368\& print "exa\en" if "\ex{3af}" cmp pack("C", 0xdf) == 0;
369.Ve
370.PP
371.Vb 2
372\& # ... but pack/unpack C are not affected, in case you still
373\& # want to go back to your native encoding
374.Ve
375.PP
376.Vb 1
377\& print "zetta\en" if unpack("C", (pack("C", 0xdf))) == 0xdf;
378.Ve
379.SH "KNOWN PROBLEMS"
380.IX Header "KNOWN PROBLEMS"
381For native multibyte encodings (either fixed or variable length),
382the current implementation of the regular expressions may introduce
383recoding errors for regular expression literals longer than 127 bytes.
384.PP
385The encoding pragma is not supported on \s-1EBCDIC\s0 platforms.
386(Porters who are willing and able to remove this limitation are
387welcome.)
388.SH "SEE ALSO"
389.IX Header "SEE ALSO"
390perlunicode, Encode, open, Filter::Util::Call,
391.PP
392Ch. 15 of \f(CW\*(C`Programming Perl (3rd Edition)\*(C'\fR
393by Larry Wall, Tom Christiansen, Jon Orwant;
394O'Reilly & Associates; \s-1ISBN\s0 0\-596\-00027\-8