Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "ENC2XS 1" | |
132 | .TH ENC2XS 1 "2007-06-19" "perl v5.8.8" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | enc2xs \-\- Perl Encode Module Generator | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 3 | |
138 | \& enc2xs -[options] | |
139 | \& enc2xs -M ModName mapfiles... | |
140 | \& enc2xs -C | |
141 | .Ve | |
142 | .SH "DESCRIPTION" | |
143 | .IX Header "DESCRIPTION" | |
144 | \&\fIenc2xs\fR builds a Perl extension for use by Encode from either | |
145 | Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). | |
146 | Besides being used internally during the build process of the Encode | |
147 | module, you can use \fIenc2xs\fR to add your own encoding to perl. | |
148 | No knowledge of \s-1XS\s0 is necessary. | |
149 | .SH "Quick Guide" | |
150 | .IX Header "Quick Guide" | |
151 | If you want to know as little about Perl as possible but need to | |
152 | add a new encoding, just read this chapter and forget the rest. | |
153 | .IP "0." 4 | |
154 | Have a .ucm file ready. You can get it from somewhere or you can write | |
155 | your own from scratch or you can grab one from the Encode distribution | |
156 | and customize it. For the \s-1UCM\s0 format, see the next Chapter. In the | |
157 | example below, I'll call my theoretical encoding myascii, defined | |
158 | in \fImy.ucm\fR. \f(CW\*(C`$\*(C'\fR is a shell prompt. | |
159 | .Sp | |
160 | .Vb 2 | |
161 | \& $ ls -F | |
162 | \& my.ucm | |
163 | .Ve | |
164 | .IP "1." 4 | |
165 | Issue a command as follows; | |
166 | .Sp | |
167 | .Vb 5 | |
168 | \& $ enc2xs -M My my.ucm | |
169 | \& generating Makefile.PL | |
170 | \& generating My.pm | |
171 | \& generating README | |
172 | \& generating Changes | |
173 | .Ve | |
174 | .Sp | |
175 | Now take a look at your current directory. It should look like this. | |
176 | .Sp | |
177 | .Vb 2 | |
178 | \& $ ls -F | |
179 | \& Makefile.PL My.pm my.ucm t/ | |
180 | .Ve | |
181 | .Sp | |
182 | The following files were created. | |
183 | .Sp | |
184 | .Vb 3 | |
185 | \& Makefile.PL - MakeMaker script | |
186 | \& My.pm - Encode submodule | |
187 | \& t/My.t - test file | |
188 | .Ve | |
189 | .RS 4 | |
190 | .IP "1.1." 4 | |
191 | .IX Item "1.1." | |
192 | If you want *.ucm installed together with the modules, do as follows; | |
193 | .Sp | |
194 | .Vb 3 | |
195 | \& $ mkdir Encode | |
196 | \& $ mv *.ucm Encode | |
197 | \& $ enc2xs -M My Encode/*ucm | |
198 | .Ve | |
199 | .RE | |
200 | .RS 4 | |
201 | .RE | |
202 | .IP "2." 4 | |
203 | Edit the files generated. You don't have to if you have no time \s-1AND\s0 no | |
204 | intention to give it to someone else. But it is a good idea to edit | |
205 | the pod and to add more tests. | |
206 | .IP "3." 4 | |
207 | Now issue a command all Perl Mongers love: | |
208 | .Sp | |
209 | .Vb 2 | |
210 | \& $ perl Makefile.PL | |
211 | \& Writing Makefile for Encode::My | |
212 | .Ve | |
213 | .IP "4." 4 | |
214 | Now all you have to do is make. | |
215 | .Sp | |
216 | .Vb 12 | |
217 | \& $ make | |
218 | \& cp My.pm blib/lib/Encode/My.pm | |
219 | \& /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \e | |
220 | \& -o encode_t.c -f encode_t.fnm | |
221 | \& Reading myascii (myascii) | |
222 | \& Writing compiled form | |
223 | \& 128 bytes in string tables | |
224 | \& 384 bytes (75%) saved spotting duplicates | |
225 | \& 1 bytes (0.775%) saved using substrings | |
226 | \& .... | |
227 | \& chmod 644 blib/arch/auto/Encode/My/My.bs | |
228 | \& $ | |
229 | .Ve | |
230 | .Sp | |
231 | The time it takes varies depending on how fast your machine is and | |
232 | how large your encoding is. Unless you are working on something big | |
233 | like euc\-tw, it won't take too long. | |
234 | .IP "5." 4 | |
235 | You can \*(L"make install\*(R" already but you should test first. | |
236 | .Sp | |
237 | .Vb 8 | |
238 | \& $ make test | |
239 | \& PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \e | |
240 | \& -e 'use Test::Harness qw(&runtests $verbose); \e | |
241 | \& $verbose=0; runtests @ARGV;' t/*.t | |
242 | \& t/My....ok | |
243 | \& All tests successful. | |
244 | \& Files=1, Tests=2, 0 wallclock secs | |
245 | \& ( 0.09 cusr + 0.01 csys = 0.09 CPU) | |
246 | .Ve | |
247 | .IP "6." 4 | |
248 | If you are content with the test result, just \*(L"make install\*(R" | |
249 | .IP "7." 4 | |
250 | If you want to add your encoding to Encode's demand-loading list | |
251 | (so you don't have to \*(L"use Encode::YourEncoding\*(R"), run | |
252 | .Sp | |
253 | .Vb 1 | |
254 | \& enc2xs -C | |
255 | .Ve | |
256 | .Sp | |
257 | to update Encode::ConfigLocal, a module that controls local settings. | |
258 | After that, \*(L"use Encode;\*(R" is enough to load your encodings on demand. | |
259 | .SH "The Unicode Character Map" | |
260 | .IX Header "The Unicode Character Map" | |
261 | Encode uses the Unicode Character Map (\s-1UCM\s0) format for source character | |
262 | mappings. This format is used by \s-1IBM\s0's \s-1ICU\s0 package and was adopted | |
263 | by Nick Ing-Simmons for use with the Encode module. Since \s-1UCM\s0 is | |
264 | more flexible than Tcl's Encoding Map and far more user\-friendly, | |
265 | this is the recommended formet for Encode now. | |
266 | .PP | |
267 | A \s-1UCM\s0 file looks like this. | |
268 | .PP | |
269 | .Vb 19 | |
270 | \& # | |
271 | \& # Comments | |
272 | \& # | |
273 | \& <code_set_name> "US-ascii" # Required | |
274 | \& <code_set_alias> "ascii" # Optional | |
275 | \& <mb_cur_min> 1 # Required; usually 1 | |
276 | \& <mb_cur_max> 1 # Max. # of bytes/char | |
277 | \& <subchar> \ex3F # Substitution char | |
278 | \& # | |
279 | \& CHARMAP | |
280 | \& <U0000> \ex00 |0 # <control> | |
281 | \& <U0001> \ex01 |0 # <control> | |
282 | \& <U0002> \ex02 |0 # <control> | |
283 | \& .... | |
284 | \& <U007C> \ex7C |0 # VERTICAL LINE | |
285 | \& <U007D> \ex7D |0 # RIGHT CURLY BRACKET | |
286 | \& <U007E> \ex7E |0 # TILDE | |
287 | \& <U007F> \ex7F |0 # <control> | |
288 | \& END CHARMAP | |
289 | .Ve | |
290 | .IP "\(bu" 4 | |
291 | Anything that follows \f(CW\*(C`#\*(C'\fR is treated as a comment. | |
292 | .IP "\(bu" 4 | |
293 | The header section continues until a line containing the word | |
294 | \&\s-1CHARMAP\s0. This section has a form of \fI<keyword> value\fR, one | |
295 | pair per line. Strings used as values must be quoted. Barewords are | |
296 | treated as numbers. \fI\exXX\fR represents a byte. | |
297 | .Sp | |
298 | Most of the keywords are self\-explanatory. \fIsubchar\fR means | |
299 | substitution character, not subcharacter. When you decode a Unicode | |
300 | sequence to this encoding but no matching character is found, the byte | |
301 | sequence defined here will be used. For most cases, the value here is | |
302 | \&\ex3F; in \s-1ASCII\s0, this is a question mark. | |
303 | .IP "\(bu" 4 | |
304 | \&\s-1CHARMAP\s0 starts the character map section. Each line has a form as | |
305 | follows: | |
306 | .Sp | |
307 | .Vb 5 | |
308 | \& <UXXXX> \exXX.. |0 # comment | |
309 | \& ^ ^ ^ | |
310 | \& | | +- Fallback flag | |
311 | \& | +-------- Encoded byte sequence | |
312 | \& +-------------- Unicode Character ID in hex | |
313 | .Ve | |
314 | .Sp | |
315 | The format is roughly the same as a header section except for the | |
316 | fallback flag: | followed by 0..3. The meaning of the possible | |
317 | values is as follows: | |
318 | .RS 4 | |
319 | .IP "|0" 4 | |
320 | .IX Item "|0" | |
321 | Round trip safe. A character decoded to Unicode encodes back to the | |
322 | same byte sequence. Most characters have this flag. | |
323 | .IP "|1" 4 | |
324 | .IX Item "|1" | |
325 | Fallback for unicode \-> encoding. When seen, enc2xs adds this | |
326 | character for the encode map only. | |
327 | .IP "|2" 4 | |
328 | .IX Item "|2" | |
329 | Skip sub-char mapping should there be no code point. | |
330 | .IP "|3" 4 | |
331 | .IX Item "|3" | |
332 | Fallback for encoding \-> unicode. When seen, enc2xs adds this | |
333 | character for the decode map only. | |
334 | .RE | |
335 | .RS 4 | |
336 | .RE | |
337 | .IP "\(bu" 4 | |
338 | And finally, \s-1END\s0 \s-1OF\s0 \s-1CHARMAP\s0 ends the section. | |
339 | .PP | |
340 | When you are manually creating a \s-1UCM\s0 file, you should copy ascii.ucm | |
341 | or an existing encoding which is close to yours, rather than write | |
342 | your own from scratch. | |
343 | .PP | |
344 | When you do so, make sure you leave at least \fBU0000\fR to \fBU0020\fR as | |
345 | is, unless your environment is \s-1EBCDIC\s0. | |
346 | .PP | |
347 | \&\fB\s-1CAVEAT\s0\fR: not all features in \s-1UCM\s0 are implemented. For example, | |
348 | icu:state is not used. Because of that, you need to write a perl | |
349 | module if you want to support algorithmical encodings, notably | |
350 | the \s-1ISO\-2022\s0 series. Such modules include Encode::JP::2022_JP, | |
351 | Encode::KR::2022_KR, and Encode::TW::HZ. | |
352 | .Sh "Coping with duplicate mappings" | |
353 | .IX Subsection "Coping with duplicate mappings" | |
354 | When you create a map, you \s-1SHOULD\s0 make your mappings round-trip safe. | |
355 | That is, \f(CW\*(C`encode('your\-encoding', decode('your\-encoding', $data)) eq | |
356 | $data\*(C'\fR stands for all characters that are marked as \f(CW\*(C`|0\*(C'\fR. Here is | |
357 | how to make sure: | |
358 | .IP "\(bu" 4 | |
359 | Sort your map in Unicode order. | |
360 | .IP "\(bu" 4 | |
361 | When you have a duplicate entry, mark either one with '|1' or '|3'. | |
362 | .IP "\(bu" 4 | |
363 | And make sure the '|1' or '|3' entry \s-1FOLLOWS\s0 the '|0' entry. | |
364 | .PP | |
365 | Here is an example from big5\-eten. | |
366 | .PP | |
367 | .Vb 2 | |
368 | \& <U2550> \exF9\exF9 |0 | |
369 | \& <U2550> \exA2\exA4 |3 | |
370 | .Ve | |
371 | .PP | |
372 | Internally Encoding \-> Unicode and Unicode \-> Encoding Map looks like | |
373 | this; | |
374 | .PP | |
375 | .Vb 4 | |
376 | \& E to U U to E | |
377 | \& -------------------------------------- | |
378 | \& \exF9\exF9 => U2550 U2550 => \exF9\exF9 | |
379 | \& \exA2\exA4 => U2550 | |
380 | .Ve | |
381 | .PP | |
382 | So it is round-trip safe for \exF9\exF9. But if the line above is upside | |
383 | down, here is what happens. | |
384 | .PP | |
385 | .Vb 4 | |
386 | \& E to U U to E | |
387 | \& -------------------------------------- | |
388 | \& \exA2\exA4 => U2550 U2550 => \exF9\exF9 | |
389 | \& (\exF9\exF9 => U2550 is now overwritten!) | |
390 | .Ve | |
391 | .PP | |
392 | The Encode package comes with \fIucmlint\fR, a crude but sufficient | |
393 | utility to check the integrity of a \s-1UCM\s0 file. Check under the | |
394 | Encode/bin directory for this. | |
395 | .PP | |
396 | When in doubt, you can use \fIucmsort\fR, yet another utility under | |
397 | Encode/bin directory. | |
398 | .SH "Bookmarks" | |
399 | .IX Header "Bookmarks" | |
400 | .IP "\(bu" 4 | |
401 | \&\s-1ICU\s0 Home Page | |
402 | <http://oss.software.ibm.com/icu/> | |
403 | .IP "\(bu" 4 | |
404 | \&\s-1ICU\s0 Character Mapping Tables | |
405 | <http://oss.software.ibm.com/icu/charset/> | |
406 | .IP "\(bu" 4 | |
407 | ICU:Conversion Data | |
408 | <http://oss.software.ibm.com/icu/userguide/conversion\-data.html> | |
409 | .SH "SEE ALSO" | |
410 | .IX Header "SEE ALSO" | |
411 | Encode, | |
412 | perlmod, | |
413 | perlpod |