Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "PERLPACKTUT 1" | |
132 | .TH PERLPACKTUT 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | perlpacktut \- tutorial on \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR | |
135 | .SH "DESCRIPTION" | |
136 | .IX Header "DESCRIPTION" | |
137 | \&\f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR are two functions for transforming data according | |
138 | to a user-defined template, between the guarded way Perl stores values | |
139 | and some well-defined representation as might be required in the | |
140 | environment of a Perl program. Unfortunately, they're also two of | |
141 | the most misunderstood and most often overlooked functions that Perl | |
142 | provides. This tutorial will demystify them for you. | |
143 | .SH "The Basic Principle" | |
144 | .IX Header "The Basic Principle" | |
145 | Most programming languages don't shelter the memory where variables are | |
146 | stored. In C, for instance, you can take the address of some variable, | |
147 | and the \f(CW\*(C`sizeof\*(C'\fR operator tells you how many bytes are allocated to | |
148 | the variable. Using the address and the size, you may access the storage | |
149 | to your heart's content. | |
150 | .PP | |
151 | In Perl, you just can't access memory at random, but the structural and | |
152 | representational conversion provided by \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR is an | |
153 | excellent alternative. The \f(CW\*(C`pack\*(C'\fR function converts values to a byte | |
154 | sequence containing representations according to a given specification, | |
155 | the so-called \*(L"template\*(R" argument. \f(CW\*(C`unpack\*(C'\fR is the reverse process, | |
156 | deriving some values from the contents of a string of bytes. (Be cautioned, | |
157 | however, that not all that has been packed together can be neatly unpacked \- | |
158 | a very common experience as seasoned travellers are likely to confirm.) | |
159 | .PP | |
160 | Why, you may ask, would you need a chunk of memory containing some values | |
161 | in binary representation? One good reason is input and output accessing | |
162 | some file, a device, or a network connection, whereby this binary | |
163 | representation is either forced on you or will give you some benefit | |
164 | in processing. Another cause is passing data to some system call that | |
165 | is not available as a Perl function: \f(CW\*(C`syscall\*(C'\fR requires you to provide | |
166 | parameters stored in the way it happens in a C program. Even text processing | |
167 | (as shown in the next section) may be simplified with judicious usage | |
168 | of these two functions. | |
169 | .PP | |
170 | To see how (un)packing works, we'll start with a simple template | |
171 | code where the conversion is in low gear: between the contents of a byte | |
172 | sequence and a string of hexadecimal digits. Let's use \f(CW\*(C`unpack\*(C'\fR, since | |
173 | this is likely to remind you of a dump program, or some desperate last | |
174 | message unfortunate programs are wont to throw at you before they expire | |
175 | into the wild blue yonder. Assuming that the variable \f(CW$mem\fR holds a | |
176 | sequence of bytes that we'd like to inspect without assuming anything | |
177 | about its meaning, we can write | |
178 | .PP | |
179 | .Vb 2 | |
180 | \& my( $hex ) = unpack( 'H*', $mem ); | |
181 | \& print "$hex\en"; | |
182 | .Ve | |
183 | .PP | |
184 | whereupon we might see something like this, with each pair of hex digits | |
185 | corresponding to a byte: | |
186 | .PP | |
187 | .Vb 1 | |
188 | \& 41204d414e204120504c414e20412043414e414c2050414e414d41 | |
189 | .Ve | |
190 | .PP | |
191 | What was in this chunk of memory? Numbers, characters, or a mixture of | |
192 | both? Assuming that we're on a computer where \s-1ASCII\s0 (or some similar) | |
193 | encoding is used: hexadecimal values in the range \f(CW0x40\fR \- \f(CW0x5A\fR | |
194 | indicate an uppercase letter, and \f(CW0x20\fR encodes a space. So we might | |
195 | assume it is a piece of text, which some are able to read like a tabloid; | |
196 | but others will have to get hold of an \s-1ASCII\s0 table and relive that | |
197 | firstgrader feeling. Not caring too much about which way to read this, | |
198 | we note that \f(CW\*(C`unpack\*(C'\fR with the template code \f(CW\*(C`H\*(C'\fR converts the contents | |
199 | of a sequence of bytes into the customary hexadecimal notation. Since | |
200 | \&\*(L"a sequence of\*(R" is a pretty vague indication of quantity, \f(CW\*(C`H\*(C'\fR has been | |
201 | defined to convert just a single hexadecimal digit unless it is followed | |
202 | by a repeat count. An asterisk for the repeat count means to use whatever | |
203 | remains. | |
204 | .PP | |
205 | The inverse operation \- packing byte contents from a string of hexadecimal | |
206 | digits \- is just as easily written. For instance: | |
207 | .PP | |
208 | .Vb 2 | |
209 | \& my $s = pack( 'H2' x 10, map { "3$_" } ( 0..9 ) ); | |
210 | \& print "$s\en"; | |
211 | .Ve | |
212 | .PP | |
213 | Since we feed a list of ten 2\-digit hexadecimal strings to \f(CW\*(C`pack\*(C'\fR, the | |
214 | pack template should contain ten pack codes. If this is run on a computer | |
215 | with \s-1ASCII\s0 character coding, it will print \f(CW0123456789\fR. | |
216 | .SH "Packing Text" | |
217 | .IX Header "Packing Text" | |
218 | Let's suppose you've got to read in a data file like this: | |
219 | .PP | |
220 | .Vb 4 | |
221 | \& Date |Description | Income|Expenditure | |
222 | \& 01/24/2001 Ahmed's Camel Emporium 1147.99 | |
223 | \& 01/28/2001 Flea spray 24.99 | |
224 | \& 01/29/2001 Camel rides to tourists 235.00 | |
225 | .Ve | |
226 | .PP | |
227 | How do we do it? You might think first to use \f(CW\*(C`split\*(C'\fR; however, since | |
228 | \&\f(CW\*(C`split\*(C'\fR collapses blank fields, you'll never know whether a record was | |
229 | income or expenditure. Oops. Well, you could always use \f(CW\*(C`substr\*(C'\fR: | |
230 | .PP | |
231 | .Vb 7 | |
232 | \& while (<>) { | |
233 | \& my $date = substr($_, 0, 11); | |
234 | \& my $desc = substr($_, 12, 27); | |
235 | \& my $income = substr($_, 40, 7); | |
236 | \& my $expend = substr($_, 52, 7); | |
237 | \& ... | |
238 | \& } | |
239 | .Ve | |
240 | .PP | |
241 | It's not really a barrel of laughs, is it? In fact, it's worse than it | |
242 | may seem; the eagle-eyed may notice that the first field should only be | |
243 | 10 characters wide, and the error has propagated right through the other | |
244 | numbers \- which we've had to count by hand. So it's error-prone as well | |
245 | as horribly unfriendly. | |
246 | .PP | |
247 | Or maybe we could use regular expressions: | |
248 | .PP | |
249 | .Vb 5 | |
250 | \& while (<>) { | |
251 | \& my($date, $desc, $income, $expend) = | |
252 | \& m|(\ed\ed/\ed\ed/\ed{4}) (.{27}) (.{7})(.*)|; | |
253 | \& ... | |
254 | \& } | |
255 | .Ve | |
256 | .PP | |
257 | Urgh. Well, it's a bit better, but \- well, would you want to maintain | |
258 | that? | |
259 | .PP | |
260 | Hey, isn't Perl supposed to make this sort of thing easy? Well, it does, | |
261 | if you use the right tools. \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR are designed to help | |
262 | you out when dealing with fixed-width data like the above. Let's have a | |
263 | look at a solution with \f(CW\*(C`unpack\*(C'\fR: | |
264 | .PP | |
265 | .Vb 4 | |
266 | \& while (<>) { | |
267 | \& my($date, $desc, $income, $expend) = unpack("A10xA27xA7A*", $_); | |
268 | \& ... | |
269 | \& } | |
270 | .Ve | |
271 | .PP | |
272 | That looks a bit nicer; but we've got to take apart that weird template. | |
273 | Where did I pull that out of? | |
274 | .PP | |
275 | \&\s-1OK\s0, let's have a look at some of our data again; in fact, we'll include | |
276 | the headers, and a handy ruler so we can keep track of where we are. | |
277 | .PP | |
278 | .Vb 5 | |
279 | \& 1 2 3 4 5 | |
280 | \& 1234567890123456789012345678901234567890123456789012345678 | |
281 | \& Date |Description | Income|Expenditure | |
282 | \& 01/28/2001 Flea spray 24.99 | |
283 | \& 01/29/2001 Camel rides to tourists 235.00 | |
284 | .Ve | |
285 | .PP | |
286 | From this, we can see that the date column stretches from column 1 to | |
287 | column 10 \- ten characters wide. The \f(CW\*(C`pack\*(C'\fR\-ese for \*(L"character\*(R" is | |
288 | \&\f(CW\*(C`A\*(C'\fR, and ten of them are \f(CW\*(C`A10\*(C'\fR. So if we just wanted to extract the | |
289 | dates, we could say this: | |
290 | .PP | |
291 | .Vb 1 | |
292 | \& my($date) = unpack("A10", $_); | |
293 | .Ve | |
294 | .PP | |
295 | \&\s-1OK\s0, what's next? Between the date and the description is a blank column; | |
296 | we want to skip over that. The \f(CW\*(C`x\*(C'\fR template means \*(L"skip forward\*(R", so we | |
297 | want one of those. Next, we have another batch of characters, from 12 to | |
298 | 38. That's 27 more characters, hence \f(CW\*(C`A27\*(C'\fR. (Don't make the fencepost | |
299 | error \- there are 27 characters between 12 and 38, not 26. Count 'em!) | |
300 | .PP | |
301 | Now we skip another character and pick up the next 7 characters: | |
302 | .PP | |
303 | .Vb 1 | |
304 | \& my($date,$description,$income) = unpack("A10xA27xA7", $_); | |
305 | .Ve | |
306 | .PP | |
307 | Now comes the clever bit. Lines in our ledger which are just income and | |
308 | not expenditure might end at column 46. Hence, we don't want to tell our | |
309 | \&\f(CW\*(C`unpack\*(C'\fR pattern that we \fBneed\fR to find another 12 characters; we'll | |
310 | just say \*(L"if there's anything left, take it\*(R". As you might guess from | |
311 | regular expressions, that's what the \f(CW\*(C`*\*(C'\fR means: \*(L"use everything | |
312 | remaining\*(R". | |
313 | .IP "\(bu" 3 | |
314 | Be warned, though, that unlike regular expressions, if the \f(CW\*(C`unpack\*(C'\fR | |
315 | template doesn't match the incoming data, Perl will scream and die. | |
316 | .PP | |
317 | Hence, putting it all together: | |
318 | .PP | |
319 | .Vb 1 | |
320 | \& my($date,$description,$income,$expend) = unpack("A10xA27xA7xA*", $_); | |
321 | .Ve | |
322 | .PP | |
323 | Now, that's our data parsed. I suppose what we might want to do now is | |
324 | total up our income and expenditure, and add another line to the end of | |
325 | our ledger \- in the same format \- saying how much we've brought in and | |
326 | how much we've spent: | |
327 | .PP | |
328 | .Vb 5 | |
329 | \& while (<>) { | |
330 | \& my($date, $desc, $income, $expend) = unpack("A10xA27xA7xA*", $_); | |
331 | \& $tot_income += $income; | |
332 | \& $tot_expend += $expend; | |
333 | \& } | |
334 | .Ve | |
335 | .PP | |
336 | .Vb 2 | |
337 | \& $tot_income = sprintf("%.2f", $tot_income); # Get them into | |
338 | \& $tot_expend = sprintf("%.2f", $tot_expend); # "financial" format | |
339 | .Ve | |
340 | .PP | |
341 | .Vb 1 | |
342 | \& $date = POSIX::strftime("%m/%d/%Y", localtime); | |
343 | .Ve | |
344 | .PP | |
345 | .Vb 1 | |
346 | \& # OK, let's go: | |
347 | .Ve | |
348 | .PP | |
349 | .Vb 1 | |
350 | \& print pack("A10xA27xA7xA*", $date, "Totals", $tot_income, $tot_expend); | |
351 | .Ve | |
352 | .PP | |
353 | Oh, hmm. That didn't quite work. Let's see what happened: | |
354 | .PP | |
355 | .Vb 4 | |
356 | \& 01/24/2001 Ahmed's Camel Emporium 1147.99 | |
357 | \& 01/28/2001 Flea spray 24.99 | |
358 | \& 01/29/2001 Camel rides to tourists 1235.00 | |
359 | \& 03/23/2001Totals 1235.001172.98 | |
360 | .Ve | |
361 | .PP | |
362 | \&\s-1OK\s0, it's a start, but what happened to the spaces? We put \f(CW\*(C`x\*(C'\fR, didn't | |
363 | we? Shouldn't it skip forward? Let's look at what \*(L"pack\*(R" in perlfunc says: | |
364 | .PP | |
365 | .Vb 1 | |
366 | \& x A null byte. | |
367 | .Ve | |
368 | .PP | |
369 | Urgh. No wonder. There's a big difference between \*(L"a null byte\*(R", | |
370 | character zero, and \*(L"a space\*(R", character 32. Perl's put something | |
371 | between the date and the description \- but unfortunately, we can't see | |
372 | it! | |
373 | .PP | |
374 | What we actually need to do is expand the width of the fields. The \f(CW\*(C`A\*(C'\fR | |
375 | format pads any non-existent characters with spaces, so we can use the | |
376 | additional spaces to line up our fields, like this: | |
377 | .PP | |
378 | .Vb 1 | |
379 | \& print pack("A11 A28 A8 A*", $date, "Totals", $tot_income, $tot_expend); | |
380 | .Ve | |
381 | .PP | |
382 | (Note that you can put spaces in the template to make it more readable, | |
383 | but they don't translate to spaces in the output.) Here's what we got | |
384 | this time: | |
385 | .PP | |
386 | .Vb 4 | |
387 | \& 01/24/2001 Ahmed's Camel Emporium 1147.99 | |
388 | \& 01/28/2001 Flea spray 24.99 | |
389 | \& 01/29/2001 Camel rides to tourists 1235.00 | |
390 | \& 03/23/2001 Totals 1235.00 1172.98 | |
391 | .Ve | |
392 | .PP | |
393 | That's a bit better, but we still have that last column which needs to | |
394 | be moved further over. There's an easy way to fix this up: | |
395 | unfortunately, we can't get \f(CW\*(C`pack\*(C'\fR to right-justify our fields, but we | |
396 | can get \f(CW\*(C`sprintf\*(C'\fR to do it: | |
397 | .PP | |
398 | .Vb 4 | |
399 | \& $tot_income = sprintf("%.2f", $tot_income); | |
400 | \& $tot_expend = sprintf("%12.2f", $tot_expend); | |
401 | \& $date = POSIX::strftime("%m/%d/%Y", localtime); | |
402 | \& print pack("A11 A28 A8 A*", $date, "Totals", $tot_income, $tot_expend); | |
403 | .Ve | |
404 | .PP | |
405 | This time we get the right answer: | |
406 | .PP | |
407 | .Vb 3 | |
408 | \& 01/28/2001 Flea spray 24.99 | |
409 | \& 01/29/2001 Camel rides to tourists 1235.00 | |
410 | \& 03/23/2001 Totals 1235.00 1172.98 | |
411 | .Ve | |
412 | .PP | |
413 | So that's how we consume and produce fixed-width data. Let's recap what | |
414 | we've seen of \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR so far: | |
415 | .IP "\(bu" 3 | |
416 | Use \f(CW\*(C`pack\*(C'\fR to go from several pieces of data to one fixed-width | |
417 | version; use \f(CW\*(C`unpack\*(C'\fR to turn a fixed-width-format string into several | |
418 | pieces of data. | |
419 | .IP "\(bu" 3 | |
420 | The pack format \f(CW\*(C`A\*(C'\fR means \*(L"any character\*(R"; if you're \f(CW\*(C`pack\*(C'\fRing and | |
421 | you've run out of things to pack, \f(CW\*(C`pack\*(C'\fR will fill the rest up with | |
422 | spaces. | |
423 | .IP "\(bu" 3 | |
424 | \&\f(CW\*(C`x\*(C'\fR means \*(L"skip a byte\*(R" when \f(CW\*(C`unpack\*(C'\fRing; when \f(CW\*(C`pack\*(C'\fRing, it means | |
425 | \&\*(L"introduce a null byte\*(R" \- that's probably not what you mean if you're | |
426 | dealing with plain text. | |
427 | .IP "\(bu" 3 | |
428 | You can follow the formats with numbers to say how many characters | |
429 | should be affected by that format: \f(CW\*(C`A12\*(C'\fR means \*(L"take 12 characters\*(R"; | |
430 | \&\f(CW\*(C`x6\*(C'\fR means \*(L"skip 6 bytes\*(R" or \*(L"character 0, 6 times\*(R". | |
431 | .IP "\(bu" 3 | |
432 | Instead of a number, you can use \f(CW\*(C`*\*(C'\fR to mean \*(L"consume everything else | |
433 | left\*(R". | |
434 | .Sp | |
435 | \&\fBWarning\fR: when packing multiple pieces of data, \f(CW\*(C`*\*(C'\fR only means | |
436 | \&\*(L"consume all of the current piece of data\*(R". That's to say | |
437 | .Sp | |
438 | .Vb 1 | |
439 | \& pack("A*A*", $one, $two) | |
440 | .Ve | |
441 | .Sp | |
442 | packs all of \f(CW$one\fR into the first \f(CW\*(C`A*\*(C'\fR and then all of \f(CW$two\fR into | |
443 | the second. This is a general principle: each format character | |
444 | corresponds to one piece of data to be \f(CW\*(C`pack\*(C'\fRed. | |
445 | .SH "Packing Numbers" | |
446 | .IX Header "Packing Numbers" | |
447 | So much for textual data. Let's get onto the meaty stuff that \f(CW\*(C`pack\*(C'\fR | |
448 | and \f(CW\*(C`unpack\*(C'\fR are best at: handling binary formats for numbers. There is, | |
449 | of course, not just one binary format \- life would be too simple \- but | |
450 | Perl will do all the finicky labor for you. | |
451 | .Sh "Integers" | |
452 | .IX Subsection "Integers" | |
453 | Packing and unpacking numbers implies conversion to and from some | |
454 | \&\fIspecific\fR binary representation. Leaving floating point numbers | |
455 | aside for the moment, the salient properties of any such representation | |
456 | are: | |
457 | .IP "\(bu" 4 | |
458 | the number of bytes used for storing the integer, | |
459 | .IP "\(bu" 4 | |
460 | whether the contents are interpreted as a signed or unsigned number, | |
461 | .IP "\(bu" 4 | |
462 | the byte ordering: whether the first byte is the least or most | |
463 | significant byte (or: little-endian or big\-endian, respectively). | |
464 | .PP | |
465 | So, for instance, to pack 20302 to a signed 16 bit integer in your | |
466 | computer's representation you write | |
467 | .PP | |
468 | .Vb 1 | |
469 | \& my $ps = pack( 's', 20302 ); | |
470 | .Ve | |
471 | .PP | |
472 | Again, the result is a string, now containing 2 bytes. If you print | |
473 | this string (which is, generally, not recommended) you might see | |
474 | \&\f(CW\*(C`ON\*(C'\fR or \f(CW\*(C`NO\*(C'\fR (depending on your system's byte ordering) \- or something | |
475 | entirely different if your computer doesn't use \s-1ASCII\s0 character encoding. | |
476 | Unpacking \f(CW$ps\fR with the same template returns the original integer value: | |
477 | .PP | |
478 | .Vb 1 | |
479 | \& my( $s ) = unpack( 's', $ps ); | |
480 | .Ve | |
481 | .PP | |
482 | This is true for all numeric template codes. But don't expect miracles: | |
483 | if the packed value exceeds the allotted byte capacity, high order bits | |
484 | are silently discarded, and unpack certainly won't be able to pull them | |
485 | back out of some magic hat. And, when you pack using a signed template | |
486 | code such as \f(CW\*(C`s\*(C'\fR, an excess value may result in the sign bit | |
487 | getting set, and unpacking this will smartly return a negative value. | |
488 | .PP | |
489 | 16 bits won't get you too far with integers, but there is \f(CW\*(C`l\*(C'\fR and \f(CW\*(C`L\*(C'\fR | |
490 | for signed and unsigned 32\-bit integers. And if this is not enough and | |
491 | your system supports 64 bit integers you can push the limits much closer | |
492 | to infinity with pack codes \f(CW\*(C`q\*(C'\fR and \f(CW\*(C`Q\*(C'\fR. A notable exception is provided | |
493 | by pack codes \f(CW\*(C`i\*(C'\fR and \f(CW\*(C`I\*(C'\fR for signed and unsigned integers of the | |
494 | \&\*(L"local custom\*(R" variety: Such an integer will take up as many bytes as | |
495 | a local C compiler returns for \f(CW\*(C`sizeof(int)\*(C'\fR, but it'll use \fIat least\fR | |
496 | 32 bits. | |
497 | .PP | |
498 | Each of the integer pack codes \f(CW\*(C`sSlLqQ\*(C'\fR results in a fixed number of bytes, | |
499 | no matter where you execute your program. This may be useful for some | |
500 | applications, but it does not provide for a portable way to pass data | |
501 | structures between Perl and C programs (bound to happen when you call | |
502 | \&\s-1XS\s0 extensions or the Perl function \f(CW\*(C`syscall\*(C'\fR), or when you read or | |
503 | write binary files. What you'll need in this case are template codes that | |
504 | depend on what your local C compiler compiles when you code \f(CW\*(C`short\*(C'\fR or | |
505 | \&\f(CW\*(C`unsigned long\*(C'\fR, for instance. These codes and their corresponding | |
506 | byte lengths are shown in the table below. Since the C standard leaves | |
507 | much leeway with respect to the relative sizes of these data types, actual | |
508 | values may vary, and that's why the values are given as expressions in | |
509 | C and Perl. (If you'd like to use values from \f(CW%Config\fR in your program | |
510 | you have to import it with \f(CW\*(C`use Config\*(C'\fR.) | |
511 | .PP | |
512 | .Vb 5 | |
513 | \& signed unsigned byte length in C byte length in Perl | |
514 | \& s! S! sizeof(short) $Config{shortsize} | |
515 | \& i! I! sizeof(int) $Config{intsize} | |
516 | \& l! L! sizeof(long) $Config{longsize} | |
517 | \& q! Q! sizeof(long long) $Config{longlongsize} | |
518 | .Ve | |
519 | .PP | |
520 | The \f(CW\*(C`i!\*(C'\fR and \f(CW\*(C`I!\*(C'\fR codes aren't different from \f(CW\*(C`i\*(C'\fR and \f(CW\*(C`I\*(C'\fR; they are | |
521 | tolerated for completeness' sake. | |
522 | .Sh "Unpacking a Stack Frame" | |
523 | .IX Subsection "Unpacking a Stack Frame" | |
524 | Requesting a particular byte ordering may be necessary when you work with | |
525 | binary data coming from some specific architecture whereas your program could | |
526 | run on a totally different system. As an example, assume you have 24 bytes | |
527 | containing a stack frame as it happens on an Intel 8086: | |
528 | .PP | |
529 | .Vb 11 | |
530 | \& +---------+ +----+----+ +---------+ | |
531 | \& TOS: | IP | TOS+4:| FL | FH | FLAGS TOS+14:| SI | | |
532 | \& +---------+ +----+----+ +---------+ | |
533 | \& | CS | | AL | AH | AX | DI | | |
534 | \& +---------+ +----+----+ +---------+ | |
535 | \& | BL | BH | BX | BP | | |
536 | \& +----+----+ +---------+ | |
537 | \& | CL | CH | CX | DS | | |
538 | \& +----+----+ +---------+ | |
539 | \& | DL | DH | DX | ES | | |
540 | \& +----+----+ +---------+ | |
541 | .Ve | |
542 | .PP | |
543 | First, we note that this time-honored 16\-bit \s-1CPU\s0 uses little-endian order, | |
544 | and that's why the low order byte is stored at the lower address. To | |
545 | unpack such a (signed) short we'll have to use code \f(CW\*(C`v\*(C'\fR. A repeat | |
546 | count unpacks all 12 shorts: | |
547 | .PP | |
548 | .Vb 2 | |
549 | \& my( $ip, $cs, $flags, $ax, $bx, $cd, $dx, $si, $di, $bp, $ds, $es ) = | |
550 | \& unpack( 'v12', $frame ); | |
551 | .Ve | |
552 | .PP | |
553 | Alternatively, we could have used \f(CW\*(C`C\*(C'\fR to unpack the individually | |
554 | accessible byte registers \s-1FL\s0, \s-1FH\s0, \s-1AL\s0, \s-1AH\s0, etc.: | |
555 | .PP | |
556 | .Vb 2 | |
557 | \& my( $fl, $fh, $al, $ah, $bl, $bh, $cl, $ch, $dl, $dh ) = | |
558 | \& unpack( 'C10', substr( $frame, 4, 10 ) ); | |
559 | .Ve | |
560 | .PP | |
561 | It would be nice if we could do this in one fell swoop: unpack a short, | |
562 | back up a little, and then unpack 2 bytes. Since Perl \fIis\fR nice, it | |
563 | proffers the template code \f(CW\*(C`X\*(C'\fR to back up one byte. Putting this all | |
564 | together, we may now write: | |
565 | .PP | |
566 | .Vb 5 | |
567 | \& my( $ip, $cs, | |
568 | \& $flags,$fl,$fh, | |
569 | \& $ax,$al,$ah, $bx,$bl,$bh, $cx,$cl,$ch, $dx,$dl,$dh, | |
570 | \& $si, $di, $bp, $ds, $es ) = | |
571 | \& unpack( 'v2' . ('vXXCC' x 5) . 'v5', $frame ); | |
572 | .Ve | |
573 | .PP | |
574 | (The clumsy construction of the template can be avoided \- just read on!) | |
575 | .PP | |
576 | We've taken some pains to construct the template so that it matches | |
577 | the contents of our frame buffer. Otherwise we'd either get undefined values, | |
578 | or \f(CW\*(C`unpack\*(C'\fR could not unpack all. If \f(CW\*(C`pack\*(C'\fR runs out of items, it will | |
579 | supply null strings (which are coerced into zeroes whenever the pack code | |
580 | says so). | |
581 | .Sh "How to Eat an Egg on a Net" | |
582 | .IX Subsection "How to Eat an Egg on a Net" | |
583 | The pack code for big-endian (high order byte at the lowest address) is | |
584 | \&\f(CW\*(C`n\*(C'\fR for 16 bit and \f(CW\*(C`N\*(C'\fR for 32 bit integers. You use these codes | |
585 | if you know that your data comes from a compliant architecture, but, | |
586 | surprisingly enough, you should also use these pack codes if you | |
587 | exchange binary data, across the network, with some system that you | |
588 | know next to nothing about. The simple reason is that this | |
589 | order has been chosen as the \fInetwork order\fR, and all standard-fearing | |
590 | programs ought to follow this convention. (This is, of course, a stern | |
591 | backing for one of the Lilliputian parties and may well influence the | |
592 | political development there.) So, if the protocol expects you to send | |
593 | a message by sending the length first, followed by just so many bytes, | |
594 | you could write: | |
595 | .PP | |
596 | .Vb 1 | |
597 | \& my $buf = pack( 'N', length( $msg ) ) . $msg; | |
598 | .Ve | |
599 | .PP | |
600 | or even: | |
601 | .PP | |
602 | .Vb 1 | |
603 | \& my $buf = pack( 'NA*', length( $msg ), $msg ); | |
604 | .Ve | |
605 | .PP | |
606 | and pass \f(CW$buf\fR to your send routine. Some protocols demand that the | |
607 | count should include the length of the count itself: then just add 4 | |
608 | to the data length. (But make sure to read \*(L"Lengths and Widths\*(R" before | |
609 | you really code this!) | |
610 | .Sh "Floating point Numbers" | |
611 | .IX Subsection "Floating point Numbers" | |
612 | For packing floating point numbers you have the choice between the | |
613 | pack codes \f(CW\*(C`f\*(C'\fR and \f(CW\*(C`d\*(C'\fR which pack into (or unpack from) single-precision or | |
614 | double-precision representation as it is provided by your system. (There | |
615 | is no such thing as a network representation for reals, so if you want | |
616 | to send your real numbers across computer boundaries, you'd better stick | |
617 | to \s-1ASCII\s0 representation, unless you're absolutely sure what's on the other | |
618 | end of the line.) | |
619 | .SH "Exotic Templates" | |
620 | .IX Header "Exotic Templates" | |
621 | .Sh "Bit Strings" | |
622 | .IX Subsection "Bit Strings" | |
623 | Bits are the atoms in the memory world. Access to individual bits may | |
624 | have to be used either as a last resort or because it is the most | |
625 | convenient way to handle your data. Bit string (un)packing converts | |
626 | between strings containing a series of \f(CW0\fR and \f(CW1\fR characters and | |
627 | a sequence of bytes each containing a group of 8 bits. This is almost | |
628 | as simple as it sounds, except that there are two ways the contents of | |
629 | a byte may be written as a bit string. Let's have a look at an annotated | |
630 | byte: | |
631 | .PP | |
632 | .Vb 5 | |
633 | \& 7 6 5 4 3 2 1 0 | |
634 | \& +-----------------+ | |
635 | \& | 1 0 0 0 1 1 0 0 | | |
636 | \& +-----------------+ | |
637 | \& MSB LSB | |
638 | .Ve | |
639 | .PP | |
640 | It's egg-eating all over again: Some think that as a bit string this should | |
641 | be written \*(L"10001100\*(R" i.e. beginning with the most significant bit, others | |
642 | insist on \*(L"00110001\*(R". Well, Perl isn't biased, so that's why we have two bit | |
643 | string codes: | |
644 | .PP | |
645 | .Vb 2 | |
646 | \& $byte = pack( 'B8', '10001100' ); # start with MSB | |
647 | \& $byte = pack( 'b8', '00110001' ); # start with LSB | |
648 | .Ve | |
649 | .PP | |
650 | It is not possible to pack or unpack bit fields \- just integral bytes. | |
651 | \&\f(CW\*(C`pack\*(C'\fR always starts at the next byte boundary and \*(L"rounds up\*(R" to the | |
652 | next multiple of 8 by adding zero bits as required. (If you do want bit | |
653 | fields, there is \*(L"vec\*(R" in perlfunc. Or you could implement bit field | |
654 | handling at the character string level, using split, substr, and | |
655 | concatenation on unpacked bit strings.) | |
656 | .PP | |
657 | To illustrate unpacking for bit strings, we'll decompose a simple | |
658 | status register (a \*(L"\-\*(R" stands for a \*(L"reserved\*(R" bit): | |
659 | .PP | |
660 | .Vb 4 | |
661 | \& +-----------------+-----------------+ | |
662 | \& | S Z - A - P - C | - - - - O D I T | | |
663 | \& +-----------------+-----------------+ | |
664 | \& MSB LSB MSB LSB | |
665 | .Ve | |
666 | .PP | |
667 | Converting these two bytes to a string can be done with the unpack | |
668 | template \f(CW'b16'\fR. To obtain the individual bit values from the bit | |
669 | string we use \f(CW\*(C`split\*(C'\fR with the \*(L"empty\*(R" separator pattern which dissects | |
670 | into individual characters. Bit values from the \*(L"reserved\*(R" positions are | |
671 | simply assigned to \f(CW\*(C`undef\*(C'\fR, a convenient notation for \*(L"I don't care where | |
672 | this goes\*(R". | |
673 | .PP | |
674 | .Vb 3 | |
675 | \& ($carry, undef, $parity, undef, $auxcarry, undef, $zero, $sign, | |
676 | \& $trace, $interrupt, $direction, $overflow) = | |
677 | \& split( //, unpack( 'b16', $status ) ); | |
678 | .Ve | |
679 | .PP | |
680 | We could have used an unpack template \f(CW'b12'\fR just as well, since the | |
681 | last 4 bits can be ignored anyway. | |
682 | .Sh "Uuencoding" | |
683 | .IX Subsection "Uuencoding" | |
684 | Another odd-man-out in the template alphabet is \f(CW\*(C`u\*(C'\fR, which packs an | |
685 | \&\*(L"uuencoded string\*(R". (\*(L"uu\*(R" is short for Unix\-to\-Unix.) Chances are that | |
686 | you won't ever need this encoding technique which was invented to overcome | |
687 | the shortcomings of old-fashioned transmission mediums that do not support | |
688 | other than simple \s-1ASCII\s0 data. The essential recipe is simple: Take three | |
689 | bytes, or 24 bits. Split them into 4 six\-packs, adding a space (0x20) to | |
690 | each. Repeat until all of the data is blended. Fold groups of 4 bytes into | |
691 | lines no longer than 60 and garnish them in front with the original byte count | |
692 | (incremented by 0x20) and a \f(CW"\en"\fR at the end. \- The \f(CW\*(C`pack\*(C'\fR chef will | |
693 | prepare this for you, a la minute, when you select pack code \f(CW\*(C`u\*(C'\fR on the menu: | |
694 | .PP | |
695 | .Vb 1 | |
696 | \& my $uubuf = pack( 'u', $bindat ); | |
697 | .Ve | |
698 | .PP | |
699 | A repeat count after \f(CW\*(C`u\*(C'\fR sets the number of bytes to put into an | |
700 | uuencoded line, which is the maximum of 45 by default, but could be | |
701 | set to some (smaller) integer multiple of three. \f(CW\*(C`unpack\*(C'\fR simply ignores | |
702 | the repeat count. | |
703 | .Sh "Doing Sums" | |
704 | .IX Subsection "Doing Sums" | |
705 | An even stranger template code is \f(CW\*(C`%\*(C'\fR<\fInumber\fR>. First, because | |
706 | it's used as a prefix to some other template code. Second, because it | |
707 | cannot be used in \f(CW\*(C`pack\*(C'\fR at all, and third, in \f(CW\*(C`unpack\*(C'\fR, doesn't return the | |
708 | data as defined by the template code it precedes. Instead it'll give you an | |
709 | integer of \fInumber\fR bits that is computed from the data value by | |
710 | doing sums. For numeric unpack codes, no big feat is achieved: | |
711 | .PP | |
712 | .Vb 2 | |
713 | \& my $buf = pack( 'iii', 100, 20, 3 ); | |
714 | \& print unpack( '%32i3', $buf ), "\en"; # prints 123 | |
715 | .Ve | |
716 | .PP | |
717 | For string values, \f(CW\*(C`%\*(C'\fR returns the sum of the byte values saving | |
718 | you the trouble of a sum loop with \f(CW\*(C`substr\*(C'\fR and \f(CW\*(C`ord\*(C'\fR: | |
719 | .PP | |
720 | .Vb 1 | |
721 | \& print unpack( '%32A*', "\ex01\ex10" ), "\en"; # prints 17 | |
722 | .Ve | |
723 | .PP | |
724 | Although the \f(CW\*(C`%\*(C'\fR code is documented as returning a \*(L"checksum\*(R": | |
725 | don't put your trust in such values! Even when applied to a small number | |
726 | of bytes, they won't guarantee a noticeable Hamming distance. | |
727 | .PP | |
728 | In connection with \f(CW\*(C`b\*(C'\fR or \f(CW\*(C`B\*(C'\fR, \f(CW\*(C`%\*(C'\fR simply adds bits, and this can be put | |
729 | to good use to count set bits efficiently: | |
730 | .PP | |
731 | .Vb 1 | |
732 | \& my $bitcount = unpack( '%32b*', $mask ); | |
733 | .Ve | |
734 | .PP | |
735 | And an even parity bit can be determined like this: | |
736 | .PP | |
737 | .Vb 1 | |
738 | \& my $evenparity = unpack( '%1b*', $mask ); | |
739 | .Ve | |
740 | .Sh "Unicode" | |
741 | .IX Subsection "Unicode" | |
742 | Unicode is a character set that can represent most characters in most of | |
743 | the world's languages, providing room for over one million different | |
744 | characters. Unicode 3.1 specifies 94,140 characters: The Basic Latin | |
745 | characters are assigned to the numbers 0 \- 127. The Latin\-1 Supplement with | |
746 | characters that are used in several European languages is in the next | |
747 | range, up to 255. After some more Latin extensions we find the character | |
748 | sets from languages using non-Roman alphabets, interspersed with a | |
749 | variety of symbol sets such as currency symbols, Zapf Dingbats or Braille. | |
750 | (You might want to visit www.unicode.org for a look at some of | |
751 | them \- my personal favourites are Telugu and Kannada.) | |
752 | .PP | |
753 | The Unicode character sets associates characters with integers. Encoding | |
754 | these numbers in an equal number of bytes would more than double the | |
755 | requirements for storing texts written in Latin alphabets. | |
756 | The \s-1UTF\-8\s0 encoding avoids this by storing the most common (from a western | |
757 | point of view) characters in a single byte while encoding the rarer | |
758 | ones in three or more bytes. | |
759 | .PP | |
760 | So what has this got to do with \f(CW\*(C`pack\*(C'\fR? Well, if you want to convert | |
761 | between a Unicode number and its \s-1UTF\-8\s0 representation you can do so by | |
762 | using template code \f(CW\*(C`U\*(C'\fR. As an example, let's produce the \s-1UTF\-8\s0 | |
763 | representation of the Euro currency symbol (code number 0x20AC): | |
764 | .PP | |
765 | .Vb 1 | |
766 | \& $UTF8{Euro} = pack( 'U', 0x20AC ); | |
767 | .Ve | |
768 | .PP | |
769 | Inspecting \f(CW$UTF8{Euro}\fR shows that it contains 3 bytes: \*(L"\exe2\ex82\exac\*(R". The | |
770 | round trip can be completed with \f(CW\*(C`unpack\*(C'\fR: | |
771 | .PP | |
772 | .Vb 1 | |
773 | \& $Unicode{Euro} = unpack( 'U', $UTF8{Euro} ); | |
774 | .Ve | |
775 | .PP | |
776 | Usually you'll want to pack or unpack \s-1UTF\-8\s0 strings: | |
777 | .PP | |
778 | .Vb 3 | |
779 | \& # pack and unpack the Hebrew alphabet | |
780 | \& my $alefbet = pack( 'U*', 0x05d0..0x05ea ); | |
781 | \& my @hebrew = unpack( 'U*', $utf ); | |
782 | .Ve | |
783 | .Sh "Another Portable Binary Encoding" | |
784 | .IX Subsection "Another Portable Binary Encoding" | |
785 | The pack code \f(CW\*(C`w\*(C'\fR has been added to support a portable binary data | |
786 | encoding scheme that goes way beyond simple integers. (Details can | |
787 | be found at Casbah.org, the Scarab project.) A \s-1BER\s0 (Binary Encoded | |
788 | Representation) compressed unsigned integer stores base 128 | |
789 | digits, most significant digit first, with as few digits as possible. | |
790 | Bit eight (the high bit) is set on each byte except the last. There | |
791 | is no size limit to \s-1BER\s0 encoding, but Perl won't go to extremes. | |
792 | .PP | |
793 | .Vb 1 | |
794 | \& my $berbuf = pack( 'w*', 1, 128, 128+1, 128*128+127 ); | |
795 | .Ve | |
796 | .PP | |
797 | A hex dump of \f(CW$berbuf\fR, with spaces inserted at the right places, | |
798 | shows 01 8100 8101 81807F. Since the last byte is always less than | |
799 | 128, \f(CW\*(C`unpack\*(C'\fR knows where to stop. | |
800 | .SH "Template Grouping" | |
801 | .IX Header "Template Grouping" | |
802 | Prior to Perl 5.8, repetitions of templates had to be made by | |
803 | \&\f(CW\*(C`x\*(C'\fR\-multiplication of template strings. Now there is a better way as | |
804 | we may use the pack codes \f(CW\*(C`(\*(C'\fR and \f(CW\*(C`)\*(C'\fR combined with a repeat count. | |
805 | The \f(CW\*(C`unpack\*(C'\fR template from the Stack Frame example can simply | |
806 | be written like this: | |
807 | .PP | |
808 | .Vb 1 | |
809 | \& unpack( 'v2 (vXXCC)5 v5', $frame ) | |
810 | .Ve | |
811 | .PP | |
812 | Let's explore this feature a little more. We'll begin with the equivalent of | |
813 | .PP | |
814 | .Vb 1 | |
815 | \& join( '', map( substr( $_, 0, 1 ), @str ) ) | |
816 | .Ve | |
817 | .PP | |
818 | which returns a string consisting of the first character from each string. | |
819 | Using pack, we can write | |
820 | .PP | |
821 | .Vb 1 | |
822 | \& pack( '(A)'.@str, @str ) | |
823 | .Ve | |
824 | .PP | |
825 | or, because a repeat count \f(CW\*(C`*\*(C'\fR means \*(L"repeat as often as required\*(R", | |
826 | simply | |
827 | .PP | |
828 | .Vb 1 | |
829 | \& pack( '(A)*', @str ) | |
830 | .Ve | |
831 | .PP | |
832 | (Note that the template \f(CW\*(C`A*\*(C'\fR would only have packed \f(CW$str[0]\fR in full | |
833 | length.) | |
834 | .PP | |
835 | To pack dates stored as triplets ( day, month, year ) in an array \f(CW@dates\fR | |
836 | into a sequence of byte, byte, short integer we can write | |
837 | .PP | |
838 | .Vb 1 | |
839 | \& $pd = pack( '(CCS)*', map( @$_, @dates ) ); | |
840 | .Ve | |
841 | .PP | |
842 | To swap pairs of characters in a string (with even length) one could use | |
843 | several techniques. First, let's use \f(CW\*(C`x\*(C'\fR and \f(CW\*(C`X\*(C'\fR to skip forward and back: | |
844 | .PP | |
845 | .Vb 1 | |
846 | \& $s = pack( '(A)*', unpack( '(xAXXAx)*', $s ) ); | |
847 | .Ve | |
848 | .PP | |
849 | We can also use \f(CW\*(C`@\*(C'\fR to jump to an offset, with 0 being the position where | |
850 | we were when the last \f(CW\*(C`(\*(C'\fR was encountered: | |
851 | .PP | |
852 | .Vb 1 | |
853 | \& $s = pack( '(A)*', unpack( '(@1A @0A @2)*', $s ) ); | |
854 | .Ve | |
855 | .PP | |
856 | Finally, there is also an entirely different approach by unpacking big | |
857 | endian shorts and packing them in the reverse byte order: | |
858 | .PP | |
859 | .Vb 1 | |
860 | \& $s = pack( '(v)*', unpack( '(n)*', $s ); | |
861 | .Ve | |
862 | .SH "Lengths and Widths" | |
863 | .IX Header "Lengths and Widths" | |
864 | .Sh "String Lengths" | |
865 | .IX Subsection "String Lengths" | |
866 | In the previous section we've seen a network message that was constructed | |
867 | by prefixing the binary message length to the actual message. You'll find | |
868 | that packing a length followed by so many bytes of data is a | |
869 | frequently used recipe since appending a null byte won't work | |
870 | if a null byte may be part of the data. Here is an example where both | |
871 | techniques are used: after two null terminated strings with source and | |
872 | destination address, a Short Message (to a mobile phone) is sent after | |
873 | a length byte: | |
874 | .PP | |
875 | .Vb 1 | |
876 | \& my $msg = pack( 'Z*Z*CA*', $src, $dst, length( $sm ), $sm ); | |
877 | .Ve | |
878 | .PP | |
879 | Unpacking this message can be done with the same template: | |
880 | .PP | |
881 | .Vb 1 | |
882 | \& ( $src, $dst, $len, $sm ) = unpack( 'Z*Z*CA*', $msg ); | |
883 | .Ve | |
884 | .PP | |
885 | There's a subtle trap lurking in the offing: Adding another field after | |
886 | the Short Message (in variable \f(CW$sm\fR) is all right when packing, but this | |
887 | cannot be unpacked naively: | |
888 | .PP | |
889 | .Vb 2 | |
890 | \& # pack a message | |
891 | \& my $msg = pack( 'Z*Z*CA*C', $src, $dst, length( $sm ), $sm, $prio ); | |
892 | .Ve | |
893 | .PP | |
894 | .Vb 2 | |
895 | \& # unpack fails - $prio remains undefined! | |
896 | \& ( $src, $dst, $len, $sm, $prio ) = unpack( 'Z*Z*CA*C', $msg ); | |
897 | .Ve | |
898 | .PP | |
899 | The pack code \f(CW\*(C`A*\*(C'\fR gobbles up all remaining bytes, and \f(CW$prio\fR remains | |
900 | undefined! Before we let disappointment dampen the morale: Perl's got | |
901 | the trump card to make this trick too, just a little further up the sleeve. | |
902 | Watch this: | |
903 | .PP | |
904 | .Vb 2 | |
905 | \& # pack a message: ASCIIZ, ASCIIZ, length/string, byte | |
906 | \& my $msg = pack( 'Z* Z* C/A* C', $src, $dst, $sm, $prio ); | |
907 | .Ve | |
908 | .PP | |
909 | .Vb 2 | |
910 | \& # unpack | |
911 | \& ( $src, $dst, $sm, $prio ) = unpack( 'Z* Z* C/A* C', $msg ); | |
912 | .Ve | |
913 | .PP | |
914 | Combining two pack codes with a slash (\f(CW\*(C`/\*(C'\fR) associates them with a single | |
915 | value from the argument list. In \f(CW\*(C`pack\*(C'\fR, the length of the argument is | |
916 | taken and packed according to the first code while the argument itself | |
917 | is added after being converted with the template code after the slash. | |
918 | This saves us the trouble of inserting the \f(CW\*(C`length\*(C'\fR call, but it is | |
919 | in \f(CW\*(C`unpack\*(C'\fR where we really score: The value of the length byte marks the | |
920 | end of the string to be taken from the buffer. Since this combination | |
921 | doesn't make sense except when the second pack code isn't \f(CW\*(C`a*\*(C'\fR, \f(CW\*(C`A*\*(C'\fR | |
922 | or \f(CW\*(C`Z*\*(C'\fR, Perl won't let you. | |
923 | .PP | |
924 | The pack code preceding \f(CW\*(C`/\*(C'\fR may be anything that's fit to represent a | |
925 | number: All the numeric binary pack codes, and even text codes such as | |
926 | \&\f(CW\*(C`A4\*(C'\fR or \f(CW\*(C`Z*\*(C'\fR: | |
927 | .PP | |
928 | .Vb 4 | |
929 | \& # pack/unpack a string preceded by its length in ASCII | |
930 | \& my $buf = pack( 'A4/A*', "Humpty-Dumpty" ); | |
931 | \& # unpack $buf: '13 Humpty-Dumpty' | |
932 | \& my $txt = unpack( 'A4/A*', $buf ); | |
933 | .Ve | |
934 | .PP | |
935 | \&\f(CW\*(C`/\*(C'\fR is not implemented in Perls before 5.6, so if your code is required to | |
936 | work on older Perls you'll need to \f(CW\*(C`unpack( 'Z* Z* C')\*(C'\fR to get the length, | |
937 | then use it to make a new unpack string. For example | |
938 | .PP | |
939 | .Vb 2 | |
940 | \& # pack a message: ASCIIZ, ASCIIZ, length, string, byte (5.005 compatible) | |
941 | \& my $msg = pack( 'Z* Z* C A* C', $src, $dst, length $sm, $sm, $prio ); | |
942 | .Ve | |
943 | .PP | |
944 | .Vb 3 | |
945 | \& # unpack | |
946 | \& ( undef, undef, $len) = unpack( 'Z* Z* C', $msg ); | |
947 | \& ($src, $dst, $sm, $prio) = unpack ( "Z* Z* x A$len C", $msg ); | |
948 | .Ve | |
949 | .PP | |
950 | But that second \f(CW\*(C`unpack\*(C'\fR is rushing ahead. It isn't using a simple literal | |
951 | string for the template. So maybe we should introduce... | |
952 | .Sh "Dynamic Templates" | |
953 | .IX Subsection "Dynamic Templates" | |
954 | So far, we've seen literals used as templates. If the list of pack | |
955 | items doesn't have fixed length, an expression constructing the | |
956 | template is required (whenever, for some reason, \f(CW\*(C`()*\*(C'\fR cannot be used). | |
957 | Here's an example: To store named string values in a way that can be | |
958 | conveniently parsed by a C program, we create a sequence of names and | |
959 | null terminated \s-1ASCII\s0 strings, with \f(CW\*(C`=\*(C'\fR between the name and the value, | |
960 | followed by an additional delimiting null byte. Here's how: | |
961 | .PP | |
962 | .Vb 2 | |
963 | \& my $env = pack( '(A*A*Z*)' . keys( %Env ) . 'C', | |
964 | \& map( { ( $_, '=', $Env{$_} ) } keys( %Env ) ), 0 ); | |
965 | .Ve | |
966 | .PP | |
967 | Let's examine the cogs of this byte mill, one by one. There's the \f(CW\*(C`map\*(C'\fR | |
968 | call, creating the items we intend to stuff into the \f(CW$env\fR buffer: | |
969 | to each key (in \f(CW$_\fR) it adds the \f(CW\*(C`=\*(C'\fR separator and the hash entry value. | |
970 | Each triplet is packed with the template code sequence \f(CW\*(C`A*A*Z*\*(C'\fR that | |
971 | is repeated according to the number of keys. (Yes, that's what the \f(CW\*(C`keys\*(C'\fR | |
972 | function returns in scalar context.) To get the very last null byte, | |
973 | we add a \f(CW0\fR at the end of the \f(CW\*(C`pack\*(C'\fR list, to be packed with \f(CW\*(C`C\*(C'\fR. | |
974 | (Attentive readers may have noticed that we could have omitted the 0.) | |
975 | .PP | |
976 | For the reverse operation, we'll have to determine the number of items | |
977 | in the buffer before we can let \f(CW\*(C`unpack\*(C'\fR rip it apart: | |
978 | .PP | |
979 | .Vb 2 | |
980 | \& my $n = $env =~ tr/\e0// - 1; | |
981 | \& my %env = map( split( /=/, $_ ), unpack( "(Z*)$n", $env ) ); | |
982 | .Ve | |
983 | .PP | |
984 | The \f(CW\*(C`tr\*(C'\fR counts the null bytes. The \f(CW\*(C`unpack\*(C'\fR call returns a list of | |
985 | name-value pairs each of which is taken apart in the \f(CW\*(C`map\*(C'\fR block. | |
986 | .Sh "Counting Repetitions" | |
987 | .IX Subsection "Counting Repetitions" | |
988 | Rather than storing a sentinel at the end of a data item (or a list of items), | |
989 | we could precede the data with a count. Again, we pack keys and values of | |
990 | a hash, preceding each with an unsigned short length count, and up front | |
991 | we store the number of pairs: | |
992 | .PP | |
993 | .Vb 1 | |
994 | \& my $env = pack( 'S(S/A* S/A*)*', scalar keys( %Env ), %Env ); | |
995 | .Ve | |
996 | .PP | |
997 | This simplifies the reverse operation as the number of repetitions can be | |
998 | unpacked with the \f(CW\*(C`/\*(C'\fR code: | |
999 | .PP | |
1000 | .Vb 1 | |
1001 | \& my %env = unpack( 'S/(S/A* S/A*)', $env ); | |
1002 | .Ve | |
1003 | .PP | |
1004 | Note that this is one of the rare cases where you cannot use the same | |
1005 | template for \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR because \f(CW\*(C`pack\*(C'\fR can't determine | |
1006 | a repeat count for a \f(CW\*(C`()\*(C'\fR\-group. | |
1007 | .SH "Packing and Unpacking C Structures" | |
1008 | .IX Header "Packing and Unpacking C Structures" | |
1009 | In previous sections we have seen how to pack numbers and character | |
1010 | strings. If it were not for a couple of snags we could conclude this | |
1011 | section right away with the terse remark that C structures don't | |
1012 | contain anything else, and therefore you already know all there is to it. | |
1013 | Sorry, no: read on, please. | |
1014 | .Sh "The Alignment Pit" | |
1015 | .IX Subsection "The Alignment Pit" | |
1016 | In the consideration of speed against memory requirements the balance | |
1017 | has been tilted in favor of faster execution. This has influenced the | |
1018 | way C compilers allocate memory for structures: On architectures | |
1019 | where a 16\-bit or 32\-bit operand can be moved faster between places in | |
1020 | memory, or to or from a \s-1CPU\s0 register, if it is aligned at an even or | |
1021 | multiple-of-four or even at a multiple-of eight address, a C compiler | |
1022 | will give you this speed benefit by stuffing extra bytes into structures. | |
1023 | If you don't cross the C shoreline this is not likely to cause you any | |
1024 | grief (although you should care when you design large data structures, | |
1025 | or you want your code to be portable between architectures (you do want | |
1026 | that, don't you?)). | |
1027 | .PP | |
1028 | To see how this affects \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR, we'll compare these two | |
1029 | C structures: | |
1030 | .PP | |
1031 | .Vb 6 | |
1032 | \& typedef struct { | |
1033 | \& char c1; | |
1034 | \& short s; | |
1035 | \& char c2; | |
1036 | \& long l; | |
1037 | \& } gappy_t; | |
1038 | .Ve | |
1039 | .PP | |
1040 | .Vb 6 | |
1041 | \& typedef struct { | |
1042 | \& long l; | |
1043 | \& short s; | |
1044 | \& char c1; | |
1045 | \& char c2; | |
1046 | \& } dense_t; | |
1047 | .Ve | |
1048 | .PP | |
1049 | Typically, a C compiler allocates 12 bytes to a \f(CW\*(C`gappy_t\*(C'\fR variable, but | |
1050 | requires only 8 bytes for a \f(CW\*(C`dense_t\*(C'\fR. After investigating this further, | |
1051 | we can draw memory maps, showing where the extra 4 bytes are hidden: | |
1052 | .PP | |
1053 | .Vb 5 | |
1054 | \& 0 +4 +8 +12 | |
1055 | \& +--+--+--+--+--+--+--+--+--+--+--+--+ | |
1056 | \& |c1|xx| s |c2|xx|xx|xx| l | xx = fill byte | |
1057 | \& +--+--+--+--+--+--+--+--+--+--+--+--+ | |
1058 | \& gappy_t | |
1059 | .Ve | |
1060 | .PP | |
1061 | .Vb 5 | |
1062 | \& 0 +4 +8 | |
1063 | \& +--+--+--+--+--+--+--+--+ | |
1064 | \& | l | h |c1|c2| | |
1065 | \& +--+--+--+--+--+--+--+--+ | |
1066 | \& dense_t | |
1067 | .Ve | |
1068 | .PP | |
1069 | And that's where the first quirk strikes: \f(CW\*(C`pack\*(C'\fR and \f(CW\*(C`unpack\*(C'\fR | |
1070 | templates have to be stuffed with \f(CW\*(C`x\*(C'\fR codes to get those extra fill bytes. | |
1071 | .PP | |
1072 | The natural question: \*(L"Why can't Perl compensate for the gaps?\*(R" warrants | |
1073 | an answer. One good reason is that C compilers might provide (non\-ANSI) | |
1074 | extensions permitting all sorts of fancy control over the way structures | |
1075 | are aligned, even at the level of an individual structure field. And, if | |
1076 | this were not enough, there is an insidious thing called \f(CW\*(C`union\*(C'\fR where | |
1077 | the amount of fill bytes cannot be derived from the alignment of the next | |
1078 | item alone. | |
1079 | .PP | |
1080 | \&\s-1OK\s0, so let's bite the bullet. Here's one way to get the alignment right | |
1081 | by inserting template codes \f(CW\*(C`x\*(C'\fR, which don't take a corresponding item | |
1082 | from the list: | |
1083 | .PP | |
1084 | .Vb 1 | |
1085 | \& my $gappy = pack( 'cxs cxxx l!', $c1, $s, $c2, $l ); | |
1086 | .Ve | |
1087 | .PP | |
1088 | Note the \f(CW\*(C`!\*(C'\fR after \f(CW\*(C`l\*(C'\fR: We want to make sure that we pack a long | |
1089 | integer as it is compiled by our C compiler. And even now, it will only | |
1090 | work for the platforms where the compiler aligns things as above. | |
1091 | And somebody somewhere has a platform where it doesn't. | |
1092 | [Probably a Cray, where \f(CW\*(C`short\*(C'\fRs, \f(CW\*(C`int\*(C'\fRs and \f(CW\*(C`long\*(C'\fRs are all 8 bytes. :\-)] | |
1093 | .PP | |
1094 | Counting bytes and watching alignments in lengthy structures is bound to | |
1095 | be a drag. Isn't there a way we can create the template with a simple | |
1096 | program? Here's a C program that does the trick: | |
1097 | .PP | |
1098 | .Vb 2 | |
1099 | \& #include <stdio.h> | |
1100 | \& #include <stddef.h> | |
1101 | .Ve | |
1102 | .PP | |
1103 | .Vb 6 | |
1104 | \& typedef struct { | |
1105 | \& char fc1; | |
1106 | \& short fs; | |
1107 | \& char fc2; | |
1108 | \& long fl; | |
1109 | \& } gappy_t; | |
1110 | .Ve | |
1111 | .PP | |
1112 | .Vb 2 | |
1113 | \& #define Pt(struct,field,tchar) \e | |
1114 | \& printf( "@%d%s ", offsetof(struct,field), # tchar ); | |
1115 | .Ve | |
1116 | .PP | |
1117 | .Vb 7 | |
1118 | \& int main() { | |
1119 | \& Pt( gappy_t, fc1, c ); | |
1120 | \& Pt( gappy_t, fs, s! ); | |
1121 | \& Pt( gappy_t, fc2, c ); | |
1122 | \& Pt( gappy_t, fl, l! ); | |
1123 | \& printf( "\en" ); | |
1124 | \& } | |
1125 | .Ve | |
1126 | .PP | |
1127 | The output line can be used as a template in a \f(CW\*(C`pack\*(C'\fR or \f(CW\*(C`unpack\*(C'\fR call: | |
1128 | .PP | |
1129 | .Vb 1 | |
1130 | \& my $gappy = pack( '@0c @2s! @4c @8l!', $c1, $s, $c2, $l ); | |
1131 | .Ve | |
1132 | .PP | |
1133 | Gee, yet another template code \- as if we hadn't plenty. But | |
1134 | \&\f(CW\*(C`@\*(C'\fR saves our day by enabling us to specify the offset from the beginning | |
1135 | of the pack buffer to the next item: This is just the value | |
1136 | the \f(CW\*(C`offsetof\*(C'\fR macro (defined in \f(CW\*(C`<stddef.h>\*(C'\fR) returns when | |
1137 | given a \f(CW\*(C`struct\*(C'\fR type and one of its field names (\*(L"member\-designator\*(R" in | |
1138 | C standardese). | |
1139 | .PP | |
1140 | Neither using offsets nor adding \f(CW\*(C`x\*(C'\fR's to bridge the gaps is satisfactory. | |
1141 | (Just imagine what happens if the structure changes.) What we really need | |
1142 | is a way of saying \*(L"skip as many bytes as required to the next multiple of N\*(R". | |
1143 | In fluent Templatese, you say this with \f(CW\*(C`x!N\*(C'\fR where N is replaced by the | |
1144 | appropriate value. Here's the next version of our struct packaging: | |
1145 | .PP | |
1146 | .Vb 1 | |
1147 | \& my $gappy = pack( 'c x!2 s c x!4 l!', $c1, $s, $c2, $l ); | |
1148 | .Ve | |
1149 | .PP | |
1150 | That's certainly better, but we still have to know how long all the | |
1151 | integers are, and portability is far away. Rather than \f(CW2\fR, | |
1152 | for instance, we want to say \*(L"however long a short is\*(R". But this can be | |
1153 | done by enclosing the appropriate pack code in brackets: \f(CW\*(C`[s]\*(C'\fR. So, here's | |
1154 | the very best we can do: | |
1155 | .PP | |
1156 | .Vb 1 | |
1157 | \& my $gappy = pack( 'c x![s] s c x![l!] l!', $c1, $s, $c2, $l ); | |
1158 | .Ve | |
1159 | .Sh "Alignment, Take 2" | |
1160 | .IX Subsection "Alignment, Take 2" | |
1161 | I'm afraid that we're not quite through with the alignment catch yet. The | |
1162 | hydra raises another ugly head when you pack arrays of structures: | |
1163 | .PP | |
1164 | .Vb 4 | |
1165 | \& typedef struct { | |
1166 | \& short count; | |
1167 | \& char glyph; | |
1168 | \& } cell_t; | |
1169 | .Ve | |
1170 | .PP | |
1171 | .Vb 1 | |
1172 | \& typedef cell_t buffer_t[BUFLEN]; | |
1173 | .Ve | |
1174 | .PP | |
1175 | Where's the catch? Padding is neither required before the first field \f(CW\*(C`count\*(C'\fR, | |
1176 | nor between this and the next field \f(CW\*(C`glyph\*(C'\fR, so why can't we simply pack | |
1177 | like this: | |
1178 | .PP | |
1179 | .Vb 3 | |
1180 | \& # something goes wrong here: | |
1181 | \& pack( 's!a' x @buffer, | |
1182 | \& map{ ( $_->{count}, $_->{glyph} ) } @buffer ); | |
1183 | .Ve | |
1184 | .PP | |
1185 | This packs \f(CW\*(C`3*@buffer\*(C'\fR bytes, but it turns out that the size of | |
1186 | \&\f(CW\*(C`buffer_t\*(C'\fR is four times \f(CW\*(C`BUFLEN\*(C'\fR! The moral of the story is that | |
1187 | the required alignment of a structure or array is propagated to the | |
1188 | next higher level where we have to consider padding \fIat the end\fR | |
1189 | of each component as well. Thus the correct template is: | |
1190 | .PP | |
1191 | .Vb 2 | |
1192 | \& pack( 's!ax' x @buffer, | |
1193 | \& map{ ( $_->{count}, $_->{glyph} ) } @buffer ); | |
1194 | .Ve | |
1195 | .Sh "Alignment, Take 3" | |
1196 | .IX Subsection "Alignment, Take 3" | |
1197 | And even if you take all the above into account, \s-1ANSI\s0 still lets this: | |
1198 | .PP | |
1199 | .Vb 3 | |
1200 | \& typedef struct { | |
1201 | \& char foo[2]; | |
1202 | \& } foo_t; | |
1203 | .Ve | |
1204 | .PP | |
1205 | vary in size. The alignment constraint of the structure can be greater than | |
1206 | any of its elements. [And if you think that this doesn't affect anything | |
1207 | common, dismember the next cellphone that you see. Many have \s-1ARM\s0 cores, and | |
1208 | the \s-1ARM\s0 structure rules make \f(CW\*(C`sizeof (foo_t)\*(C'\fR == 4] | |
1209 | .Sh "Pointers for How to Use Them" | |
1210 | .IX Subsection "Pointers for How to Use Them" | |
1211 | The title of this section indicates the second problem you may run into | |
1212 | sooner or later when you pack C structures. If the function you intend | |
1213 | to call expects a, say, \f(CW\*(C`void *\*(C'\fR value, you \fIcannot\fR simply take | |
1214 | a reference to a Perl variable. (Although that value certainly is a | |
1215 | memory address, it's not the address where the variable's contents are | |
1216 | stored.) | |
1217 | .PP | |
1218 | Template code \f(CW\*(C`P\*(C'\fR promises to pack a \*(L"pointer to a fixed length string\*(R". | |
1219 | Isn't this what we want? Let's try: | |
1220 | .PP | |
1221 | .Vb 3 | |
1222 | \& # allocate some storage and pack a pointer to it | |
1223 | \& my $memory = "\ex00" x $size; | |
1224 | \& my $memptr = pack( 'P', $memory ); | |
1225 | .Ve | |
1226 | .PP | |
1227 | But wait: doesn't \f(CW\*(C`pack\*(C'\fR just return a sequence of bytes? How can we pass this | |
1228 | string of bytes to some C code expecting a pointer which is, after all, | |
1229 | nothing but a number? The answer is simple: We have to obtain the numeric | |
1230 | address from the bytes returned by \f(CW\*(C`pack\*(C'\fR. | |
1231 | .PP | |
1232 | .Vb 1 | |
1233 | \& my $ptr = unpack( 'L!', $memptr ); | |
1234 | .Ve | |
1235 | .PP | |
1236 | Obviously this assumes that it is possible to typecast a pointer | |
1237 | to an unsigned long and vice versa, which frequently works but should not | |
1238 | be taken as a universal law. \- Now that we have this pointer the next question | |
1239 | is: How can we put it to good use? We need a call to some C function | |
1240 | where a pointer is expected. The \fIread\fR\|(2) system call comes to mind: | |
1241 | .PP | |
1242 | .Vb 1 | |
1243 | \& ssize_t read(int fd, void *buf, size_t count); | |
1244 | .Ve | |
1245 | .PP | |
1246 | After reading perlfunc explaining how to use \f(CW\*(C`syscall\*(C'\fR we can write | |
1247 | this Perl function copying a file to standard output: | |
1248 | .PP | |
1249 | .Vb 12 | |
1250 | \& require 'syscall.ph'; | |
1251 | \& sub cat($){ | |
1252 | \& my $path = shift(); | |
1253 | \& my $size = -s $path; | |
1254 | \& my $memory = "\ex00" x $size; # allocate some memory | |
1255 | \& my $ptr = unpack( 'L', pack( 'P', $memory ) ); | |
1256 | \& open( F, $path ) || die( "$path: cannot open ($!)\en" ); | |
1257 | \& my $fd = fileno(F); | |
1258 | \& my $res = syscall( &SYS_read, fileno(F), $ptr, $size ); | |
1259 | \& print $memory; | |
1260 | \& close( F ); | |
1261 | \& } | |
1262 | .Ve | |
1263 | .PP | |
1264 | This is neither a specimen of simplicity nor a paragon of portability but | |
1265 | it illustrates the point: We are able to sneak behind the scenes and | |
1266 | access Perl's otherwise well-guarded memory! (Important note: Perl's | |
1267 | \&\f(CW\*(C`syscall\*(C'\fR does \fInot\fR require you to construct pointers in this roundabout | |
1268 | way. You simply pass a string variable, and Perl forwards the address.) | |
1269 | .PP | |
1270 | How does \f(CW\*(C`unpack\*(C'\fR with \f(CW\*(C`P\*(C'\fR work? Imagine some pointer in the buffer | |
1271 | about to be unpacked: If it isn't the null pointer (which will smartly | |
1272 | produce the \f(CW\*(C`undef\*(C'\fR value) we have a start address \- but then what? | |
1273 | Perl has no way of knowing how long this \*(L"fixed length string\*(R" is, so | |
1274 | it's up to you to specify the actual size as an explicit length after \f(CW\*(C`P\*(C'\fR. | |
1275 | .PP | |
1276 | .Vb 2 | |
1277 | \& my $mem = "abcdefghijklmn"; | |
1278 | \& print unpack( 'P5', pack( 'P', $mem ) ); # prints "abcde" | |
1279 | .Ve | |
1280 | .PP | |
1281 | As a consequence, \f(CW\*(C`pack\*(C'\fR ignores any number or \f(CW\*(C`*\*(C'\fR after \f(CW\*(C`P\*(C'\fR. | |
1282 | .PP | |
1283 | Now that we have seen \f(CW\*(C`P\*(C'\fR at work, we might as well give \f(CW\*(C`p\*(C'\fR a whirl. | |
1284 | Why do we need a second template code for packing pointers at all? The | |
1285 | answer lies behind the simple fact that an \f(CW\*(C`unpack\*(C'\fR with \f(CW\*(C`p\*(C'\fR promises | |
1286 | a null-terminated string starting at the address taken from the buffer, | |
1287 | and that implies a length for the data item to be returned: | |
1288 | .PP | |
1289 | .Vb 2 | |
1290 | \& my $buf = pack( 'p', "abc\ex00efhijklmn" ); | |
1291 | \& print unpack( 'p', $buf ); # prints "abc" | |
1292 | .Ve | |
1293 | .PP | |
1294 | Albeit this is apt to be confusing: As a consequence of the length being | |
1295 | implied by the string's length, a number after pack code \f(CW\*(C`p\*(C'\fR is a repeat | |
1296 | count, not a length as after \f(CW\*(C`P\*(C'\fR. | |
1297 | .PP | |
1298 | Using \f(CW\*(C`pack(..., $x)\*(C'\fR with \f(CW\*(C`P\*(C'\fR or \f(CW\*(C`p\*(C'\fR to get the address where \f(CW$x\fR is | |
1299 | actually stored must be used with circumspection. Perl's internal machinery | |
1300 | considers the relation between a variable and that address as its very own | |
1301 | private matter and doesn't really care that we have obtained a copy. Therefore: | |
1302 | .IP "\(bu" 4 | |
1303 | Do not use \f(CW\*(C`pack\*(C'\fR with \f(CW\*(C`p\*(C'\fR or \f(CW\*(C`P\*(C'\fR to obtain the address of variable | |
1304 | that's bound to go out of scope (and thereby freeing its memory) before you | |
1305 | are done with using the memory at that address. | |
1306 | .IP "\(bu" 4 | |
1307 | Be very careful with Perl operations that change the value of the | |
1308 | variable. Appending something to the variable, for instance, might require | |
1309 | reallocation of its storage, leaving you with a pointer into no\-man's land. | |
1310 | .IP "\(bu" 4 | |
1311 | Don't think that you can get the address of a Perl variable | |
1312 | when it is stored as an integer or double number! \f(CW\*(C`pack('P', $x)\*(C'\fR will | |
1313 | force the variable's internal representation to string, just as if you | |
1314 | had written something like \f(CW\*(C`$x .= ''\*(C'\fR. | |
1315 | .PP | |
1316 | It's safe, however, to P\- or p\-pack a string literal, because Perl simply | |
1317 | allocates an anonymous variable. | |
1318 | .SH "Pack Recipes" | |
1319 | .IX Header "Pack Recipes" | |
1320 | Here are a collection of (possibly) useful canned recipes for \f(CW\*(C`pack\*(C'\fR | |
1321 | and \f(CW\*(C`unpack\*(C'\fR: | |
1322 | .PP | |
1323 | .Vb 2 | |
1324 | \& # Convert IP address for socket functions | |
1325 | \& pack( "C4", split /\e./, "123.4.5.6" ); | |
1326 | .Ve | |
1327 | .PP | |
1328 | .Vb 2 | |
1329 | \& # Count the bits in a chunk of memory (e.g. a select vector) | |
1330 | \& unpack( '%32b*', $mask ); | |
1331 | .Ve | |
1332 | .PP | |
1333 | .Vb 3 | |
1334 | \& # Determine the endianness of your system | |
1335 | \& $is_little_endian = unpack( 'c', pack( 's', 1 ) ); | |
1336 | \& $is_big_endian = unpack( 'xc', pack( 's', 1 ) ); | |
1337 | .Ve | |
1338 | .PP | |
1339 | .Vb 2 | |
1340 | \& # Determine the number of bits in a native integer | |
1341 | \& $bits = unpack( '%32I!', ~0 ); | |
1342 | .Ve | |
1343 | .PP | |
1344 | .Vb 2 | |
1345 | \& # Prepare argument for the nanosleep system call | |
1346 | \& my $timespec = pack( 'L!L!', $secs, $nanosecs ); | |
1347 | .Ve | |
1348 | .PP | |
1349 | For a simple memory dump we unpack some bytes into just as | |
1350 | many pairs of hex digits, and use \f(CW\*(C`map\*(C'\fR to handle the traditional | |
1351 | spacing \- 16 bytes to a line: | |
1352 | .PP | |
1353 | .Vb 4 | |
1354 | \& my $i; | |
1355 | \& print map( ++$i % 16 ? "$_ " : "$_\en", | |
1356 | \& unpack( 'H2' x length( $mem ), $mem ) ), | |
1357 | \& length( $mem ) % 16 ? "\en" : ''; | |
1358 | .Ve | |
1359 | .SH "Funnies Section" | |
1360 | .IX Header "Funnies Section" | |
1361 | .Vb 5 | |
1362 | \& # Pulling digits out of nowhere... | |
1363 | \& print unpack( 'C', pack( 'x' ) ), | |
1364 | \& unpack( '%B*', pack( 'A' ) ), | |
1365 | \& unpack( 'H', pack( 'A' ) ), | |
1366 | \& unpack( 'A', unpack( 'C', pack( 'A' ) ) ), "\en"; | |
1367 | .Ve | |
1368 | .PP | |
1369 | .Vb 2 | |
1370 | \& # One for the road ;-) | |
1371 | \& my $advice = pack( 'all u can in a van' ); | |
1372 | .Ve | |
1373 | .SH "Authors" | |
1374 | .IX Header "Authors" | |
1375 | Simon Cozens and Wolfgang Laun. |