Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "BALANCED 1" | |
132 | .TH BALANCED 1 "2000-08-20" "perl v5.8.0" "User Contributed Perl Documentation" | |
133 | .SH "NAME" | |
134 | Text::Balanced \- Extract delimited text sequences from strings. | |
135 | .SH "SYNOPSIS" | |
136 | .IX Header "SYNOPSIS" | |
137 | .Vb 8 | |
138 | \& use Text::Balanced qw ( | |
139 | \& extract_delimited | |
140 | \& extract_bracketed | |
141 | \& extract_quotelike | |
142 | \& extract_codeblock | |
143 | \& extract_variable | |
144 | \& extract_tagged | |
145 | \& extract_multiple | |
146 | .Ve | |
147 | .PP | |
148 | .Vb 3 | |
149 | \& gen_delimited_pat | |
150 | \& gen_extract_tagged | |
151 | \& ); | |
152 | .Ve | |
153 | .PP | |
154 | .Vb 2 | |
155 | \& # Extract the initial substring of $text that is delimited by | |
156 | \& # two (unescaped) instances of the first character in $delim. | |
157 | .Ve | |
158 | .PP | |
159 | .Vb 1 | |
160 | \& ($extracted, $remainder) = extract_delimited($text,$delim); | |
161 | .Ve | |
162 | .PP | |
163 | .Vb 3 | |
164 | \& # Extract the initial substring of $text that is bracketed | |
165 | \& # with a delimiter(s) specified by $delim (where the string | |
166 | \& # in $delim contains one or more of '(){}[]<>'). | |
167 | .Ve | |
168 | .PP | |
169 | .Vb 1 | |
170 | \& ($extracted, $remainder) = extract_bracketed($text,$delim); | |
171 | .Ve | |
172 | .PP | |
173 | .Vb 2 | |
174 | \& # Extract the initial substring of $text that is bounded by | |
175 | \& # an HTML/XML tag. | |
176 | .Ve | |
177 | .PP | |
178 | .Vb 1 | |
179 | \& ($extracted, $remainder) = extract_tagged($text); | |
180 | .Ve | |
181 | .PP | |
182 | .Vb 2 | |
183 | \& # Extract the initial substring of $text that is bounded by | |
184 | \& # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags | |
185 | .Ve | |
186 | .PP | |
187 | .Vb 2 | |
188 | \& ($extracted, $remainder) = | |
189 | \& extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); | |
190 | .Ve | |
191 | .PP | |
192 | .Vb 2 | |
193 | \& # Extract the initial substring of $text that represents a | |
194 | \& # Perl "quote or quote-like operation" | |
195 | .Ve | |
196 | .PP | |
197 | .Vb 1 | |
198 | \& ($extracted, $remainder) = extract_quotelike($text); | |
199 | .Ve | |
200 | .PP | |
201 | .Vb 3 | |
202 | \& # Extract the initial substring of $text that represents a block | |
203 | \& # of Perl code, bracketed by any of character(s) specified by $delim | |
204 | \& # (where the string $delim contains one or more of '(){}[]<>'). | |
205 | .Ve | |
206 | .PP | |
207 | .Vb 1 | |
208 | \& ($extracted, $remainder) = extract_codeblock($text,$delim); | |
209 | .Ve | |
210 | .PP | |
211 | .Vb 3 | |
212 | \& # Extract the initial substrings of $text that would be extracted by | |
213 | \& # one or more sequential applications of the specified functions | |
214 | \& # or regular expressions | |
215 | .Ve | |
216 | .PP | |
217 | .Vb 7 | |
218 | \& @extracted = extract_multiple($text, | |
219 | \& [ \e&extract_bracketed, | |
220 | \& \e&extract_quotelike, | |
221 | \& \e&some_other_extractor_sub, | |
222 | \& qr/[xyz]*/, | |
223 | \& 'literal', | |
224 | \& ]); | |
225 | .Ve | |
226 | .PP | |
227 | # Create a string representing an optimized pattern (a la Friedl) | |
228 | # that matches a substring delimited by any of the specified characters | |
229 | # (in this case: any type of quote or a slash) | |
230 | .PP | |
231 | .Vb 1 | |
232 | \& $patstring = gen_delimited_pat(q{'"`/}); | |
233 | .Ve | |
234 | .PP | |
235 | # Generate a reference to an anonymous sub that is just like extract_tagged | |
236 | # but pre-compiled and optimized for a specific pair of tags, and consequently | |
237 | # much faster (i.e. 3 times faster). It uses qr// for better performance on | |
238 | # repeated calls, so it only works under Perl 5.005 or later. | |
239 | .PP | |
240 | .Vb 1 | |
241 | \& $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); | |
242 | .Ve | |
243 | .PP | |
244 | .Vb 1 | |
245 | \& ($extracted, $remainder) = $extract_head->($text); | |
246 | .Ve | |
247 | .SH "DESCRIPTION" | |
248 | .IX Header "DESCRIPTION" | |
249 | The various \f(CW\*(C`extract_...\*(C'\fR subroutines may be used to extract a | |
250 | delimited string (possibly after skipping a specified prefix string). | |
251 | The search for the string always begins at the current \f(CW\*(C`pos\*(C'\fR | |
252 | location of the string's variable (or at index zero, if no \f(CW\*(C`pos\*(C'\fR | |
253 | position is defined). | |
254 | .Sh "General behaviour in list contexts" | |
255 | .IX Subsection "General behaviour in list contexts" | |
256 | In a list context, all the subroutines return a list, the first three | |
257 | elements of which are always: | |
258 | .IP "[0]" 4 | |
259 | .IX Item "[0]" | |
260 | The extracted string, including the specified delimiters. | |
261 | If the extraction fails an empty string is returned. | |
262 | .IP "[1]" 4 | |
263 | .IX Item "[1]" | |
264 | The remainder of the input string (i.e. the characters after the | |
265 | extracted string). On failure, the entire string is returned. | |
266 | .IP "[2]" 4 | |
267 | .IX Item "[2]" | |
268 | The skipped prefix (i.e. the characters before the extracted string). | |
269 | On failure, the empty string is returned. | |
270 | .PP | |
271 | Note that in a list context, the contents of the original input text (the first | |
272 | argument) are not modified in any way. | |
273 | .PP | |
274 | However, if the input text was passed in a variable, that variable's | |
275 | \&\f(CW\*(C`pos\*(C'\fR value is updated to point at the first character after the | |
276 | extracted text. That means that in a list context the various | |
277 | subroutines can be used much like regular expressions. For example: | |
278 | .PP | |
279 | .Vb 4 | |
280 | \& while ( $next = (extract_quotelike($text))[0] ) | |
281 | \& { | |
282 | \& # process next quote-like (in $next) | |
283 | \& } | |
284 | .Ve | |
285 | .Sh "General behaviour in scalar and void contexts" | |
286 | .IX Subsection "General behaviour in scalar and void contexts" | |
287 | In a scalar context, the extracted string is returned, having first been | |
288 | removed from the input text. Thus, the following code also processes | |
289 | each quote-like operation, but actually removes them from \f(CW$text:\fR | |
290 | .PP | |
291 | .Vb 4 | |
292 | \& while ( $next = extract_quotelike($text) ) | |
293 | \& { | |
294 | \& # process next quote-like (in $next) | |
295 | \& } | |
296 | .Ve | |
297 | .PP | |
298 | Note that if the input text is a read-only string (i.e. a literal), | |
299 | no attempt is made to remove the extracted text. | |
300 | .PP | |
301 | In a void context the behaviour of the extraction subroutines is | |
302 | exactly the same as in a scalar context, except (of course) that the | |
303 | extracted substring is not returned. | |
304 | .Sh "A note about prefixes" | |
305 | .IX Subsection "A note about prefixes" | |
306 | Prefix patterns are matched without any trailing modifiers (\f(CW\*(C`/gimsox\*(C'\fR etc.) | |
307 | This can bite you if you're expecting a prefix specification like | |
308 | \&'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix | |
309 | pattern will only succeed if the <H1> tag is on the current line, since | |
310 | \&. normally doesn't match newlines. | |
311 | .PP | |
312 | To overcome this limitation, you need to turn on /s matching within | |
313 | the prefix pattern, using the \f(CW\*(C`(?s)\*(C'\fR directive: '(?s).*?(?=<H1>)' | |
314 | .ie n .Sh """extract_delimited""" | |
315 | .el .Sh "\f(CWextract_delimited\fP" | |
316 | .IX Subsection "extract_delimited" | |
317 | The \f(CW\*(C`extract_delimited\*(C'\fR function formalizes the common idiom | |
318 | of extracting a single-character-delimited substring from the start of | |
319 | a string. For example, to extract a single-quote delimited string, the | |
320 | following code is typically used: | |
321 | .PP | |
322 | .Vb 2 | |
323 | \& ($remainder = $text) =~ s/\eA('(\e\e.|[^'])*')//s; | |
324 | \& $extracted = $1; | |
325 | .Ve | |
326 | .PP | |
327 | but with \f(CW\*(C`extract_delimited\*(C'\fR it can be simplified to: | |
328 | .PP | |
329 | .Vb 1 | |
330 | \& ($extracted,$remainder) = extract_delimited($text, "'"); | |
331 | .Ve | |
332 | .PP | |
333 | \&\f(CW\*(C`extract_delimited\*(C'\fR takes up to four scalars (the input text, the | |
334 | delimiters, a prefix pattern to be skipped, and any escape characters) | |
335 | and extracts the initial substring of the text that | |
336 | is appropriately delimited. If the delimiter string has multiple | |
337 | characters, the first one encountered in the text is taken to delimit | |
338 | the substring. | |
339 | The third argument specifies a prefix pattern that is to be skipped | |
340 | (but must be present!) before the substring is extracted. | |
341 | The final argument specifies the escape character to be used for each | |
342 | delimiter. | |
343 | .PP | |
344 | All arguments are optional. If the escape characters are not specified, | |
345 | every delimiter is escaped with a backslash (\f(CW\*(C`\e\*(C'\fR). | |
346 | If the prefix is not specified, the | |
347 | pattern \f(CW'\es*'\fR \- optional whitespace \- is used. If the delimiter set | |
348 | is also not specified, the set \f(CW\*(C`/["'`]/\*(C'\fR is used. If the text to be processed | |
349 | is not specified either, \f(CW$_\fR is used. | |
350 | .PP | |
351 | In list context, \f(CW\*(C`extract_delimited\*(C'\fR returns a array of three | |
352 | elements, the extracted substring (\fIincluding the surrounding | |
353 | delimiters\fR), the remainder of the text, and the skipped prefix (if | |
354 | any). If a suitable delimited substring is not found, the first | |
355 | element of the array is the empty string, the second is the complete | |
356 | original text, and the prefix returned in the third element is an | |
357 | empty string. | |
358 | .PP | |
359 | In a scalar context, just the extracted substring is returned. In | |
360 | a void context, the extracted substring (and any prefix) are simply | |
361 | removed from the beginning of the first argument. | |
362 | .PP | |
363 | Examples: | |
364 | .PP | |
365 | .Vb 1 | |
366 | \& # Remove a single-quoted substring from the very beginning of $text: | |
367 | .Ve | |
368 | .PP | |
369 | .Vb 1 | |
370 | \& $substring = extract_delimited($text, "'", ''); | |
371 | .Ve | |
372 | .PP | |
373 | .Vb 3 | |
374 | \& # Remove a single-quoted Pascalish substring (i.e. one in which | |
375 | \& # doubling the quote character escapes it) from the very | |
376 | \& # beginning of $text: | |
377 | .Ve | |
378 | .PP | |
379 | .Vb 1 | |
380 | \& $substring = extract_delimited($text, "'", '', "'"); | |
381 | .Ve | |
382 | .PP | |
383 | .Vb 3 | |
384 | \& # Extract a single- or double- quoted substring from the | |
385 | \& # beginning of $text, optionally after some whitespace | |
386 | \& # (note the list context to protect $text from modification): | |
387 | .Ve | |
388 | .PP | |
389 | .Vb 1 | |
390 | \& ($substring) = extract_delimited $text, q{"'}; | |
391 | .Ve | |
392 | .PP | |
393 | .Vb 1 | |
394 | \& # Delete the substring delimited by the first '/' in $text: | |
395 | .Ve | |
396 | .PP | |
397 | .Vb 1 | |
398 | \& $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; | |
399 | .Ve | |
400 | .PP | |
401 | Note that this last example is \fInot\fR the same as deleting the first | |
402 | quote-like pattern. For instance, if \f(CW$text\fR contained the string: | |
403 | .PP | |
404 | .Vb 1 | |
405 | \& "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" | |
406 | .Ve | |
407 | .PP | |
408 | then after the deletion it would contain: | |
409 | .PP | |
410 | .Vb 1 | |
411 | \& "if ('.$UNIXCMD/s) { $cmd = $1; }" | |
412 | .Ve | |
413 | .PP | |
414 | not: | |
415 | .PP | |
416 | .Vb 1 | |
417 | \& "if ('./cmd' =~ ms) { $cmd = $1; }" | |
418 | .Ve | |
419 | .PP | |
420 | See \*(L"extract_quotelike\*(R" for a (partial) solution to this problem. | |
421 | .ie n .Sh """extract_bracketed""" | |
422 | .el .Sh "\f(CWextract_bracketed\fP" | |
423 | .IX Subsection "extract_bracketed" | |
424 | Like \f(CW"extract_delimited"\fR, the \f(CW\*(C`extract_bracketed\*(C'\fR function takes | |
425 | up to three optional scalar arguments: a string to extract from, a delimiter | |
426 | specifier, and a prefix pattern. As before, a missing prefix defaults to | |
427 | optional whitespace and a missing text defaults to \f(CW$_\fR. However, a missing | |
428 | delimiter specifier defaults to \f(CW'{}()[]<>'\fR (see below). | |
429 | .PP | |
430 | \&\f(CW\*(C`extract_bracketed\*(C'\fR extracts a balanced-bracket-delimited | |
431 | substring (using any one (or more) of the user-specified delimiter | |
432 | brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also | |
433 | respect quoted unbalanced brackets (see below). | |
434 | .PP | |
435 | A \*(L"delimiter bracket\*(R" is a bracket in list of delimiters passed as | |
436 | \&\f(CW\*(C`extract_bracketed\*(C'\fR's second argument. Delimiter brackets are | |
437 | specified by giving either the left or right (or both!) versions | |
438 | of the required bracket(s). Note that the order in which | |
439 | two or more delimiter brackets are specified is not significant. | |
440 | .PP | |
441 | A \*(L"balanced\-bracket\-delimited substring\*(R" is a substring bounded by | |
442 | matched brackets, such that any other (left or right) delimiter | |
443 | bracket \fIwithin\fR the substring is also matched by an opposite | |
444 | (right or left) delimiter bracket \fIat the same level of nesting\fR. Any | |
445 | type of bracket not in the delimiter list is treated as an ordinary | |
446 | character. | |
447 | .PP | |
448 | In other words, each type of bracket specified as a delimiter must be | |
449 | balanced and correctly nested within the substring, and any other kind of | |
450 | (\*(L"non\-delimiter\*(R") bracket in the substring is ignored. | |
451 | .PP | |
452 | For example, given the string: | |
453 | .PP | |
454 | .Vb 1 | |
455 | \& $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; | |
456 | .Ve | |
457 | .PP | |
458 | then a call to \f(CW\*(C`extract_bracketed\*(C'\fR in a list context: | |
459 | .PP | |
460 | .Vb 1 | |
461 | \& @result = extract_bracketed( $text, '{}' ); | |
462 | .Ve | |
463 | .PP | |
464 | would return: | |
465 | .PP | |
466 | .Vb 1 | |
467 | \& ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) | |
468 | .Ve | |
469 | .PP | |
470 | since both sets of \f(CW'{..}'\fR brackets are properly nested and evenly balanced. | |
471 | (In a scalar context just the first element of the array would be returned. In | |
472 | a void context, \f(CW$text\fR would be replaced by an empty string.) | |
473 | .PP | |
474 | Likewise the call in: | |
475 | .PP | |
476 | .Vb 1 | |
477 | \& @result = extract_bracketed( $text, '{[' ); | |
478 | .Ve | |
479 | .PP | |
480 | would return the same result, since all sets of both types of specified | |
481 | delimiter brackets are correctly nested and balanced. | |
482 | .PP | |
483 | However, the call in: | |
484 | .PP | |
485 | .Vb 1 | |
486 | \& @result = extract_bracketed( $text, '{([<' ); | |
487 | .Ve | |
488 | .PP | |
489 | would fail, returning: | |
490 | .PP | |
491 | .Vb 1 | |
492 | \& ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); | |
493 | .Ve | |
494 | .PP | |
495 | because the embedded pairs of \f(CW'(..)'\fRs and \f(CW'[..]'\fRs are \*(L"cross\-nested\*(R" and | |
496 | the embedded \f(CW'>'\fR is unbalanced. (In a scalar context, this call would | |
497 | return an empty string. In a void context, \f(CW$text\fR would be unchanged.) | |
498 | .PP | |
499 | Note that the embedded single-quotes in the string don't help in this | |
500 | case, since they have not been specified as acceptable delimiters and are | |
501 | therefore treated as non-delimiter characters (and ignored). | |
502 | .PP | |
503 | However, if a particular species of quote character is included in the | |
504 | delimiter specification, then that type of quote will be correctly handled. | |
505 | for example, if \f(CW$text\fR is: | |
506 | .PP | |
507 | .Vb 1 | |
508 | \& $text = '<A HREF=">>>>">link</A>'; | |
509 | .Ve | |
510 | .PP | |
511 | then | |
512 | .PP | |
513 | .Vb 1 | |
514 | \& @result = extract_bracketed( $text, '<">' ); | |
515 | .Ve | |
516 | .PP | |
517 | returns: | |
518 | .PP | |
519 | .Vb 1 | |
520 | \& ( '<A HREF=">>>>">', 'link</A>', "" ) | |
521 | .Ve | |
522 | .PP | |
523 | as expected. Without the specification of \f(CW\*(C`"\*(C'\fR as an embedded quoter: | |
524 | .PP | |
525 | .Vb 1 | |
526 | \& @result = extract_bracketed( $text, '<>' ); | |
527 | .Ve | |
528 | .PP | |
529 | the result would be: | |
530 | .PP | |
531 | .Vb 1 | |
532 | \& ( '<A HREF=">', '>>>">link</A>', "" ) | |
533 | .Ve | |
534 | .PP | |
535 | In addition to the quote delimiters \f(CW\*(C`'\*(C'\fR, \f(CW\*(C`"\*(C'\fR, and \f(CW\*(C``\*(C'\fR, full Perl quote-like | |
536 | quoting (i.e. q{string}, qq{string}, etc) can be specified by including the | |
537 | letter 'q' as a delimiter. Hence: | |
538 | .PP | |
539 | .Vb 1 | |
540 | \& @result = extract_bracketed( $text, '<q>' ); | |
541 | .Ve | |
542 | .PP | |
543 | would correctly match something like this: | |
544 | .PP | |
545 | .Vb 1 | |
546 | \& $text = '<leftop: conj /and/ conj>'; | |
547 | .Ve | |
548 | .PP | |
549 | See also: \f(CW"extract_quotelike"\fR and \f(CW"extract_codeblock"\fR. | |
550 | .ie n .Sh """extract_tagged""" | |
551 | .el .Sh "\f(CWextract_tagged\fP" | |
552 | .IX Subsection "extract_tagged" | |
553 | \&\f(CW\*(C`extract_tagged\*(C'\fR extracts and segments text between (balanced) | |
554 | specified tags. | |
555 | .PP | |
556 | The subroutine takes up to five optional arguments: | |
557 | .IP "1." 4 | |
558 | A string to be processed (\f(CW$_\fR if the string is omitted or \f(CW\*(C`undef\*(C'\fR) | |
559 | .IP "2." 4 | |
560 | A string specifying a pattern to be matched as the opening tag. | |
561 | If the pattern string is omitted (or \f(CW\*(C`undef\*(C'\fR) then a pattern | |
562 | that matches any standard \s-1HTML/XML\s0 tag is used. | |
563 | .IP "3." 4 | |
564 | A string specifying a pattern to be matched at the closing tag. | |
565 | If the pattern string is omitted (or \f(CW\*(C`undef\*(C'\fR) then the closing | |
566 | tag is constructed by inserting a \f(CW\*(C`/\*(C'\fR after any leading bracket | |
567 | characters in the actual opening tag that was matched (\fInot\fR the pattern | |
568 | that matched the tag). For example, if the opening tag pattern | |
569 | is specified as \f(CW'{{\ew+}}'\fR and actually matched the opening tag | |
570 | \&\f(CW"{{DATA}}"\fR, then the constructed closing tag would be \f(CW"{{/DATA}}"\fR. | |
571 | .IP "4." 4 | |
572 | A string specifying a pattern to be matched as a prefix (which is to be | |
573 | skipped). If omitted, optional whitespace is skipped. | |
574 | .IP "5." 4 | |
575 | A hash reference containing various parsing options (see below) | |
576 | .PP | |
577 | The various options that can be specified are: | |
578 | .ie n .IP """reject => $listref""" 4 | |
579 | .el .IP "\f(CWreject => $listref\fR" 4 | |
580 | .IX Item "reject => $listref" | |
581 | The list reference contains one or more strings specifying patterns | |
582 | that must \fInot\fR appear within the tagged text. | |
583 | .Sp | |
584 | For example, to extract | |
585 | an \s-1HTML\s0 link (which should not contain nested links) use: | |
586 | .Sp | |
587 | .Vb 1 | |
588 | \& extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); | |
589 | .Ve | |
590 | .ie n .IP """ignore => $listref""" 4 | |
591 | .el .IP "\f(CWignore => $listref\fR" 4 | |
592 | .IX Item "ignore => $listref" | |
593 | The list reference contains one or more strings specifying patterns | |
594 | that are \fInot\fR be be treated as nested tags within the tagged text | |
595 | (even if they would match the start tag pattern). | |
596 | .Sp | |
597 | For example, to extract an arbitrary \s-1XML\s0 tag, but ignore \*(L"empty\*(R" elements: | |
598 | .Sp | |
599 | .Vb 1 | |
600 | \& extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); | |
601 | .Ve | |
602 | .Sp | |
603 | (also see \*(L"gen_delimited_pat\*(R" below). | |
604 | .ie n .IP """fail => $str""" 4 | |
605 | .el .IP "\f(CWfail => $str\fR" 4 | |
606 | .IX Item "fail => $str" | |
607 | The \f(CW\*(C`fail\*(C'\fR option indicates the action to be taken if a matching end | |
608 | tag is not encountered (i.e. before the end of the string or some | |
609 | \&\f(CW\*(C`reject\*(C'\fR pattern matches). By default, a failure to match a closing | |
610 | tag causes \f(CW\*(C`extract_tagged\*(C'\fR to immediately fail. | |
611 | .Sp | |
612 | However, if the string value associated with <reject> is \*(L"\s-1MAX\s0\*(R", then | |
613 | \&\f(CW\*(C`extract_tagged\*(C'\fR returns the complete text up to the point of failure. | |
614 | If the string is \*(L"\s-1PARA\s0\*(R", \f(CW\*(C`extract_tagged\*(C'\fR returns only the first paragraph | |
615 | after the tag (up to the first line that is either empty or contains | |
616 | only whitespace characters). | |
617 | If the string is "", the the default behaviour (i.e. failure) is reinstated. | |
618 | .Sp | |
619 | For example, suppose the start tag \*(L"/para\*(R" introduces a paragraph, which then | |
620 | continues until the next \*(L"/endpara\*(R" tag or until another \*(L"/para\*(R" tag is | |
621 | encountered: | |
622 | .Sp | |
623 | .Vb 1 | |
624 | \& $text = "/para line 1\en\enline 3\en/para line 4"; | |
625 | .Ve | |
626 | .Sp | |
627 | .Vb 2 | |
628 | \& extract_tagged($text, '/para', '/endpara', undef, | |
629 | \& {reject => '/para', fail => MAX ); | |
630 | .Ve | |
631 | .Sp | |
632 | .Vb 1 | |
633 | \& # EXTRACTED: "/para line 1\en\enline 3\en" | |
634 | .Ve | |
635 | .Sp | |
636 | Suppose instead, that if no matching \*(L"/endpara\*(R" tag is found, the \*(L"/para\*(R" | |
637 | tag refers only to the immediately following paragraph: | |
638 | .Sp | |
639 | .Vb 1 | |
640 | \& $text = "/para line 1\en\enline 3\en/para line 4"; | |
641 | .Ve | |
642 | .Sp | |
643 | .Vb 2 | |
644 | \& extract_tagged($text, '/para', '/endpara', undef, | |
645 | \& {reject => '/para', fail => MAX ); | |
646 | .Ve | |
647 | .Sp | |
648 | .Vb 1 | |
649 | \& # EXTRACTED: "/para line 1\en" | |
650 | .Ve | |
651 | .Sp | |
652 | Note that the specified \f(CW\*(C`fail\*(C'\fR behaviour applies to nested tags as well. | |
653 | .PP | |
654 | On success in a list context, an array of 6 elements is returned. The elements are: | |
655 | .IP "[0]" 4 | |
656 | .IX Item "[0]" | |
657 | the extracted tagged substring (including the outermost tags), | |
658 | .IP "[1]" 4 | |
659 | .IX Item "[1]" | |
660 | the remainder of the input text, | |
661 | .IP "[2]" 4 | |
662 | .IX Item "[2]" | |
663 | the prefix substring (if any), | |
664 | .IP "[3]" 4 | |
665 | .IX Item "[3]" | |
666 | the opening tag | |
667 | .IP "[4]" 4 | |
668 | .IX Item "[4]" | |
669 | the text between the opening and closing tags | |
670 | .IP "[5]" 4 | |
671 | .IX Item "[5]" | |
672 | the closing tag (or "" if no closing tag was found) | |
673 | .PP | |
674 | On failure, all of these values (except the remaining text) are \f(CW\*(C`undef\*(C'\fR. | |
675 | .PP | |
676 | In a scalar context, \f(CW\*(C`extract_tagged\*(C'\fR returns just the complete | |
677 | substring that matched a tagged text (including the start and end | |
678 | tags). \f(CW\*(C`undef\*(C'\fR is returned on failure. In addition, the original input | |
679 | text has the returned substring (and any prefix) removed from it. | |
680 | .PP | |
681 | In a void context, the input text just has the matched substring (and | |
682 | any specified prefix) removed. | |
683 | .ie n .Sh """gen_extract_tagged""" | |
684 | .el .Sh "\f(CWgen_extract_tagged\fP" | |
685 | .IX Subsection "gen_extract_tagged" | |
686 | (Note: This subroutine is only available under Perl5.005) | |
687 | .PP | |
688 | \&\f(CW\*(C`gen_extract_tagged\*(C'\fR generates a new anonymous subroutine which | |
689 | extracts text between (balanced) specified tags. In other words, | |
690 | it generates a function identical in function to \f(CW\*(C`extract_tagged\*(C'\fR. | |
691 | .PP | |
692 | The difference between \f(CW\*(C`extract_tagged\*(C'\fR and the anonymous | |
693 | subroutines generated by | |
694 | \&\f(CW\*(C`gen_extract_tagged\*(C'\fR, is that those generated subroutines: | |
695 | .IP "\(bu" 4 | |
696 | do not have to reparse tag specification or parsing options every time | |
697 | they are called (whereas \f(CW\*(C`extract_tagged\*(C'\fR has to effectively rebuild | |
698 | its tag parser on every call); | |
699 | .IP "\(bu" 4 | |
700 | make use of the new qr// construct to pre-compile the regexes they use | |
701 | (whereas \f(CW\*(C`extract_tagged\*(C'\fR uses standard string variable interpolation | |
702 | to create tag-matching patterns). | |
703 | .PP | |
704 | The subroutine takes up to four optional arguments (the same set as | |
705 | \&\f(CW\*(C`extract_tagged\*(C'\fR except for the string to be processed). It returns | |
706 | a reference to a subroutine which in turn takes a single argument (the text to | |
707 | be extracted from). | |
708 | .PP | |
709 | In other words, the implementation of \f(CW\*(C`extract_tagged\*(C'\fR is exactly | |
710 | equivalent to: | |
711 | .PP | |
712 | .Vb 6 | |
713 | \& sub extract_tagged | |
714 | \& { | |
715 | \& my $text = shift; | |
716 | \& $extractor = gen_extract_tagged(@_); | |
717 | \& return $extractor->($text); | |
718 | \& } | |
719 | .Ve | |
720 | .PP | |
721 | (although \f(CW\*(C`extract_tagged\*(C'\fR is not currently implemented that way, in order | |
722 | to preserve pre\-5.005 compatibility). | |
723 | .PP | |
724 | Using \f(CW\*(C`gen_extract_tagged\*(C'\fR to create extraction functions for specific tags | |
725 | is a good idea if those functions are going to be called more than once, since | |
726 | their performance is typically twice as good as the more general-purpose | |
727 | \&\f(CW\*(C`extract_tagged\*(C'\fR. | |
728 | .ie n .Sh """extract_quotelike""" | |
729 | .el .Sh "\f(CWextract_quotelike\fP" | |
730 | .IX Subsection "extract_quotelike" | |
731 | \&\f(CW\*(C`extract_quotelike\*(C'\fR attempts to recognize, extract, and segment any | |
732 | one of the various Perl quotes and quotelike operators (see | |
733 | \&\fIperlop\fR\|(3)) Nested backslashed delimiters, embedded balanced bracket | |
734 | delimiters (for the quotelike operators), and trailing modifiers are | |
735 | all caught. For example, in: | |
736 | .PP | |
737 | .Vb 1 | |
738 | \& extract_quotelike 'q # an octothorpe: \e# (not the end of the q!) #' | |
739 | .Ve | |
740 | .PP | |
741 | .Vb 1 | |
742 | \& extract_quotelike ' "You said, \e"Use sed\e"." ' | |
743 | .Ve | |
744 | .PP | |
745 | .Vb 1 | |
746 | \& extract_quotelike ' s{([A-Z]{1,8}\e.[A-Z]{3})} /\eL$1\eE/; ' | |
747 | .Ve | |
748 | .PP | |
749 | .Vb 1 | |
750 | \& extract_quotelike ' tr/\e\e\e/\e\e\e\e/\e\e\e//ds; ' | |
751 | .Ve | |
752 | .PP | |
753 | the full Perl quotelike operations are all extracted correctly. | |
754 | .PP | |
755 | Note too that, when using the /x modifier on a regex, any comment | |
756 | containing the current pattern delimiter will cause the regex to be | |
757 | immediately terminated. In other words: | |
758 | .PP | |
759 | .Vb 5 | |
760 | \& 'm / | |
761 | \& (?i) # CASE INSENSITIVE | |
762 | \& [a-z_] # LEADING ALPHABETIC/UNDERSCORE | |
763 | \& [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS | |
764 | \& /x' | |
765 | .Ve | |
766 | .PP | |
767 | will be extracted as if it were: | |
768 | .PP | |
769 | .Vb 3 | |
770 | \& 'm / | |
771 | \& (?i) # CASE INSENSITIVE | |
772 | \& [a-z_] # LEADING ALPHABETIC/' | |
773 | .Ve | |
774 | .PP | |
775 | This behaviour is identical to that of the Perl 5.004 interpreter. | |
776 | .PP | |
777 | \&\f(CW\*(C`extract_quotelike\*(C'\fR takes two arguments: the text to be processed and | |
778 | a prefix to be matched at the very beginning of the text. If no prefix | |
779 | is specified, optional whitespace is the default. If no text is given, | |
780 | \&\f(CW$_\fR is used. | |
781 | .PP | |
782 | In a list context, an array of 11 elements is returned. The elements are: | |
783 | .IP "[0]" 4 | |
784 | .IX Item "[0]" | |
785 | the extracted quotelike substring (including trailing modifiers), | |
786 | .IP "[1]" 4 | |
787 | .IX Item "[1]" | |
788 | the remainder of the input text, | |
789 | .IP "[2]" 4 | |
790 | .IX Item "[2]" | |
791 | the prefix substring (if any), | |
792 | .IP "[3]" 4 | |
793 | .IX Item "[3]" | |
794 | the name of the quotelike operator (if any), | |
795 | .IP "[4]" 4 | |
796 | .IX Item "[4]" | |
797 | the left delimiter of the first block of the operation, | |
798 | .IP "[5]" 4 | |
799 | .IX Item "[5]" | |
800 | the text of the first block of the operation | |
801 | (that is, the contents of | |
802 | a quote, the regex of a match or substitution or the target list of a | |
803 | translation), | |
804 | .IP "[6]" 4 | |
805 | .IX Item "[6]" | |
806 | the right delimiter of the first block of the operation, | |
807 | .IP "[7]" 4 | |
808 | .IX Item "[7]" | |
809 | the left delimiter of the second block of the operation | |
810 | (that is, if it is a \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR, or \f(CW\*(C`y\*(C'\fR), | |
811 | .IP "[8]" 4 | |
812 | .IX Item "[8]" | |
813 | the text of the second block of the operation | |
814 | (that is, the replacement of a substitution or the translation list | |
815 | of a translation), | |
816 | .IP "[9]" 4 | |
817 | .IX Item "[9]" | |
818 | the right delimiter of the second block of the operation (if any), | |
819 | .IP "[10]" 4 | |
820 | .IX Item "[10]" | |
821 | the trailing modifiers on the operation (if any). | |
822 | .PP | |
823 | For each of the fields marked \*(L"(if any)\*(R" the default value on success is | |
824 | an empty string. | |
825 | On failure, all of these values (except the remaining text) are \f(CW\*(C`undef\*(C'\fR. | |
826 | .PP | |
827 | In a scalar context, \f(CW\*(C`extract_quotelike\*(C'\fR returns just the complete substring | |
828 | that matched a quotelike operation (or \f(CW\*(C`undef\*(C'\fR on failure). In a scalar or | |
829 | void context, the input text has the same substring (and any specified | |
830 | prefix) removed. | |
831 | .PP | |
832 | Examples: | |
833 | .PP | |
834 | .Vb 1 | |
835 | \& # Remove the first quotelike literal that appears in text | |
836 | .Ve | |
837 | .PP | |
838 | .Vb 1 | |
839 | \& $quotelike = extract_quotelike($text,'.*?'); | |
840 | .Ve | |
841 | .PP | |
842 | .Vb 2 | |
843 | \& # Replace one or more leading whitespace-separated quotelike | |
844 | \& # literals in $_ with "<QLL>" | |
845 | .Ve | |
846 | .PP | |
847 | .Vb 1 | |
848 | \& do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; | |
849 | .Ve | |
850 | .PP | |
851 | .Vb 1 | |
852 | \& # Isolate the search pattern in a quotelike operation from $text | |
853 | .Ve | |
854 | .PP | |
855 | .Vb 9 | |
856 | \& ($op,$pat) = (extract_quotelike $text)[3,5]; | |
857 | \& if ($op =~ /[ms]/) | |
858 | \& { | |
859 | \& print "search pattern: $pat\en"; | |
860 | \& } | |
861 | \& else | |
862 | \& { | |
863 | \& print "$op is not a pattern matching operation\en"; | |
864 | \& } | |
865 | .Ve | |
866 | .ie n .Sh """extract_codeblock""" | |
867 | .el .Sh "\f(CWextract_codeblock\fP" | |
868 | .IX Subsection "extract_codeblock" | |
869 | \&\f(CW\*(C`extract_codeblock\*(C'\fR attempts to recognize and extract a balanced | |
870 | bracket delimited substring that may contain unbalanced brackets | |
871 | inside Perl quotes or quotelike operations. That is, \f(CW\*(C`extract_codeblock\*(C'\fR | |
872 | is like a combination of \f(CW"extract_bracketed"\fR and | |
873 | \&\f(CW"extract_quotelike"\fR. | |
874 | .PP | |
875 | \&\f(CW\*(C`extract_codeblock\*(C'\fR takes the same initial three parameters as \f(CW\*(C`extract_bracketed\*(C'\fR: | |
876 | a text to process, a set of delimiter brackets to look for, and a prefix to | |
877 | match first. It also takes an optional fourth parameter, which allows the | |
878 | outermost delimiter brackets to be specified separately (see below). | |
879 | .PP | |
880 | Omitting the first argument (input text) means process \f(CW$_\fR instead. | |
881 | Omitting the second argument (delimiter brackets) indicates that only \f(CW'{'\fR is to be used. | |
882 | Omitting the third argument (prefix argument) implies optional whitespace at the start. | |
883 | Omitting the fourth argument (outermost delimiter brackets) indicates that the | |
884 | value of the second argument is to be used for the outermost delimiters. | |
885 | .PP | |
886 | Once the prefix an dthe outermost opening delimiter bracket have been | |
887 | recognized, code blocks are extracted by stepping through the input text and | |
888 | trying the following alternatives in sequence: | |
889 | .IP "1." 4 | |
890 | Try and match a closing delimiter bracket. If the bracket was the same | |
891 | species as the last opening bracket, return the substring to that | |
892 | point. If the bracket was mismatched, return an error. | |
893 | .IP "2." 4 | |
894 | Try to match a quote or quotelike operator. If found, call | |
895 | \&\f(CW\*(C`extract_quotelike\*(C'\fR to eat it. If \f(CW\*(C`extract_quotelike\*(C'\fR fails, return | |
896 | the error it returned. Otherwise go back to step 1. | |
897 | .IP "3." 4 | |
898 | Try to match an opening delimiter bracket. If found, call | |
899 | \&\f(CW\*(C`extract_codeblock\*(C'\fR recursively to eat the embedded block. If the | |
900 | recursive call fails, return an error. Otherwise, go back to step 1. | |
901 | .IP "4." 4 | |
902 | Unconditionally match a bareword or any other single character, and | |
903 | then go back to step 1. | |
904 | .PP | |
905 | Examples: | |
906 | .PP | |
907 | .Vb 1 | |
908 | \& # Find a while loop in the text | |
909 | .Ve | |
910 | .PP | |
911 | .Vb 4 | |
912 | \& if ($text =~ s/.*?while\es*\e{/{/) | |
913 | \& { | |
914 | \& $loop = "while " . extract_codeblock($text); | |
915 | \& } | |
916 | .Ve | |
917 | .PP | |
918 | .Vb 2 | |
919 | \& # Remove the first round-bracketed list (which may include | |
920 | \& # round- or curly-bracketed code blocks or quotelike operators) | |
921 | .Ve | |
922 | .PP | |
923 | .Vb 1 | |
924 | \& extract_codeblock $text, "(){}", '[^(]*'; | |
925 | .Ve | |
926 | .PP | |
927 | The ability to specify a different outermost delimiter bracket is useful | |
928 | in some circumstances. For example, in the Parse::RecDescent module, | |
929 | parser actions which are to be performed only on a successful parse | |
930 | are specified using a \f(CW\*(C`<defer:...>\*(C'\fR directive. For example: | |
931 | .PP | |
932 | .Vb 2 | |
933 | \& sentence: subject verb object | |
934 | \& <defer: {$::theVerb = $item{verb}} > | |
935 | .Ve | |
936 | .PP | |
937 | Parse::RecDescent uses \f(CW\*(C`extract_codeblock($text, '{}<>')\*(C'\fR to extract the code | |
938 | within the \f(CW\*(C`<defer:...>\*(C'\fR directive, but there's a problem. | |
939 | .PP | |
940 | A deferred action like this: | |
941 | .PP | |
942 | .Vb 1 | |
943 | \& <defer: {if ($count>10) {$count--}} > | |
944 | .Ve | |
945 | .PP | |
946 | will be incorrectly parsed as: | |
947 | .PP | |
948 | .Vb 1 | |
949 | \& <defer: {if ($count> | |
950 | .Ve | |
951 | .PP | |
952 | because the \*(L"less than\*(R" operator is interpreted as a closing delimiter. | |
953 | .PP | |
954 | But, by extracting the directive using | |
955 | \&\f(CW\*(C`extract_codeblock($text,\ '{}',\ undef,\ '<>')\*(C'\fR | |
956 | the '>' character is only treated as a delimited at the outermost | |
957 | level of the code block, so the directive is parsed correctly. | |
958 | .ie n .Sh """extract_multiple""" | |
959 | .el .Sh "\f(CWextract_multiple\fP" | |
960 | .IX Subsection "extract_multiple" | |
961 | The \f(CW\*(C`extract_multiple\*(C'\fR subroutine takes a string to be processed and a | |
962 | list of extractors (subroutines or regular expressions) to apply to that string. | |
963 | .PP | |
964 | In an array context \f(CW\*(C`extract_multiple\*(C'\fR returns an array of substrings | |
965 | of the original string, as extracted by the specified extractors. | |
966 | In a scalar context, \f(CW\*(C`extract_multiple\*(C'\fR returns the first | |
967 | substring successfully extracted from the original string. In both | |
968 | scalar and void contexts the original string has the first successfully | |
969 | extracted substring removed from it. In all contexts | |
970 | \&\f(CW\*(C`extract_multiple\*(C'\fR starts at the current \f(CW\*(C`pos\*(C'\fR of the string, and | |
971 | sets that \f(CW\*(C`pos\*(C'\fR appropriately after it matches. | |
972 | .PP | |
973 | Hence, the aim of of a call to \f(CW\*(C`extract_multiple\*(C'\fR in a list context | |
974 | is to split the processed string into as many non-overlapping fields as | |
975 | possible, by repeatedly applying each of the specified extractors | |
976 | to the remainder of the string. Thus \f(CW\*(C`extract_multiple\*(C'\fR is | |
977 | a generalized form of Perl's \f(CW\*(C`split\*(C'\fR subroutine. | |
978 | .PP | |
979 | The subroutine takes up to four optional arguments: | |
980 | .IP "1." 4 | |
981 | A string to be processed (\f(CW$_\fR if the string is omitted or \f(CW\*(C`undef\*(C'\fR) | |
982 | .IP "2." 4 | |
983 | A reference to a list of subroutine references and/or qr// objects and/or | |
984 | literal strings and/or hash references, specifying the extractors | |
985 | to be used to split the string. If this argument is omitted (or | |
986 | \&\f(CW\*(C`undef\*(C'\fR) the list: | |
987 | .Sp | |
988 | .Vb 5 | |
989 | \& [ | |
990 | \& sub { extract_variable($_[0], '') }, | |
991 | \& sub { extract_quotelike($_[0],'') }, | |
992 | \& sub { extract_codeblock($_[0],'{}','') }, | |
993 | \& ] | |
994 | .Ve | |
995 | .Sp | |
996 | is used. | |
997 | .IP "3." 4 | |
998 | An number specifying the maximum number of fields to return. If this | |
999 | argument is omitted (or \f(CW\*(C`undef\*(C'\fR), split continues as long as possible. | |
1000 | .Sp | |
1001 | If the third argument is \fIN\fR, then extraction continues until \fIN\fR fields | |
1002 | have been successfully extracted, or until the string has been completely | |
1003 | processed. | |
1004 | .Sp | |
1005 | Note that in scalar and void contexts the value of this argument is | |
1006 | automatically reset to 1 (under \f(CW\*(C`\-w\*(C'\fR, a warning is issued if the argument | |
1007 | has to be reset). | |
1008 | .IP "4." 4 | |
1009 | A value indicating whether unmatched substrings (see below) within the | |
1010 | text should be skipped or returned as fields. If the value is true, | |
1011 | such substrings are skipped. Otherwise, they are returned. | |
1012 | .PP | |
1013 | The extraction process works by applying each extractor in | |
1014 | sequence to the text string. If the extractor is a subroutine it | |
1015 | is called in a list | |
1016 | context and is expected to return a list of a single element, namely | |
1017 | the extracted text. | |
1018 | Note that the value returned by an extractor subroutine need not bear any | |
1019 | relationship to the corresponding substring of the original text (see | |
1020 | examples below). | |
1021 | .PP | |
1022 | If the extractor is a precompiled regular expression or a string, | |
1023 | it is matched against the text in a scalar context with a leading | |
1024 | \&'\eG' and the gc modifiers enabled. The extracted value is either | |
1025 | \&\f(CW$1\fR if that variable is defined after the match, or else the | |
1026 | complete match (i.e. $&). | |
1027 | .PP | |
1028 | If the extractor is a hash reference, it must contain exactly one element. | |
1029 | The value of that element is one of the | |
1030 | above extractor types (subroutine reference, regular expression, or string). | |
1031 | The key of that element is the name of a class into which the successful | |
1032 | return value of the extractor will be blessed. | |
1033 | .PP | |
1034 | If an extractor returns a defined value, that value is immediately | |
1035 | treated as the next extracted field and pushed onto the list of fields. | |
1036 | If the extractor was specified in a hash reference, the field is also | |
1037 | blessed into the appropriate class, | |
1038 | .PP | |
1039 | If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is | |
1040 | assumed to have failed to extract. | |
1041 | If none of the extractor subroutines succeeds, then one | |
1042 | character is extracted from the start of the text and the extraction | |
1043 | subroutines reapplied. Characters which are thus removed are accumulated and | |
1044 | eventually become the next field (unless the fourth argument is true, in which | |
1045 | case they are disgarded). | |
1046 | .PP | |
1047 | For example, the following extracts substrings that are valid Perl variables: | |
1048 | .PP | |
1049 | .Vb 3 | |
1050 | \& @fields = extract_multiple($text, | |
1051 | \& [ sub { extract_variable($_[0]) } ], | |
1052 | \& undef, 1); | |
1053 | .Ve | |
1054 | .PP | |
1055 | This example separates a text into fields which are quote delimited, | |
1056 | curly bracketed, and anything else. The delimited and bracketed | |
1057 | parts are also blessed to identify them (the \*(L"anything else\*(R" is unblessed): | |
1058 | .PP | |
1059 | .Vb 5 | |
1060 | \& @fields = extract_multiple($text, | |
1061 | \& [ | |
1062 | \& { Delim => sub { extract_delimited($_[0],q{'"}) } }, | |
1063 | \& { Brack => sub { extract_bracketed($_[0],'{}') } }, | |
1064 | \& ]); | |
1065 | .Ve | |
1066 | .PP | |
1067 | This call extracts the next single substring that is a valid Perl quotelike | |
1068 | operator (and removes it from \f(CW$text\fR): | |
1069 | .PP | |
1070 | .Vb 4 | |
1071 | \& $quotelike = extract_multiple($text, | |
1072 | \& [ | |
1073 | \& sub { extract_quotelike($_[0]) }, | |
1074 | \& ], undef, 1); | |
1075 | .Ve | |
1076 | .PP | |
1077 | Finally, here is yet another way to do comma-separated value parsing: | |
1078 | .PP | |
1079 | .Vb 6 | |
1080 | \& @fields = extract_multiple($csv_text, | |
1081 | \& [ | |
1082 | \& sub { extract_delimited($_[0],q{'"}) }, | |
1083 | \& qr/([^,]+)(.*)/, | |
1084 | \& ], | |
1085 | \& undef,1); | |
1086 | .Ve | |
1087 | .PP | |
1088 | The list in the second argument means: | |
1089 | \&\fI\*(L"Try and extract a ' or \*(R" delimited string, otherwise extract anything up to a comma..."\fR. | |
1090 | The undef third argument means: | |
1091 | \&\fI\*(L"...as many times as possible...\*(R"\fR, | |
1092 | and the true value in the fourth argument means | |
1093 | \&\fI\*(L"...discarding anything else that appears (i.e. the commas)\*(R"\fR. | |
1094 | .PP | |
1095 | If you wanted the commas preserved as separate fields (i.e. like split | |
1096 | does if your split pattern has capturing parentheses), you would | |
1097 | just make the last parameter undefined (or remove it). | |
1098 | .ie n .Sh """gen_delimited_pat""" | |
1099 | .el .Sh "\f(CWgen_delimited_pat\fP" | |
1100 | .IX Subsection "gen_delimited_pat" | |
1101 | The \f(CW\*(C`gen_delimited_pat\*(C'\fR subroutine takes a single (string) argument and | |
1102 | builds a Friedl-style optimized regex that matches a string delimited | |
1103 | by any one of the characters in the single argument. For example: | |
1104 | .PP | |
1105 | .Vb 1 | |
1106 | \& gen_delimited_pat(q{'"}) | |
1107 | .Ve | |
1108 | .PP | |
1109 | returns the regex: | |
1110 | .PP | |
1111 | .Vb 1 | |
1112 | \& (?:\e"(?:\e\e\e"|(?!\e").)*\e"|\e'(?:\e\e\e'|(?!\e').)*\e') | |
1113 | .Ve | |
1114 | .PP | |
1115 | Note that the specified delimiters are automatically quotemeta'd. | |
1116 | .PP | |
1117 | A typical use of \f(CW\*(C`gen_delimited_pat\*(C'\fR would be to build special purpose tags | |
1118 | for \f(CW\*(C`extract_tagged\*(C'\fR. For example, to properly ignore \*(L"empty\*(R" \s-1XML\s0 elements | |
1119 | (which might contain quoted strings): | |
1120 | .PP | |
1121 | .Vb 1 | |
1122 | \& my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; | |
1123 | .Ve | |
1124 | .PP | |
1125 | .Vb 1 | |
1126 | \& extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); | |
1127 | .Ve | |
1128 | .PP | |
1129 | \&\f(CW\*(C`gen_delimited_pat\*(C'\fR may also be called with an optional second argument, | |
1130 | which specifies the \*(L"escape\*(R" character(s) to be used for each delimiter. | |
1131 | For example to match a Pascal-style string (where ' is the delimiter | |
1132 | and '' is a literal ' within the string): | |
1133 | .PP | |
1134 | .Vb 1 | |
1135 | \& gen_delimited_pat(q{'},q{'}); | |
1136 | .Ve | |
1137 | .PP | |
1138 | Different escape characters can be specified for different delimiters. | |
1139 | For example, to specify that '/' is the escape for single quotes | |
1140 | and '%' is the escape for double quotes: | |
1141 | .PP | |
1142 | .Vb 1 | |
1143 | \& gen_delimited_pat(q{'"},q{/%}); | |
1144 | .Ve | |
1145 | .PP | |
1146 | If more delimiters than escape chars are specified, the last escape char | |
1147 | is used for the remaining delimiters. | |
1148 | If no escape char is specified for a given specified delimiter, '\e' is used. | |
1149 | .PP | |
1150 | Note that | |
1151 | \&\f(CW\*(C`gen_delimited_pat\*(C'\fR was previously called | |
1152 | \&\f(CW\*(C`delimited_pat\*(C'\fR. That name may still be used, but is now deprecated. | |
1153 | .SH "DIAGNOSTICS" | |
1154 | .IX Header "DIAGNOSTICS" | |
1155 | In a list context, all the functions return \f(CW\*(C`(undef,$original_text)\*(C'\fR | |
1156 | on failure. In a scalar context, failure is indicated by returning \f(CW\*(C`undef\*(C'\fR | |
1157 | (in this case the input text is not modified in any way). | |
1158 | .PP | |
1159 | In addition, on failure in \fIany\fR context, one of the following explanatory | |
1160 | diagnostic messages is returned in the standard \f(CW$@\fR variable (on success the | |
1161 | \&\f(CW$@\fR variable is guaranteed to be \f(CW\*(C`undef\*(C'\fR): | |
1162 | .ie n .IP """Did not find a suitable bracket: ""%s""""" 4 | |
1163 | .el .IP "\f(CWDid not find a suitable bracket: ``%s''\fR" 4 | |
1164 | .IX Item "Did not find a suitable bracket: ""%s""" | |
1165 | The delimiter provided to \f(CW\*(C`extract_bracketed\*(C'\fR was not one of | |
1166 | \&\f(CW'()[]<>{}'\fR. | |
1167 | .ie n .IP """Did not find prefix: /%s/""" 4 | |
1168 | .el .IP "\f(CWDid not find prefix: /%s/\fR" 4 | |
1169 | .IX Item "Did not find prefix: /%s/" | |
1170 | A non-optional prefix was specified but wasn't found at the start of the text. | |
1171 | .ie n .IP """Did not find opening bracket after prefix: ""%s""""" 4 | |
1172 | .el .IP "\f(CWDid not find opening bracket after prefix: ``%s''\fR" 4 | |
1173 | .IX Item "Did not find opening bracket after prefix: ""%s""" | |
1174 | \&\f(CW\*(C`extract_bracketed\*(C'\fR or \f(CW\*(C`extract_codeblock\*(C'\fR was expecting a | |
1175 | particular kind of bracket at the start of the text, and didn't find it. | |
1176 | .ie n .IP """No quotelike operator found after prefix: ""%s""""" 4 | |
1177 | .el .IP "\f(CWNo quotelike operator found after prefix: ``%s''\fR" 4 | |
1178 | .IX Item "No quotelike operator found after prefix: ""%s""" | |
1179 | \&\f(CW\*(C`extract_quotelike\*(C'\fR didn't find one of the quotelike operators \f(CW\*(C`q\*(C'\fR, | |
1180 | \&\f(CW\*(C`qq\*(C'\fR, \f(CW\*(C`qw\*(C'\fR, \f(CW\*(C`qx\*(C'\fR, \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR or \f(CW\*(C`y\*(C'\fR at the start of the substring | |
1181 | it was extracting. | |
1182 | .ie n .IP """Unmatched closing bracket: ""%c""""" 4 | |
1183 | .el .IP "\f(CWUnmatched closing bracket: ``%c''\fR" 4 | |
1184 | .IX Item "Unmatched closing bracket: ""%c""" | |
1185 | \&\f(CW\*(C`extract_bracketed\*(C'\fR, \f(CW\*(C`extract_quotelike\*(C'\fR or \f(CW\*(C`extract_codeblock\*(C'\fR encountered | |
1186 | a closing bracket where none was expected. | |
1187 | .ie n .IP """Unmatched opening bracket(s): ""%s""""" 4 | |
1188 | .el .IP "\f(CWUnmatched opening bracket(s): ``%s''\fR" 4 | |
1189 | .IX Item "Unmatched opening bracket(s): ""%s""" | |
1190 | \&\f(CW\*(C`extract_bracketed\*(C'\fR, \f(CW\*(C`extract_quotelike\*(C'\fR or \f(CW\*(C`extract_codeblock\*(C'\fR ran | |
1191 | out of characters in the text before closing one or more levels of nested | |
1192 | brackets. | |
1193 | .ie n .IP """Unmatched embedded quote (%s)""" 4 | |
1194 | .el .IP "\f(CWUnmatched embedded quote (%s)\fR" 4 | |
1195 | .IX Item "Unmatched embedded quote (%s)" | |
1196 | \&\f(CW\*(C`extract_bracketed\*(C'\fR attempted to match an embedded quoted substring, but | |
1197 | failed to find a closing quote to match it. | |
1198 | .ie n .IP """Did not find closing delimiter to match '%s'""" 4 | |
1199 | .el .IP "\f(CWDid not find closing delimiter to match '%s'\fR" 4 | |
1200 | .IX Item "Did not find closing delimiter to match '%s'" | |
1201 | \&\f(CW\*(C`extract_quotelike\*(C'\fR was unable to find a closing delimiter to match the | |
1202 | one that opened the quote-like operation. | |
1203 | .ie n .IP """Mismatched closing bracket: expected ""%c"" but found ""%s""""" 4 | |
1204 | .el .IP "\f(CWMismatched closing bracket: expected ``%c'' but found ``%s''\fR" 4 | |
1205 | .IX Item "Mismatched closing bracket: expected ""%c"" but found ""%s""" | |
1206 | \&\f(CW\*(C`extract_bracketed\*(C'\fR, \f(CW\*(C`extract_quotelike\*(C'\fR or \f(CW\*(C`extract_codeblock\*(C'\fR found | |
1207 | a valid bracket delimiter, but it was the wrong species. This usually | |
1208 | indicates a nesting error, but may indicate incorrect quoting or escaping. | |
1209 | .ie n .IP """No block delimiter found after quotelike ""%s""""" 4 | |
1210 | .el .IP "\f(CWNo block delimiter found after quotelike ``%s''\fR" 4 | |
1211 | .IX Item "No block delimiter found after quotelike ""%s""" | |
1212 | \&\f(CW\*(C`extract_quotelike\*(C'\fR or \f(CW\*(C`extract_codeblock\*(C'\fR found one of the | |
1213 | quotelike operators \f(CW\*(C`q\*(C'\fR, \f(CW\*(C`qq\*(C'\fR, \f(CW\*(C`qw\*(C'\fR, \f(CW\*(C`qx\*(C'\fR, \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR or \f(CW\*(C`y\*(C'\fR | |
1214 | without a suitable block after it. | |
1215 | .ie n .IP """Did not find leading dereferencer""" 4 | |
1216 | .el .IP "\f(CWDid not find leading dereferencer\fR" 4 | |
1217 | .IX Item "Did not find leading dereferencer" | |
1218 | \&\f(CW\*(C`extract_variable\*(C'\fR was expecting one of '$', '@', or '%' at the start of | |
1219 | a variable, but didn't find any of them. | |
1220 | .ie n .IP """Bad identifier after dereferencer""" 4 | |
1221 | .el .IP "\f(CWBad identifier after dereferencer\fR" 4 | |
1222 | .IX Item "Bad identifier after dereferencer" | |
1223 | \&\f(CW\*(C`extract_variable\*(C'\fR found a '$', '@', or '%' indicating a variable, but that | |
1224 | character was not followed by a legal Perl identifier. | |
1225 | .ie n .IP """Did not find expected opening bracket at %s""" 4 | |
1226 | .el .IP "\f(CWDid not find expected opening bracket at %s\fR" 4 | |
1227 | .IX Item "Did not find expected opening bracket at %s" | |
1228 | \&\f(CW\*(C`extract_codeblock\*(C'\fR failed to find any of the outermost opening brackets | |
1229 | that were specified. | |
1230 | .ie n .IP """Improperly nested codeblock at %s""" 4 | |
1231 | .el .IP "\f(CWImproperly nested codeblock at %s\fR" 4 | |
1232 | .IX Item "Improperly nested codeblock at %s" | |
1233 | A nested code block was found that started with a delimiter that was specified | |
1234 | as being only to be used as an outermost bracket. | |
1235 | .ie n .IP """Missing second block for quotelike ""%s""""" 4 | |
1236 | .el .IP "\f(CWMissing second block for quotelike ``%s''\fR" 4 | |
1237 | .IX Item "Missing second block for quotelike ""%s""" | |
1238 | \&\f(CW\*(C`extract_codeblock\*(C'\fR or \f(CW\*(C`extract_quotelike\*(C'\fR found one of the | |
1239 | quotelike operators \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR or \f(CW\*(C`y\*(C'\fR followed by only one block. | |
1240 | .ie n .IP """No match found for opening bracket""" 4 | |
1241 | .el .IP "\f(CWNo match found for opening bracket\fR" 4 | |
1242 | .IX Item "No match found for opening bracket" | |
1243 | \&\f(CW\*(C`extract_codeblock\*(C'\fR failed to find a closing bracket to match the outermost | |
1244 | opening bracket. | |
1245 | .ie n .IP """Did not find opening tag: /%s/""" 4 | |
1246 | .el .IP "\f(CWDid not find opening tag: /%s/\fR" 4 | |
1247 | .IX Item "Did not find opening tag: /%s/" | |
1248 | \&\f(CW\*(C`extract_tagged\*(C'\fR did not find a suitable opening tag (after any specified | |
1249 | prefix was removed). | |
1250 | .ie n .IP """Unable to construct closing tag to match: /%s/""" 4 | |
1251 | .el .IP "\f(CWUnable to construct closing tag to match: /%s/\fR" 4 | |
1252 | .IX Item "Unable to construct closing tag to match: /%s/" | |
1253 | \&\f(CW\*(C`extract_tagged\*(C'\fR matched the specified opening tag and tried to | |
1254 | modify the matched text to produce a matching closing tag (because | |
1255 | none was specified). It failed to generate the closing tag, almost | |
1256 | certainly because the opening tag did not start with a | |
1257 | bracket of some kind. | |
1258 | .ie n .IP """Found invalid nested tag: %s""" 4 | |
1259 | .el .IP "\f(CWFound invalid nested tag: %s\fR" 4 | |
1260 | .IX Item "Found invalid nested tag: %s" | |
1261 | \&\f(CW\*(C`extract_tagged\*(C'\fR found a nested tag that appeared in the \*(L"reject\*(R" list | |
1262 | (and the failure mode was not \*(L"\s-1MAX\s0\*(R" or \*(L"\s-1PARA\s0\*(R"). | |
1263 | .ie n .IP """Found unbalanced nested tag: %s""" 4 | |
1264 | .el .IP "\f(CWFound unbalanced nested tag: %s\fR" 4 | |
1265 | .IX Item "Found unbalanced nested tag: %s" | |
1266 | \&\f(CW\*(C`extract_tagged\*(C'\fR found a nested opening tag that was not matched by a | |
1267 | corresponding nested closing tag (and the failure mode was not \*(L"\s-1MAX\s0\*(R" or \*(L"\s-1PARA\s0\*(R"). | |
1268 | .ie n .IP """Did not find closing tag""" 4 | |
1269 | .el .IP "\f(CWDid not find closing tag\fR" 4 | |
1270 | .IX Item "Did not find closing tag" | |
1271 | \&\f(CW\*(C`extract_tagged\*(C'\fR reached the end of the text without finding a closing tag | |
1272 | to match the original opening tag (and the failure mode was not | |
1273 | \&\*(L"\s-1MAX\s0\*(R" or \*(L"\s-1PARA\s0\*(R"). | |
1274 | .SH "AUTHOR" | |
1275 | .IX Header "AUTHOR" | |
1276 | Damian Conway (damian@conway.org) | |
1277 | .SH "BUGS AND IRRITATIONS" | |
1278 | .IX Header "BUGS AND IRRITATIONS" | |
1279 | There are undoubtedly serious bugs lurking somewhere in this code, if | |
1280 | only because parts of it give the impression of understanding a great deal | |
1281 | more about Perl than they really do. | |
1282 | .PP | |
1283 | Bug reports and other feedback are most welcome. | |
1284 | .SH "COPYRIGHT" | |
1285 | .IX Header "COPYRIGHT" | |
1286 | .Vb 4 | |
1287 | \& Copyright (c) 1997-2000, Damian Conway. All Rights Reserved. | |
1288 | \& This module is free software. It may be used, redistributed | |
1289 | \&and/or modified under the terms of the Perl Artistic License | |
1290 | \& (see http://www.perl.com/perl/misc/Artistic.html) | |
1291 | .Ve |