Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / lib / site_perl / 5.8.0 / Text / Balanced.pod
CommitLineData
86530b38
AT
1=head1 NAME
2
3Text::Balanced - Extract delimited text sequences from strings.
4
5
6=head1 SYNOPSIS
7
8 use Text::Balanced qw (
9 extract_delimited
10 extract_bracketed
11 extract_quotelike
12 extract_codeblock
13 extract_variable
14 extract_tagged
15 extract_multiple
16
17 gen_delimited_pat
18 gen_extract_tagged
19 );
20
21 # Extract the initial substring of $text that is delimited by
22 # two (unescaped) instances of the first character in $delim.
23
24 ($extracted, $remainder) = extract_delimited($text,$delim);
25
26
27 # Extract the initial substring of $text that is bracketed
28 # with a delimiter(s) specified by $delim (where the string
29 # in $delim contains one or more of '(){}[]<>').
30
31 ($extracted, $remainder) = extract_bracketed($text,$delim);
32
33
34 # Extract the initial substring of $text that is bounded by
35 # an HTML/XML tag.
36
37 ($extracted, $remainder) = extract_tagged($text);
38
39
40 # Extract the initial substring of $text that is bounded by
41 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
42
43 ($extracted, $remainder) =
44 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
45
46
47 # Extract the initial substring of $text that represents a
48 # Perl "quote or quote-like operation"
49
50 ($extracted, $remainder) = extract_quotelike($text);
51
52
53 # Extract the initial substring of $text that represents a block
54 # of Perl code, bracketed by any of character(s) specified by $delim
55 # (where the string $delim contains one or more of '(){}[]<>').
56
57 ($extracted, $remainder) = extract_codeblock($text,$delim);
58
59
60 # Extract the initial substrings of $text that would be extracted by
61 # one or more sequential applications of the specified functions
62 # or regular expressions
63
64 @extracted = extract_multiple($text,
65 [ \&extract_bracketed,
66 \&extract_quotelike,
67 \&some_other_extractor_sub,
68 qr/[xyz]*/,
69 'literal',
70 ]);
71
72# Create a string representing an optimized pattern (a la Friedl)
73# that matches a substring delimited by any of the specified characters
74# (in this case: any type of quote or a slash)
75
76 $patstring = gen_delimited_pat(q{'"`/});
77
78
79# Generate a reference to an anonymous sub that is just like extract_tagged
80# but pre-compiled and optimized for a specific pair of tags, and consequently
81# much faster (i.e. 3 times faster). It uses qr// for better performance on
82# repeated calls, so it only works under Perl 5.005 or later.
83
84 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
85
86 ($extracted, $remainder) = $extract_head->($text);
87
88
89=head1 DESCRIPTION
90
91The various C<extract_...> subroutines may be used to extract a
92delimited string (possibly after skipping a specified prefix string).
93The search for the string always begins at the current C<pos>
94location of the string's variable (or at index zero, if no C<pos>
95position is defined).
96
97=head2 General behaviour in list contexts
98
99In a list context, all the subroutines return a list, the first three
100elements of which are always:
101
102=over 4
103
104=item [0]
105
106The extracted string, including the specified delimiters.
107If the extraction fails an empty string is returned.
108
109=item [1]
110
111The remainder of the input string (i.e. the characters after the
112extracted string). On failure, the entire string is returned.
113
114=item [2]
115
116The skipped prefix (i.e. the characters before the extracted string).
117On failure, the empty string is returned.
118
119=back
120
121Note that in a list context, the contents of the original input text (the first
122argument) are not modified in any way.
123
124However, if the input text was passed in a variable, that variable's
125C<pos> value is updated to point at the first character after the
126extracted text. That means that in a list context the various
127subroutines can be used much like regular expressions. For example:
128
129 while ( $next = (extract_quotelike($text))[0] )
130 {
131 # process next quote-like (in $next)
132 }
133
134
135=head2 General behaviour in scalar and void contexts
136
137In a scalar context, the extracted string is returned, having first been
138removed from the input text. Thus, the following code also processes
139each quote-like operation, but actually removes them from $text:
140
141 while ( $next = extract_quotelike($text) )
142 {
143 # process next quote-like (in $next)
144 }
145
146Note that if the input text is a read-only string (i.e. a literal),
147no attempt is made to remove the extracted text.
148
149In a void context the behaviour of the extraction subroutines is
150exactly the same as in a scalar context, except (of course) that the
151extracted substring is not returned.
152
153=head2 A note about prefixes
154
155Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
156This can bite you if you're expecting a prefix specification like
157'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
158pattern will only succeed if the <H1> tag is on the current line, since
159. normally doesn't match newlines.
160
161To overcome this limitation, you need to turn on /s matching within
162the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
163
164
165=head2 C<extract_delimited>
166
167The C<extract_delimited> function formalizes the common idiom
168of extracting a single-character-delimited substring from the start of
169a string. For example, to extract a single-quote delimited string, the
170following code is typically used:
171
172 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
173 $extracted = $1;
174
175but with C<extract_delimited> it can be simplified to:
176
177 ($extracted,$remainder) = extract_delimited($text, "'");
178
179C<extract_delimited> takes up to four scalars (the input text, the
180delimiters, a prefix pattern to be skipped, and any escape characters)
181and extracts the initial substring of the text that
182is appropriately delimited. If the delimiter string has multiple
183characters, the first one encountered in the text is taken to delimit
184the substring.
185The third argument specifies a prefix pattern that is to be skipped
186(but must be present!) before the substring is extracted.
187The final argument specifies the escape character to be used for each
188delimiter.
189
190All arguments are optional. If the escape characters are not specified,
191every delimiter is escaped with a backslash (C<\>).
192If the prefix is not specified, the
193pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
194is also not specified, the set C</["'`]/> is used. If the text to be processed
195is not specified either, C<$_> is used.
196
197In list context, C<extract_delimited> returns a array of three
198elements, the extracted substring (I<including the surrounding
199delimiters>), the remainder of the text, and the skipped prefix (if
200any). If a suitable delimited substring is not found, the first
201element of the array is the empty string, the second is the complete
202original text, and the prefix returned in the third element is an
203empty string.
204
205In a scalar context, just the extracted substring is returned. In
206a void context, the extracted substring (and any prefix) are simply
207removed from the beginning of the first argument.
208
209Examples:
210
211 # Remove a single-quoted substring from the very beginning of $text:
212
213 $substring = extract_delimited($text, "'", '');
214
215 # Remove a single-quoted Pascalish substring (i.e. one in which
216 # doubling the quote character escapes it) from the very
217 # beginning of $text:
218
219 $substring = extract_delimited($text, "'", '', "'");
220
221 # Extract a single- or double- quoted substring from the
222 # beginning of $text, optionally after some whitespace
223 # (note the list context to protect $text from modification):
224
225 ($substring) = extract_delimited $text, q{"'};
226
227
228 # Delete the substring delimited by the first '/' in $text:
229
230 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
231
232Note that this last example is I<not> the same as deleting the first
233quote-like pattern. For instance, if C<$text> contained the string:
234
235 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
236
237then after the deletion it would contain:
238
239 "if ('.$UNIXCMD/s) { $cmd = $1; }"
240
241not:
242
243 "if ('./cmd' =~ ms) { $cmd = $1; }"
244
245
246See L<"extract_quotelike"> for a (partial) solution to this problem.
247
248
249=head2 C<extract_bracketed>
250
251Like C<"extract_delimited">, the C<extract_bracketed> function takes
252up to three optional scalar arguments: a string to extract from, a delimiter
253specifier, and a prefix pattern. As before, a missing prefix defaults to
254optional whitespace and a missing text defaults to C<$_>. However, a missing
255delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
256
257C<extract_bracketed> extracts a balanced-bracket-delimited
258substring (using any one (or more) of the user-specified delimiter
259brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
260respect quoted unbalanced brackets (see below).
261
262A "delimiter bracket" is a bracket in list of delimiters passed as
263C<extract_bracketed>'s second argument. Delimiter brackets are
264specified by giving either the left or right (or both!) versions
265of the required bracket(s). Note that the order in which
266two or more delimiter brackets are specified is not significant.
267
268A "balanced-bracket-delimited substring" is a substring bounded by
269matched brackets, such that any other (left or right) delimiter
270bracket I<within> the substring is also matched by an opposite
271(right or left) delimiter bracket I<at the same level of nesting>. Any
272type of bracket not in the delimiter list is treated as an ordinary
273character.
274
275In other words, each type of bracket specified as a delimiter must be
276balanced and correctly nested within the substring, and any other kind of
277("non-delimiter") bracket in the substring is ignored.
278
279For example, given the string:
280
281 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
282
283then a call to C<extract_bracketed> in a list context:
284
285 @result = extract_bracketed( $text, '{}' );
286
287would return:
288
289 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
290
291since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
292(In a scalar context just the first element of the array would be returned. In
293a void context, C<$text> would be replaced by an empty string.)
294
295Likewise the call in:
296
297 @result = extract_bracketed( $text, '{[' );
298
299would return the same result, since all sets of both types of specified
300delimiter brackets are correctly nested and balanced.
301
302However, the call in:
303
304 @result = extract_bracketed( $text, '{([<' );
305
306would fail, returning:
307
308 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
309
310because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
311the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
312return an empty string. In a void context, C<$text> would be unchanged.)
313
314Note that the embedded single-quotes in the string don't help in this
315case, since they have not been specified as acceptable delimiters and are
316therefore treated as non-delimiter characters (and ignored).
317
318However, if a particular species of quote character is included in the
319delimiter specification, then that type of quote will be correctly handled.
320for example, if C<$text> is:
321
322 $text = '<A HREF=">>>>">link</A>';
323
324then
325
326 @result = extract_bracketed( $text, '<">' );
327
328returns:
329
330 ( '<A HREF=">>>>">', 'link</A>', "" )
331
332as expected. Without the specification of C<"> as an embedded quoter:
333
334 @result = extract_bracketed( $text, '<>' );
335
336the result would be:
337
338 ( '<A HREF=">', '>>>">link</A>', "" )
339
340In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
341quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
342letter 'q' as a delimiter. Hence:
343
344 @result = extract_bracketed( $text, '<q>' );
345
346would correctly match something like this:
347
348 $text = '<leftop: conj /and/ conj>';
349
350See also: C<"extract_quotelike"> and C<"extract_codeblock">.
351
352
353=head2 C<extract_tagged>
354
355C<extract_tagged> extracts and segments text between (balanced)
356specified tags.
357
358The subroutine takes up to five optional arguments:
359
360=over 4
361
362=item 1.
363
364A string to be processed (C<$_> if the string is omitted or C<undef>)
365
366=item 2.
367
368A string specifying a pattern to be matched as the opening tag.
369If the pattern string is omitted (or C<undef>) then a pattern
370that matches any standard HTML/XML tag is used.
371
372=item 3.
373
374A string specifying a pattern to be matched at the closing tag.
375If the pattern string is omitted (or C<undef>) then the closing
376tag is constructed by inserting a C</> after any leading bracket
377characters in the actual opening tag that was matched (I<not> the pattern
378that matched the tag). For example, if the opening tag pattern
379is specified as C<'{{\w+}}'> and actually matched the opening tag
380C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
381
382=item 4.
383
384A string specifying a pattern to be matched as a prefix (which is to be
385skipped). If omitted, optional whitespace is skipped.
386
387=item 5.
388
389A hash reference containing various parsing options (see below)
390
391=back
392
393The various options that can be specified are:
394
395=over 4
396
397=item C<reject =E<gt> $listref>
398
399The list reference contains one or more strings specifying patterns
400that must I<not> appear within the tagged text.
401
402For example, to extract
403an HTML link (which should not contain nested links) use:
404
405 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
406
407=item C<ignore =E<gt> $listref>
408
409The list reference contains one or more strings specifying patterns
410that are I<not> be be treated as nested tags within the tagged text
411(even if they would match the start tag pattern).
412
413For example, to extract an arbitrary XML tag, but ignore "empty" elements:
414
415 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
416
417(also see L<"gen_delimited_pat"> below).
418
419
420=item C<fail =E<gt> $str>
421
422The C<fail> option indicates the action to be taken if a matching end
423tag is not encountered (i.e. before the end of the string or some
424C<reject> pattern matches). By default, a failure to match a closing
425tag causes C<extract_tagged> to immediately fail.
426
427However, if the string value associated with <reject> is "MAX", then
428C<extract_tagged> returns the complete text up to the point of failure.
429If the string is "PARA", C<extract_tagged> returns only the first paragraph
430after the tag (up to the first line that is either empty or contains
431only whitespace characters).
432If the string is "", the the default behaviour (i.e. failure) is reinstated.
433
434For example, suppose the start tag "/para" introduces a paragraph, which then
435continues until the next "/endpara" tag or until another "/para" tag is
436encountered:
437
438 $text = "/para line 1\n\nline 3\n/para line 4";
439
440 extract_tagged($text, '/para', '/endpara', undef,
441 {reject => '/para', fail => MAX );
442
443 # EXTRACTED: "/para line 1\n\nline 3\n"
444
445Suppose instead, that if no matching "/endpara" tag is found, the "/para"
446tag refers only to the immediately following paragraph:
447
448 $text = "/para line 1\n\nline 3\n/para line 4";
449
450 extract_tagged($text, '/para', '/endpara', undef,
451 {reject => '/para', fail => MAX );
452
453 # EXTRACTED: "/para line 1\n"
454
455Note that the specified C<fail> behaviour applies to nested tags as well.
456
457=back
458
459On success in a list context, an array of 6 elements is returned. The elements are:
460
461=over 4
462
463=item [0]
464
465the extracted tagged substring (including the outermost tags),
466
467=item [1]
468
469the remainder of the input text,
470
471=item [2]
472
473the prefix substring (if any),
474
475=item [3]
476
477the opening tag
478
479=item [4]
480
481the text between the opening and closing tags
482
483=item [5]
484
485the closing tag (or "" if no closing tag was found)
486
487=back
488
489On failure, all of these values (except the remaining text) are C<undef>.
490
491In a scalar context, C<extract_tagged> returns just the complete
492substring that matched a tagged text (including the start and end
493tags). C<undef> is returned on failure. In addition, the original input
494text has the returned substring (and any prefix) removed from it.
495
496In a void context, the input text just has the matched substring (and
497any specified prefix) removed.
498
499
500=head2 C<gen_extract_tagged>
501
502(Note: This subroutine is only available under Perl5.005)
503
504C<gen_extract_tagged> generates a new anonymous subroutine which
505extracts text between (balanced) specified tags. In other words,
506it generates a function identical in function to C<extract_tagged>.
507
508The difference between C<extract_tagged> and the anonymous
509subroutines generated by
510C<gen_extract_tagged>, is that those generated subroutines:
511
512=over 4
513
514=item *
515
516do not have to reparse tag specification or parsing options every time
517they are called (whereas C<extract_tagged> has to effectively rebuild
518its tag parser on every call);
519
520=item *
521
522make use of the new qr// construct to pre-compile the regexes they use
523(whereas C<extract_tagged> uses standard string variable interpolation
524to create tag-matching patterns).
525
526=back
527
528The subroutine takes up to four optional arguments (the same set as
529C<extract_tagged> except for the string to be processed). It returns
530a reference to a subroutine which in turn takes a single argument (the text to
531be extracted from).
532
533In other words, the implementation of C<extract_tagged> is exactly
534equivalent to:
535
536 sub extract_tagged
537 {
538 my $text = shift;
539 $extractor = gen_extract_tagged(@_);
540 return $extractor->($text);
541 }
542
543(although C<extract_tagged> is not currently implemented that way, in order
544to preserve pre-5.005 compatibility).
545
546Using C<gen_extract_tagged> to create extraction functions for specific tags
547is a good idea if those functions are going to be called more than once, since
548their performance is typically twice as good as the more general-purpose
549C<extract_tagged>.
550
551
552=head2 C<extract_quotelike>
553
554C<extract_quotelike> attempts to recognize, extract, and segment any
555one of the various Perl quotes and quotelike operators (see
556L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
557delimiters (for the quotelike operators), and trailing modifiers are
558all caught. For example, in:
559
560 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
561
562 extract_quotelike ' "You said, \"Use sed\"." '
563
564 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
565
566 extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
567
568the full Perl quotelike operations are all extracted correctly.
569
570Note too that, when using the /x modifier on a regex, any comment
571containing the current pattern delimiter will cause the regex to be
572immediately terminated. In other words:
573
574 'm /
575 (?i) # CASE INSENSITIVE
576 [a-z_] # LEADING ALPHABETIC/UNDERSCORE
577 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
578 /x'
579
580will be extracted as if it were:
581
582 'm /
583 (?i) # CASE INSENSITIVE
584 [a-z_] # LEADING ALPHABETIC/'
585
586This behaviour is identical to that of the Perl 5.004 interpreter.
587
588C<extract_quotelike> takes two arguments: the text to be processed and
589a prefix to be matched at the very beginning of the text. If no prefix
590is specified, optional whitespace is the default. If no text is given,
591C<$_> is used.
592
593In a list context, an array of 11 elements is returned. The elements are:
594
595=over 4
596
597=item [0]
598
599the extracted quotelike substring (including trailing modifiers),
600
601=item [1]
602
603the remainder of the input text,
604
605=item [2]
606
607the prefix substring (if any),
608
609=item [3]
610
611the name of the quotelike operator (if any),
612
613=item [4]
614
615the left delimiter of the first block of the operation,
616
617=item [5]
618
619the text of the first block of the operation
620(that is, the contents of
621a quote, the regex of a match or substitution or the target list of a
622translation),
623
624=item [6]
625
626the right delimiter of the first block of the operation,
627
628=item [7]
629
630the left delimiter of the second block of the operation
631(that is, if it is a C<s>, C<tr>, or C<y>),
632
633=item [8]
634
635the text of the second block of the operation
636(that is, the replacement of a substitution or the translation list
637of a translation),
638
639=item [9]
640
641the right delimiter of the second block of the operation (if any),
642
643=item [10]
644
645the trailing modifiers on the operation (if any).
646
647=back
648
649For each of the fields marked "(if any)" the default value on success is
650an empty string.
651On failure, all of these values (except the remaining text) are C<undef>.
652
653
654In a scalar context, C<extract_quotelike> returns just the complete substring
655that matched a quotelike operation (or C<undef> on failure). In a scalar or
656void context, the input text has the same substring (and any specified
657prefix) removed.
658
659Examples:
660
661 # Remove the first quotelike literal that appears in text
662
663 $quotelike = extract_quotelike($text,'.*?');
664
665 # Replace one or more leading whitespace-separated quotelike
666 # literals in $_ with "<QLL>"
667
668 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
669
670
671 # Isolate the search pattern in a quotelike operation from $text
672
673 ($op,$pat) = (extract_quotelike $text)[3,5];
674 if ($op =~ /[ms]/)
675 {
676 print "search pattern: $pat\n";
677 }
678 else
679 {
680 print "$op is not a pattern matching operation\n";
681 }
682
683
684=head2 C<extract_codeblock>
685
686C<extract_codeblock> attempts to recognize and extract a balanced
687bracket delimited substring that may contain unbalanced brackets
688inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
689is like a combination of C<"extract_bracketed"> and
690C<"extract_quotelike">.
691
692C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
693a text to process, a set of delimiter brackets to look for, and a prefix to
694match first. It also takes an optional fourth parameter, which allows the
695outermost delimiter brackets to be specified separately (see below).
696
697Omitting the first argument (input text) means process C<$_> instead.
698Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
699Omitting the third argument (prefix argument) implies optional whitespace at the start.
700Omitting the fourth argument (outermost delimiter brackets) indicates that the
701value of the second argument is to be used for the outermost delimiters.
702
703Once the prefix an dthe outermost opening delimiter bracket have been
704recognized, code blocks are extracted by stepping through the input text and
705trying the following alternatives in sequence:
706
707=over 4
708
709=item 1.
710
711Try and match a closing delimiter bracket. If the bracket was the same
712species as the last opening bracket, return the substring to that
713point. If the bracket was mismatched, return an error.
714
715=item 2.
716
717Try to match a quote or quotelike operator. If found, call
718C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
719the error it returned. Otherwise go back to step 1.
720
721=item 3.
722
723Try to match an opening delimiter bracket. If found, call
724C<extract_codeblock> recursively to eat the embedded block. If the
725recursive call fails, return an error. Otherwise, go back to step 1.
726
727=item 4.
728
729Unconditionally match a bareword or any other single character, and
730then go back to step 1.
731
732=back
733
734
735Examples:
736
737 # Find a while loop in the text
738
739 if ($text =~ s/.*?while\s*\{/{/)
740 {
741 $loop = "while " . extract_codeblock($text);
742 }
743
744 # Remove the first round-bracketed list (which may include
745 # round- or curly-bracketed code blocks or quotelike operators)
746
747 extract_codeblock $text, "(){}", '[^(]*';
748
749
750The ability to specify a different outermost delimiter bracket is useful
751in some circumstances. For example, in the Parse::RecDescent module,
752parser actions which are to be performed only on a successful parse
753are specified using a C<E<lt>defer:...E<gt>> directive. For example:
754
755 sentence: subject verb object
756 <defer: {$::theVerb = $item{verb}} >
757
758Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
759within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
760
761A deferred action like this:
762
763 <defer: {if ($count>10) {$count--}} >
764
765will be incorrectly parsed as:
766
767 <defer: {if ($count>
768
769because the "less than" operator is interpreted as a closing delimiter.
770
771But, by extracting the directive using
772S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
773the '>' character is only treated as a delimited at the outermost
774level of the code block, so the directive is parsed correctly.
775
776=head2 C<extract_multiple>
777
778The C<extract_multiple> subroutine takes a string to be processed and a
779list of extractors (subroutines or regular expressions) to apply to that string.
780
781In an array context C<extract_multiple> returns an array of substrings
782of the original string, as extracted by the specified extractors.
783In a scalar context, C<extract_multiple> returns the first
784substring successfully extracted from the original string. In both
785scalar and void contexts the original string has the first successfully
786extracted substring removed from it. In all contexts
787C<extract_multiple> starts at the current C<pos> of the string, and
788sets that C<pos> appropriately after it matches.
789
790Hence, the aim of of a call to C<extract_multiple> in a list context
791is to split the processed string into as many non-overlapping fields as
792possible, by repeatedly applying each of the specified extractors
793to the remainder of the string. Thus C<extract_multiple> is
794a generalized form of Perl's C<split> subroutine.
795
796The subroutine takes up to four optional arguments:
797
798=over 4
799
800=item 1.
801
802A string to be processed (C<$_> if the string is omitted or C<undef>)
803
804=item 2.
805
806A reference to a list of subroutine references and/or qr// objects and/or
807literal strings and/or hash references, specifying the extractors
808to be used to split the string. If this argument is omitted (or
809C<undef>) the list:
810
811 [
812 sub { extract_variable($_[0], '') },
813 sub { extract_quotelike($_[0],'') },
814 sub { extract_codeblock($_[0],'{}','') },
815 ]
816
817is used.
818
819
820=item 3.
821
822An number specifying the maximum number of fields to return. If this
823argument is omitted (or C<undef>), split continues as long as possible.
824
825If the third argument is I<N>, then extraction continues until I<N> fields
826have been successfully extracted, or until the string has been completely
827processed.
828
829Note that in scalar and void contexts the value of this argument is
830automatically reset to 1 (under C<-w>, a warning is issued if the argument
831has to be reset).
832
833=item 4.
834
835A value indicating whether unmatched substrings (see below) within the
836text should be skipped or returned as fields. If the value is true,
837such substrings are skipped. Otherwise, they are returned.
838
839=back
840
841The extraction process works by applying each extractor in
842sequence to the text string. If the extractor is a subroutine it
843is called in a list
844context and is expected to return a list of a single element, namely
845the extracted text.
846Note that the value returned by an extractor subroutine need not bear any
847relationship to the corresponding substring of the original text (see
848examples below).
849
850If the extractor is a precompiled regular expression or a string,
851it is matched against the text in a scalar context with a leading
852'\G' and the gc modifiers enabled. The extracted value is either
853$1 if that variable is defined after the match, or else the
854complete match (i.e. $&).
855
856If the extractor is a hash reference, it must contain exactly one element.
857The value of that element is one of the
858above extractor types (subroutine reference, regular expression, or string).
859The key of that element is the name of a class into which the successful
860return value of the extractor will be blessed.
861
862If an extractor returns a defined value, that value is immediately
863treated as the next extracted field and pushed onto the list of fields.
864If the extractor was specified in a hash reference, the field is also
865blessed into the appropriate class,
866
867If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
868assumed to have failed to extract.
869If none of the extractor subroutines succeeds, then one
870character is extracted from the start of the text and the extraction
871subroutines reapplied. Characters which are thus removed are accumulated and
872eventually become the next field (unless the fourth argument is true, in which
873case they are disgarded).
874
875For example, the following extracts substrings that are valid Perl variables:
876
877 @fields = extract_multiple($text,
878 [ sub { extract_variable($_[0]) } ],
879 undef, 1);
880
881This example separates a text into fields which are quote delimited,
882curly bracketed, and anything else. The delimited and bracketed
883parts are also blessed to identify them (the "anything else" is unblessed):
884
885 @fields = extract_multiple($text,
886 [
887 { Delim => sub { extract_delimited($_[0],q{'"}) } },
888 { Brack => sub { extract_bracketed($_[0],'{}') } },
889 ]);
890
891This call extracts the next single substring that is a valid Perl quotelike
892operator (and removes it from $text):
893
894 $quotelike = extract_multiple($text,
895 [
896 sub { extract_quotelike($_[0]) },
897 ], undef, 1);
898
899Finally, here is yet another way to do comma-separated value parsing:
900
901 @fields = extract_multiple($csv_text,
902 [
903 sub { extract_delimited($_[0],q{'"}) },
904 qr/([^,]+)(.*)/,
905 ],
906 undef,1);
907
908The list in the second argument means:
909I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
910The undef third argument means:
911I<"...as many times as possible...">,
912and the true value in the fourth argument means
913I<"...discarding anything else that appears (i.e. the commas)">.
914
915If you wanted the commas preserved as separate fields (i.e. like split
916does if your split pattern has capturing parentheses), you would
917just make the last parameter undefined (or remove it).
918
919
920=head2 C<gen_delimited_pat>
921
922The C<gen_delimited_pat> subroutine takes a single (string) argument and
923builds a Friedl-style optimized regex that matches a string delimited
924by any one of the characters in the single argument. For example:
925
926 gen_delimited_pat(q{'"})
927
928returns the regex:
929
930 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
931
932Note that the specified delimiters are automatically quotemeta'd.
933
934A typical use of C<gen_delimited_pat> would be to build special purpose tags
935for C<extract_tagged>. For example, to properly ignore "empty" XML elements
936(which might contain quoted strings):
937
938 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
939
940 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
941
942
943C<gen_delimited_pat> may also be called with an optional second argument,
944which specifies the "escape" character(s) to be used for each delimiter.
945For example to match a Pascal-style string (where ' is the delimiter
946and '' is a literal ' within the string):
947
948 gen_delimited_pat(q{'},q{'});
949
950Different escape characters can be specified for different delimiters.
951For example, to specify that '/' is the escape for single quotes
952and '%' is the escape for double quotes:
953
954 gen_delimited_pat(q{'"},q{/%});
955
956If more delimiters than escape chars are specified, the last escape char
957is used for the remaining delimiters.
958If no escape char is specified for a given specified delimiter, '\' is used.
959
960Note that
961C<gen_delimited_pat> was previously called
962C<delimited_pat>. That name may still be used, but is now deprecated.
963
964
965=head1 DIAGNOSTICS
966
967In a list context, all the functions return C<(undef,$original_text)>
968on failure. In a scalar context, failure is indicated by returning C<undef>
969(in this case the input text is not modified in any way).
970
971In addition, on failure in I<any> context, one of the following explanatory
972diagnostic messages is returned in the standard C<$@> variable (on success the
973C<$@> variable is guaranteed to be C<undef>):
974
975=over 4
976
977=item C<Did not find a suitable bracket: "%s">
978
979The delimiter provided to C<extract_bracketed> was not one of
980C<'()[]E<lt>E<gt>{}'>.
981
982=item C<Did not find prefix: /%s/>
983
984A non-optional prefix was specified but wasn't found at the start of the text.
985
986=item C<Did not find opening bracket after prefix: "%s">
987
988C<extract_bracketed> or C<extract_codeblock> was expecting a
989particular kind of bracket at the start of the text, and didn't find it.
990
991=item C<No quotelike operator found after prefix: "%s">
992
993C<extract_quotelike> didn't find one of the quotelike operators C<q>,
994C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
995it was extracting.
996
997=item C<Unmatched closing bracket: "%c">
998
999C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
1000a closing bracket where none was expected.
1001
1002=item C<Unmatched opening bracket(s): "%s">
1003
1004C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
1005out of characters in the text before closing one or more levels of nested
1006brackets.
1007
1008=item C<Unmatched embedded quote (%s)>
1009
1010C<extract_bracketed> attempted to match an embedded quoted substring, but
1011failed to find a closing quote to match it.
1012
1013=item C<Did not find closing delimiter to match '%s'>
1014
1015C<extract_quotelike> was unable to find a closing delimiter to match the
1016one that opened the quote-like operation.
1017
1018=item C<Mismatched closing bracket: expected "%c" but found "%s">
1019
1020C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
1021a valid bracket delimiter, but it was the wrong species. This usually
1022indicates a nesting error, but may indicate incorrect quoting or escaping.
1023
1024=item C<No block delimiter found after quotelike "%s">
1025
1026C<extract_quotelike> or C<extract_codeblock> found one of the
1027quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
1028without a suitable block after it.
1029
1030=item C<Did not find leading dereferencer>
1031
1032C<extract_variable> was expecting one of '$', '@', or '%' at the start of
1033a variable, but didn't find any of them.
1034
1035=item C<Bad identifier after dereferencer>
1036
1037C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
1038character was not followed by a legal Perl identifier.
1039
1040=item C<Did not find expected opening bracket at %s>
1041
1042C<extract_codeblock> failed to find any of the outermost opening brackets
1043that were specified.
1044
1045=item C<Improperly nested codeblock at %s>
1046
1047A nested code block was found that started with a delimiter that was specified
1048as being only to be used as an outermost bracket.
1049
1050=item C<Missing second block for quotelike "%s">
1051
1052C<extract_codeblock> or C<extract_quotelike> found one of the
1053quotelike operators C<s>, C<tr> or C<y> followed by only one block.
1054
1055=item C<No match found for opening bracket>
1056
1057C<extract_codeblock> failed to find a closing bracket to match the outermost
1058opening bracket.
1059
1060=item C<Did not find opening tag: /%s/>
1061
1062C<extract_tagged> did not find a suitable opening tag (after any specified
1063prefix was removed).
1064
1065=item C<Unable to construct closing tag to match: /%s/>
1066
1067C<extract_tagged> matched the specified opening tag and tried to
1068modify the matched text to produce a matching closing tag (because
1069none was specified). It failed to generate the closing tag, almost
1070certainly because the opening tag did not start with a
1071bracket of some kind.
1072
1073=item C<Found invalid nested tag: %s>
1074
1075C<extract_tagged> found a nested tag that appeared in the "reject" list
1076(and the failure mode was not "MAX" or "PARA").
1077
1078=item C<Found unbalanced nested tag: %s>
1079
1080C<extract_tagged> found a nested opening tag that was not matched by a
1081corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
1082
1083=item C<Did not find closing tag>
1084
1085C<extract_tagged> reached the end of the text without finding a closing tag
1086to match the original opening tag (and the failure mode was not
1087"MAX" or "PARA").
1088
1089
1090
1091
1092=back
1093
1094
1095=head1 AUTHOR
1096
1097Damian Conway (damian@conway.org)
1098
1099
1100=head1 BUGS AND IRRITATIONS
1101
1102There are undoubtedly serious bugs lurking somewhere in this code, if
1103only because parts of it give the impression of understanding a great deal
1104more about Perl than they really do.
1105
1106Bug reports and other feedback are most welcome.
1107
1108
1109=head1 COPYRIGHT
1110
1111 Copyright (c) 1997-2000, Damian Conway. All Rights Reserved.
1112 This module is free software. It may be used, redistributed
1113and/or modified under the terms of the Perl Artistic License
1114 (see http://www.perl.com/perl/misc/Artistic.html)
1115