Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. |
2 | # FOR FULL DOCUMENTATION SEE Balanced.pod | |
3 | ||
4 | use 5.005; | |
5 | use strict; | |
6 | ||
7 | package Text::Balanced; | |
8 | ||
9 | use Exporter; | |
10 | use SelfLoader; | |
11 | use vars qw { $VERSION @ISA %EXPORT_TAGS }; | |
12 | ||
13 | $VERSION = '1.77'; | |
14 | @ISA = qw ( Exporter ); | |
15 | ||
16 | %EXPORT_TAGS = ( ALL => [ qw( | |
17 | &extract_delimited | |
18 | &extract_bracketed | |
19 | &extract_quotelike | |
20 | &extract_codeblock | |
21 | &extract_variable | |
22 | &extract_tagged | |
23 | &extract_multiple | |
24 | ||
25 | &gen_delimited_pat | |
26 | &gen_extract_tagged | |
27 | ||
28 | &delimited_pat | |
29 | ) ] ); | |
30 | ||
31 | Exporter::export_ok_tags('ALL'); | |
32 | ||
33 | # PROTOTYPES | |
34 | ||
35 | sub _match_bracketed($$$$$$); | |
36 | sub _match_variable($$); | |
37 | sub _match_codeblock($$$$$$$); | |
38 | sub _match_quotelike($$$$); | |
39 | ||
40 | # HANDLE RETURN VALUES IN VARIOUS CONTEXTS | |
41 | ||
42 | sub _fail | |
43 | { | |
44 | my ($wantarray,$textref) = @_; | |
45 | return ("",$$textref,"") if $wantarray; | |
46 | return undef; | |
47 | } | |
48 | ||
49 | sub _succeed | |
50 | { | |
51 | $@ = undef; | |
52 | my ($wantarray,$textref) = splice @_, 0, 2; | |
53 | if ($wantarray) | |
54 | { | |
55 | # print join ("|", @_), "\n"; | |
56 | my @res; | |
57 | pos($$textref) = $_[2]; # RESET \G | |
58 | while (my ($from, $len) = splice @_, 0, 2) | |
59 | { | |
60 | push @res, substr($$textref,$from,$len); | |
61 | } | |
62 | return @res; | |
63 | } | |
64 | else | |
65 | { | |
66 | my $match = substr($$textref,$_[0],$_[1]); | |
67 | eval {substr($$textref,$_[4],$_[1]+$_[5])=""} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE | |
68 | pos($$textref) = $_[4]; # RESET \G | |
69 | return $match; | |
70 | } | |
71 | } | |
72 | ||
73 | # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING | |
74 | ||
75 | sub gen_delimited_pat($;$) # ($delimiters;$escapes) | |
76 | { | |
77 | my ($dels, $escs) = @_; | |
78 | return "" unless $dels =~ /\S/; | |
79 | $escs = '\\' unless $escs; | |
80 | $escs .= substr($escs,-1) x (length($dels)-length($escs)); | |
81 | my @pat = (); | |
82 | my $i; | |
83 | for ($i=0; $i<length $dels; $i++) | |
84 | { | |
85 | my $del = quotemeta substr($dels,$i,1); | |
86 | my $esc = quotemeta substr($escs,$i,1); | |
87 | if ($del eq $esc) | |
88 | { | |
89 | push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; | |
90 | } | |
91 | else | |
92 | { | |
93 | push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; | |
94 | } | |
95 | } | |
96 | my $pat = join '|', @pat; | |
97 | return "(?:$pat)"; | |
98 | } | |
99 | ||
100 | *delimited_pat = \&gen_delimited_pat; | |
101 | ||
102 | ||
103 | # THE EXTRACTION FUNCTIONS | |
104 | ||
105 | sub extract_delimited (;$$$$) | |
106 | { | |
107 | my $textref = defined $_[0] ? \$_[0] : \$_; | |
108 | my $wantarray = wantarray; | |
109 | my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; | |
110 | my $pre = defined $_[2] ? $_[2] : '\s*'; | |
111 | my $esc = defined $_[3] ? $_[3] : qq{\\}; | |
112 | my $pat = gen_delimited_pat($del, $esc); | |
113 | my $startpos = pos $$textref || 0; | |
114 | return _fail($wantarray, $textref) | |
115 | unless $$textref =~ m/\G($pre)($pat)/gc; | |
116 | my $prelen = length($1); | |
117 | my $matchpos = $startpos+$prelen; | |
118 | my $endpos = pos $$textref; | |
119 | return _succeed $wantarray, $textref, | |
120 | $matchpos, $endpos-$matchpos, # MATCH | |
121 | $endpos, length($$textref)-$endpos, # REMAINDER | |
122 | $startpos, $prelen; # PREFIX | |
123 | } | |
124 | ||
125 | sub extract_bracketed (;$$$) | |
126 | { | |
127 | my $textref = defined $_[0] ? \$_[0] : \$_; | |
128 | my $ldel = defined $_[1] ? $_[1] : '{([<'; | |
129 | my $pre = defined $_[2] ? $_[2] : '\s*'; | |
130 | my $wantarray = wantarray; | |
131 | my $qdel = ""; | |
132 | my $quotelike; | |
133 | $ldel =~ s/'//g and $qdel .= q{'}; | |
134 | $ldel =~ s/"//g and $qdel .= q{"}; | |
135 | $ldel =~ s/`//g and $qdel .= q{`}; | |
136 | $ldel =~ s/q//g and $quotelike = 1; | |
137 | $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; | |
138 | my $rdel = $ldel; | |
139 | unless ($rdel =~ tr/[({</])}>/) | |
140 | { | |
141 | $@ = "Did not find a suitable bracket in delimiter: \"$_[1]\""; | |
142 | return _fail $wantarray, $textref; | |
143 | } | |
144 | my $posbug = pos; | |
145 | $ldel = join('|', map { quotemeta $_ } split('', $ldel)); | |
146 | $rdel = join('|', map { quotemeta $_ } split('', $rdel)); | |
147 | pos = $posbug; | |
148 | ||
149 | my $startpos = pos $$textref || 0; | |
150 | my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); | |
151 | ||
152 | return _fail ($wantarray, $textref) unless @match; | |
153 | ||
154 | return _succeed ( $wantarray, $textref, | |
155 | $match[2], $match[5]+2, # MATCH | |
156 | @match[8,9], # REMAINDER | |
157 | @match[0,1], # PREFIX | |
158 | ); | |
159 | } | |
160 | ||
161 | sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel | |
162 | { | |
163 | my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; | |
164 | my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); | |
165 | unless ($$textref =~ m/\G$pre/gc) | |
166 | { | |
167 | $@ = "Did not find prefix: /$pre/"; | |
168 | return; | |
169 | } | |
170 | ||
171 | $ldelpos = pos $$textref; | |
172 | ||
173 | unless ($$textref =~ m/\G($ldel)/gc) | |
174 | { | |
175 | $@ = "Did not find opening bracket after prefix: \"$pre\""; | |
176 | pos $$textref = $startpos; | |
177 | return; | |
178 | } | |
179 | ||
180 | my @nesting = ( $1 ); | |
181 | my $textlen = length $$textref; | |
182 | while (pos $$textref < $textlen) | |
183 | { | |
184 | next if $$textref =~ m/\G\\./gcs; | |
185 | ||
186 | if ($$textref =~ m/\G($ldel)/gc) | |
187 | { | |
188 | push @nesting, $1; | |
189 | } | |
190 | elsif ($$textref =~ m/\G($rdel)/gc) | |
191 | { | |
192 | my ($found, $brackettype) = ($1, $1); | |
193 | if ($#nesting < 0) | |
194 | { | |
195 | $@ = "Unmatched closing bracket: \"$found\""; | |
196 | pos $$textref = $startpos; | |
197 | return; | |
198 | } | |
199 | my $expected = pop(@nesting); | |
200 | $expected =~ tr/({[</)}]>/; | |
201 | if ($expected ne $brackettype) | |
202 | { | |
203 | $@ = qq{Mismatched closing bracket: expected "$expected" but found "$found"}; | |
204 | pos $$textref = $startpos; | |
205 | return; | |
206 | } | |
207 | last if $#nesting < 0; | |
208 | } | |
209 | elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) | |
210 | { | |
211 | $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gc and next; | |
212 | $@ = "Unmatched embedded quote ($1)"; | |
213 | pos $$textref = $startpos; | |
214 | return; | |
215 | } | |
216 | elsif ($quotelike && _match_quotelike($textref,"",1,0)) | |
217 | { | |
218 | next; | |
219 | } | |
220 | ||
221 | else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } | |
222 | } | |
223 | if ($#nesting>=0) | |
224 | { | |
225 | $@ = "Unmatched opening bracket(s): " | |
226 | . join("..",@nesting).".."; | |
227 | pos $$textref = $startpos; | |
228 | return; | |
229 | } | |
230 | ||
231 | $endpos = pos $$textref; | |
232 | ||
233 | return ( | |
234 | $startpos, $ldelpos-$startpos, # PREFIX | |
235 | $ldelpos, 1, # OPENING BRACKET | |
236 | $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS | |
237 | $endpos-1, 1, # CLOSING BRACKET | |
238 | $endpos, length($$textref)-$endpos, # REMAINDER | |
239 | ); | |
240 | } | |
241 | ||
242 | sub revbracket($) | |
243 | { | |
244 | my $brack = reverse $_[0]; | |
245 | $brack =~ tr/[({</])}>/; | |
246 | return $brack; | |
247 | } | |
248 | ||
249 | my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; | |
250 | ||
251 | sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) | |
252 | { | |
253 | my $textref = defined $_[0] ? \$_[0] : \$_; | |
254 | my $ldel = $_[1]; | |
255 | my $rdel = $_[2]; | |
256 | my $pre = defined $_[3] ? $_[3] : '\s*'; | |
257 | my %options = defined $_[4] ? %{$_[4]} : (); | |
258 | my $omode = defined $options{fail} ? $options{fail} : ''; | |
259 | my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) | |
260 | : defined($options{reject}) ? $options{reject} | |
261 | : '' | |
262 | ; | |
263 | my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) | |
264 | : defined($options{ignore}) ? $options{ignore} | |
265 | : '' | |
266 | ; | |
267 | ||
268 | if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } | |
269 | $@ = undef; | |
270 | ||
271 | my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); | |
272 | ||
273 | return _fail(wantarray, $textref) unless @match; | |
274 | return _succeed wantarray, $textref, | |
275 | $match[2], $match[3]+$match[5]+$match[7], # MATCH | |
276 | @match[8..9,0..1,2..7]; # REM, PRE, BITS | |
277 | } | |
278 | ||
279 | sub _match_tagged # ($$$$$$$) | |
280 | { | |
281 | my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; | |
282 | my $rdelspec; | |
283 | ||
284 | my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); | |
285 | ||
286 | unless ($$textref =~ m/\G($pre)/gc) | |
287 | { | |
288 | $@ = "Did not find prefix: /$pre/"; | |
289 | goto failed; | |
290 | } | |
291 | ||
292 | $opentagpos = pos($$textref); | |
293 | ||
294 | unless ($$textref =~ m/\G$ldel/gc) | |
295 | { | |
296 | $@ = "Did not find opening tag: /$ldel/"; | |
297 | goto failed; | |
298 | } | |
299 | ||
300 | $textpos = pos($$textref); | |
301 | ||
302 | if (!defined $rdel) | |
303 | { | |
304 | $rdelspec = $&; | |
305 | unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ "$1\/$2". revbracket($1) /oes) | |
306 | { | |
307 | $@ = "Unable to construct closing tag to match: $rdel"; | |
308 | goto failed; | |
309 | } | |
310 | } | |
311 | else | |
312 | { | |
313 | $rdelspec = eval "qq{$rdel}"; | |
314 | } | |
315 | ||
316 | while (pos($$textref) < length($$textref)) | |
317 | { | |
318 | next if $$textref =~ m/\G\\./gc; | |
319 | ||
320 | if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) | |
321 | { | |
322 | $parapos = pos($$textref) - length($1) | |
323 | unless defined $parapos; | |
324 | } | |
325 | elsif ($$textref =~ m/\G($rdelspec)/gc ) | |
326 | { | |
327 | $closetagpos = pos($$textref)-length($1); | |
328 | goto matched; | |
329 | } | |
330 | elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) | |
331 | { | |
332 | next; | |
333 | } | |
334 | elsif ($bad && $$textref =~ m/\G($bad)/gcs) | |
335 | { | |
336 | pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS | |
337 | goto short if ($omode eq 'PARA' || $omode eq 'MAX'); | |
338 | $@ = "Found invalid nested tag: $1"; | |
339 | goto failed; | |
340 | } | |
341 | elsif ($$textref =~ m/\G($ldel)/gc) | |
342 | { | |
343 | my $tag = $1; | |
344 | pos($$textref) -= length($tag); # REWIND TO NESTED TAG | |
345 | unless (_match_tagged(@_)) # MATCH NESTED TAG | |
346 | { | |
347 | goto short if $omode eq 'PARA' || $omode eq 'MAX'; | |
348 | $@ = "Found unbalanced nested tag: $tag"; | |
349 | goto failed; | |
350 | } | |
351 | } | |
352 | else { $$textref =~ m/./gcs } | |
353 | } | |
354 | ||
355 | short: | |
356 | $closetagpos = pos($$textref); | |
357 | goto matched if $omode eq 'MAX'; | |
358 | goto failed unless $omode eq 'PARA'; | |
359 | ||
360 | if (defined $parapos) { pos($$textref) = $parapos } | |
361 | else { $parapos = pos($$textref) } | |
362 | ||
363 | return ( | |
364 | $startpos, $opentagpos-$startpos, # PREFIX | |
365 | $opentagpos, $textpos-$opentagpos, # OPENING TAG | |
366 | $textpos, $parapos-$textpos, # TEXT | |
367 | $parapos, 0, # NO CLOSING TAG | |
368 | $parapos, length($$textref)-$parapos, # REMAINDER | |
369 | ); | |
370 | ||
371 | matched: | |
372 | $endpos = pos($$textref); | |
373 | return ( | |
374 | $startpos, $opentagpos-$startpos, # PREFIX | |
375 | $opentagpos, $textpos-$opentagpos, # OPENING TAG | |
376 | $textpos, $closetagpos-$textpos, # TEXT | |
377 | $closetagpos, $endpos-$closetagpos, # CLOSING TAG | |
378 | $endpos, length($$textref)-$endpos, # REMAINDER | |
379 | ); | |
380 | ||
381 | failed: | |
382 | $@ = "Did not find closing tag" unless $@; | |
383 | pos($$textref) = $startpos; | |
384 | return; | |
385 | } | |
386 | ||
387 | sub extract_variable (;$$) | |
388 | { | |
389 | my $textref = defined $_[0] ? \$_[0] : \$_; | |
390 | return ("","","") unless defined $$textref; | |
391 | my $pre = defined $_[1] ? $_[1] : '\s*'; | |
392 | ||
393 | my @match = _match_variable($textref,$pre); | |
394 | ||
395 | return _fail wantarray, $textref unless @match; | |
396 | ||
397 | return _succeed wantarray, $textref, | |
398 | @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX | |
399 | } | |
400 | ||
401 | sub _match_variable($$) | |
402 | { | |
403 | my ($textref, $pre) = @_; | |
404 | my $startpos = pos($$textref) = pos($$textref)||0; | |
405 | unless ($$textref =~ m/\G($pre)/gc) | |
406 | { | |
407 | $@ = "Did not find prefix: /$pre/"; | |
408 | return; | |
409 | } | |
410 | my $varpos = pos($$textref); | |
411 | unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc) | |
412 | { | |
413 | $@ = "Did not find leading dereferencer"; | |
414 | pos $$textref = $startpos; | |
415 | return; | |
416 | } | |
417 | ||
418 | unless ($$textref =~ m/\G\s*(?:::)?(?:[_a-z]\w*::)*[_a-z]\w*/gci | |
419 | or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)) | |
420 | { | |
421 | $@ = "Bad identifier after dereferencer"; | |
422 | pos $$textref = $startpos; | |
423 | return; | |
424 | } | |
425 | ||
426 | while (1) | |
427 | { | |
428 | next if _match_codeblock($textref, | |
429 | qr/\s*->\s*(?:[a-zA-Z]\w+\s*)?/, | |
430 | qr/[({[]/, qr/[)}\]]/, | |
431 | qr/[({[]/, qr/[)}\]]/, 0); | |
432 | next if _match_codeblock($textref, | |
433 | qr/\s*/, qr/[{[]/, qr/[}\]]/, | |
434 | qr/[{[]/, qr/[}\]]/, 0); | |
435 | next if _match_variable($textref,'\s*->\s*'); | |
436 | next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; | |
437 | last; | |
438 | } | |
439 | ||
440 | my $endpos = pos($$textref); | |
441 | return ($startpos, $varpos-$startpos, | |
442 | $varpos, $endpos-$varpos, | |
443 | $endpos, length($$textref)-$endpos | |
444 | ); | |
445 | } | |
446 | ||
447 | sub extract_codeblock (;$$$$$) | |
448 | { | |
449 | my $textref = defined $_[0] ? \$_[0] : \$_; | |
450 | my $wantarray = wantarray; | |
451 | my $ldel_inner = defined $_[1] ? $_[1] : '{'; | |
452 | my $pre = defined $_[2] ? $_[2] : '\s*'; | |
453 | my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; | |
454 | my $rd = $_[4]; | |
455 | my $rdel_inner = $ldel_inner; | |
456 | my $rdel_outer = $ldel_outer; | |
457 | my $posbug = pos; | |
458 | for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } | |
459 | for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } | |
460 | for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) | |
461 | { | |
462 | $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' | |
463 | } | |
464 | pos = $posbug; | |
465 | ||
466 | my @match = _match_codeblock($textref, $pre, | |
467 | $ldel_outer, $rdel_outer, | |
468 | $ldel_inner, $rdel_inner, | |
469 | $rd); | |
470 | return _fail($wantarray, $textref) unless @match; | |
471 | return _succeed($wantarray, $textref, | |
472 | @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX | |
473 | ); | |
474 | ||
475 | } | |
476 | ||
477 | sub _match_codeblock($$$$$$$) | |
478 | { | |
479 | my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; | |
480 | my $startpos = pos($$textref) = pos($$textref) || 0; | |
481 | unless ($$textref =~ m/\G($pre)/gc) | |
482 | { | |
483 | $@ = qq{Did not match prefix /$pre/ at"} . | |
484 | substr($$textref,pos($$textref),20) . | |
485 | q{..."}; | |
486 | return; | |
487 | } | |
488 | my $codepos = pos($$textref); | |
489 | unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER | |
490 | { | |
491 | $@ = qq{Did not find expected opening bracket at "} . | |
492 | substr($$textref,pos($$textref),20) . | |
493 | q{..."}; | |
494 | pos $$textref = $startpos; | |
495 | return; | |
496 | } | |
497 | my $closing = $1; | |
498 | $closing =~ tr/([<{/)]>}/; | |
499 | my $matched; | |
500 | my $patvalid = 1; | |
501 | while (pos($$textref) < length($$textref)) | |
502 | { | |
503 | $matched = ''; | |
504 | if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) | |
505 | { | |
506 | $patvalid = 0; | |
507 | next; | |
508 | } | |
509 | ||
510 | if ($$textref =~ m/\G\s*#.*/gc) | |
511 | { | |
512 | next; | |
513 | } | |
514 | ||
515 | if ($$textref =~ m/\G\s*($rdel_outer)/gc) | |
516 | { | |
517 | unless ($matched = ($closing && $1 eq $closing) ) | |
518 | { | |
519 | next if $1 eq '>'; # MIGHT BE A "LESS THAN" | |
520 | $@ = q{Mismatched closing bracket at "} . | |
521 | substr($$textref,pos($$textref),20) . | |
522 | qq{...". Expected '$closing'}; | |
523 | } | |
524 | last; | |
525 | } | |
526 | ||
527 | if (_match_variable($textref,'\s*') || | |
528 | _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) | |
529 | { | |
530 | $patvalid = 0; | |
531 | next; | |
532 | } | |
533 | ||
534 | ||
535 | # NEED TO COVER MANY MORE CASES HERE!!! | |
536 | if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=? | |
537 | | =(?!>) | |
538 | | (\*\*|&&|\|\||<<|>>)=? | |
539 | | [!=][~=] | |
540 | | split|grep|map|return | |
541 | )#gcx) | |
542 | { | |
543 | $patvalid = 1; | |
544 | next; | |
545 | } | |
546 | ||
547 | if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) | |
548 | { | |
549 | $patvalid = 1; | |
550 | next; | |
551 | } | |
552 | ||
553 | if ($$textref =~ m/\G\s*$ldel_outer/gc) | |
554 | { | |
555 | $@ = q{Improperly nested codeblock at "} . | |
556 | substr($$textref,pos($$textref),20) . | |
557 | q{..."}; | |
558 | last; | |
559 | } | |
560 | ||
561 | $patvalid = 0; | |
562 | $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; | |
563 | } | |
564 | ||
565 | unless ($matched) | |
566 | { | |
567 | $@ = 'No match found for opening bracket' unless $@; | |
568 | return; | |
569 | } | |
570 | ||
571 | my $endpos = pos($$textref); | |
572 | return ( $startpos, $codepos-$startpos, | |
573 | $codepos, $endpos-$codepos, | |
574 | $endpos, length($$textref)-$endpos, | |
575 | ); | |
576 | } | |
577 | ||
578 | ||
579 | my %mods = ( | |
580 | 'none' => '[cgimsox]*', | |
581 | 'm' => '[cgimsox]*', | |
582 | 's' => '[cegimsox]*', | |
583 | 'tr' => '[cds]*', | |
584 | 'y' => '[cds]*', | |
585 | 'qq' => '', | |
586 | 'qx' => '', | |
587 | 'qw' => '', | |
588 | 'qr' => '[imsx]*', | |
589 | 'q' => '', | |
590 | ); | |
591 | ||
592 | sub extract_quotelike (;$$) | |
593 | { | |
594 | my $textref = $_[0] ? \$_[0] : \$_; | |
595 | my $wantarray = wantarray; | |
596 | my $pre = defined $_[1] ? $_[1] : '\s*'; | |
597 | ||
598 | my @match = _match_quotelike($textref,$pre,1,0); | |
599 | return _fail($wantarray, $textref) unless @match; | |
600 | return _succeed($wantarray, $textref, | |
601 | $match[2], $match[18]-$match[2], # MATCH | |
602 | @match[18,19], # REMAINDER | |
603 | @match[0,1], # PREFIX | |
604 | @match[2..17], # THE BITS | |
605 | ); | |
606 | }; | |
607 | ||
608 | sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) | |
609 | { | |
610 | my ($textref, $pre, $rawmatch, $qmark) = @_; | |
611 | ||
612 | my ($textlen,$startpos, | |
613 | $oppos, | |
614 | $preld1pos,$ld1pos,$str1pos,$rd1pos, | |
615 | $preld2pos,$ld2pos,$str2pos,$rd2pos, | |
616 | $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); | |
617 | ||
618 | unless ($$textref =~ m/\G($pre)/gc) | |
619 | { | |
620 | $@ = qq{Did not find prefix /$pre/ at "} . | |
621 | substr($$textref, pos($$textref), 20) . | |
622 | q{..."}; | |
623 | return; | |
624 | } | |
625 | $oppos = pos($$textref); | |
626 | ||
627 | my $initial = substr($$textref,$oppos,1); | |
628 | ||
629 | if ($initial && $initial =~ m|^[\"\'\`]| | |
630 | || $rawmatch && $initial =~ m|^/| | |
631 | || $qmark && $initial =~ m|^\?|) | |
632 | { | |
633 | unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcx) | |
634 | { | |
635 | $@ = qq{Did not find closing delimiter to match '$initial' at "} . | |
636 | substr($$textref, $oppos, 20) . | |
637 | q{..."}; | |
638 | pos $$textref = $startpos; | |
639 | return; | |
640 | } | |
641 | $modpos= pos($$textref); | |
642 | $rd1pos = $modpos-1; | |
643 | ||
644 | if ($initial eq '/' || $initial eq '?') | |
645 | { | |
646 | $$textref =~ m/\G$mods{none}/gc | |
647 | } | |
648 | ||
649 | my $endpos = pos($$textref); | |
650 | return ( | |
651 | $startpos, $oppos-$startpos, # PREFIX | |
652 | $oppos, 0, # NO OPERATOR | |
653 | $oppos, 1, # LEFT DEL | |
654 | $oppos+1, $rd1pos-$oppos-1, # STR/PAT | |
655 | $rd1pos, 1, # RIGHT DEL | |
656 | $modpos, 0, # NO 2ND LDEL | |
657 | $modpos, 0, # NO 2ND STR | |
658 | $modpos, 0, # NO 2ND RDEL | |
659 | $modpos, $endpos-$modpos, # MODIFIERS | |
660 | $endpos, $textlen-$endpos, # REMAINDER | |
661 | ); | |
662 | } | |
663 | ||
664 | unless ($$textref =~ m!\G(m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)!gc) | |
665 | { | |
666 | $@ = q{No quotelike operator found after prefix at "} . | |
667 | substr($$textref, pos($$textref), 20) . | |
668 | q{..."}; | |
669 | pos $$textref = $startpos; | |
670 | return; | |
671 | } | |
672 | ||
673 | my $op = $1; | |
674 | $preld1pos = pos($$textref); | |
675 | $$textref =~ m/\G\s*/gc; | |
676 | $ld1pos = pos($$textref); | |
677 | $str1pos = $ld1pos+1; | |
678 | ||
679 | unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD | |
680 | { | |
681 | $@ = "No block delimiter found after quotelike $op"; | |
682 | pos $$textref = $startpos; | |
683 | return; | |
684 | } | |
685 | pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN | |
686 | my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); | |
687 | if ($ldel1 =~ /[[(<{]/) | |
688 | { | |
689 | $rdel1 =~ tr/[({</])}>/; | |
690 | _match_bracketed($textref,"",$ldel1,"","",$rdel1) | |
691 | || do { pos $$textref = $startpos; return }; | |
692 | } | |
693 | else | |
694 | { | |
695 | $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gc | |
696 | || do { pos $$textref = $startpos; return }; | |
697 | } | |
698 | $ld2pos = $rd1pos = pos($$textref)-1; | |
699 | ||
700 | my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; | |
701 | if ($second_arg) | |
702 | { | |
703 | my ($ldel2, $rdel2); | |
704 | if ($ldel1 =~ /[[(<{]/) | |
705 | { | |
706 | unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD | |
707 | { | |
708 | $@ = "Missing second block for quotelike $op"; | |
709 | pos $$textref = $startpos; | |
710 | return; | |
711 | } | |
712 | $ldel2 = $rdel2 = "\Q$1"; | |
713 | $rdel2 =~ tr/[({</])}>/; | |
714 | } | |
715 | else | |
716 | { | |
717 | $ldel2 = $rdel2 = $ldel1; | |
718 | } | |
719 | $str2pos = $ld2pos+1; | |
720 | ||
721 | if ($ldel2 =~ /[[(<{]/) | |
722 | { | |
723 | pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD | |
724 | _match_bracketed($textref,"",$ldel2,"","",$rdel2) | |
725 | || do { pos $$textref = $startpos; return }; | |
726 | } | |
727 | else | |
728 | { | |
729 | $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gc | |
730 | || do { pos $$textref = $startpos; return }; | |
731 | } | |
732 | $rd2pos = pos($$textref)-1; | |
733 | } | |
734 | else | |
735 | { | |
736 | $ld2pos = $str2pos = $rd2pos = $rd1pos; | |
737 | } | |
738 | ||
739 | $modpos = pos $$textref; | |
740 | ||
741 | $$textref =~ m/\G($mods{$op})/gc; | |
742 | my $endpos = pos $$textref; | |
743 | ||
744 | return ( | |
745 | $startpos, $startpos-$oppos, # PREFIX | |
746 | $oppos, length($op), # OPERATOR | |
747 | $ld1pos, 1, # LEFT DEL | |
748 | $str1pos, $rd1pos-$str1pos, # STR/PAT | |
749 | $rd1pos, 1, # RIGHT DEL | |
750 | $ld2pos, $second_arg, # 2ND LDEL (MAYBE) | |
751 | $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) | |
752 | $rd2pos, $second_arg, # 2ND RDEL (MAYBE) | |
753 | $modpos, $endpos-$modpos, # MODIFIERS | |
754 | $endpos, $textlen-$endpos, # REMAINDER | |
755 | ); | |
756 | } | |
757 | ||
758 | my $def_func = | |
759 | [ | |
760 | sub { extract_variable($_[0], '') }, | |
761 | sub { extract_quotelike($_[0],'') }, | |
762 | sub { extract_codeblock($_[0],'{}','') }, | |
763 | ]; | |
764 | ||
765 | sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) | |
766 | { | |
767 | my $textref = defined($_[0]) ? \$_[0] : \$_; | |
768 | my $posbug = pos; | |
769 | my ($lastpos, $firstpos); | |
770 | my @fields = (); | |
771 | ||
772 | for ($$textref) | |
773 | { | |
774 | my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; | |
775 | my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; | |
776 | my $igunk = $_[3]; | |
777 | ||
778 | pos ||= 0; | |
779 | ||
780 | unless (wantarray) | |
781 | { | |
782 | use Carp; | |
783 | carp "extract_multiple reset maximal count to 1 in scalar context" | |
784 | if $^W && defined($_[2]) && $max > 1; | |
785 | $max = 1 | |
786 | } | |
787 | ||
788 | my $unkpos; | |
789 | my $func; | |
790 | my $class; | |
791 | ||
792 | my @class; | |
793 | foreach $func ( @func ) | |
794 | { | |
795 | if (ref($func) eq 'HASH') | |
796 | { | |
797 | push @class, (keys %$func)[0]; | |
798 | $func = (values %$func)[0]; | |
799 | } | |
800 | else | |
801 | { | |
802 | push @class, undef; | |
803 | } | |
804 | } | |
805 | ||
806 | FIELD: while (pos() < length()) | |
807 | { | |
808 | my $field; | |
809 | foreach my $i ( 0..$#func ) | |
810 | { | |
811 | $func = $func[$i]; | |
812 | $class = $class[$i]; | |
813 | $lastpos = pos; | |
814 | if (ref($func) eq 'CODE') | |
815 | { ($field) = $func->($_) } | |
816 | elsif( m/\G$func/gc ) | |
817 | { $field = defined($1) ? $1 : $& } | |
818 | ||
819 | if (defined($field) && length($field)) | |
820 | { | |
821 | if (defined($unkpos) && !$igunk) | |
822 | { | |
823 | push @fields, substr($_, $unkpos, $lastpos-$unkpos); | |
824 | $firstpos = $unkpos unless defined $firstpos; | |
825 | undef $unkpos; | |
826 | last FIELD if @fields == $max; | |
827 | } | |
828 | push @fields, $class | |
829 | ? bless(\$field, $class) | |
830 | : $field; | |
831 | $firstpos = $lastpos unless defined $firstpos; | |
832 | $lastpos = pos; | |
833 | last FIELD if @fields == $max; | |
834 | next FIELD; | |
835 | } | |
836 | } | |
837 | if (/\G(.)/gcs) | |
838 | { | |
839 | $unkpos = pos()-1 | |
840 | unless $igunk || defined $unkpos; | |
841 | } | |
842 | } | |
843 | ||
844 | if (defined $unkpos) | |
845 | { | |
846 | push @fields, substr($_, $unkpos); | |
847 | $firstpos = $unkpos unless defined $firstpos; | |
848 | $lastpos = length; | |
849 | } | |
850 | last; | |
851 | } | |
852 | ||
853 | pos $$textref = $lastpos; | |
854 | return @fields if wantarray; | |
855 | ||
856 | $firstpos ||= 0; | |
857 | eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; | |
858 | pos $$textref = $firstpos }; | |
859 | return $fields[0]; | |
860 | } | |
861 | ||
862 | ||
863 | sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) | |
864 | { | |
865 | my $ldel = $_[0]; | |
866 | my $rdel = $_[1]; | |
867 | my $pre = defined $_[2] ? $_[2] : '\s*'; | |
868 | my %options = defined $_[3] ? %{$_[3]} : (); | |
869 | my $omode = defined $options{fail} ? $options{fail} : ''; | |
870 | my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) | |
871 | : defined($options{reject}) ? $options{reject} | |
872 | : '' | |
873 | ; | |
874 | my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) | |
875 | : defined($options{ignore}) ? $options{ignore} | |
876 | : '' | |
877 | ; | |
878 | ||
879 | if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } | |
880 | ||
881 | my $posbug = pos; | |
882 | for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } | |
883 | pos = $posbug; | |
884 | ||
885 | my $closure = sub | |
886 | { | |
887 | my $textref = defined $_[0] ? \$_[0] : \$_; | |
888 | my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); | |
889 | ||
890 | return _fail(wantarray, $textref) unless @match; | |
891 | return _succeed wantarray, $textref, | |
892 | $match[2], $match[3]+$match[5]+$match[7], # MATCH | |
893 | @match[8..9,0..1,2..7]; # REM, PRE, BITS | |
894 | }; | |
895 | ||
896 | bless $closure, 'Text::Balanced::Extractor'; | |
897 | } | |
898 | ||
899 | package Text::Balanced::Extractor; | |
900 | ||
901 | sub extract($$) # ($self, $text) | |
902 | { | |
903 | &{$_[0]}($_[1]); | |
904 | } | |
905 | ||
906 | 1; |