# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
# FOR FULL DOCUMENTATION SEE Balanced.pod
use vars qw
{ $VERSION @ISA %EXPORT_TAGS };
%EXPORT_TAGS = ( ALL
=> [ qw(
Exporter
::export_ok_tags
('ALL');
sub _match_bracketed
($$$$$$);
sub _match_codeblock
($$$$$$$);
sub _match_quotelike
($$$$);
# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
my ($wantarray,$textref) = @_;
return ("",$$textref,"") if $wantarray;
my ($wantarray,$textref) = splice @_, 0, 2;
# print join ("|", @_), "\n";
pos($$textref) = $_[2]; # RESET \G
while (my ($from, $len) = splice @_, 0, 2)
push @res, substr($$textref,$from,$len);
my $match = substr($$textref,$_[0],$_[1]);
eval {substr($$textref,$_[4],$_[1]+$_[5])=""} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
pos($$textref) = $_[4]; # RESET \G
# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
sub gen_delimited_pat
($;$) # ($delimiters;$escapes)
return "" unless $dels =~ /\S/;
$escs = '\\' unless $escs;
$escs .= substr($escs,-1) x
(length($dels)-length($escs));
for ($i=0; $i<length $dels; $i++)
my $del = quotemeta substr($dels,$i,1);
my $esc = quotemeta substr($escs,$i,1);
push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
my $pat = join '|', @pat;
*delimited_pat
= \
&gen_delimited_pat
;
# THE EXTRACTION FUNCTIONS
sub extract_delimited
(;$$$$)
my $textref = defined $_[0] ? \
$_[0] : \
$_;
my $wantarray = wantarray;
my $del = defined $_[1] ?
$_[1] : qq{\'\"\
`};
my $pre = defined $_[2] ? $_[2] : '\s*';
my $esc = defined $_[3] ? $_[3] : qq{\\};
my $pat = gen_delimited_pat($del, $esc);
my $startpos = pos $$textref || 0;
return _fail($wantarray, $textref)
unless $$textref =~ m/\G($pre)($pat)/gc;
my $matchpos = $startpos+$prelen;
my $endpos = pos $$textref;
return _succeed $wantarray, $textref,
$matchpos, $endpos-$matchpos, # MATCH
$endpos, length($$textref)-$endpos, # REMAINDER
$startpos, $prelen; # PREFIX
sub extract_bracketed (;$$$)
my $textref = defined $_[0] ? \$_[0] : \$_;
my $ldel = defined $_[1] ? $_[1] : '{([<';
my $pre = defined $_[2] ? $_[2] : '\s*';
my $wantarray = wantarray;
$ldel =~ s/'//g and $qdel .= q{'};
$ldel =~ s/"//g and $qdel .= q{"};
$ldel =~ s/`//g and $qdel .= q{`};
$ldel =~ s/q//g and $quotelike = 1;
$ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
unless ($rdel =~ tr/[({</])}>/)
$@
= "Did not find a suitable bracket in delimiter: \"$_[1]\"";
return _fail
$wantarray, $textref;
$ldel = join('|', map { quotemeta $_ } split('', $ldel));
$rdel = join('|', map { quotemeta $_ } split('', $rdel));
my $startpos = pos $$textref || 0;
my @match = _match_bracketed
($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
return _fail
($wantarray, $textref) unless @match;
return _succeed
( $wantarray, $textref,
$match[2], $match[5]+2, # MATCH
sub _match_bracketed
($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
unless ($$textref =~ m/\G$pre/gc)
$@
= "Did not find prefix: /$pre/";
$ldelpos = pos $$textref;
unless ($$textref =~ m/\G($ldel)/gc)
$@
= "Did not find opening bracket after prefix: \"$pre\"";
pos $$textref = $startpos;
my $textlen = length $$textref;
while (pos $$textref < $textlen)
next if $$textref =~ m/\G\\./gcs;
if ($$textref =~ m/\G($ldel)/gc)
elsif ($$textref =~ m/\G($rdel)/gc)
my ($found, $brackettype) = ($1, $1);
$@
= "Unmatched closing bracket: \"$found\"";
pos $$textref = $startpos;
my $expected = pop(@nesting);
$expected =~ tr/({[</)}]>/;
if ($expected ne $brackettype)
$@
= qq{Mismatched closing bracket
: expected
"$expected" but found
"$found"};
pos $$textref = $startpos;
elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
$$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gc and next;
$@
= "Unmatched embedded quote ($1)";
pos $$textref = $startpos;
elsif ($quotelike && _match_quotelike
($textref,"",1,0))
else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
$@
= "Unmatched opening bracket(s): "
. join("..",@nesting)."..";
pos $$textref = $startpos;
$startpos, $ldelpos-$startpos, # PREFIX
$ldelpos, 1, # OPENING BRACKET
$ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
$endpos-1, 1, # CLOSING BRACKET
$endpos, length($$textref)-$endpos, # REMAINDER
my $brack = reverse $_[0];
my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
sub extract_tagged
(;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
my $textref = defined $_[0] ? \
$_[0] : \
$_;
my $pre = defined $_[3] ?
$_[3] : '\s*';
my %options = defined $_[4] ?
%{$_[4]} : ();
my $omode = defined $options{fail
} ?
$options{fail
} : '';
my $bad = ref($options{reject
}) eq 'ARRAY' ?
join('|', @
{$options{reject
}})
: defined($options{reject
}) ?
$options{reject
}
my $ignore = ref($options{ignore
}) eq 'ARRAY' ?
join('|', @
{$options{ignore
}})
: defined($options{ignore
}) ?
$options{ignore
}
if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat
(q{'"}) . '|[^>])*>'; }
my @match = _match_tagged
($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
return _fail
(wantarray, $textref) unless @match;
return _succeed
wantarray, $textref,
$match[2], $match[3]+$match[5]+$match[7], # MATCH
@match[8..9,0..1,2..7]; # REM, PRE, BITS
sub _match_tagged
# ($$$$$$$)
my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
unless ($$textref =~ m/\G($pre)/gc)
$@
= "Did not find prefix: /$pre/";
$opentagpos = pos($$textref);
unless ($$textref =~ m/\G$ldel/gc)
$@
= "Did not find opening tag: /$ldel/";
$textpos = pos($$textref);
unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ "$1\/$2". revbracket($1) /oes)
$@ = "Unable to construct closing tag to match
: $rdel";
$rdelspec = eval "qq{$rdel}";
while (pos($$textref) < length($$textref))
next if $$textref =~ m/\G\\./gc;
if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
$parapos = pos($$textref) - length($1)
elsif ($$textref =~ m/\G($rdelspec)/gc )
$closetagpos = pos($$textref)-length($1);
elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
elsif ($bad && $$textref =~ m/\G($bad)/gcs)
pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
goto short if ($omode eq 'PARA' || $omode eq 'MAX');
$@ = "Found invalid nested tag
: $1";
elsif ($$textref =~ m/\G($ldel)/gc)
pos($$textref) -= length($tag); # REWIND TO NESTED TAG
unless (_match_tagged(@_)) # MATCH NESTED TAG
goto short if $omode eq 'PARA' || $omode eq 'MAX';
$@ = "Found unbalanced nested tag
: $tag";
else { $$textref =~ m/./gcs }
$closetagpos = pos($$textref);
goto matched if $omode eq 'MAX';
goto failed unless $omode eq 'PARA';
if (defined $parapos) { pos($$textref) = $parapos }
else { $parapos = pos($$textref) }
$startpos, $opentagpos-$startpos, # PREFIX
$opentagpos, $textpos-$opentagpos, # OPENING TAG
$textpos, $parapos-$textpos, # TEXT
$parapos, 0, # NO CLOSING TAG
$parapos, length($$textref)-$parapos, # REMAINDER
$endpos = pos($$textref);
$startpos, $opentagpos-$startpos, # PREFIX
$opentagpos, $textpos-$opentagpos, # OPENING TAG
$textpos, $closetagpos-$textpos, # TEXT
$closetagpos, $endpos-$closetagpos, # CLOSING TAG
$endpos, length($$textref)-$endpos, # REMAINDER
$@ = "Did
not find closing tag
" unless $@;
pos($$textref) = $startpos;
sub extract_variable (;$$)
my $textref = defined $_[0] ? \$_[0] : \$_;
return ("","","") unless defined $$textref;
my $pre = defined $_[1] ? $_[1] : '\s*';
my @match = _match_variable($textref,$pre);
return _fail wantarray, $textref unless @match;
return _succeed wantarray, $textref,
@match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
my ($textref, $pre) = @_;
my $startpos = pos($$textref) = pos($$textref)||0;
unless ($$textref =~ m/\G($pre)/gc)
$@ = "Did
not find prefix
: /$pre/";
my $varpos = pos($$textref);
unless ($$textref =~ m/\G(\$#?|[*\@\%]|\\&)+/gc)
$@ = "Did
not find leading dereferencer
";
pos $$textref = $startpos;
unless ($$textref =~ m/\G\s*(?:::)?(?:[_a-z]\w*::)*[_a-z]\w*/gci
or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0))
$@ = "Bad identifier after dereferencer
";
pos $$textref = $startpos;
next if _match_codeblock($textref,
qr/\s*->\s*(?:[a-zA-Z]\w+\s*)?/,
qr/[({[]/, qr/[)}\]]/, 0);
next if _match_codeblock($textref,
qr/\s*/, qr/[{[]/, qr/[}\]]/,
next if _match_variable($textref,'\s*->\s*');
next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
my $endpos = pos($$textref);
return ($startpos, $varpos-$startpos,
$varpos, $endpos-$varpos,
$endpos, length($$textref)-$endpos
sub extract_codeblock (;$$$$$)
my $textref = defined $_[0] ? \$_[0] : \$_;
my $wantarray = wantarray;
my $ldel_inner = defined $_[1] ? $_[1] : '{';
my $pre = defined $_[2] ? $_[2] : '\s*';
my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
my $rdel_inner = $ldel_inner;
my $rdel_outer = $ldel_outer;
for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
$_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
my @match = _match_codeblock($textref, $pre,
$ldel_outer, $rdel_outer,
$ldel_inner, $rdel_inner,
return _fail($wantarray, $textref) unless @match;
return _succeed($wantarray, $textref,
@match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
sub _match_codeblock($$$$$$$)
my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
my $startpos = pos($$textref) = pos($$textref) || 0;
unless ($$textref =~ m/\G($pre)/gc)
$@ = qq{Did not match prefix /$pre/ at"} .
substr($$textref,pos($$textref),20) .
my $codepos = pos($$textref);
unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
$@
= qq{Did
not find expected opening bracket at
"} .
substr($$textref,pos($$textref),20) .
pos $$textref = $startpos;
$closing =~ tr/([<{/)]>}/;
while (pos($$textref) < length($$textref))
if ($rd && $$textref =~ m
#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
if ($$textref =~ m/\G\s*#.*/gc)
if ($$textref =~ m/\G\s*($rdel_outer)/gc)
unless ($matched = ($closing && $1 eq $closing) )
next if $1 eq '>'; # MIGHT BE A "LESS THAN"
$@
= q{Mismatched closing bracket at "} .
substr($$textref,pos($$textref),20) .
qq{...". Expected '$closing'};
if (_match_variable($textref,'\s*') ||
_match_quotelike($textref,'\s*',$patvalid,$patvalid) )
# NEED TO COVER MANY MORE CASES HERE!!!
if ($$textref =~ m#\G\s*( [-+*x/%^&|.]=?
if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
if ($$textref =~ m/\G\s*$ldel_outer/gc)
$@ = q{Improperly nested codeblock at "} .
substr($$textref,pos($$textref),20) .
$$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
$@
= 'No match found for opening bracket' unless $@
;
my $endpos = pos($$textref);
return ( $startpos, $codepos-$startpos,
$codepos, $endpos-$codepos,
$endpos, length($$textref)-$endpos,
sub extract_quotelike
(;$$)
my $textref = $_[0] ? \
$_[0] : \
$_;
my $wantarray = wantarray;
my $pre = defined $_[1] ?
$_[1] : '\s*';
my @match = _match_quotelike
($textref,$pre,1,0);
return _fail
($wantarray, $textref) unless @match;
return _succeed
($wantarray, $textref,
$match[2], $match[18]-$match[2], # MATCH
@match[18,19], # REMAINDER
@match[2..17], # THE BITS
sub _match_quotelike
($$$$) # ($textref, $prepat, $allow_raw_match)
my ($textref, $pre, $rawmatch, $qmark) = @_;
$preld1pos,$ld1pos,$str1pos,$rd1pos,
$preld2pos,$ld2pos,$str2pos,$rd2pos,
$modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
unless ($$textref =~ m/\G($pre)/gc)
$@
= qq{Did
not find prefix
/$pre/ at
"} .
substr($$textref, pos($$textref), 20) .
my $initial = substr($$textref,$oppos,1);
if ($initial && $initial =~ m
|^[\"\'\
`]|
|| $rawmatch && $initial =~ m|^/|
|| $qmark && $initial =~ m|^\?|)
unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcx)
$@ = qq{Did not find closing delimiter to match '$initial' at "} .
substr($$textref, $oppos, 20) .
pos $$textref = $startpos;
if ($initial eq '/' || $initial eq '?')
$$textref =~ m/\G$mods{none}/gc
my $endpos = pos($$textref);
$startpos, $oppos-$startpos, # PREFIX
$oppos+1, $rd1pos-$oppos-1, # STR/PAT
$modpos, 0, # NO 2ND LDEL
$modpos, 0, # NO 2ND RDEL
$modpos, $endpos-$modpos, # MODIFIERS
$endpos, $textlen-$endpos, # REMAINDER
unless ($$textref =~ m!\G(m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)!gc)
$@
= q{No quotelike operator found after prefix at "} .
substr($$textref, pos($$textref), 20) .
pos $$textref = $startpos;
$preld1pos = pos($$textref);
$ld1pos = pos($$textref);
unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
$@
= "No block delimiter found after quotelike $op";
pos $$textref = $startpos;
pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
_match_bracketed
($textref,"",$ldel1,"","",$rdel1)
|| do { pos $$textref = $startpos; return };
$$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gc
|| do { pos $$textref = $startpos; return };
$ld2pos = $rd1pos = pos($$textref)-1;
my $second_arg = $op =~ /s|tr|y/ ?
1 : 0;
unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
$@
= "Missing second block for quotelike $op";
pos $$textref = $startpos;
$ldel2 = $rdel2 = "\Q$1";
$ldel2 = $rdel2 = $ldel1;
pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
_match_bracketed
($textref,"",$ldel2,"","",$rdel2)
|| do { pos $$textref = $startpos; return };
$$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gc
|| do { pos $$textref = $startpos; return };
$rd2pos = pos($$textref)-1;
$ld2pos = $str2pos = $rd2pos = $rd1pos;
$$textref =~ m/\G($mods{$op})/gc;
my $endpos = pos $$textref;
$startpos, $startpos-$oppos, # PREFIX
$oppos, length($op), # OPERATOR
$str1pos, $rd1pos-$str1pos, # STR/PAT
$ld2pos, $second_arg, # 2ND LDEL (MAYBE)
$str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
$rd2pos, $second_arg, # 2ND RDEL (MAYBE)
$modpos, $endpos-$modpos, # MODIFIERS
$endpos, $textlen-$endpos, # REMAINDER
sub { extract_variable
($_[0], '') },
sub { extract_quotelike
($_[0],'') },
sub { extract_codeblock
($_[0],'{}','') },
sub extract_multiple
(;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
my $textref = defined($_[0]) ? \
$_[0] : \
$_;
my ($lastpos, $firstpos);
my @func = defined $_[1] ? @
{$_[1]} : @
{$def_func};
my $max = defined $_[2] && $_[2]>0 ?
$_[2] : 1_000_000_000
;
carp
"extract_multiple reset maximal count to 1 in scalar context"
if $^W
&& defined($_[2]) && $max > 1;
if (ref($func) eq 'HASH')
push @class, (keys %$func)[0];
$func = (values %$func)[0];
FIELD
: while (pos() < length())
foreach my $i ( 0..$#func )
if (ref($func) eq 'CODE')
{ ($field) = $func->($_) }
{ $field = defined($1) ?
$1 : $& }
if (defined($field) && length($field))
if (defined($unkpos) && !$igunk)
push @fields, substr($_, $unkpos, $lastpos-$unkpos);
$firstpos = $unkpos unless defined $firstpos;
last FIELD
if @fields == $max;
$firstpos = $lastpos unless defined $firstpos;
last FIELD
if @fields == $max;
unless $igunk || defined $unkpos;
push @fields, substr($_, $unkpos);
$firstpos = $unkpos unless defined $firstpos;
pos $$textref = $lastpos;
return @fields if wantarray;
eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
pos $$textref = $firstpos };
sub gen_extract_tagged
# ($opentag, $closetag, $pre, \%options)
my $pre = defined $_[2] ?
$_[2] : '\s*';
my %options = defined $_[3] ?
%{$_[3]} : ();
my $omode = defined $options{fail
} ?
$options{fail
} : '';
my $bad = ref($options{reject
}) eq 'ARRAY' ?
join('|', @
{$options{reject
}})
: defined($options{reject
}) ?
$options{reject
}
my $ignore = ref($options{ignore
}) eq 'ARRAY' ?
join('|', @
{$options{ignore
}})
: defined($options{ignore
}) ?
$options{ignore
}
if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat
(q{'"}) . '|[^>])*>'; }
for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
my $textref = defined $_[0] ? \
$_[0] : \
$_;
my @match = Text
::Balanced
::_match_tagged
($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
return _fail
(wantarray, $textref) unless @match;
return _succeed
wantarray, $textref,
$match[2], $match[3]+$match[5]+$match[7], # MATCH
@match[8..9,0..1,2..7]; # REM, PRE, BITS
bless $closure, 'Text::Balanced::Extractor';
package Text
::Balanced
::Extractor
;
sub extract
($$) # ($self, $text)