git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental - sam-t2/devtools/v9/lib/perl5/5.8.8/Text/Balanced.pm

... / ...

Commit	Line	Data
	1	# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
	2	# FOR FULL DOCUMENTATION SEE Balanced.pod
	3
	4	use 5.005;
	5	use strict;
	6
	7	package Text::Balanced;
	8
	9	use Exporter;
	10	use SelfLoader;
	11	use vars qw { $VERSION @ISA %EXPORT_TAGS };
	12
	13	$VERSION = '1.95';
	14	@ISA = qw ( Exporter );
	15
	16	%EXPORT_TAGS = ( ALL => [ qw(
	17	&extract_delimited
	18	&extract_bracketed
	19	&extract_quotelike
	20	&extract_codeblock
	21	&extract_variable
	22	&extract_tagged
	23	&extract_multiple
	24
	25	&gen_delimited_pat
	26	&gen_extract_tagged
	27
	28	&delimited_pat
	29	) ] );
	30
	31	Exporter::export_ok_tags('ALL');
	32
	33	# PROTOTYPES
	34
	35	sub _match_bracketed($$$$$$);
	36	sub _match_variable($$);
	37	sub _match_codeblock($$$$$$$);
	38	sub _match_quotelike($$$$);
	39
	40	# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
	41
	42	sub _failmsg {
	43	my ($message, $pos) = @_;
	44	$@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
	45	}
	46
	47	sub _fail
	48	{
	49	my ($wantarray, $textref, $message, $pos) = @_;
	50	_failmsg $message, $pos if $message;
	51	return ("",$$textref,"") if $wantarray;
	52	return undef;
	53	}
	54
	55	sub _succeed
	56	{
	57	$@ = undef;
	58	my ($wantarray,$textref) = splice @_, 0, 2;
	59	my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
	60	my ($startlen) = $_[5];
	61	my $remainderpos = $_[2];
	62	if ($wantarray)
	63	{
	64	my @res;
	65	while (my ($from, $len) = splice @_, 0, 2)
	66	{
	67	push @res, substr($$textref,$from,$len);
	68	}
	69	if ($extralen) { # CORRECT FILLET
	70	my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
	71	$res[1] = "$extra$res[1]";
	72	eval { substr($$textref,$remainderpos,0) = $extra;
	73	substr($$textref,$extrapos,$extralen,"\n")} ;
	74	#REARRANGE HERE DOC AND FILLET IF POSSIBLE
	75	pos($$textref) = $remainderpos-$extralen+1; # RESET \G
	76	}
	77	else {
	78	pos($$textref) = $remainderpos; # RESET \G
	79	}
	80	return @res;
	81	}
	82	else
	83	{
	84	my $match = substr($$textref,$_[0],$_[1]);
	85	substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
	86	my $extra = $extralen
	87	? substr($$textref, $extrapos, $extralen)."\n" : "";
	88	eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
	89	pos($$textref) = $_[4]; # RESET \G
	90	return $match;
	91	}
	92	}
	93
	94	# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
	95
	96	sub gen_delimited_pat($;$) # ($delimiters;$escapes)
	97	{
	98	my ($dels, $escs) = @_;
	99	return "" unless $dels =~ /\S/;
	100	$escs = '\\' unless $escs;
	101	$escs .= substr($escs,-1) x (length($dels)-length($escs));
	102	my @pat = ();
	103	my $i;
	104	for ($i=0; $i<length $dels; $i++)
	105	{
	106	my $del = quotemeta substr($dels,$i,1);
	107	my $esc = quotemeta substr($escs,$i,1);
	108	if ($del eq $esc)
	109	{
	110	push @pat, "$del(?:[^$del](?:(?:$del$del)[^$del])*)$del";
	111	}
	112	else
	113	{
	114	push @pat, "$del(?:[^$esc$del](?:$esc.[^$esc$del])*)$del";
	115	}
	116	}
	117	my $pat = join '\|', @pat;
	118	return "(?:$pat)";
	119	}
	120
	121	*delimited_pat = \&gen_delimited_pat;
	122
	123
	124	# THE EXTRACTION FUNCTIONS
	125
	126	sub extract_delimited (;$$$$)
	127	{
	128	my $textref = defined $_[0] ? \$_[0] : \$_;
	129	my $wantarray = wantarray;
	130	my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
	131	my $pre = defined $_[2] ? $_[2] : '\s*';
	132	my $esc = defined $_[3] ? $_[3] : qq{\\};
	133	my $pat = gen_delimited_pat($del, $esc);
	134	my $startpos = pos $$textref \|\| 0;
	135	return _fail($wantarray, $textref, "Not a delimited pattern", 0)
	136	unless $$textref =~ m/\G($pre)($pat)/gc;
	137	my $prelen = length($1);
	138	my $matchpos = $startpos+$prelen;
	139	my $endpos = pos $$textref;
	140	return _succeed $wantarray, $textref,
	141	$matchpos, $endpos-$matchpos, # MATCH
	142	$endpos, length($$textref)-$endpos, # REMAINDER
	143	$startpos, $prelen; # PREFIX
	144	}
	145
	146	sub extract_bracketed (;$$$)
	147	{
	148	my $textref = defined $_[0] ? \$_[0] : \$_;
	149	my $ldel = defined $_[1] ? $_[1] : '{([<';
	150	my $pre = defined $_[2] ? $_[2] : '\s*';
	151	my $wantarray = wantarray;
	152	my $qdel = "";
	153	my $quotelike;
	154	$ldel =~ s/'//g and $qdel .= q{'};
	155	$ldel =~ s/"//g and $qdel .= q{"};
	156	$ldel =~ s/`//g and $qdel .= q{`};
	157	$ldel =~ s/q//g and $quotelike = 1;
	158	$ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
	159	my $rdel = $ldel;
	160	unless ($rdel =~ tr/[({</])}>/)
	161	{
	162	return _fail $wantarray, $textref,
	163	"Did not find a suitable bracket in delimiter: \"$_[1]\"",
	164	0;
	165	}
	166	my $posbug = pos;
	167	$ldel = join('\|', map { quotemeta $_ } split('', $ldel));
	168	$rdel = join('\|', map { quotemeta $_ } split('', $rdel));
	169	pos = $posbug;
	170
	171	my $startpos = pos $$textref \|\| 0;
	172	my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
	173
	174	return _fail ($wantarray, $textref) unless @match;
	175
	176	return _succeed ( $wantarray, $textref,
	177	$match[2], $match[5]+2, # MATCH
	178	@match[8,9], # REMAINDER
	179	@match[0,1], # PREFIX
	180	);
	181	}
	182
	183	sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
	184	{
	185	my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
	186	my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref\|\|0);
	187	unless ($$textref =~ m/\G$pre/gc)
	188	{
	189	_failmsg "Did not find prefix: /$pre/", $startpos;
	190	return;
	191	}
	192
	193	$ldelpos = pos $$textref;
	194
	195	unless ($$textref =~ m/\G($ldel)/gc)
	196	{
	197	_failmsg "Did not find opening bracket after prefix: \"$pre\"",
	198	pos $$textref;
	199	pos $$textref = $startpos;
	200	return;
	201	}
	202
	203	my @nesting = ( $1 );
	204	my $textlen = length $$textref;
	205	while (pos $$textref < $textlen)
	206	{
	207	next if $$textref =~ m/\G\\./gcs;
	208
	209	if ($$textref =~ m/\G($ldel)/gc)
	210	{
	211	push @nesting, $1;
	212	}
	213	elsif ($$textref =~ m/\G($rdel)/gc)
	214	{
	215	my ($found, $brackettype) = ($1, $1);
	216	if ($#nesting < 0)
	217	{
	218	_failmsg "Unmatched closing bracket: \"$found\"",
	219	pos $$textref;
	220	pos $$textref = $startpos;
	221	return;
	222	}
	223	my $expected = pop(@nesting);
	224	$expected =~ tr/({[</)}]>/;
	225	if ($expected ne $brackettype)
	226	{
	227	_failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
	228	pos $$textref;
	229	pos $$textref = $startpos;
	230	return;
	231	}
	232	last if $#nesting < 0;
	233	}
	234	elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
	235	{
	236	$$textref =~ m/\G[^\\$1](?:\\.[^\\$1])*(\Q$1\E)/gsc and next;
	237	_failmsg "Unmatched embedded quote ($1)",
	238	pos $$textref;
	239	pos $$textref = $startpos;
	240	return;
	241	}
	242	elsif ($quotelike && _match_quotelike($textref,"",1,0))
	243	{
	244	next;
	245	}
	246
	247	else { $$textref =~ m/\G(?:[a-zA-Z0-9]+\|.)/gcs }
	248	}
	249	if ($#nesting>=0)
	250	{
	251	_failmsg "Unmatched opening bracket(s): "
	252	. join("..",@nesting)."..",
	253	pos $$textref;
	254	pos $$textref = $startpos;
	255	return;
	256	}
	257
	258	$endpos = pos $$textref;
	259
	260	return (
	261	$startpos, $ldelpos-$startpos, # PREFIX
	262	$ldelpos, 1, # OPENING BRACKET
	263	$ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
	264	$endpos-1, 1, # CLOSING BRACKET
	265	$endpos, length($$textref)-$endpos, # REMAINDER
	266	);
	267	}
	268
	269	sub revbracket($)
	270	{
	271	my $brack = reverse $_[0];
	272	$brack =~ tr/[({</])}>/;
	273	return $brack;
	274	}
	275
	276	my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
	277
	278	sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
	279	{
	280	my $textref = defined $_[0] ? \$_[0] : \$_;
	281	my $ldel = $_[1];
	282	my $rdel = $_[2];
	283	my $pre = defined $_[3] ? $_[3] : '\s*';
	284	my %options = defined $_[4] ? %{$_[4]} : ();
	285	my $omode = defined $options{fail} ? $options{fail} : '';
	286	my $bad = ref($options{reject}) eq 'ARRAY' ? join('\|', @{$options{reject}})
	287	: defined($options{reject}) ? $options{reject}
	288	: ''
	289	;
	290	my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('\|', @{$options{ignore}})
	291	: defined($options{ignore}) ? $options{ignore}
	292	: ''
	293	;
	294
	295	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '\|[^>])*>'; }
	296	$@ = undef;
	297
	298	my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
	299
	300	return _fail(wantarray, $textref) unless @match;
	301	return _succeed wantarray, $textref,
	302	$match[2], $match[3]+$match[5]+$match[7], # MATCH
	303	@match[8..9,0..1,2..7]; # REM, PRE, BITS
	304	}
	305
	306	sub _match_tagged # ($$$$$$$)
	307	{
	308	my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
	309	my $rdelspec;
	310
	311	my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)\|\|0 );
	312
	313	unless ($$textref =~ m/\G($pre)/gc)
	314	{
	315	_failmsg "Did not find prefix: /$pre/", pos $$textref;
	316	goto failed;
	317	}
	318
	319	$opentagpos = pos($$textref);
	320
	321	unless ($$textref =~ m/\G$ldel/gc)
	322	{
	323	_failmsg "Did not find opening tag: /$ldel/", pos $$textref;
	324	goto failed;
	325	}
	326
	327	$textpos = pos($$textref);
	328
	329	if (!defined $rdel)
	330	{
	331	$rdelspec = $&;
	332	unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
	333	{
	334	_failmsg "Unable to construct closing tag to match: $rdel",
	335	pos $$textref;
	336	goto failed;
	337	}
	338	}
	339	else
	340	{
	341	$rdelspec = eval "qq{$rdel}" \|\| do {
	342	my $del;
	343	for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / \| ',)
	344	{ next if $rdel =~ /\Q$_/; $del = $_; last }
	345	unless ($del) {
	346	use Carp;
	347	croak "Can't interpolate right delimiter $rdel"
	348	}
	349	eval "qq$del$rdel$del";
	350	};
	351	}
	352
	353	while (pos($$textref) < length($$textref))
	354	{
	355	next if $$textref =~ m/\G\\./gc;
	356
	357	if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
	358	{
	359	$parapos = pos($$textref) - length($1)
	360	unless defined $parapos;
	361	}
	362	elsif ($$textref =~ m/\G($rdelspec)/gc )
	363	{
	364	$closetagpos = pos($$textref)-length($1);
	365	goto matched;
	366	}
	367	elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
	368	{
	369	next;
	370	}
	371	elsif ($bad && $$textref =~ m/\G($bad)/gcs)
	372	{
	373	pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
	374	goto short if ($omode eq 'PARA' \|\| $omode eq 'MAX');
	375	_failmsg "Found invalid nested tag: $1", pos $$textref;
	376	goto failed;
	377	}
	378	elsif ($$textref =~ m/\G($ldel)/gc)
	379	{
	380	my $tag = $1;
	381	pos($$textref) -= length($tag); # REWIND TO NESTED TAG
	382	unless (_match_tagged(@_)) # MATCH NESTED TAG
	383	{
	384	goto short if $omode eq 'PARA' \|\| $omode eq 'MAX';
	385	_failmsg "Found unbalanced nested tag: $tag",
	386	pos $$textref;
	387	goto failed;
	388	}
	389	}
	390	else { $$textref =~ m/./gcs }
	391	}
	392
	393	short:
	394	$closetagpos = pos($$textref);
	395	goto matched if $omode eq 'MAX';
	396	goto failed unless $omode eq 'PARA';
	397
	398	if (defined $parapos) { pos($$textref) = $parapos }
	399	else { $parapos = pos($$textref) }
	400
	401	return (
	402	$startpos, $opentagpos-$startpos, # PREFIX
	403	$opentagpos, $textpos-$opentagpos, # OPENING TAG
	404	$textpos, $parapos-$textpos, # TEXT
	405	$parapos, 0, # NO CLOSING TAG
	406	$parapos, length($$textref)-$parapos, # REMAINDER
	407	);
	408
	409	matched:
	410	$endpos = pos($$textref);
	411	return (
	412	$startpos, $opentagpos-$startpos, # PREFIX
	413	$opentagpos, $textpos-$opentagpos, # OPENING TAG
	414	$textpos, $closetagpos-$textpos, # TEXT
	415	$closetagpos, $endpos-$closetagpos, # CLOSING TAG
	416	$endpos, length($$textref)-$endpos, # REMAINDER
	417	);
	418
	419	failed:
	420	_failmsg "Did not find closing tag", pos $$textref unless $@;
	421	pos($$textref) = $startpos;
	422	return;
	423	}
	424
	425	sub extract_variable (;$$)
	426	{
	427	my $textref = defined $_[0] ? \$_[0] : \$_;
	428	return ("","","") unless defined $$textref;
	429	my $pre = defined $_[1] ? $_[1] : '\s*';
	430
	431	my @match = _match_variable($textref,$pre);
	432
	433	return _fail wantarray, $textref unless @match;
	434
	435	return _succeed wantarray, $textref,
	436	@match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
	437	}
	438
	439	sub _match_variable($$)
	440	{
	441	# $#
	442	# $^
	443	# $$
	444	my ($textref, $pre) = @_;
	445	my $startpos = pos($$textref) = pos($$textref)\|\|0;
	446	unless ($$textref =~ m/\G($pre)/gc)
	447	{
	448	_failmsg "Did not find prefix: /$pre/", pos $$textref;
	449	return;
	450	}
	451	my $varpos = pos($$textref);
	452	unless ($$textref =~ m{\G\$\s(?!::)(\d+\|[][&`'+./\|,";%=~:?!\@<>()-]\|\^[a-z]?)}gci)
	453	{
	454	unless ($$textref =~ m/\G((\$#?\|[*\@\%]\|\\&)+)/gc)
	455	{
	456	_failmsg "Did not find leading dereferencer", pos $$textref;
	457	pos $$textref = $startpos;
	458	return;
	459	}
	460	my $deref = $1;
	461
	462	unless ($$textref =~ m/\G\s(?:::\|')?(?:[_a-z]\w(?:::\|'))[_a-z]\w/gci
	463	or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
	464	or $deref eq '$#' or $deref eq '$$' )
	465	{
	466	_failmsg "Bad identifier after dereferencer", pos $$textref;
	467	pos $$textref = $startpos;
	468	return;
	469	}
	470	}
	471
	472	while (1)
	473	{
	474	next if $$textref =~ m/\G\s(?:->)?\s[{]\w+[}]/gc;
	475	next if _match_codeblock($textref,
	476	qr/\s->\s(?:[_a-zA-Z]\w+\s*)?/,
	477	qr/[({[]/, qr/[)}\]]/,
	478	qr/[({[]/, qr/[)}\]]/, 0);
	479	next if _match_codeblock($textref,
	480	qr/\s*/, qr/[{[]/, qr/[}\]]/,
	481	qr/[{[]/, qr/[}\]]/, 0);
	482	next if _match_variable($textref,'\s->\s');
	483	next if $$textref =~ m/\G\s->\s\w+(?![{([])/gc;
	484	last;
	485	}
	486
	487	my $endpos = pos($$textref);
	488	return ($startpos, $varpos-$startpos,
	489	$varpos, $endpos-$varpos,
	490	$endpos, length($$textref)-$endpos
	491	);
	492	}
	493
	494	sub extract_codeblock (;$$$$$)
	495	{
	496	my $textref = defined $_[0] ? \$_[0] : \$_;
	497	my $wantarray = wantarray;
	498	my $ldel_inner = defined $_[1] ? $_[1] : '{';
	499	my $pre = defined $_[2] ? $_[2] : '\s*';
	500	my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
	501	my $rd = $_[4];
	502	my $rdel_inner = $ldel_inner;
	503	my $rdel_outer = $ldel_outer;
	504	my $posbug = pos;
	505	for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
	506	for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
	507	for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
	508	{
	509	$_ = '('.join('\|',map { quotemeta $_ } split('',$_)).')'
	510	}
	511	pos = $posbug;
	512
	513	my @match = _match_codeblock($textref, $pre,
	514	$ldel_outer, $rdel_outer,
	515	$ldel_inner, $rdel_inner,
	516	$rd);
	517	return _fail($wantarray, $textref) unless @match;
	518	return _succeed($wantarray, $textref,
	519	@match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
	520	);
	521
	522	}
	523
	524	sub _match_codeblock($$$$$$$)
	525	{
	526	my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
	527	my $startpos = pos($$textref) = pos($$textref) \|\| 0;
	528	unless ($$textref =~ m/\G($pre)/gc)
	529	{
	530	_failmsg qq{Did not match prefix /$pre/ at"} .
	531	substr($$textref,pos($$textref),20) .
	532	q{..."},
	533	pos $$textref;
	534	return;
	535	}
	536	my $codepos = pos($$textref);
	537	unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
	538	{
	539	_failmsg qq{Did not find expected opening bracket at "} .
	540	substr($$textref,pos($$textref),20) .
	541	q{..."},
	542	pos $$textref;
	543	pos $$textref = $startpos;
	544	return;
	545	}
	546	my $closing = $1;
	547	$closing =~ tr/([<{/)]>}/;
	548	my $matched;
	549	my $patvalid = 1;
	550	while (pos($$textref) < length($$textref))
	551	{
	552	$matched = '';
	553	if ($rd && $$textref =~ m#\G(\Q(?)\E\|\Q(s?)\E\|\Q(s)\E)#gc)
	554	{
	555	$patvalid = 0;
	556	next;
	557	}
	558
	559	if ($$textref =~ m/\G\s#./gc)
	560	{
	561	next;
	562	}
	563
	564	if ($$textref =~ m/\G\s*($rdel_outer)/gc)
	565	{
	566	unless ($matched = ($closing && $1 eq $closing) )
	567	{
	568	next if $1 eq '>'; # MIGHT BE A "LESS THAN"
	569	_failmsg q{Mismatched closing bracket at "} .
	570	substr($$textref,pos($$textref),20) .
	571	qq{...". Expected '$closing'},
	572	pos $$textref;
	573	}
	574	last;
	575	}
	576
	577	if (_match_variable($textref,'\s*') \|\|
	578	_match_quotelike($textref,'\s*',$patvalid,$patvalid) )
	579	{
	580	$patvalid = 0;
	581	next;
	582	}
	583
	584
	585	# NEED TO COVER MANY MORE CASES HERE!!!
	586	if ($$textref =~ m#\G\s*(?!$ldel_inner)
	587	( [-+*x/%^&\|.]=?
	588	\| [!=]~
	589	\| =(?!>)
	590	\| (\\\|&&\|\\|\\|\|<<\|>>)=?
	591	\| split\|grep\|map\|return
	592	\| [([]
	593	)#gcx)
	594	{
	595	$patvalid = 1;
	596	next;
	597	}
	598
	599	if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
	600	{
	601	$patvalid = 1;
	602	next;
	603	}
	604
	605	if ($$textref =~ m/\G\s*$ldel_outer/gc)
	606	{
	607	_failmsg q{Improperly nested codeblock at "} .
	608	substr($$textref,pos($$textref),20) .
	609	q{..."},
	610	pos $$textref;
	611	last;
	612	}
	613
	614	$patvalid = 0;
	615	$$textref =~ m/\G\s*(\w+\|[-=>]>\|.\|\Z)/gc;
	616	}
	617	continue { $@ = undef }
	618
	619	unless ($matched)
	620	{
	621	_failmsg 'No match found for opening bracket', pos $$textref
	622	unless $@;
	623	return;
	624	}
	625
	626	my $endpos = pos($$textref);
	627	return ( $startpos, $codepos-$startpos,
	628	$codepos, $endpos-$codepos,
	629	$endpos, length($$textref)-$endpos,
	630	);
	631	}
	632
	633
	634	my %mods = (
	635	'none' => '[cgimsox]*',
	636	'm' => '[cgimsox]*',
	637	's' => '[cegimsox]*',
	638	'tr' => '[cds]*',
	639	'y' => '[cds]*',
	640	'qq' => '',
	641	'qx' => '',
	642	'qw' => '',
	643	'qr' => '[imsx]*',
	644	'q' => '',
	645	);
	646
	647	sub extract_quotelike (;$$)
	648	{
	649	my $textref = $_[0] ? \$_[0] : \$_;
	650	my $wantarray = wantarray;
	651	my $pre = defined $_[1] ? $_[1] : '\s*';
	652
	653	my @match = _match_quotelike($textref,$pre,1,0);
	654	return _fail($wantarray, $textref) unless @match;
	655	return _succeed($wantarray, $textref,
	656	$match[2], $match[18]-$match[2], # MATCH
	657	@match[18,19], # REMAINDER
	658	@match[0,1], # PREFIX
	659	@match[2..17], # THE BITS
	660	@match[20,21], # ANY FILLET?
	661	);
	662	};
	663
	664	sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
	665	{
	666	my ($textref, $pre, $rawmatch, $qmark) = @_;
	667
	668	my ($textlen,$startpos,
	669	$oppos,
	670	$preld1pos,$ld1pos,$str1pos,$rd1pos,
	671	$preld2pos,$ld2pos,$str2pos,$rd2pos,
	672	$modpos) = ( length($$textref), pos($$textref) = pos($$textref) \|\| 0 );
	673
	674	unless ($$textref =~ m/\G($pre)/gc)
	675	{
	676	_failmsg qq{Did not find prefix /$pre/ at "} .
	677	substr($$textref, pos($$textref), 20) .
	678	q{..."},
	679	pos $$textref;
	680	return;
	681	}
	682	$oppos = pos($$textref);
	683
	684	my $initial = substr($$textref,$oppos,1);
	685
	686	if ($initial && $initial =~ m\|^[\"\'\`]\|
	687	\|\| $rawmatch && $initial =~ m\|^/\|
	688	\|\| $qmark && $initial =~ m\|^\?\|)
	689	{
	690	unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]) \Q$initial\E /gcsx)
	691	{
	692	_failmsg qq{Did not find closing delimiter to match '$initial' at "} .
	693	substr($$textref, $oppos, 20) .
	694	q{..."},
	695	pos $$textref;
	696	pos $$textref = $startpos;
	697	return;
	698	}
	699	$modpos= pos($$textref);
	700	$rd1pos = $modpos-1;
	701
	702	if ($initial eq '/' \|\| $initial eq '?')
	703	{
	704	$$textref =~ m/\G$mods{none}/gc
	705	}
	706
	707	my $endpos = pos($$textref);
	708	return (
	709	$startpos, $oppos-$startpos, # PREFIX
	710	$oppos, 0, # NO OPERATOR
	711	$oppos, 1, # LEFT DEL
	712	$oppos+1, $rd1pos-$oppos-1, # STR/PAT
	713	$rd1pos, 1, # RIGHT DEL
	714	$modpos, 0, # NO 2ND LDEL
	715	$modpos, 0, # NO 2ND STR
	716	$modpos, 0, # NO 2ND RDEL
	717	$modpos, $endpos-$modpos, # MODIFIERS
	718	$endpos, $textlen-$endpos, # REMAINDER
	719	);
	720	}
	721
	722	unless ($$textref =~ m{\G(\b(?:m\|s\|qq\|qx\|qw\|q\|qr\|tr\|y)\b(?=\s*\S)\|<<)}gc)
	723	{
	724	_failmsg q{No quotelike operator found after prefix at "} .
	725	substr($$textref, pos($$textref), 20) .
	726	q{..."},
	727	pos $$textref;
	728	pos $$textref = $startpos;
	729	return;
	730	}
	731
	732	my $op = $1;
	733	$preld1pos = pos($$textref);
	734	if ($op eq '<<') {
	735	$ld1pos = pos($$textref);
	736	my $label;
	737	if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
	738	$label = $1;
	739	}
	740	elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\])) '
	741	\| \G " ([^"\\]* (?:\\.[^"\\])) "
	742	\| \G ` ([^`\\]* (?:\\.[^`\\])) `
	743	}gcsx) {
	744	$label = $+;
	745	}
	746	else {
	747	$label = "";
	748	}
	749	my $extrapos = pos($$textref);
	750	$$textref =~ m{.*\n}gc;
	751	$str1pos = pos($$textref);
	752	unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
	753	_failmsg qq{Missing here doc terminator ('$label') after "} .
	754	substr($$textref, $startpos, 20) .
	755	q{..."},
	756	pos $$textref;
	757	pos $$textref = $startpos;
	758	return;
	759	}
	760	$rd1pos = pos($$textref);
	761	$$textref =~ m{$label\n}gc;
	762	$ld2pos = pos($$textref);
	763	return (
	764	$startpos, $oppos-$startpos, # PREFIX
	765	$oppos, length($op), # OPERATOR
	766	$ld1pos, $extrapos-$ld1pos, # LEFT DEL
	767	$str1pos, $rd1pos-$str1pos, # STR/PAT
	768	$rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
	769	$ld2pos, 0, # NO 2ND LDEL
	770	$ld2pos, 0, # NO 2ND STR
	771	$ld2pos, 0, # NO 2ND RDEL
	772	$ld2pos, 0, # NO MODIFIERS
	773	$ld2pos, $textlen-$ld2pos, # REMAINDER
	774	$extrapos, $str1pos-$extrapos, # FILLETED BIT
	775	);
	776	}
	777
	778	$$textref =~ m/\G\s*/gc;
	779	$ld1pos = pos($$textref);
	780	$str1pos = $ld1pos+1;
	781
	782	unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
	783	{
	784	_failmsg "No block delimiter found after quotelike $op",
	785	pos $$textref;
	786	pos $$textref = $startpos;
	787	return;
	788	}
	789	pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
	790	my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
	791	if ($ldel1 =~ /[[(<{]/)
	792	{
	793	$rdel1 =~ tr/[({</])}>/;
	794	_match_bracketed($textref,"",$ldel1,"","",$rdel1)
	795	\|\| do { pos $$textref = $startpos; return };
	796	}
	797	else
	798	{
	799	$$textref =~ /$ldel1[^\\$ldel1](\\.[^\\$ldel1])*$ldel1/gcs
	800	\|\| do { pos $$textref = $startpos; return };
	801	}
	802	$ld2pos = $rd1pos = pos($$textref)-1;
	803
	804	my $second_arg = $op =~ /s\|tr\|y/ ? 1 : 0;
	805	if ($second_arg)
	806	{
	807	my ($ldel2, $rdel2);
	808	if ($ldel1 =~ /[[(<{]/)
	809	{
	810	unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
	811	{
	812	_failmsg "Missing second block for quotelike $op",
	813	pos $$textref;
	814	pos $$textref = $startpos;
	815	return;
	816	}
	817	$ldel2 = $rdel2 = "\Q$1";
	818	$rdel2 =~ tr/[({</])}>/;
	819	}
	820	else
	821	{
	822	$ldel2 = $rdel2 = $ldel1;
	823	}
	824	$str2pos = $ld2pos+1;
	825
	826	if ($ldel2 =~ /[[(<{]/)
	827	{
	828	pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
	829	_match_bracketed($textref,"",$ldel2,"","",$rdel2)
	830	\|\| do { pos $$textref = $startpos; return };
	831	}
	832	else
	833	{
	834	$$textref =~ /[^\\$ldel2](\\.[^\\$ldel2])*$ldel2/gcs
	835	\|\| do { pos $$textref = $startpos; return };
	836	}
	837	$rd2pos = pos($$textref)-1;
	838	}
	839	else
	840	{
	841	$ld2pos = $str2pos = $rd2pos = $rd1pos;
	842	}
	843
	844	$modpos = pos $$textref;
	845
	846	$$textref =~ m/\G($mods{$op})/gc;
	847	my $endpos = pos $$textref;
	848
	849	return (
	850	$startpos, $oppos-$startpos, # PREFIX
	851	$oppos, length($op), # OPERATOR
	852	$ld1pos, 1, # LEFT DEL
	853	$str1pos, $rd1pos-$str1pos, # STR/PAT
	854	$rd1pos, 1, # RIGHT DEL
	855	$ld2pos, $second_arg, # 2ND LDEL (MAYBE)
	856	$str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
	857	$rd2pos, $second_arg, # 2ND RDEL (MAYBE)
	858	$modpos, $endpos-$modpos, # MODIFIERS
	859	$endpos, $textlen-$endpos, # REMAINDER
	860	);
	861	}
	862
	863	my $def_func =
	864	[
	865	sub { extract_variable($_[0], '') },
	866	sub { extract_quotelike($_[0],'') },
	867	sub { extract_codeblock($_[0],'{}','') },
	868	];
	869
	870	sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
	871	{
	872	my $textref = defined($_[0]) ? \$_[0] : \$_;
	873	my $posbug = pos;
	874	my ($lastpos, $firstpos);
	875	my @fields = ();
	876
	877	#for ($$textref)
	878	{
	879	my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
	880	my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
	881	my $igunk = $_[3];
	882
	883	pos $$textref \|\|= 0;
	884
	885	unless (wantarray)
	886	{
	887	use Carp;
	888	carp "extract_multiple reset maximal count to 1 in scalar context"
	889	if $^W && defined($_[2]) && $max > 1;
	890	$max = 1
	891	}
	892
	893	my $unkpos;
	894	my $func;
	895	my $class;
	896
	897	my @class;
	898	foreach $func ( @func )
	899	{
	900	if (ref($func) eq 'HASH')
	901	{
	902	push @class, (keys %$func)[0];
	903	$func = (values %$func)[0];
	904	}
	905	else
	906	{
	907	push @class, undef;
	908	}
	909	}
	910
	911	FIELD: while (pos($$textref) < length($$textref))
	912	{
	913	my ($field, $rem);
	914	my @bits;
	915	foreach my $i ( 0..$#func )
	916	{
	917	my $pref;
	918	$func = $func[$i];
	919	$class = $class[$i];
	920	$lastpos = pos $$textref;
	921	if (ref($func) eq 'CODE')
	922	{ ($field,$rem,$pref) = @bits = $func->($$textref);
	923	# print "[$field\|$rem]" if $field;
	924	}
	925	elsif (ref($func) eq 'Text::Balanced::Extractor')
	926	{ @bits = $field = $func->extract($$textref) }
	927	elsif( $$textref =~ m/\G$func/gc )
	928	{ @bits = $field = defined($1) ? $1 : $& }
	929	$pref \|\|= "";
	930	if (defined($field) && length($field))
	931	{
	932	if (!$igunk) {
	933	$unkpos = pos $$textref
	934	if length($pref) && !defined($unkpos);
	935	if (defined $unkpos)
	936	{
	937	push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
	938	$firstpos = $unkpos unless defined $firstpos;
	939	undef $unkpos;
	940	last FIELD if @fields == $max;
	941	}
	942	}
	943	push @fields, $class
	944	? bless (\$field, $class)
	945	: $field;
	946	$firstpos = $lastpos unless defined $firstpos;
	947	$lastpos = pos $$textref;
	948	last FIELD if @fields == $max;
	949	next FIELD;
	950	}
	951	}
	952	if ($$textref =~ /\G(.)/gcs)
	953	{
	954	$unkpos = pos($$textref)-1
	955	unless $igunk \|\| defined $unkpos;
	956	}
	957	}
	958
	959	if (defined $unkpos)
	960	{
	961	push @fields, substr($$textref, $unkpos);
	962	$firstpos = $unkpos unless defined $firstpos;
	963	$lastpos = length $$textref;
	964	}
	965	last;
	966	}
	967
	968	pos $$textref = $lastpos;
	969	return @fields if wantarray;
	970
	971	$firstpos \|\|= 0;
	972	eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
	973	pos $$textref = $firstpos };
	974	return $fields[0];
	975	}
	976
	977
	978	sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
	979	{
	980	my $ldel = $_[0];
	981	my $rdel = $_[1];
	982	my $pre = defined $_[2] ? $_[2] : '\s*';
	983	my %options = defined $_[3] ? %{$_[3]} : ();
	984	my $omode = defined $options{fail} ? $options{fail} : '';
	985	my $bad = ref($options{reject}) eq 'ARRAY' ? join('\|', @{$options{reject}})
	986	: defined($options{reject}) ? $options{reject}
	987	: ''
	988	;
	989	my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('\|', @{$options{ignore}})
	990	: defined($options{ignore}) ? $options{ignore}
	991	: ''
	992	;
	993
	994	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '\|[^>])*>'; }
	995
	996	my $posbug = pos;
	997	for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
	998	pos = $posbug;
	999
	1000	my $closure = sub
	1001	{
	1002	my $textref = defined $_[0] ? \$_[0] : \$_;
	1003	my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
	1004
	1005	return _fail(wantarray, $textref) unless @match;
	1006	return _succeed wantarray, $textref,
	1007	$match[2], $match[3]+$match[5]+$match[7], # MATCH
	1008	@match[8..9,0..1,2..7]; # REM, PRE, BITS
	1009	};
	1010
	1011	bless $closure, 'Text::Balanced::Extractor';
	1012	}
	1013
	1014	package Text::Balanced::Extractor;
	1015
	1016	sub extract($$) # ($self, $text)
	1017	{
	1018	&{$_[0]}($_[1]);
	1019	}
	1020
	1021	package Text::Balanced::ErrorMsg;
	1022
	1023	use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
	1024
	1025	1;
	1026
	1027	__END__
	1028
	1029	=head1 NAME
	1030
	1031	Text::Balanced - Extract delimited text sequences from strings.
	1032
	1033
	1034	=head1 SYNOPSIS
	1035
	1036	use Text::Balanced qw (
	1037	extract_delimited
	1038	extract_bracketed
	1039	extract_quotelike
	1040	extract_codeblock
	1041	extract_variable
	1042	extract_tagged
	1043	extract_multiple
	1044
	1045	gen_delimited_pat
	1046	gen_extract_tagged
	1047	);
	1048
	1049	# Extract the initial substring of $text that is delimited by
	1050	# two (unescaped) instances of the first character in $delim.
	1051
	1052	($extracted, $remainder) = extract_delimited($text,$delim);
	1053
	1054
	1055	# Extract the initial substring of $text that is bracketed
	1056	# with a delimiter(s) specified by $delim (where the string
	1057	# in $delim contains one or more of '(){}[]<>').
	1058
	1059	($extracted, $remainder) = extract_bracketed($text,$delim);
	1060
	1061
	1062	# Extract the initial substring of $text that is bounded by
	1063	# an XML tag.
	1064
	1065	($extracted, $remainder) = extract_tagged($text);
	1066
	1067
	1068	# Extract the initial substring of $text that is bounded by
	1069	# a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
	1070
	1071	($extracted, $remainder) =
	1072	extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
	1073
	1074
	1075	# Extract the initial substring of $text that represents a
	1076	# Perl "quote or quote-like operation"
	1077
	1078	($extracted, $remainder) = extract_quotelike($text);
	1079
	1080
	1081	# Extract the initial substring of $text that represents a block
	1082	# of Perl code, bracketed by any of character(s) specified by $delim
	1083	# (where the string $delim contains one or more of '(){}[]<>').
	1084
	1085	($extracted, $remainder) = extract_codeblock($text,$delim);
	1086
	1087
	1088	# Extract the initial substrings of $text that would be extracted by
	1089	# one or more sequential applications of the specified functions
	1090	# or regular expressions
	1091
	1092	@extracted = extract_multiple($text,
	1093	[ \&extract_bracketed,
	1094	\&extract_quotelike,
	1095	\&some_other_extractor_sub,
	1096	qr/[xyz]*/,
	1097	'literal',
	1098	]);
	1099
	1100	# Create a string representing an optimized pattern (a la Friedl)
	1101	# that matches a substring delimited by any of the specified characters
	1102	# (in this case: any type of quote or a slash)
	1103
	1104	$patstring = gen_delimited_pat(q{'"`/});
	1105
	1106
	1107	# Generate a reference to an anonymous sub that is just like extract_tagged
	1108	# but pre-compiled and optimized for a specific pair of tags, and consequently
	1109	# much faster (i.e. 3 times faster). It uses qr// for better performance on
	1110	# repeated calls, so it only works under Perl 5.005 or later.
	1111
	1112	$extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
	1113
	1114	($extracted, $remainder) = $extract_head->($text);
	1115
	1116
	1117	=head1 DESCRIPTION
	1118
	1119	The various C<extract_...> subroutines may be used to
	1120	extract a delimited substring, possibly after skipping a
	1121	specified prefix string. By default, that prefix is
	1122	optional whitespace (C</\s*/>), but you can change it to whatever
	1123	you wish (see below).
	1124
	1125	The substring to be extracted must appear at the
	1126	current C<pos> location of the string's variable
	1127	(or at index zero, if no C<pos> position is defined).
	1128	In other words, the C<extract_...> subroutines I<don't>
	1129	extract the first occurance of a substring anywhere
	1130	in a string (like an unanchored regex would). Rather,
	1131	they extract an occurance of the substring appearing
	1132	immediately at the current matching position in the
	1133	string (like a C<\G>-anchored regex would).
	1134
	1135
	1136
	1137	=head2 General behaviour in list contexts
	1138
	1139	In a list context, all the subroutines return a list, the first three
	1140	elements of which are always:
	1141
	1142	=over 4
	1143
	1144	=item [0]
	1145
	1146	The extracted string, including the specified delimiters.
	1147	If the extraction fails an empty string is returned.
	1148
	1149	=item [1]
	1150
	1151	The remainder of the input string (i.e. the characters after the
	1152	extracted string). On failure, the entire string is returned.
	1153
	1154	=item [2]
	1155
	1156	The skipped prefix (i.e. the characters before the extracted string).
	1157	On failure, the empty string is returned.
	1158
	1159	=back
	1160
	1161	Note that in a list context, the contents of the original input text (the first
	1162	argument) are not modified in any way.
	1163
	1164	However, if the input text was passed in a variable, that variable's
	1165	C<pos> value is updated to point at the first character after the
	1166	extracted text. That means that in a list context the various
	1167	subroutines can be used much like regular expressions. For example:
	1168
	1169	while ( $next = (extract_quotelike($text))[0] )
	1170	{
	1171	# process next quote-like (in $next)
	1172	}
	1173
	1174
	1175	=head2 General behaviour in scalar and void contexts
	1176
	1177	In a scalar context, the extracted string is returned, having first been
	1178	removed from the input text. Thus, the following code also processes
	1179	each quote-like operation, but actually removes them from $text:
	1180
	1181	while ( $next = extract_quotelike($text) )
	1182	{
	1183	# process next quote-like (in $next)
	1184	}
	1185
	1186	Note that if the input text is a read-only string (i.e. a literal),
	1187	no attempt is made to remove the extracted text.
	1188
	1189	In a void context the behaviour of the extraction subroutines is
	1190	exactly the same as in a scalar context, except (of course) that the
	1191	extracted substring is not returned.
	1192
	1193	=head2 A note about prefixes
	1194
	1195	Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
	1196	This can bite you if you're expecting a prefix specification like
	1197	'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
	1198	pattern will only succeed if the <H1> tag is on the current line, since
	1199	. normally doesn't match newlines.
	1200
	1201	To overcome this limitation, you need to turn on /s matching within
	1202	the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
	1203
	1204
	1205	=head2 C<extract_delimited>
	1206
	1207	The C<extract_delimited> function formalizes the common idiom
	1208	of extracting a single-character-delimited substring from the start of
	1209	a string. For example, to extract a single-quote delimited string, the
	1210	following code is typically used:
	1211
	1212	($remainder = $text) =~ s/\A('(\\.\|[^'])*')//s;
	1213	$extracted = $1;
	1214
	1215	but with C<extract_delimited> it can be simplified to:
	1216
	1217	($extracted,$remainder) = extract_delimited($text, "'");
	1218
	1219	C<extract_delimited> takes up to four scalars (the input text, the
	1220	delimiters, a prefix pattern to be skipped, and any escape characters)
	1221	and extracts the initial substring of the text that
	1222	is appropriately delimited. If the delimiter string has multiple
	1223	characters, the first one encountered in the text is taken to delimit
	1224	the substring.
	1225	The third argument specifies a prefix pattern that is to be skipped
	1226	(but must be present!) before the substring is extracted.
	1227	The final argument specifies the escape character to be used for each
	1228	delimiter.
	1229
	1230	All arguments are optional. If the escape characters are not specified,
	1231	every delimiter is escaped with a backslash (C<\>).
	1232	If the prefix is not specified, the
	1233	pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
	1234	is also not specified, the set C</["'`]/> is used. If the text to be processed
	1235	is not specified either, C<$_> is used.
	1236
	1237	In list context, C<extract_delimited> returns a array of three
	1238	elements, the extracted substring (I<including the surrounding
	1239	delimiters>), the remainder of the text, and the skipped prefix (if
	1240	any). If a suitable delimited substring is not found, the first
	1241	element of the array is the empty string, the second is the complete
	1242	original text, and the prefix returned in the third element is an
	1243	empty string.
	1244
	1245	In a scalar context, just the extracted substring is returned. In
	1246	a void context, the extracted substring (and any prefix) are simply
	1247	removed from the beginning of the first argument.
	1248
	1249	Examples:
	1250
	1251	# Remove a single-quoted substring from the very beginning of $text:
	1252
	1253	$substring = extract_delimited($text, "'", '');
	1254
	1255	# Remove a single-quoted Pascalish substring (i.e. one in which
	1256	# doubling the quote character escapes it) from the very
	1257	# beginning of $text:
	1258
	1259	$substring = extract_delimited($text, "'", '', "'");
	1260
	1261	# Extract a single- or double- quoted substring from the
	1262	# beginning of $text, optionally after some whitespace
	1263	# (note the list context to protect $text from modification):
	1264
	1265	($substring) = extract_delimited $text, q{"'};
	1266
	1267
	1268	# Delete the substring delimited by the first '/' in $text:
	1269
	1270	$text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
	1271
	1272	Note that this last example is I<not> the same as deleting the first
	1273	quote-like pattern. For instance, if C<$text> contained the string:
	1274
	1275	"if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
	1276
	1277	then after the deletion it would contain:
	1278
	1279	"if ('.$UNIXCMD/s) { $cmd = $1; }"
	1280
	1281	not:
	1282
	1283	"if ('./cmd' =~ ms) { $cmd = $1; }"
	1284
	1285
	1286	See L<"extract_quotelike"> for a (partial) solution to this problem.
	1287
	1288
	1289	=head2 C<extract_bracketed>
	1290
	1291	Like C<"extract_delimited">, the C<extract_bracketed> function takes
	1292	up to three optional scalar arguments: a string to extract from, a delimiter
	1293	specifier, and a prefix pattern. As before, a missing prefix defaults to
	1294	optional whitespace and a missing text defaults to C<$_>. However, a missing
	1295	delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
	1296
	1297	C<extract_bracketed> extracts a balanced-bracket-delimited
	1298	substring (using any one (or more) of the user-specified delimiter
	1299	brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
	1300	respect quoted unbalanced brackets (see below).
	1301
	1302	A "delimiter bracket" is a bracket in list of delimiters passed as
	1303	C<extract_bracketed>'s second argument. Delimiter brackets are
	1304	specified by giving either the left or right (or both!) versions
	1305	of the required bracket(s). Note that the order in which
	1306	two or more delimiter brackets are specified is not significant.
	1307
	1308	A "balanced-bracket-delimited substring" is a substring bounded by
	1309	matched brackets, such that any other (left or right) delimiter
	1310	bracket I<within> the substring is also matched by an opposite
	1311	(right or left) delimiter bracket I<at the same level of nesting>. Any
	1312	type of bracket not in the delimiter list is treated as an ordinary
	1313	character.
	1314
	1315	In other words, each type of bracket specified as a delimiter must be
	1316	balanced and correctly nested within the substring, and any other kind of
	1317	("non-delimiter") bracket in the substring is ignored.
	1318
	1319	For example, given the string:
	1320
	1321	$text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
	1322
	1323	then a call to C<extract_bracketed> in a list context:
	1324
	1325	@result = extract_bracketed( $text, '{}' );
	1326
	1327	would return:
	1328
	1329	( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
	1330
	1331	since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
	1332	(In a scalar context just the first element of the array would be returned. In
	1333	a void context, C<$text> would be replaced by an empty string.)
	1334
	1335	Likewise the call in:
	1336
	1337	@result = extract_bracketed( $text, '{[' );
	1338
	1339	would return the same result, since all sets of both types of specified
	1340	delimiter brackets are correctly nested and balanced.
	1341
	1342	However, the call in:
	1343
	1344	@result = extract_bracketed( $text, '{([<' );
	1345
	1346	would fail, returning:
	1347
	1348	( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
	1349
	1350	because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
	1351	the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
	1352	return an empty string. In a void context, C<$text> would be unchanged.)
	1353
	1354	Note that the embedded single-quotes in the string don't help in this
	1355	case, since they have not been specified as acceptable delimiters and are
	1356	therefore treated as non-delimiter characters (and ignored).
	1357
	1358	However, if a particular species of quote character is included in the
	1359	delimiter specification, then that type of quote will be correctly handled.
	1360	for example, if C<$text> is:
	1361
	1362	$text = '<A HREF=">>>>">link</A>';
	1363
	1364	then
	1365
	1366	@result = extract_bracketed( $text, '<">' );
	1367
	1368	returns:
	1369
	1370	( '<A HREF=">>>>">', 'link</A>', "" )
	1371
	1372	as expected. Without the specification of C<"> as an embedded quoter:
	1373
	1374	@result = extract_bracketed( $text, '<>' );
	1375
	1376	the result would be:
	1377
	1378	( '<A HREF=">', '>>>">link</A>', "" )
	1379
	1380	In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
	1381	quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
	1382	letter 'q' as a delimiter. Hence:
	1383
	1384	@result = extract_bracketed( $text, '<q>' );
	1385
	1386	would correctly match something like this:
	1387
	1388	$text = '<leftop: conj /and/ conj>';
	1389
	1390	See also: C<"extract_quotelike"> and C<"extract_codeblock">.
	1391
	1392
	1393	=head2 C<extract_variable>
	1394
	1395	C<extract_variable> extracts any valid Perl variable or
	1396	variable-involved expression, including scalars, arrays, hashes, array
	1397	accesses, hash look-ups, method calls through objects, subroutine calles
	1398	through subroutine references, etc.
	1399
	1400	The subroutine takes up to two optional arguments:
	1401
	1402	=over 4
	1403
	1404	=item 1.
	1405
	1406	A string to be processed (C<$_> if the string is omitted or C<undef>)
	1407
	1408	=item 2.
	1409
	1410	A string specifying a pattern to be matched as a prefix (which is to be
	1411	skipped). If omitted, optional whitespace is skipped.
	1412
	1413	=back
	1414
	1415	On success in a list context, an array of 3 elements is returned. The
	1416	elements are:
	1417
	1418	=over 4
	1419
	1420	=item [0]
	1421
	1422	the extracted variable, or variablish expression
	1423
	1424	=item [1]
	1425
	1426	the remainder of the input text,
	1427
	1428	=item [2]
	1429
	1430	the prefix substring (if any),
	1431
	1432	=back
	1433
	1434	On failure, all of these values (except the remaining text) are C<undef>.
	1435
	1436	In a scalar context, C<extract_variable> returns just the complete
	1437	substring that matched a variablish expression. C<undef> is returned on
	1438	failure. In addition, the original input text has the returned substring
	1439	(and any prefix) removed from it.
	1440
	1441	In a void context, the input text just has the matched substring (and
	1442	any specified prefix) removed.
	1443
	1444
	1445	=head2 C<extract_tagged>
	1446
	1447	C<extract_tagged> extracts and segments text between (balanced)
	1448	specified tags.
	1449
	1450	The subroutine takes up to five optional arguments:
	1451
	1452	=over 4
	1453
	1454	=item 1.
	1455
	1456	A string to be processed (C<$_> if the string is omitted or C<undef>)
	1457
	1458	=item 2.
	1459
	1460	A string specifying a pattern to be matched as the opening tag.
	1461	If the pattern string is omitted (or C<undef>) then a pattern
	1462	that matches any standard XML tag is used.
	1463
	1464	=item 3.
	1465
	1466	A string specifying a pattern to be matched at the closing tag.
	1467	If the pattern string is omitted (or C<undef>) then the closing
	1468	tag is constructed by inserting a C</> after any leading bracket
	1469	characters in the actual opening tag that was matched (I<not> the pattern
	1470	that matched the tag). For example, if the opening tag pattern
	1471	is specified as C<'{{\w+}}'> and actually matched the opening tag
	1472	C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
	1473
	1474	=item 4.
	1475
	1476	A string specifying a pattern to be matched as a prefix (which is to be
	1477	skipped). If omitted, optional whitespace is skipped.
	1478
	1479	=item 5.
	1480
	1481	A hash reference containing various parsing options (see below)
	1482
	1483	=back
	1484
	1485	The various options that can be specified are:
	1486
	1487	=over 4
	1488
	1489	=item C<reject =E<gt> $listref>
	1490
	1491	The list reference contains one or more strings specifying patterns
	1492	that must I<not> appear within the tagged text.
	1493
	1494	For example, to extract
	1495	an HTML link (which should not contain nested links) use:
	1496
	1497	extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
	1498
	1499	=item C<ignore =E<gt> $listref>
	1500
	1501	The list reference contains one or more strings specifying patterns
	1502	that are I<not> be be treated as nested tags within the tagged text
	1503	(even if they would match the start tag pattern).
	1504
	1505	For example, to extract an arbitrary XML tag, but ignore "empty" elements:
	1506
	1507	extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
	1508
	1509	(also see L<"gen_delimited_pat"> below).
	1510
	1511
	1512	=item C<fail =E<gt> $str>
	1513
	1514	The C<fail> option indicates the action to be taken if a matching end
	1515	tag is not encountered (i.e. before the end of the string or some
	1516	C<reject> pattern matches). By default, a failure to match a closing
	1517	tag causes C<extract_tagged> to immediately fail.
	1518
	1519	However, if the string value associated with <reject> is "MAX", then
	1520	C<extract_tagged> returns the complete text up to the point of failure.
	1521	If the string is "PARA", C<extract_tagged> returns only the first paragraph
	1522	after the tag (up to the first line that is either empty or contains
	1523	only whitespace characters).
	1524	If the string is "", the the default behaviour (i.e. failure) is reinstated.
	1525
	1526	For example, suppose the start tag "/para" introduces a paragraph, which then
	1527	continues until the next "/endpara" tag or until another "/para" tag is
	1528	encountered:
	1529
	1530	$text = "/para line 1\n\nline 3\n/para line 4";
	1531
	1532	extract_tagged($text, '/para', '/endpara', undef,
	1533	{reject => '/para', fail => MAX );
	1534
	1535	# EXTRACTED: "/para line 1\n\nline 3\n"
	1536
	1537	Suppose instead, that if no matching "/endpara" tag is found, the "/para"
	1538	tag refers only to the immediately following paragraph:
	1539
	1540	$text = "/para line 1\n\nline 3\n/para line 4";
	1541
	1542	extract_tagged($text, '/para', '/endpara', undef,
	1543	{reject => '/para', fail => MAX );
	1544
	1545	# EXTRACTED: "/para line 1\n"
	1546
	1547	Note that the specified C<fail> behaviour applies to nested tags as well.
	1548
	1549	=back
	1550
	1551	On success in a list context, an array of 6 elements is returned. The elements are:
	1552
	1553	=over 4
	1554
	1555	=item [0]
	1556
	1557	the extracted tagged substring (including the outermost tags),
	1558
	1559	=item [1]
	1560
	1561	the remainder of the input text,
	1562
	1563	=item [2]
	1564
	1565	the prefix substring (if any),
	1566
	1567	=item [3]
	1568
	1569	the opening tag
	1570
	1571	=item [4]
	1572
	1573	the text between the opening and closing tags
	1574
	1575	=item [5]
	1576
	1577	the closing tag (or "" if no closing tag was found)
	1578
	1579	=back
	1580
	1581	On failure, all of these values (except the remaining text) are C<undef>.
	1582
	1583	In a scalar context, C<extract_tagged> returns just the complete
	1584	substring that matched a tagged text (including the start and end
	1585	tags). C<undef> is returned on failure. In addition, the original input
	1586	text has the returned substring (and any prefix) removed from it.
	1587
	1588	In a void context, the input text just has the matched substring (and
	1589	any specified prefix) removed.
	1590
	1591
	1592	=head2 C<gen_extract_tagged>
	1593
	1594	(Note: This subroutine is only available under Perl5.005)
	1595
	1596	C<gen_extract_tagged> generates a new anonymous subroutine which
	1597	extracts text between (balanced) specified tags. In other words,
	1598	it generates a function identical in function to C<extract_tagged>.
	1599
	1600	The difference between C<extract_tagged> and the anonymous
	1601	subroutines generated by
	1602	C<gen_extract_tagged>, is that those generated subroutines:
	1603
	1604	=over 4
	1605
	1606	=item *
	1607
	1608	do not have to reparse tag specification or parsing options every time
	1609	they are called (whereas C<extract_tagged> has to effectively rebuild
	1610	its tag parser on every call);
	1611
	1612	=item *
	1613
	1614	make use of the new qr// construct to pre-compile the regexes they use
	1615	(whereas C<extract_tagged> uses standard string variable interpolation
	1616	to create tag-matching patterns).
	1617
	1618	=back
	1619
	1620	The subroutine takes up to four optional arguments (the same set as
	1621	C<extract_tagged> except for the string to be processed). It returns
	1622	a reference to a subroutine which in turn takes a single argument (the text to
	1623	be extracted from).
	1624
	1625	In other words, the implementation of C<extract_tagged> is exactly
	1626	equivalent to:
	1627
	1628	sub extract_tagged
	1629	{
	1630	my $text = shift;
	1631	$extractor = gen_extract_tagged(@_);
	1632	return $extractor->($text);
	1633	}
	1634
	1635	(although C<extract_tagged> is not currently implemented that way, in order
	1636	to preserve pre-5.005 compatibility).
	1637
	1638	Using C<gen_extract_tagged> to create extraction functions for specific tags
	1639	is a good idea if those functions are going to be called more than once, since
	1640	their performance is typically twice as good as the more general-purpose
	1641	C<extract_tagged>.
	1642
	1643
	1644	=head2 C<extract_quotelike>
	1645
	1646	C<extract_quotelike> attempts to recognize, extract, and segment any
	1647	one of the various Perl quotes and quotelike operators (see
	1648	L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
	1649	delimiters (for the quotelike operators), and trailing modifiers are
	1650	all caught. For example, in:
	1651
	1652	extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
	1653
	1654	extract_quotelike ' "You said, \"Use sed\"." '
	1655
	1656	extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
	1657
	1658	extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
	1659
	1660	the full Perl quotelike operations are all extracted correctly.
	1661
	1662	Note too that, when using the /x modifier on a regex, any comment
	1663	containing the current pattern delimiter will cause the regex to be
	1664	immediately terminated. In other words:
	1665
	1666	'm /
	1667	(?i) # CASE INSENSITIVE
	1668	[a-z_] # LEADING ALPHABETIC/UNDERSCORE
	1669	[a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
	1670	/x'
	1671
	1672	will be extracted as if it were:
	1673
	1674	'm /
	1675	(?i) # CASE INSENSITIVE
	1676	[a-z_] # LEADING ALPHABETIC/'
	1677
	1678	This behaviour is identical to that of the actual compiler.
	1679
	1680	C<extract_quotelike> takes two arguments: the text to be processed and
	1681	a prefix to be matched at the very beginning of the text. If no prefix
	1682	is specified, optional whitespace is the default. If no text is given,
	1683	C<$_> is used.
	1684
	1685	In a list context, an array of 11 elements is returned. The elements are:
	1686
	1687	=over 4
	1688
	1689	=item [0]
	1690
	1691	the extracted quotelike substring (including trailing modifiers),
	1692
	1693	=item [1]
	1694
	1695	the remainder of the input text,
	1696
	1697	=item [2]
	1698
	1699	the prefix substring (if any),
	1700
	1701	=item [3]
	1702
	1703	the name of the quotelike operator (if any),
	1704
	1705	=item [4]
	1706
	1707	the left delimiter of the first block of the operation,
	1708
	1709	=item [5]
	1710
	1711	the text of the first block of the operation
	1712	(that is, the contents of
	1713	a quote, the regex of a match or substitution or the target list of a
	1714	translation),
	1715
	1716	=item [6]
	1717
	1718	the right delimiter of the first block of the operation,
	1719
	1720	=item [7]
	1721
	1722	the left delimiter of the second block of the operation
	1723	(that is, if it is a C<s>, C<tr>, or C<y>),
	1724
	1725	=item [8]
	1726
	1727	the text of the second block of the operation
	1728	(that is, the replacement of a substitution or the translation list
	1729	of a translation),
	1730
	1731	=item [9]
	1732
	1733	the right delimiter of the second block of the operation (if any),
	1734
	1735	=item [10]
	1736
	1737	the trailing modifiers on the operation (if any).
	1738
	1739	=back
	1740
	1741	For each of the fields marked "(if any)" the default value on success is
	1742	an empty string.
	1743	On failure, all of these values (except the remaining text) are C<undef>.
	1744
	1745
	1746	In a scalar context, C<extract_quotelike> returns just the complete substring
	1747	that matched a quotelike operation (or C<undef> on failure). In a scalar or
	1748	void context, the input text has the same substring (and any specified
	1749	prefix) removed.
	1750
	1751	Examples:
	1752
	1753	# Remove the first quotelike literal that appears in text
	1754
	1755	$quotelike = extract_quotelike($text,'.*?');
	1756
	1757	# Replace one or more leading whitespace-separated quotelike
	1758	# literals in $_ with "<QLL>"
	1759
	1760	do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
	1761
	1762
	1763	# Isolate the search pattern in a quotelike operation from $text
	1764
	1765	($op,$pat) = (extract_quotelike $text)[3,5];
	1766	if ($op =~ /[ms]/)
	1767	{
	1768	print "search pattern: $pat\n";
	1769	}
	1770	else
	1771	{
	1772	print "$op is not a pattern matching operation\n";
	1773	}
	1774
	1775
	1776	=head2 C<extract_quotelike> and "here documents"
	1777
	1778	C<extract_quotelike> can successfully extract "here documents" from an input
	1779	string, but with an important caveat in list contexts.
	1780
	1781	Unlike other types of quote-like literals, a here document is rarely
	1782	a contiguous substring. For example, a typical piece of code using
	1783	here document might look like this:
	1784
	1785	<<'EOMSG' \|\| die;
	1786	This is the message.
	1787	EOMSG
	1788	exit;
	1789
	1790	Given this as an input string in a scalar context, C<extract_quotelike>
	1791	would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
	1792	leaving the string " \|\| die;\nexit;" in the original variable. In other words,
	1793	the two separate pieces of the here document are successfully extracted and
	1794	concatenated.
	1795
	1796	In a list context, C<extract_quotelike> would return the list
	1797
	1798	=over 4
	1799
	1800	=item [0]
	1801
	1802	"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
	1803	including fore and aft delimiters),
	1804
	1805	=item [1]
	1806
	1807	" \|\| die;\nexit;" (i.e. the remainder of the input text, concatenated),
	1808
	1809	=item [2]
	1810
	1811	"" (i.e. the prefix substring -- trivial in this case),
	1812
	1813	=item [3]
	1814
	1815	"<<" (i.e. the "name" of the quotelike operator)
	1816
	1817	=item [4]
	1818
	1819	"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
	1820
	1821	=item [5]
	1822
	1823	"This is the message.\n" (i.e. the text of the here document),
	1824
	1825	=item [6]
	1826
	1827	"EOMSG" (i.e. the right delimiter of the here document),
	1828
	1829	=item [7..10]
	1830
	1831	"" (a here document has no second left delimiter, second text, second right
	1832	delimiter, or trailing modifiers).
	1833
	1834	=back
	1835
	1836	However, the matching position of the input variable would be set to
	1837	"exit;" (i.e. I<after> the closing delimiter of the here document),
	1838	which would cause the earlier " \|\| die;\nexit;" to be skipped in any
	1839	sequence of code fragment extractions.
	1840
	1841	To avoid this problem, when it encounters a here document whilst
	1842	extracting from a modifiable string, C<extract_quotelike> silently
	1843	rearranges the string to an equivalent piece of Perl:
	1844
	1845	<<'EOMSG'
	1846	This is the message.
	1847	EOMSG
	1848	\|\| die;
	1849	exit;
	1850
	1851	in which the here document I<is> contiguous. It still leaves the
	1852	matching position after the here document, but now the rest of the line
	1853	on which the here document starts is not skipped.
	1854
	1855	To prevent <extract_quotelike> from mucking about with the input in this way
	1856	(this is the only case where a list-context C<extract_quotelike> does so),
	1857	you can pass the input variable as an interpolated literal:
	1858
	1859	$quotelike = extract_quotelike("$var");
	1860
	1861
	1862	=head2 C<extract_codeblock>
	1863
	1864	C<extract_codeblock> attempts to recognize and extract a balanced
	1865	bracket delimited substring that may contain unbalanced brackets
	1866	inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
	1867	is like a combination of C<"extract_bracketed"> and
	1868	C<"extract_quotelike">.
	1869
	1870	C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
	1871	a text to process, a set of delimiter brackets to look for, and a prefix to
	1872	match first. It also takes an optional fourth parameter, which allows the
	1873	outermost delimiter brackets to be specified separately (see below).
	1874
	1875	Omitting the first argument (input text) means process C<$_> instead.
	1876	Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
	1877	Omitting the third argument (prefix argument) implies optional whitespace at the start.
	1878	Omitting the fourth argument (outermost delimiter brackets) indicates that the
	1879	value of the second argument is to be used for the outermost delimiters.
	1880
	1881	Once the prefix an dthe outermost opening delimiter bracket have been
	1882	recognized, code blocks are extracted by stepping through the input text and
	1883	trying the following alternatives in sequence:
	1884
	1885	=over 4
	1886
	1887	=item 1.
	1888
	1889	Try and match a closing delimiter bracket. If the bracket was the same
	1890	species as the last opening bracket, return the substring to that
	1891	point. If the bracket was mismatched, return an error.
	1892
	1893	=item 2.
	1894
	1895	Try to match a quote or quotelike operator. If found, call
	1896	C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
	1897	the error it returned. Otherwise go back to step 1.
	1898
	1899	=item 3.
	1900
	1901	Try to match an opening delimiter bracket. If found, call
	1902	C<extract_codeblock> recursively to eat the embedded block. If the
	1903	recursive call fails, return an error. Otherwise, go back to step 1.
	1904
	1905	=item 4.
	1906
	1907	Unconditionally match a bareword or any other single character, and
	1908	then go back to step 1.
	1909
	1910	=back
	1911
	1912
	1913	Examples:
	1914
	1915	# Find a while loop in the text
	1916
	1917	if ($text =~ s/.?while\s\{/{/)
	1918	{
	1919	$loop = "while " . extract_codeblock($text);
	1920	}
	1921
	1922	# Remove the first round-bracketed list (which may include
	1923	# round- or curly-bracketed code blocks or quotelike operators)
	1924
	1925	extract_codeblock $text, "(){}", '[^(]*';
	1926
	1927
	1928	The ability to specify a different outermost delimiter bracket is useful
	1929	in some circumstances. For example, in the Parse::RecDescent module,
	1930	parser actions which are to be performed only on a successful parse
	1931	are specified using a C<E<lt>defer:...E<gt>> directive. For example:
	1932
	1933	sentence: subject verb object
	1934	<defer: {$::theVerb = $item{verb}} >
	1935
	1936	Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
	1937	within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
	1938
	1939	A deferred action like this:
	1940
	1941	<defer: {if ($count>10) {$count--}} >
	1942
	1943	will be incorrectly parsed as:
	1944
	1945	<defer: {if ($count>
	1946
	1947	because the "less than" operator is interpreted as a closing delimiter.
	1948
	1949	But, by extracting the directive using
	1950	S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
	1951	the '>' character is only treated as a delimited at the outermost
	1952	level of the code block, so the directive is parsed correctly.
	1953
	1954	=head2 C<extract_multiple>
	1955
	1956	The C<extract_multiple> subroutine takes a string to be processed and a
	1957	list of extractors (subroutines or regular expressions) to apply to that string.
	1958
	1959	In an array context C<extract_multiple> returns an array of substrings
	1960	of the original string, as extracted by the specified extractors.
	1961	In a scalar context, C<extract_multiple> returns the first
	1962	substring successfully extracted from the original string. In both
	1963	scalar and void contexts the original string has the first successfully
	1964	extracted substring removed from it. In all contexts
	1965	C<extract_multiple> starts at the current C<pos> of the string, and
	1966	sets that C<pos> appropriately after it matches.
	1967
	1968	Hence, the aim of of a call to C<extract_multiple> in a list context
	1969	is to split the processed string into as many non-overlapping fields as
	1970	possible, by repeatedly applying each of the specified extractors
	1971	to the remainder of the string. Thus C<extract_multiple> is
	1972	a generalized form of Perl's C<split> subroutine.
	1973
	1974	The subroutine takes up to four optional arguments:
	1975
	1976	=over 4
	1977
	1978	=item 1.
	1979
	1980	A string to be processed (C<$_> if the string is omitted or C<undef>)
	1981
	1982	=item 2.
	1983
	1984	A reference to a list of subroutine references and/or qr// objects and/or
	1985	literal strings and/or hash references, specifying the extractors
	1986	to be used to split the string. If this argument is omitted (or
	1987	C<undef>) the list:
	1988
	1989	[
	1990	sub { extract_variable($_[0], '') },
	1991	sub { extract_quotelike($_[0],'') },
	1992	sub { extract_codeblock($_[0],'{}','') },
	1993	]
	1994
	1995	is used.
	1996
	1997
	1998	=item 3.
	1999
	2000	An number specifying the maximum number of fields to return. If this
	2001	argument is omitted (or C<undef>), split continues as long as possible.
	2002
	2003	If the third argument is I<N>, then extraction continues until I<N> fields
	2004	have been successfully extracted, or until the string has been completely
	2005	processed.
	2006
	2007	Note that in scalar and void contexts the value of this argument is
	2008	automatically reset to 1 (under C<-w>, a warning is issued if the argument
	2009	has to be reset).
	2010
	2011	=item 4.
	2012
	2013	A value indicating whether unmatched substrings (see below) within the
	2014	text should be skipped or returned as fields. If the value is true,
	2015	such substrings are skipped. Otherwise, they are returned.
	2016
	2017	=back
	2018
	2019	The extraction process works by applying each extractor in
	2020	sequence to the text string.
	2021
	2022	If the extractor is a subroutine it is called in a list context and is
	2023	expected to return a list of a single element, namely the extracted
	2024	text. It may optionally also return two further arguments: a string
	2025	representing the text left after extraction (like $' for a pattern
	2026	match), and a string representing any prefix skipped before the
	2027	extraction (like $` in a pattern match). Note that this is designed
	2028	to facilitate the use of other Text::Balanced subroutines with
	2029	C<extract_multiple>. Note too that the value returned by an extractor
	2030	subroutine need not bear any relationship to the corresponding substring
	2031	of the original text (see examples below).
	2032
	2033	If the extractor is a precompiled regular expression or a string,
	2034	it is matched against the text in a scalar context with a leading
	2035	'\G' and the gc modifiers enabled. The extracted value is either
	2036	$1 if that variable is defined after the match, or else the
	2037	complete match (i.e. $&).
	2038
	2039	If the extractor is a hash reference, it must contain exactly one element.
	2040	The value of that element is one of the
	2041	above extractor types (subroutine reference, regular expression, or string).
	2042	The key of that element is the name of a class into which the successful
	2043	return value of the extractor will be blessed.
	2044
	2045	If an extractor returns a defined value, that value is immediately
	2046	treated as the next extracted field and pushed onto the list of fields.
	2047	If the extractor was specified in a hash reference, the field is also
	2048	blessed into the appropriate class,
	2049
	2050	If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
	2051	assumed to have failed to extract.
	2052	If none of the extractor subroutines succeeds, then one
	2053	character is extracted from the start of the text and the extraction
	2054	subroutines reapplied. Characters which are thus removed are accumulated and
	2055	eventually become the next field (unless the fourth argument is true, in which
	2056	case they are disgarded).
	2057
	2058	For example, the following extracts substrings that are valid Perl variables:
	2059
	2060	@fields = extract_multiple($text,
	2061	[ sub { extract_variable($_[0]) } ],
	2062	undef, 1);
	2063
	2064	This example separates a text into fields which are quote delimited,
	2065	curly bracketed, and anything else. The delimited and bracketed
	2066	parts are also blessed to identify them (the "anything else" is unblessed):
	2067
	2068	@fields = extract_multiple($text,
	2069	[
	2070	{ Delim => sub { extract_delimited($_[0],q{'"}) } },
	2071	{ Brack => sub { extract_bracketed($_[0],'{}') } },
	2072	]);
	2073
	2074	This call extracts the next single substring that is a valid Perl quotelike
	2075	operator (and removes it from $text):
	2076
	2077	$quotelike = extract_multiple($text,
	2078	[
	2079	sub { extract_quotelike($_[0]) },
	2080	], undef, 1);
	2081
	2082	Finally, here is yet another way to do comma-separated value parsing:
	2083
	2084	@fields = extract_multiple($csv_text,
	2085	[
	2086	sub { extract_delimited($_[0],q{'"}) },
	2087	qr/([^,]+)(.*)/,
	2088	],
	2089	undef,1);
	2090
	2091	The list in the second argument means:
	2092	I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
	2093	The undef third argument means:
	2094	I<"...as many times as possible...">,
	2095	and the true value in the fourth argument means
	2096	I<"...discarding anything else that appears (i.e. the commas)">.
	2097
	2098	If you wanted the commas preserved as separate fields (i.e. like split
	2099	does if your split pattern has capturing parentheses), you would
	2100	just make the last parameter undefined (or remove it).
	2101
	2102
	2103	=head2 C<gen_delimited_pat>
	2104
	2105	The C<gen_delimited_pat> subroutine takes a single (string) argument and
	2106	> builds a Friedl-style optimized regex that matches a string delimited
	2107	by any one of the characters in the single argument. For example:
	2108
	2109	gen_delimited_pat(q{'"})
	2110
	2111	returns the regex:
	2112
	2113	(?:\"(?:\\\"\|(?!\").)\"\|\'(?:\\\'\|(?!\').)\')
	2114
	2115	Note that the specified delimiters are automatically quotemeta'd.
	2116
	2117	A typical use of C<gen_delimited_pat> would be to build special purpose tags
	2118	for C<extract_tagged>. For example, to properly ignore "empty" XML elements
	2119	(which might contain quoted strings):
	2120
	2121	my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '\|.)+/>';
	2122
	2123	extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
	2124
	2125
	2126	C<gen_delimited_pat> may also be called with an optional second argument,
	2127	which specifies the "escape" character(s) to be used for each delimiter.
	2128	For example to match a Pascal-style string (where ' is the delimiter
	2129	and '' is a literal ' within the string):
	2130
	2131	gen_delimited_pat(q{'},q{'});
	2132
	2133	Different escape characters can be specified for different delimiters.
	2134	For example, to specify that '/' is the escape for single quotes
	2135	and '%' is the escape for double quotes:
	2136
	2137	gen_delimited_pat(q{'"},q{/%});
	2138
	2139	If more delimiters than escape chars are specified, the last escape char
	2140	is used for the remaining delimiters.
	2141	If no escape char is specified for a given specified delimiter, '\' is used.
	2142
	2143	Note that
	2144	C<gen_delimited_pat> was previously called
	2145	C<delimited_pat>. That name may still be used, but is now deprecated.
	2146
	2147
	2148	=head1 DIAGNOSTICS
	2149
	2150	In a list context, all the functions return C<(undef,$original_text)>
	2151	on failure. In a scalar context, failure is indicated by returning C<undef>
	2152	(in this case the input text is not modified in any way).
	2153
	2154	In addition, on failure in I<any> context, the C<$@> variable is set.
	2155	Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
	2156	below.
	2157	Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
	2158	which the error was detected (although not necessarily where it occurred!)
	2159	Printing C<$@> directly produces the error message, with the offset appended.
	2160	On success, the C<$@> variable is guaranteed to be C<undef>.
	2161
	2162	The available diagnostics are:
	2163
	2164	=over 4
	2165
	2166	=item C<Did not find a suitable bracket: "%s">
	2167
	2168	The delimiter provided to C<extract_bracketed> was not one of
	2169	C<'()[]E<lt>E<gt>{}'>.
	2170
	2171	=item C<Did not find prefix: /%s/>
	2172
	2173	A non-optional prefix was specified but wasn't found at the start of the text.
	2174
	2175	=item C<Did not find opening bracket after prefix: "%s">
	2176
	2177	C<extract_bracketed> or C<extract_codeblock> was expecting a
	2178	particular kind of bracket at the start of the text, and didn't find it.
	2179
	2180	=item C<No quotelike operator found after prefix: "%s">
	2181
	2182	C<extract_quotelike> didn't find one of the quotelike operators C<q>,
	2183	C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
	2184	it was extracting.
	2185
	2186	=item C<Unmatched closing bracket: "%c">
	2187
	2188	C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
	2189	a closing bracket where none was expected.
	2190
	2191	=item C<Unmatched opening bracket(s): "%s">
	2192
	2193	C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
	2194	out of characters in the text before closing one or more levels of nested
	2195	brackets.
	2196
	2197	=item C<Unmatched embedded quote (%s)>
	2198
	2199	C<extract_bracketed> attempted to match an embedded quoted substring, but
	2200	failed to find a closing quote to match it.
	2201
	2202	=item C<Did not find closing delimiter to match '%s'>
	2203
	2204	C<extract_quotelike> was unable to find a closing delimiter to match the
	2205	one that opened the quote-like operation.
	2206
	2207	=item C<Mismatched closing bracket: expected "%c" but found "%s">
	2208
	2209	C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
	2210	a valid bracket delimiter, but it was the wrong species. This usually
	2211	indicates a nesting error, but may indicate incorrect quoting or escaping.
	2212
	2213	=item C<No block delimiter found after quotelike "%s">
	2214
	2215	C<extract_quotelike> or C<extract_codeblock> found one of the
	2216	quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
	2217	without a suitable block after it.
	2218
	2219	=item C<Did not find leading dereferencer>
	2220
	2221	C<extract_variable> was expecting one of '$', '@', or '%' at the start of
	2222	a variable, but didn't find any of them.
	2223
	2224	=item C<Bad identifier after dereferencer>
	2225
	2226	C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
	2227	character was not followed by a legal Perl identifier.
	2228
	2229	=item C<Did not find expected opening bracket at %s>
	2230
	2231	C<extract_codeblock> failed to find any of the outermost opening brackets
	2232	that were specified.
	2233
	2234	=item C<Improperly nested codeblock at %s>
	2235
	2236	A nested code block was found that started with a delimiter that was specified
	2237	as being only to be used as an outermost bracket.
	2238
	2239	=item C<Missing second block for quotelike "%s">
	2240
	2241	C<extract_codeblock> or C<extract_quotelike> found one of the
	2242	quotelike operators C<s>, C<tr> or C<y> followed by only one block.
	2243
	2244	=item C<No match found for opening bracket>
	2245
	2246	C<extract_codeblock> failed to find a closing bracket to match the outermost
	2247	opening bracket.
	2248
	2249	=item C<Did not find opening tag: /%s/>
	2250
	2251	C<extract_tagged> did not find a suitable opening tag (after any specified
	2252	prefix was removed).
	2253
	2254	=item C<Unable to construct closing tag to match: /%s/>
	2255
	2256	C<extract_tagged> matched the specified opening tag and tried to
	2257	modify the matched text to produce a matching closing tag (because
	2258	none was specified). It failed to generate the closing tag, almost
	2259	certainly because the opening tag did not start with a
	2260	bracket of some kind.
	2261
	2262	=item C<Found invalid nested tag: %s>
	2263
	2264	C<extract_tagged> found a nested tag that appeared in the "reject" list
	2265	(and the failure mode was not "MAX" or "PARA").
	2266
	2267	=item C<Found unbalanced nested tag: %s>
	2268
	2269	C<extract_tagged> found a nested opening tag that was not matched by a
	2270	corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
	2271
	2272	=item C<Did not find closing tag>
	2273
	2274	C<extract_tagged> reached the end of the text without finding a closing tag
	2275	to match the original opening tag (and the failure mode was not
	2276	"MAX" or "PARA").
	2277
	2278
	2279
	2280
	2281	=back
	2282
	2283
	2284	=head1 AUTHOR
	2285
	2286	Damian Conway (damian@conway.org)
	2287
	2288
	2289	=head1 BUGS AND IRRITATIONS
	2290
	2291	There are undoubtedly serious bugs lurking somewhere in this code, if
	2292	only because parts of it give the impression of understanding a great deal
	2293	more about Perl than they really do.
	2294
	2295	Bug reports and other feedback are most welcome.
	2296
	2297
	2298	=head1 COPYRIGHT
	2299
	2300	Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
	2301	This module is free software. It may be used, redistributed
	2302	and/or modified under the same terms as Perl itself.