git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "PERLRE 1"
	132	.TH PERLRE 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide"
	133	.SH "NAME"
	134	.IX Xref "regular expression regex regexp"
	135	perlre \- Perl regular expressions
	136	.SH "DESCRIPTION"
	137	.IX Header "DESCRIPTION"
	138	This page describes the syntax of regular expressions in Perl.
	139	.PP
	140	If you haven't used regular expressions before, a quick-start
	141	introduction is available in perlrequick, and a longer tutorial
	142	introduction is available in perlretut.
	143	.PP
	144	For reference on how regular expressions are used in matching
	145	operations, plus various examples of the same, see discussions of
	146	\&\f(CW\(C`m//\(C'\fR, \f(CW\(C`s///\(C'\fR, \f(CW\(C`qr//\(C'\fR and \f(CW\(C`??\(C'\fR in \(L"Regexp Quote-Like Operators\(R" in perlop.
	147	.PP
	148	Matching operations can have various modifiers. Modifiers
	149	that relate to the interpretation of the regular expression inside
	150	are listed below. Modifiers that alter the way a regular expression
	151	is used by Perl are detailed in \(L"Regexp Quote-Like Operators\(R" in perlop and
	152	\&\(L"Gory details of parsing quoted constructs\(R" in perlop.
	153	.IP "i" 4
	154	.IX Xref " i regex, case-insensitive regexp, case-insensitive regular expression, case-insensitive"
	155	.IX Item "i"
	156	Do case-insensitive pattern matching.
	157	.Sp
	158	If \f(CW\(C`use locale\(C'\fR is in effect, the case map is taken from the current
	159	locale. See perllocale.
	160	.IP "m" 4
	161	.IX Xref " m regex, multiline regexp, multiline regular expression, multiline"
	162	.IX Item "m"
	163	Treat string as multiple lines. That is, change \(L"^\(R" and \(L"$\(R" from matching
	164	the start or end of the string to matching the start or end of any
	165	line anywhere within the string.
	166	.IP "s" 4
	167	.IX Xref " s regex, single-line regexp, single-line regular expression, single-line"
	168	.IX Item "s"
	169	Treat string as single line. That is, change \(L".\(R" to match any character
	170	whatsoever, even a newline, which normally it would not match.
	171	.Sp
	172	The \f(CW\(C`/s\(C'\fR and \f(CW\(C`/m\(C'\fR modifiers both override the \f(CW$*\fR setting. That
	173	is, no matter what \f(CW$\fR contains, \f(CW\(C`/s\(C'\fR without \f(CW\(C`/m\*(C'\fR will force
	174	\&\(L"^\(R" to match only at the beginning of the string and \(L"$\(R" to match
	175	only at the end (or just before a newline at the end) of the string.
	176	Together, as /ms, they let the \(L".\(R" match any character whatsoever,
	177	while still allowing \(L"^\(R" and \(L"$\(R" to match, respectively, just after
	178	and just before newlines within the string.
	179	.IP "x" 4
	180	.IX Xref " x"
	181	.IX Item "x"
	182	Extend your pattern's legibility by permitting whitespace and comments.
	183	.PP
	184	These are usually written as "the \f(CW\(C`/x\(C'\fR modifier", even though the delimiter
	185	in question might not really be a slash. Any of these
	186	modifiers may also be embedded within the regular expression itself using
	187	the \f(CW\(C`(?...)\(C'\fR construct. See below.
	188	.PP
	189	The \f(CW\(C`/x\(C'\fR modifier itself needs a little more explanation. It tells
	190	the regular expression parser to ignore whitespace that is neither
	191	backslashed nor within a character class. You can use this to break up
	192	your regular expression into (slightly) more readable parts. The \f(CW\(C`#\(C'\fR
	193	character is also treated as a metacharacter introducing a comment,
	194	just as in ordinary Perl code. This also means that if you want real
	195	whitespace or \f(CW\(C`#\(C'\fR characters in the pattern (outside a character
	196	class, where they are unaffected by \f(CW\(C`/x\(C'\fR), that you'll either have to
	197	escape them or encode them using octal or hex escapes. Taken together,
	198	these features go a long way towards making Perl's regular expressions
	199	more readable. Note that you have to be careful not to include the
	200	pattern delimiter in the comment\*(--perl has no way of knowing you did
	201	not intend to close the pattern early. See the C\-comment deletion code
	202	in perlop.
	203	.IX Xref " x"
	204	.Sh "Regular Expressions"
	205	.IX Subsection "Regular Expressions"
	206	The patterns used in Perl pattern matching derive from supplied in
	207	the Version 8 regex routines. (The routines are derived
	208	(distantly) from Henry Spencer's freely redistributable reimplementation
	209	of the V8 routines.) See \(L"Version 8 Regular Expressions\(R" for
	210	details.
	211	.PP
	212	In particular the following metacharacters have their standard \fIegrep\fR\-ish
	213	meanings:
	214	.IX Xref "metacharacter \ ^ . $ \| ( () [ []"
	215	.PP
	216	.Vb 7
	217	\& \e Quote the next metacharacter
	218	\& ^ Match the beginning of the line
	219	\& . Match any character (except newline)
	220	\& $ Match the end of the line (or before newline at the end)
	221	\& \| Alternation
	222	\& () Grouping
	223	\& [] Character class
	224	.Ve
	225	.PP
	226	By default, the \(L"^\(R" character is guaranteed to match only the
	227	beginning of the string, the \(L"$\(R" character only the end (or before the
	228	newline at the end), and Perl does certain optimizations with the
	229	assumption that the string contains only one line. Embedded newlines
	230	will not be matched by \(L"^\(R" or \(L"$\(R". You may, however, wish to treat a
	231	string as a multi-line buffer, such that the \(L"^\(R" will match after any
	232	newline within the string, and \(L"$\(R" will match before any newline. At the
	233	cost of a little more overhead, you can do this by using the /m modifier
	234	on the pattern match operator. (Older programs did this by setting \f(CW$*\fR,
	235	but this practice is now deprecated.)
	236	.IX Xref "^ $ m"
	237	.PP
	238	To simplify multi-line substitutions, the \(L".\(R" character never matches a
	239	newline unless you use the \f(CW\(C`/s\(C'\fR modifier, which in effect tells Perl to pretend
	240	the string is a single line\(--even if it isn't. The \f(CW\(C`/s\*(C'\fR modifier also
	241	overrides the setting of \f(CW$*\fR, in case you have some (badly behaved) older
	242	code that sets it in another module.
	243	.IX Xref ". s"
	244	.PP
	245	The following standard quantifiers are recognized:
	246	.IX Xref "metacharacter quantifier * + ? {n} {n,} {n,m}"
	247	.PP
	248	.Vb 6
	249	\& * Match 0 or more times
	250	\& + Match 1 or more times
	251	\& ? Match 1 or 0 times
	252	\& {n} Match exactly n times
	253	\& {n,} Match at least n times
	254	\& {n,m} Match at least n but not more than m times
	255	.Ve
	256	.PP
	257	(If a curly bracket occurs in any other context, it is treated
	258	as a regular character. In particular, the lower bound
	259	is not optional.) The \(L"\(R" modifier is equivalent to \f(CW\(C`{0,}\(C'\fR, the \(L"+\*(R"
	260	modifier to \f(CW\(C`{1,}\(C'\fR, and the \(L"?\(R" modifier to \f(CW\(C`{0,1}\(C'\fR. n and m are limited
	261	to integral values less than a preset limit defined when perl is built.
	262	This is usually 32766 on the most common platforms. The actual limit can
	263	be seen in the error message generated by code such as this:
	264	.PP
	265	.Vb 1
	266	\& $_ **= $_ , / {$_} / for 2 .. 42;
	267	.Ve
	268	.PP
	269	By default, a quantified subpattern is \(L"greedy\(R", that is, it will match as
	270	many times as possible (given a particular starting location) while still
	271	allowing the rest of the pattern to match. If you want it to match the
	272	minimum number of times possible, follow the quantifier with a \(L"?\(R". Note
	273	that the meanings don't change, just the \(L"greediness\(R":
	274	.IX Xref "metacharacter greedy greedyness ? *? +? ?? {n}? {n,}? {n,m}?"
	275	.PP
	276	.Vb 6
	277	\& *? Match 0 or more times
	278	\& +? Match 1 or more times
	279	\& ?? Match 0 or 1 time
	280	\& {n}? Match exactly n times
	281	\& {n,}? Match at least n times
	282	\& {n,m}? Match at least n but not more than m times
	283	.Ve
	284	.PP
	285	Because patterns are processed as double quoted strings, the following
	286	also work:
	287	.IX Xref "\t \n \r \f \a \l \u \L \U \E \Q \0 \c \N \x"
	288	.PP
	289	.Vb 17
	290	\& \et tab (HT, TAB)
	291	\& \en newline (LF, NL)
	292	\& \er return (CR)
	293	\& \ef form feed (FF)
	294	\& \ea alarm (bell) (BEL)
	295	\& \ee escape (think troff) (ESC)
	296	\& \e033 octal char (think of a PDP-11)
	297	\& \ex1B hex char
	298	\& \ex{263a} wide hex char (Unicode SMILEY)
	299	\& \ec[ control char
	300	\& \eN{name} named char
	301	\& \el lowercase next char (think vi)
	302	\& \eu uppercase next char (think vi)
	303	\& \eL lowercase till \eE (think vi)
	304	\& \eU uppercase till \eE (think vi)
	305	\& \eE end case modification (think vi)
	306	\& \eQ quote (disable) pattern metacharacters till \eE
	307	.Ve
	308	.PP
	309	If \f(CW\(C`use locale\(C'\fR is in effect, the case map used by \f(CW\(C`\el\(C'\fR, \f(CW\(C`\eL\(C'\fR, \f(CW\(C`\eu\(C'\fR
	310	and \f(CW\(C`\eU\(C'\fR is taken from the current locale. See perllocale. For
	311	documentation of \f(CW\(C`\eN{name}\(C'\fR, see charnames.
	312	.PP
	313	You cannot include a literal \f(CW\(C`$\(C'\fR or \f(CW\(C`@\(C'\fR within a \f(CW\(C`\eQ\(C'\fR sequence.
	314	An unescaped \f(CW\(C`$\(C'\fR or \f(CW\(C`@\(C'\fR interpolates the corresponding variable,
	315	while escaping will cause the literal string \f(CW\(C`\e$\(C'\fR to be matched.
	316	You'll need to write something like \f(CW\(C`m/\eQuser\eE\e@\eQhost/\(C'\fR.
	317	.PP
	318	In addition, Perl defines the following:
	319	.IX Xref "metacharacter \w \W \s \S \d \D \X \p \P \C word whitespace"
	320	.PP
	321	.Vb 14
	322	\& \ew Match a "word" character (alphanumeric plus "_")
	323	\& \eW Match a non-"word" character
	324	\& \es Match a whitespace character
	325	\& \eS Match a non-whitespace character
	326	\& \ed Match a digit character
	327	\& \eD Match a non-digit character
	328	\& \epP Match P, named property. Use \ep{Prop} for longer names.
	329	\& \ePP Match non-P
	330	\& \eX Match eXtended Unicode "combining character sequence",
	331	\& equivalent to (?:\ePM\epM*)
	332	\& \eC Match a single C char (octet) even under Unicode.
	333	\& NOTE: breaks up characters into their UTF-8 bytes,
	334	\& so you may end up with malformed pieces of UTF-8.
	335	\& Unsupported in lookbehind.
	336	.Ve
	337	.PP
	338	A \f(CW\(C`\ew\(C'\fR matches a single alphanumeric character (an alphabetic
	339	character, or a decimal digit) or \f(CW\(C`_\(C'\fR, not a whole word. Use \f(CW\(C`\ew+\(C'\fR
	340	to match a string of Perl-identifier characters (which isn't the same
	341	as matching an English word). If \f(CW\(C`use locale\(C'\fR is in effect, the list
	342	of alphabetic characters generated by \f(CW\(C`\ew\(C'\fR is taken from the current
	343	locale. See perllocale. You may use \f(CW\(C`\ew\(C'\fR, \f(CW\(C`\eW\(C'\fR, \f(CW\(C`\es\(C'\fR, \f(CW\(C`\eS\(C'\fR,
	344	\&\f(CW\(C`\ed\(C'\fR, and \f(CW\(C`\eD\(C'\fR within character classes, but if you try to use them
	345	as endpoints of a range, that's not a range, the \(L"\-\(R" is understood
	346	literally. If Unicode is in effect, \f(CW\(C`\es\(C'\fR matches also \(L"\ex{85}\(R",
	347	\&\(L"\ex{2028}, and \(R"\ex{2029}", see perlunicode for more details about
	348	\&\f(CW\(C`\epP\(C'\fR, \f(CW\(C`\ePP\(C'\fR, and \f(CW\(C`\eX\(C'\fR, and perluniintro about Unicode in general.
	349	You can define your own \f(CW\(C`\ep\(C'\fR and \f(CW\(C`\eP\(C'\fR properties, see perlunicode.
	350	.IX Xref "\w \W word"
	351	.PP
	352	The \s-1POSIX\s0 character class syntax
	353	.IX Xref "character class"
	354	.PP
	355	.Vb 1
	356	\& [:class:]
	357	.Ve
	358	.PP
	359	is also available. The available classes and their backslash
	360	equivalents (if available) are as follows:
	361	.IX Xref "character class alpha alnum ascii blank cntrl digit graph lower print punct space upper word xdigit"
	362	.PP
	363	.Vb 14
	364	\& alpha
	365	\& alnum
	366	\& ascii
	367	\& blank [1]
	368	\& cntrl
	369	\& digit \ed
	370	\& graph
	371	\& lower
	372	\& print
	373	\& punct
	374	\& space \es [2]
	375	\& upper
	376	\& word \ew [3]
	377	\& xdigit
	378	.Ve
	379	.IP "[1]" 4
	380	.IX Item "[1]"
	381	A \s-1GNU\s0 extension equivalent to \f(CW\(C`[ \et]\(C'\fR, \(L"all horizontal whitespace\(R".
	382	.IP "[2]" 4
	383	.IX Item "[2]"
	384	Not exactly equivalent to \f(CW\(C`\es\(C'\fR since the \f(CW\(C`[[:space:]]\(C'\fR includes
	385	also the (very rare) \(L"vertical tabulator\(R", \(L"\eck\(R", chr(11).
	386	.IP "[3]" 4
	387	.IX Item "[3]"
	388	A Perl extension, see above.
	389	.PP
	390	For example use \f(CW\(C`[:upper:]\(C'\fR to match all the uppercase characters.
	391	Note that the \f(CW\(C`[]\(C'\fR are part of the \f(CW\(C`[::]\(C'\fR construct, not part of the
	392	whole character class. For example:
	393	.PP
	394	.Vb 1
	395	\& [01[:alpha:]%]
	396	.Ve
	397	.PP
	398	matches zero, one, any alphabetic character, and the percentage sign.
	399	.PP
	400	The following equivalences to Unicode \ep{} constructs and equivalent
	401	backslash character classes (if available), will hold:
	402	.IX Xref "character class \p \p{}"
	403	.PP
	404	.Vb 1
	405	\& [:...:] \ep{...} backslash
	406	.Ve
	407	.PP
	408	.Vb 15
	409	\& alpha IsAlpha
	410	\& alnum IsAlnum
	411	\& ascii IsASCII
	412	\& blank IsSpace
	413	\& cntrl IsCntrl
	414	\& digit IsDigit \ed
	415	\& graph IsGraph
	416	\& lower IsLower
	417	\& print IsPrint
	418	\& punct IsPunct
	419	\& space IsSpace
	420	\& IsSpacePerl \es
	421	\& upper IsUpper
	422	\& word IsWord
	423	\& xdigit IsXDigit
	424	.Ve
	425	.PP
	426	For example \f(CW\(C`[:lower:]\(C'\fR and \f(CW\(C`\ep{IsLower}\(C'\fR are equivalent.
	427	.PP
	428	If the \f(CW\(C`utf8\(C'\fR pragma is not used but the \f(CW\(C`locale\(C'\fR pragma is, the
	429	classes correlate with the usual \fIisalpha\fR\\|(3) interface (except for
	430	\&\(L"word\(R" and \(L"blank\(R").
	431	.PP
	432	The assumedly non-obviously named classes are:
	433	.IP "cntrl" 4
	434	.IX Xref "cntrl"
	435	.IX Item "cntrl"
	436	Any control character. Usually characters that don't produce output as
	437	such but instead control the terminal somehow: for example newline and
	438	backspace are control characters. All characters with \fIord()\fR less than
	439	32 are most often classified as control characters (assuming \s-1ASCII\s0,
	440	the \s-1ISO\s0 Latin character sets, and Unicode), as is the character with
	441	the \fIord()\fR value of 127 (\f(CW\(C`DEL\(C'\fR).
	442	.IP "graph" 4
	443	.IX Xref "graph"
	444	.IX Item "graph"
	445	Any alphanumeric or punctuation (special) character.
	446	.IP "print" 4
	447	.IX Xref "print"
	448	.IX Item "print"
	449	Any alphanumeric or punctuation (special) character or the space character.
	450	.IP "punct" 4
	451	.IX Xref "punct"
	452	.IX Item "punct"
	453	Any punctuation (special) character.
	454	.IP "xdigit" 4
	455	.IX Xref "xdigit"
	456	.IX Item "xdigit"
	457	Any hexadecimal digit. Though this may feel silly ([0\-9A\-Fa\-f] would
	458	work just fine) it is included for completeness.
	459	.PP
	460	You can negate the [::] character classes by prefixing the class name
	461	with a '^'. This is a Perl extension. For example:
	462	.IX Xref "character class, negation"
	463	.PP
	464	.Vb 1
	465	\& POSIX traditional Unicode
	466	.Ve
	467	.PP
	468	.Vb 3
	469	\& [:^digit:] \eD \eP{IsDigit}
	470	\& [:^space:] \eS \eP{IsSpace}
	471	\& [:^word:] \eW \eP{IsWord}
	472	.Ve
	473	.PP
	474	Perl respects the \s-1POSIX\s0 standard in that \s-1POSIX\s0 character classes are
	475	only supported within a character class. The \s-1POSIX\s0 character classes
	476	[.cc.] and [=cc=] are recognized but \fBnot\fR supported and trying to
	477	use them will cause an error.
	478	.PP
	479	Perl defines the following zero-width assertions:
	480	.IX Xref "zero-width assertion assertion regex, zero-width assertion regexp, zero-width assertion regular expression, zero-width assertion \b \B \A \Z \z \G"
	481	.PP
	482	.Vb 7
	483	\& \eb Match a word boundary
	484	\& \eB Match a non-(word boundary)
	485	\& \eA Match only at beginning of string
	486	\& \eZ Match only at end of string, or before newline at the end
	487	\& \ez Match only at end of string
	488	\& \eG Match only at pos() (e.g. at the end-of-match position
	489	\& of prior m//g)
	490	.Ve
	491	.PP
	492	A word boundary (\f(CW\(C`\eb\(C'\fR) is a spot between two characters
	493	that has a \f(CW\(C`\ew\(C'\fR on one side of it and a \f(CW\(C`\eW\(C'\fR on the other side
	494	of it (in either order), counting the imaginary characters off the
	495	beginning and end of the string as matching a \f(CW\(C`\eW\(C'\fR. (Within
	496	character classes \f(CW\(C`\eb\(C'\fR represents backspace rather than a word
	497	boundary, just as it normally does in any double-quoted string.)
	498	The \f(CW\(C`\eA\(C'\fR and \f(CW\(C`\eZ\(C'\fR are just like \(L"^\(R" and \(L"$\(R", except that they
	499	won't match multiple times when the \f(CW\(C`/m\(C'\fR modifier is used, while
	500	\&\(L"^\(R" and \(L"$\(R" will match at every internal line boundary. To match
	501	the actual end of the string and not ignore an optional trailing
	502	newline, use \f(CW\(C`\ez\(C'\fR.
	503	.IX Xref "\b \A \Z \z m"
	504	.PP
	505	The \f(CW\(C`\eG\(C'\fR assertion can be used to chain global matches (using
	506	\&\f(CW\(C`m//g\(C'\fR), as described in \(L"Regexp Quote-Like Operators\(R" in perlop.
	507	It is also useful when writing \f(CW\(C`lex\(C'\fR\-like scanners, when you have
	508	several patterns that you want to match against consequent substrings
	509	of your string, see the previous reference. The actual location
	510	where \f(CW\(C`\eG\(C'\fR will match can also be influenced by using \f(CW\(C`pos()\(C'\fR as
	511	an lvalue: see \(L"pos\(R" in perlfunc. Currently \f(CW\(C`\eG\(C'\fR is only fully
	512	supported when anchored to the start of the pattern; while it
	513	is permitted to use it elsewhere, as in \f(CW\(C`/(?<=\eG..)./g\(C'\fR, some
	514	such uses (\f(CW\(C`/.\eG/g\(C'\fR, for example) currently cause problems, and
	515	it is recommended that you avoid such usage for now.
	516	.IX Xref "\G"
	517	.PP
	518	The bracketing construct \f(CW\(C`( ... )\(C'\fR creates capture buffers. To
	519	refer to the digit'th buffer use \e<digit> within the
	520	match. Outside the match use \(L"$\(R" instead of \(L"\e\(R". (The
	521	\&\e<digit> notation works in certain circumstances outside
	522	the match. See the warning below about \e1 vs \f(CW$1\fR for details.)
	523	Referring back to another part of the match is called a
	524	\&\fIbackreference\fR.
	525	.IX Xref "regex, capture buffer regexp, capture buffer regular expression, capture buffer backreference"
	526	.PP
	527	There is no limit to the number of captured substrings that you may
	528	use. However Perl also uses \e10, \e11, etc. as aliases for \e010,
	529	\&\e011, etc. (Recall that 0 means octal, so \e011 is the character at
	530	number 9 in your coded character set; which would be the 10th character,
	531	a horizontal tab under \s-1ASCII\s0.) Perl resolves this
	532	ambiguity by interpreting \e10 as a backreference only if at least 10
	533	left parentheses have opened before it. Likewise \e11 is a
	534	backreference only if at least 11 left parentheses have opened
	535	before it. And so on. \e1 through \e9 are always interpreted as
	536	backreferences.
	537	.PP
	538	Examples:
	539	.PP
	540	.Vb 1
	541	\& s/^([^ ]) ([^ ]*)/$2 $1/; # swap first two words
	542	.Ve
	543	.PP
	544	.Vb 3
	545	\& if (/(.)\e1/) { # find first doubled char
	546	\& print "'$1' is the first doubled character\en";
	547	\& }
	548	.Ve
	549	.PP
	550	.Vb 5
	551	\& if (/Time: (..):(..):(..)/) { # parse out values
	552	\& $hours = $1;
	553	\& $minutes = $2;
	554	\& $seconds = $3;
	555	\& }
	556	.Ve
	557	.PP
	558	Several special variables also refer back to portions of the previous
	559	match. \f(CW$+\fR returns whatever the last bracket match matched.
	560	\&\f(CW$&\fR returns the entire matched string. (At one point \f(CW$0\fR did
	561	also, but now it returns the name of the program.) \f(CW$`\fR returns
	562	everything before the matched string. \f(CW$'\fR returns everything
	563	after the matched string. And \f(CW$^N\fR contains whatever was matched by
	564	the most-recently closed group (submatch). \f(CW$^N\fR can be used in
	565	extended patterns (see below), for example to assign a submatch to a
	566	variable.
	567	.IX Xref "$+ $^N $& $` $'"
	568	.PP
	569	The numbered match variables ($1, \f(CW$2\fR, \f(CW$3\fR, etc.) and the related punctuation
	570	set (\f(CW$+\fR, \f(CW$&\fR, \f(CW$`\fR, \f(CW$'\fR, and \f(CW$^N\fR) are all dynamically scoped
	571	until the end of the enclosing block or until the next successful
	572	match, whichever comes first. (See \(L"Compound Statements\(R" in perlsyn.)
	573	.IX Xref "$+ $^N $& $` $' $1 $2 $3 $4 $5 $6 $7 $8 $9"
	574	.PP
	575	\&\fB\s-1NOTE\s0\fR: failed matches in Perl do not reset the match variables,
	576	which makes it easier to write code that tests for a series of more
	577	specific cases and remembers the best match.
	578	.PP
	579	\&\fB\s-1WARNING\s0\fR: Once Perl sees that you need one of \f(CW$&\fR, \f(CW$`\fR, or
	580	\&\f(CW$'\fR anywhere in the program, it has to provide them for every
	581	pattern match. This may substantially slow your program. Perl
	582	uses the same mechanism to produce \f(CW$1\fR, \f(CW$2\fR, etc, so you also pay a
	583	price for each pattern that contains capturing parentheses. (To
	584	avoid this cost while retaining the grouping behaviour, use the
	585	extended regular expression \f(CW\(C`(?: ... )\(C'\fR instead.) But if you never
	586	use \f(CW$&\fR, \f(CW$`\fR or \f(CW$'\fR, then patterns \fIwithout\fR capturing
	587	parentheses will not be penalized. So avoid \f(CW$&\fR, \f(CW$'\fR, and \f(CW$`\fR
	588	if you can, but if you can't (and some algorithms really appreciate
	589	them), once you've used them once, use them at will, because you've
	590	already paid the price. As of 5.005, \f(CW$&\fR is not so costly as the
	591	other two.
	592	.IX Xref "$& $` $'"
	593	.PP
	594	Backslashed metacharacters in Perl are alphanumeric, such as \f(CW\(C`\eb\(C'\fR,
	595	\&\f(CW\(C`\ew\(C'\fR, \f(CW\(C`\en\(C'\fR. Unlike some other regular expression languages, there
	596	are no backslashed symbols that aren't alphanumeric. So anything
	597	that looks like \e\e, \e(, \e), \e<, \e>, \e{, or \e} is always
	598	interpreted as a literal character, not a metacharacter. This was
	599	once used in a common idiom to disable or quote the special meanings
	600	of regular expression metacharacters in a string that you want to
	601	use for a pattern. Simply quote all non\-\(L"word\(R" characters:
	602	.PP
	603	.Vb 1
	604	\& $pattern =~ s/(\eW)/\e\e$1/g;
	605	.Ve
	606	.PP
	607	(If \f(CW\(C`use locale\(C'\fR is set, then this depends on the current locale.)
	608	Today it is more common to use the \fIquotemeta()\fR function or the \f(CW\(C`\eQ\(C'\fR
	609	metaquoting escape sequence to disable all metacharacters' special
	610	meanings like this:
	611	.PP
	612	.Vb 1
	613	\& /$unquoted\eQ$quoted\eE$unquoted/
	614	.Ve
	615	.PP
	616	Beware that if you put literal backslashes (those not inside
	617	interpolated variables) between \f(CW\(C`\eQ\(C'\fR and \f(CW\(C`\eE\(C'\fR, double-quotish
	618	backslash interpolation may lead to confusing results. If you
	619	\&\fIneed\fR to use literal backslashes within \f(CW\(C`\eQ...\eE\(C'\fR,
	620	consult \(L"Gory details of parsing quoted constructs\(R" in perlop.
	621	.Sh "Extended Patterns"
	622	.IX Subsection "Extended Patterns"
	623	Perl also defines a consistent extension syntax for features not
	624	found in standard tools like \fBawk\fR and \fBlex\fR. The syntax is a
	625	pair of parentheses with a question mark as the first thing within
	626	the parentheses. The character after the question mark indicates
	627	the extension.
	628	.PP
	629	The stability of these extensions varies widely. Some have been
	630	part of the core language for many years. Others are experimental
	631	and may change without warning or be completely removed. Check
	632	the documentation on an individual feature to verify its current
	633	status.
	634	.PP
	635	A question mark was chosen for this and for the minimal-matching
	636	construct because 1) question marks are rare in older regular
	637	expressions, and 2) whenever you see one, you should stop and
	638	\&\(L"question\(R" exactly what is going on. That's psychology...
	639	.ie n .IP """(?#text)""" 10
	640	.el .IP "\f(CW(?#text)\fR" 10
	641	.IX Xref "(?#)"
	642	.IX Item "(?#text)"
	643	A comment. The text is ignored. If the \f(CW\(C`/x\(C'\fR modifier enables
	644	whitespace formatting, a simple \f(CW\(C`#\(C'\fR will suffice. Note that Perl closes
	645	the comment as soon as it sees a \f(CW\(C`)\(C'\fR, so there is no way to put a literal
	646	\&\f(CW\(C`)\(C'\fR in the comment.
	647	.ie n .IP """(?imsx\-imsx)""" 10
	648	.el .IP "\f(CW(?imsx\-imsx)\fR" 10
	649	.IX Xref "(?)"
	650	.IX Item "(?imsx-imsx)"
	651	One or more embedded pattern-match modifiers, to be turned on (or
	652	turned off, if preceded by \f(CW\(C`\-\(C'\fR) for the remainder of the pattern or
	653	the remainder of the enclosing pattern group (if any). This is
	654	particularly useful for dynamic patterns, such as those read in from a
	655	configuration file, read in as an argument, are specified in a table
	656	somewhere, etc. Consider the case that some of which want to be case
	657	sensitive and some do not. The case insensitive ones need to include
	658	merely \f(CW\(C`(?i)\(C'\fR at the front of the pattern. For example:
	659	.Sp
	660	.Vb 2
	661	\& $pattern = "foobar";
	662	\& if ( /$pattern/i ) { }
	663	.Ve
	664	.Sp
	665	.Vb 1
	666	\& # more flexible:
	667	.Ve
	668	.Sp
	669	.Vb 2
	670	\& $pattern = "(?i)foobar";
	671	\& if ( /$pattern/ ) { }
	672	.Ve
	673	.Sp
	674	These modifiers are restored at the end of the enclosing group. For example,
	675	.Sp
	676	.Vb 1
	677	\& ( (?i) blah ) \es+ \e1
	678	.Ve
	679	.Sp
	680	will match a repeated (\fIincluding the case\fR!) word \f(CW\(C`blah\(C'\fR in any
	681	case, assuming \f(CW\(C`x\(C'\fR modifier, and no \f(CW\(C`i\(C'\fR modifier outside this
	682	group.
	683	.ie n .IP """(?:pattern)""" 10
	684	.el .IP "\f(CW(?:pattern)\fR" 10
	685	.IX Xref "(?:)"
	686	.IX Item "(?:pattern)"
	687	.PD 0
	688	.ie n .IP """(?imsx\-imsx:pattern)""" 10
	689	.el .IP "\f(CW(?imsx\-imsx:pattern)\fR" 10
	690	.IX Item "(?imsx-imsx:pattern)"
	691	.PD
	692	This is for clustering, not capturing; it groups subexpressions like
	693	\&\(L"()\(R", but doesn't make backreferences as \(L"()\(R" does. So
	694	.Sp
	695	.Vb 1
	696	\& @fields = split(/\eb(?:a\|b\|c)\eb/)
	697	.Ve
	698	.Sp
	699	is like
	700	.Sp
	701	.Vb 1
	702	\& @fields = split(/\eb(a\|b\|c)\eb/)
	703	.Ve
	704	.Sp
	705	but doesn't spit out extra fields. It's also cheaper not to capture
	706	characters if you don't need to.
	707	.Sp
	708	Any letters between \f(CW\(C`?\(C'\fR and \f(CW\(C`:\(C'\fR act as flags modifiers as with
	709	\&\f(CW\(C`(?imsx\-imsx)\(C'\fR. For example,
	710	.Sp
	711	.Vb 1
	712	\& /(?s-i:more.than).million/i
	713	.Ve
	714	.Sp
	715	is equivalent to the more verbose
	716	.Sp
	717	.Vb 1
	718	\& /(?:(?s-i)more.than).million/i
	719	.Ve
	720	.ie n .IP """(?=pattern)""" 10
	721	.el .IP "\f(CW(?=pattern)\fR" 10
	722	.IX Xref "(?=) look-ahead, positive lookahead, positive"
	723	.IX Item "(?=pattern)"
	724	A zero-width positive look-ahead assertion. For example, \f(CW\(C`/\ew+(?=\et)/\(C'\fR
	725	matches a word followed by a tab, without including the tab in \f(CW$&\fR.
	726	.ie n .IP """(?!pattern)""" 10
	727	.el .IP "\f(CW(?!pattern)\fR" 10
	728	.IX Xref "(?!) look-ahead, negative lookahead, negative"
	729	.IX Item "(?!pattern)"
	730	A zero-width negative look-ahead assertion. For example \f(CW\(C`/foo(?!bar)/\(C'\fR
	731	matches any occurrence of \(L"foo\(R" that isn't followed by \(L"bar\(R". Note
	732	however that look-ahead and look-behind are \s-1NOT\s0 the same thing. You cannot
	733	use this for look\-behind.
	734	.Sp
	735	If you are looking for a \(L"bar\(R" that isn't preceded by a \(L"foo\(R", \f(CW\(C`/(?!foo)bar/\(C'\fR
	736	will not do what you want. That's because the \f(CW\(C`(?!foo)\(C'\fR is just saying that
	737	the next thing cannot be \(L"foo\(R"\-\-and it's not, it's a \(L"bar\(R", so \(L"foobar\(R" will
	738	match. You would have to do something like \f(CW\(C`/(?!foo)...bar/\(C'\fR for that. We
	739	say \(L"like\(R" because there's the case of your \(L"bar\(R" not having three characters
	740	before it. You could cover that this way: \f(CW\(C`/(?:(?!foo)...\|^.{0,2})bar/\(C'\fR.
	741	Sometimes it's still easier just to say:
	742	.Sp
	743	.Vb 1
	744	\& if (/bar/ && $` !~ /foo$/)
	745	.Ve
	746	.Sp
	747	For look-behind see below.
	748	.ie n .IP """(?<=pattern)""" 10
	749	.el .IP "\f(CW(?<=pattern)\fR" 10
	750	.IX Xref "(?<=) look-behind, positive lookbehind, positive"
	751	.IX Item "(?<=pattern)"
	752	A zero-width positive look-behind assertion. For example, \f(CW\(C`/(?<=\et)\ew+/\(C'\fR
	753	matches a word that follows a tab, without including the tab in \f(CW$&\fR.
	754	Works only for fixed-width look\-behind.
	755	.ie n .IP """(?<!pattern)""" 10
	756	.el .IP "\f(CW(?<!pattern)\fR" 10
	757	.IX Xref "(?<!) look-behind, negative lookbehind, negative"
	758	.IX Item "(?<!pattern)"
	759	A zero-width negative look-behind assertion. For example \f(CW\(C`/(?<!bar)foo/\(C'\fR
	760	matches any occurrence of \(L"foo\(R" that does not follow \(L"bar\(R". Works
	761	only for fixed-width look\-behind.
	762	.ie n .IP """(?{ code })""" 10
	763	.el .IP "\f(CW(?{ code })\fR" 10
	764	.IX Xref "(?{}) regex, code in regexp, code in regular expression, code in"
	765	.IX Item "(?{ code })"
	766	\&\fB\s-1WARNING\s0\fR: This extended regular expression feature is considered
	767	highly experimental, and may be changed or deleted without notice.
	768	.Sp
	769	This zero-width assertion evaluates any embedded Perl code. It
	770	always succeeds, and its \f(CW\(C`code\(C'\fR is not interpolated. Currently,
	771	the rules to determine where the \f(CW\(C`code\(C'\fR ends are somewhat convoluted.
	772	.Sp
	773	This feature can be used together with the special variable \f(CW$^N\fR to
	774	capture the results of submatches in variables without having to keep
	775	track of the number of nested parentheses. For example:
	776	.Sp
	777	.Vb 3
	778	\& $_ = "The brown fox jumps over the lazy dog";
	779	\& /the (\eS+)(?{ $color = $^N }) (\eS+)(?{ $animal = $^N })/i;
	780	\& print "color = $color, animal = $animal\en";
	781	.Ve
	782	.Sp
	783	Inside the \f(CW\(C`(?{...})\(C'\fR block, \f(CW$_\fR refers to the string the regular
	784	expression is matching against. You can also use \f(CW\(C`pos()\(C'\fR to know what is
	785	the current position of matching within this string.
	786	.Sp
	787	The \f(CW\(C`code\(C'\fR is properly scoped in the following sense: If the assertion
	788	is backtracked (compare \(L"Backtracking\(R"), all changes introduced after
	789	\&\f(CW\(C`local\(C'\fRization are undone, so that
	790	.Sp
	791	.Vb 13
	792	\& $_ = 'a' x 8;
	793	\& m<
	794	\& (?{ $cnt = 0 }) # Initialize $cnt.
	795	\& (
	796	\& a
	797	\& (?{
	798	\& local $cnt = $cnt + 1; # Update $cnt, backtracking-safe.
	799	\& })
	800	\& )*
	801	\& aaaa
	802	\& (?{ $res = $cnt }) # On success copy to non-localized
	803	\& # location.
	804	\& >x;
	805	.Ve
	806	.Sp
	807	will set \f(CW\(C`$res = 4\(C'\fR. Note that after the match, \f(CW$cnt\fR returns to the globally
	808	introduced value, because the scopes that restrict \f(CW\(C`local\(C'\fR operators
	809	are unwound.
	810	.Sp
	811	This assertion may be used as a \f(CW\(C`(?(condition)yes\-pattern\|no\-pattern)\(C'\fR
	812	switch. If \fInot\fR used in this way, the result of evaluation of
	813	\&\f(CW\(C`code\(C'\fR is put into the special variable \f(CW$^R\fR. This happens
	814	immediately, so \f(CW$^R\fR can be used from other \f(CW\(C`(?{ code })\(C'\fR assertions
	815	inside the same regular expression.
	816	.Sp
	817	The assignment to \f(CW$^R\fR above is properly localized, so the old
	818	value of \f(CW$^R\fR is restored if the assertion is backtracked; compare
	819	\&\(L"Backtracking\(R".
	820	.Sp
	821	For reasons of security, this construct is forbidden if the regular
	822	expression involves run-time interpolation of variables, unless the
	823	perilous \f(CW\(C`use re 'eval'\(C'\fR pragma has been used (see re), or the
	824	variables contain results of \f(CW\(C`qr//\(C'\fR operator (see
	825	\&\(L"qr/STRING/imosx\(R" in perlop).
	826	.Sp
	827	This restriction is because of the wide-spread and remarkably convenient
	828	custom of using run-time determined strings as patterns. For example:
	829	.Sp
	830	.Vb 3
	831	\& $re = <>;
	832	\& chomp $re;
	833	\& $string =~ /$re/;
	834	.Ve
	835	.Sp
	836	Before Perl knew how to execute interpolated code within a pattern,
	837	this operation was completely safe from a security point of view,
	838	although it could raise an exception from an illegal pattern. If
	839	you turn on the \f(CW\(C`use re 'eval'\(C'\fR, though, it is no longer secure,
	840	so you should only do so if you are also using taint checking.
	841	Better yet, use the carefully constrained evaluation within a Safe
	842	compartment. See perlsec for details about both these mechanisms.
	843	.ie n .IP """(??{ code })""" 10
	844	.el .IP "\f(CW(??{ code })\fR" 10
	845	.IX Xref "(??{}) regex, postponed regexp, postponed regular expression, postponed regex, recursive regexp, recursive regular expression, recursive"
	846	.IX Item "(??{ code })"
	847	\&\fB\s-1WARNING\s0\fR: This extended regular expression feature is considered
	848	highly experimental, and may be changed or deleted without notice.
	849	A simplified version of the syntax may be introduced for commonly
	850	used idioms.
	851	.Sp
	852	This is a \(L"postponed\(R" regular subexpression. The \f(CW\(C`code\(C'\fR is evaluated
	853	at run time, at the moment this subexpression may match. The result
	854	of evaluation is considered as a regular expression and matched as
	855	if it were inserted instead of this construct.
	856	.Sp
	857	The \f(CW\(C`code\(C'\fR is not interpolated. As before, the rules to determine
	858	where the \f(CW\(C`code\(C'\fR ends are currently somewhat convoluted.
	859	.Sp
	860	The following pattern matches a parenthesized group:
	861	.Sp
	862	.Vb 9
	863	\& $re = qr{
	864	\& \e(
	865	\& (?:
	866	\& (?> [^()]+ ) # Non-parens without backtracking
	867	\& \|
	868	\& (??{ $re }) # Group with matching parens
	869	\& )*
	870	\& \e)
	871	\& }x;
	872	.Ve
	873	.ie n .IP """(?>pattern)""" 10
	874	.el .IP "\f(CW(?>pattern)\fR" 10
	875	.IX Xref "backtrack backtracking"
	876	.IX Item "(?>pattern)"
	877	\&\fB\s-1WARNING\s0\fR: This extended regular expression feature is considered
	878	highly experimental, and may be changed or deleted without notice.
	879	.Sp
	880	An \(L"independent\(R" subexpression, one which matches the substring
	881	that a \fIstandalone\fR \f(CW\(C`pattern\(C'\fR would match if anchored at the given
	882	position, and it matches \fInothing other than this substring\fR. This
	883	construct is useful for optimizations of what would otherwise be
	884	\&\(L"eternal\(R" matches, because it will not backtrack (see \(L"Backtracking\(R").
	885	It may also be useful in places where the \*(L"grab all you can, and do not
	886	give anything back\*(R" semantic is desirable.
	887	.Sp
	888	For example: \f(CW\(C`^(?>a)ab\(C'\fR will never match, since \f(CW\(C`(?>a)\(C'\fR
	889	(anchored at the beginning of string, as above) will match \fIall\fR
	890	characters \f(CW\(C`a\(C'\fR at the beginning of string, leaving no \f(CW\(C`a\(C'\fR for
	891	\&\f(CW\(C`ab\(C'\fR to match. In contrast, \f(CW\(C`aab\(C'\fR will match the same as \f(CW\(C`a+b\*(C'\fR,
	892	since the match of the subgroup \f(CW\(C`a\*(C'\fR is influenced by the following
	893	group \f(CW\(C`ab\(C'\fR (see \(L"Backtracking\(R"). In particular, \f(CW\(C`a\*(C'\fR inside
	894	\&\f(CW\(C`aab\(C'\fR will match fewer characters than a standalone \f(CW\(C`a\(C'\fR, since
	895	this makes the tail match.
	896	.Sp
	897	An effect similar to \f(CW\(C`(?>pattern)\(C'\fR may be achieved by writing
	898	\&\f(CW\(C`(?=(pattern))\e1\(C'\fR. This matches the same substring as a standalone
	899	\&\f(CW\(C`a+\(C'\fR, and the following \f(CW\(C`\e1\(C'\fR eats the matched string; it therefore
	900	makes a zero-length assertion into an analogue of \f(CW\(C`(?>...)\(C'\fR.
	901	(The difference between these two constructs is that the second one
	902	uses a capturing group, thus shifting ordinals of backreferences
	903	in the rest of a regular expression.)
	904	.Sp
	905	Consider this pattern:
	906	.Sp
	907	.Vb 8
	908	\& m{ \e(
	909	\& (
	910	\& [^()]+ # x+
	911	\& \|
	912	\& \e( [^()]* \e)
	913	\& )+
	914	\& \e)
	915	\& }x
	916	.Ve
	917	.Sp
	918	That will efficiently match a nonempty group with matching parentheses
	919	two levels deep or less. However, if there is no such group, it
	920	will take virtually forever on a long string. That's because there
	921	are so many different ways to split a long string into several
	922	substrings. This is what \f(CW\(C`(.+)+\(C'\fR is doing, and \f(CW\(C`(.+)+\(C'\fR is similar
	923	to a subpattern of the above pattern. Consider how the pattern
	924	above detects no-match on \f(CW\(C`((()aaaaaaaaaaaaaaaaaa\(C'\fR in several
	925	seconds, but that each extra letter doubles this time. This
	926	exponential performance will make it appear that your program has
	927	hung. However, a tiny change to this pattern
	928	.Sp
	929	.Vb 8
	930	\& m{ \e(
	931	\& (
	932	\& (?> [^()]+ ) # change x+ above to (?> x+ )
	933	\& \|
	934	\& \e( [^()]* \e)
	935	\& )+
	936	\& \e)
	937	\& }x
	938	.Ve
	939	.Sp
	940	which uses \f(CW\(C`(?>...)\(C'\fR matches exactly when the one above does (verifying
	941	this yourself would be a productive exercise), but finishes in a fourth
	942	the time when used on a similar string with 1000000 \f(CW\(C`a\(C'\fRs. Be aware,
	943	however, that this pattern currently triggers a warning message under
	944	the \f(CW\(C`use warnings\(C'\fR pragma or \fB\-w\fR switch saying it
	945	\&\f(CW"matches null string many times in regex"\fR.
	946	.Sp
	947	On simple groups, such as the pattern \f(CW\(C`(?> [^()]+ )\(C'\fR, a comparable
	948	effect may be achieved by negative look\-ahead, as in \f(CW\(C`[^()]+ (?! [^()] )\(C'\fR.
	949	This was only 4 times slower on a string with 1000000 \f(CW\(C`a\(C'\fRs.
	950	.Sp
	951	The \(L"grab all you can, and do not give anything back\(R" semantic is desirable
	952	in many situations where on the first sight a simple \f(CW\(C`()\*(C'\fR looks like
	953	the correct solution. Suppose we parse text with comments being delimited
	954	by \f(CW\(C`#\(C'\fR followed by some optional (horizontal) whitespace. Contrary to
	955	its appearance, \f(CW\(C`#[ \et]\*(C'\fR \fIis not\fR the correct subexpression to match
	956	the comment delimiter, because it may \(L"give up\(R" some whitespace if
	957	the remainder of the pattern can be made to match that way. The correct
	958	answer is either one of these:
	959	.Sp
	960	.Vb 2
	961	\& (?>#[ \et]*)
	962	\& #[ \et]*(?![ \et])
	963	.Ve
	964	.Sp
	965	For example, to grab non-empty comments into \f(CW$1\fR, one should use either
	966	one of these:
	967	.Sp
	968	.Vb 2
	969	\& / (?> \e# [ \et]* ) ( .+ ) /x;
	970	\& / \e# [ \et]* ( [^ \et] .* ) /x;
	971	.Ve
	972	.Sp
	973	Which one you pick depends on which of these expressions better reflects
	974	the above specification of comments.
	975	.ie n .IP """(?(condition)yes\-pattern\|no\-pattern)""" 10
	976	.el .IP "\f(CW(?(condition)yes\-pattern\|no\-pattern)\fR" 10
	977	.IX Xref "(?()"
	978	.IX Item "(?(condition)yes-pattern\|no-pattern)"
	979	.PD 0
	980	.ie n .IP """(?(condition)yes\-pattern)""" 10
	981	.el .IP "\f(CW(?(condition)yes\-pattern)\fR" 10
	982	.IX Item "(?(condition)yes-pattern)"
	983	.PD
	984	\&\fB\s-1WARNING\s0\fR: This extended regular expression feature is considered
	985	highly experimental, and may be changed or deleted without notice.
	986	.Sp
	987	Conditional expression. \f(CW\(C`(condition)\(C'\fR should be either an integer in
	988	parentheses (which is valid if the corresponding pair of parentheses
	989	matched), or look\-ahead/look\-behind/evaluate zero-width assertion.
	990	.Sp
	991	For example:
	992	.Sp
	993	.Vb 4
	994	\& m{ ( \e( )?
	995	\& [^()]+
	996	\& (?(1) \e) )
	997	\& }x
	998	.Ve
	999	.Sp
	1000	matches a chunk of non\-parentheses, possibly included in parentheses
	1001	themselves.
	1002	.Sh "Backtracking"
	1003	.IX Xref "backtrack backtracking"
	1004	.IX Subsection "Backtracking"
	1005	\&\s-1NOTE:\s0 This section presents an abstract approximation of regular
	1006	expression behavior. For a more rigorous (and complicated) view of
	1007	the rules involved in selecting a match among possible alternatives,
	1008	see \(L"Combining pieces together\(R".
	1009	.PP
	1010	A fundamental feature of regular expression matching involves the
	1011	notion called \fIbacktracking\fR, which is currently used (when needed)
	1012	by all regular expression quantifiers, namely \f(CW\(C`\(C'\fR, \f(CW\(C`?\(C'\fR, \f(CW\(C`+\(C'\fR,
	1013	\&\f(CW\(C`+?\(C'\fR, \f(CW\(C`{n,m}\(C'\fR, and \f(CW\(C`{n,m}?\(C'\fR. Backtracking is often optimized
	1014	internally, but the general principle outlined here is valid.
	1015	.PP
	1016	For a regular expression to match, the \fIentire\fR regular expression must
	1017	match, not just part of it. So if the beginning of a pattern containing a
	1018	quantifier succeeds in a way that causes later parts in the pattern to
	1019	fail, the matching engine backs up and recalculates the beginning
	1020	part\*(--that's why it's called backtracking.
	1021	.PP
	1022	Here is an example of backtracking: Let's say you want to find the
	1023	word following \(L"foo\(R" in the string \(L"Food is on the foo table.\(R":
	1024	.PP
	1025	.Vb 4
	1026	\& $_ = "Food is on the foo table.";
	1027	\& if ( /\eb(foo)\es+(\ew+)/i ) {
	1028	\& print "$2 follows $1.\en";
	1029	\& }
	1030	.Ve
	1031	.PP
	1032	When the match runs, the first part of the regular expression (\f(CW\(C`\eb(foo)\(C'\fR)
	1033	finds a possible match right at the beginning of the string, and loads up
	1034	\&\f(CW$1\fR with \(L"Foo\(R". However, as soon as the matching engine sees that there's
	1035	no whitespace following the \(L"Foo\(R" that it had saved in \f(CW$1\fR, it realizes its
	1036	mistake and starts over again one character after where it had the
	1037	tentative match. This time it goes all the way until the next occurrence
	1038	of \(L"foo\(R". The complete regular expression matches this time, and you get
	1039	the expected output of \(L"table follows foo.\(R"
	1040	.PP
	1041	Sometimes minimal matching can help a lot. Imagine you'd like to match
	1042	everything between \(L"foo\(R" and \(L"bar\(R". Initially, you write something
	1043	like this:
	1044	.PP
	1045	.Vb 4
	1046	\& $_ = "The food is under the bar in the barn.";
	1047	\& if ( /foo(.*)bar/ ) {
	1048	\& print "got <$1>\en";
	1049	\& }
	1050	.Ve
	1051	.PP
	1052	Which perhaps unexpectedly yields:
	1053	.PP
	1054	.Vb 1
	1055	\& got <d is under the bar in the >
	1056	.Ve
	1057	.PP
	1058	That's because \f(CW\(C`.\*(C'\fR was greedy, so you get everything between the
	1059	\&\fIfirst\fR \(L"foo\(R" and the \fIlast\fR \(L"bar\(R". Here it's more effective
	1060	to use minimal matching to make sure you get the text between a \(L"foo\(R"
	1061	and the first \(L"bar\(R" thereafter.
	1062	.PP
	1063	.Vb 2
	1064	\& if ( /foo(.*?)bar/ ) { print "got <$1>\en" }
	1065	\& got <d is under the >
	1066	.Ve
	1067	.PP
	1068	Here's another example: let's say you'd like to match a number at the end
	1069	of a string, and you also want to keep the preceding part of the match.
	1070	So you write this:
	1071	.PP
	1072	.Vb 4
	1073	\& $_ = "I have 2 numbers: 53147";
	1074	\& if ( /(.)(\ed)/ ) { # Wrong!
	1075	\& print "Beginning is <$1>, number is <$2>.\en";
	1076	\& }
	1077	.Ve
	1078	.PP
	1079	That won't work at all, because \f(CW\(C`.\*(C'\fR was greedy and gobbled up the
	1080	whole string. As \f(CW\(C`\ed\*(C'\fR can match on an empty string the complete
	1081	regular expression matched successfully.
	1082	.PP
	1083	.Vb 1
	1084	\& Beginning is <I have 2 numbers: 53147>, number is <>.
	1085	.Ve
	1086	.PP
	1087	Here are some variants, most of which don't work:
	1088	.PP
	1089	.Vb 11
	1090	\& $_ = "I have 2 numbers: 53147";
	1091	\& @pats = qw{
	1092	\& (.)(\ed)
	1093	\& (.*)(\ed+)
	1094	\& (.?)(\ed)
	1095	\& (.*?)(\ed+)
	1096	\& (.*)(\ed+)$
	1097	\& (.*?)(\ed+)$
	1098	\& (.*)\eb(\ed+)$
	1099	\& (.*\eD)(\ed+)$
	1100	\& };
	1101	.Ve
	1102	.PP
	1103	.Vb 8
	1104	\& for $pat (@pats) {
	1105	\& printf "%-12s ", $pat;
	1106	\& if ( /$pat/ ) {
	1107	\& print "<$1> <$2>\en";
	1108	\& } else {
	1109	\& print "FAIL\en";
	1110	\& }
	1111	\& }
	1112	.Ve
	1113	.PP
	1114	That will print out:
	1115	.PP
	1116	.Vb 8
	1117	\& (.)(\ed) <I have 2 numbers: 53147> <>
	1118	\& (.*)(\ed+) <I have 2 numbers: 5314> <7>
	1119	\& (.?)(\ed) <> <>
	1120	\& (.*?)(\ed+) <I have > <2>
	1121	\& (.*)(\ed+)$ <I have 2 numbers: 5314> <7>
	1122	\& (.*?)(\ed+)$ <I have 2 numbers: > <53147>
	1123	\& (.*)\eb(\ed+)$ <I have 2 numbers: > <53147>
	1124	\& (.*\eD)(\ed+)$ <I have 2 numbers: > <53147>
	1125	.Ve
	1126	.PP
	1127	As you see, this can be a bit tricky. It's important to realize that a
	1128	regular expression is merely a set of assertions that gives a definition
	1129	of success. There may be 0, 1, or several different ways that the
	1130	definition might succeed against a particular string. And if there are
	1131	multiple ways it might succeed, you need to understand backtracking to
	1132	know which variety of success you will achieve.
	1133	.PP
	1134	When using look-ahead assertions and negations, this can all get even
	1135	trickier. Imagine you'd like to find a sequence of non-digits not
	1136	followed by \(L"123\(R". You might try to write that as
	1137	.PP
	1138	.Vb 4
	1139	\& $_ = "ABC123";
	1140	\& if ( /^\eD*(?!123)/ ) { # Wrong!
	1141	\& print "Yup, no 123 in $_\en";
	1142	\& }
	1143	.Ve
	1144	.PP
	1145	But that isn't going to match; at least, not the way you're hoping. It
	1146	claims that there is no 123 in the string. Here's a clearer picture of
	1147	why that pattern matches, contrary to popular expectations:
	1148	.PP
	1149	.Vb 2
	1150	\& $x = 'ABC123';
	1151	\& $y = 'ABC445';
	1152	.Ve
	1153	.PP
	1154	.Vb 2
	1155	\& print "1: got $1\en" if $x =~ /^(ABC)(?!123)/;
	1156	\& print "2: got $1\en" if $y =~ /^(ABC)(?!123)/;
	1157	.Ve
	1158	.PP
	1159	.Vb 2
	1160	\& print "3: got $1\en" if $x =~ /^(\eD*)(?!123)/;
	1161	\& print "4: got $1\en" if $y =~ /^(\eD*)(?!123)/;
	1162	.Ve
	1163	.PP
	1164	This prints
	1165	.PP
	1166	.Vb 3
	1167	\& 2: got ABC
	1168	\& 3: got AB
	1169	\& 4: got ABC
	1170	.Ve
	1171	.PP
	1172	You might have expected test 3 to fail because it seems to a more
	1173	general purpose version of test 1. The important difference between
	1174	them is that test 3 contains a quantifier (\f(CW\(C`\eD\*(C'\fR) and so can use
	1175	backtracking, whereas test 1 will not. What's happening is
	1176	that you've asked \*(L"Is it true that at the start of \f(CW$x\fR, following 0 or more
	1177	non\-digits, you have something that's not 123?\*(R" If the pattern matcher had
	1178	let \f(CW\(C`\eD\(C'\fR expand to \(L"\s-1ABC\s0\*(R", this would have caused the whole pattern to
	1179	fail.
	1180	.PP
	1181	The search engine will initially match \f(CW\(C`\eD\(C'\fR with \(L"\s-1ABC\s0\*(R". Then it will
	1182	try to match \f(CW\(C`(?!123\(C'\fR with \(L"123\(R", which fails. But because
	1183	a quantifier (\f(CW\(C`\eD\*(C'\fR) has been used in the regular expression, the
	1184	search engine can backtrack and retry the match differently
	1185	in the hope of matching the complete regular expression.
	1186	.PP
	1187	The pattern really, \fIreally\fR wants to succeed, so it uses the
	1188	standard pattern back-off-and-retry and lets \f(CW\(C`\eD\(C'\fR expand to just \(L"\s-1AB\s0\*(R" this
	1189	time. Now there's indeed something following \(L"\s-1AB\s0\(R" that is not
	1190	\&\(L"123\(R". It's \(L"C123\(R", which suffices.
	1191	.PP
	1192	We can deal with this by using both an assertion and a negation.
	1193	We'll say that the first part in \f(CW$1\fR must be followed both by a digit
	1194	and by something that's not \(L"123\(R". Remember that the look-aheads
	1195	are zero-width expressions\*(--they only look, but don't consume any
	1196	of the string in their match. So rewriting this way produces what
	1197	you'd expect; that is, case 5 will fail, but case 6 succeeds:
	1198	.PP
	1199	.Vb 2
	1200	\& print "5: got $1\en" if $x =~ /^(\eD*)(?=\ed)(?!123)/;
	1201	\& print "6: got $1\en" if $y =~ /^(\eD*)(?=\ed)(?!123)/;
	1202	.Ve
	1203	.PP
	1204	.Vb 1
	1205	\& 6: got ABC
	1206	.Ve
	1207	.PP
	1208	In other words, the two zero-width assertions next to each other work as though
	1209	they're ANDed together, just as you'd use any built-in assertions: \f(CW\(C`/^$/\(C'\fR
	1210	matches only if you're at the beginning of the line \s-1AND\s0 the end of the
	1211	line simultaneously. The deeper underlying truth is that juxtaposition in
	1212	regular expressions always means \s-1AND\s0, except when you write an explicit \s-1OR\s0
	1213	using the vertical bar. \f(CW\(C`/ab/\(C'\fR means match \(L"a\(R" \s-1AND\s0 (then) match \(L"b\(R",
	1214	although the attempted matches are made at different positions because \(L"a\(R"
	1215	is not a zero-width assertion, but a one-width assertion.
	1216	.PP
	1217	\&\fB\s-1WARNING\s0\fR: particularly complicated regular expressions can take
	1218	exponential time to solve because of the immense number of possible
	1219	ways they can use backtracking to try match. For example, without
	1220	internal optimizations done by the regular expression engine, this will
	1221	take a painfully long time to run:
	1222	.PP
	1223	.Vb 1
	1224	\& 'aaaaaaaaaaaa' =~ /((a{0,5}){0,5})*[c]/
	1225	.Ve
	1226	.PP
	1227	And if you used \f(CW\(C`\*(C'\fR's in the internal groups instead of limiting them
	1228	to 0 through 5 matches, then it would take forever\*(--or until you ran
	1229	out of stack space. Moreover, these internal optimizations are not
	1230	always applicable. For example, if you put \f(CW\(C`{0,5}\(C'\fR instead of \f(CW\(C`\*(C'\fR
	1231	on the external group, no current optimization is applicable, and the
	1232	match takes a long time to finish.
	1233	.PP
	1234	A powerful tool for optimizing such beasts is what is known as an
	1235	\&\(L"independent group\(R",
	1236	which does not backtrack (see "\f(CW\(C`(?>pattern)\(C'\fR"). Note also that
	1237	zero-length look\-ahead/look\-behind assertions will not backtrack to make
	1238	the tail match, since they are in \(L"logical\(R" context: only
	1239	whether they match is considered relevant. For an example
	1240	where side-effects of look-ahead \fImight\fR have influenced the
	1241	following match, see "\f(CW\(C`(?>pattern)\(C'\fR".
	1242	.Sh "Version 8 Regular Expressions"
	1243	.IX Xref "regular expression, version 8 regex, version 8 regexp, version 8"
	1244	.IX Subsection "Version 8 Regular Expressions"
	1245	In case you're not familiar with the \(L"regular\(R" Version 8 regex
	1246	routines, here are the pattern-matching rules not described above.
	1247	.PP
	1248	Any single character matches itself, unless it is a \fImetacharacter\fR
	1249	with a special meaning described here or above. You can cause
	1250	characters that normally function as metacharacters to be interpreted
	1251	literally by prefixing them with a \(L"\e\(R" (e.g., \(L"\e.\(R" matches a \(L".\(R", not any
	1252	character; \(L"\e\e\(R" matches a \(L"\e\(R"). A series of characters matches that
	1253	series of characters in the target string, so the pattern \f(CW\(C`blurfl\(C'\fR
	1254	would match \(L"blurfl\(R" in the target string.
	1255	.PP
	1256	You can specify a character class, by enclosing a list of characters
	1257	in \f(CW\(C`[]\(C'\fR, which will match any one character from the list. If the
	1258	first character after the \(L"[\(R" is \(L"^\(R", the class matches any character not
	1259	in the list. Within a list, the \(L"\-\(R" character specifies a
	1260	range, so that \f(CW\(C`a\-z\(C'\fR represents all characters between \(L"a\(R" and \(L"z\(R",
	1261	inclusive. If you want either \(L"\-\(R" or \(L"]\(R" itself to be a member of a
	1262	class, put it at the start of the list (possibly after a \(L"^\(R"), or
	1263	escape it with a backslash. \(L"\-\(R" is also taken literally when it is
	1264	at the end of the list, just before the closing \(L"]\(R". (The
	1265	following all specify the same class of three characters: \f(CW\(C`[\-az]\(C'\fR,
	1266	\&\f(CW\(C`[az\-]\(C'\fR, and \f(CW\(C`[a\e\-z]\(C'\fR. All are different from \f(CW\(C`[a\-z]\(C'\fR, which
	1267	specifies a class containing twenty-six characters, even on \s-1EBCDIC\s0
	1268	based coded character sets.) Also, if you try to use the character
	1269	classes \f(CW\(C`\ew\(C'\fR, \f(CW\(C`\eW\(C'\fR, \f(CW\(C`\es\(C'\fR, \f(CW\(C`\eS\(C'\fR, \f(CW\(C`\ed\(C'\fR, or \f(CW\(C`\eD\(C'\fR as endpoints of
	1270	a range, that's not a range, the \(L"\-\(R" is understood literally.
	1271	.PP
	1272	Note also that the whole range idea is rather unportable between
	1273	character sets\*(--and even within character sets they may cause results
	1274	you probably didn't expect. A sound principle is to use only ranges
	1275	that begin from and end at either alphabets of equal case ([a\-e],
	1276	[A\-E]), or digits ([0\-9]). Anything else is unsafe. If in doubt,
	1277	spell out the character sets in full.
	1278	.PP
	1279	Characters may be specified using a metacharacter syntax much like that
	1280	used in C: \(L"\en\(R" matches a newline, \(L"\et\(R" a tab, \(L"\er\(R" a carriage return,
	1281	\&\(L"\ef\(R" a form feed, etc. More generally, \e\fInnn\fR, where \fInnn\fR is a string
	1282	of octal digits, matches the character whose coded character set value
	1283	is \fInnn\fR. Similarly, \ex\fInn\fR, where \fInn\fR are hexadecimal digits,
	1284	matches the character whose numeric value is \fInn\fR. The expression \ec\fIx\fR
	1285	matches the character control\-\fIx\fR. Finally, the \(L".\(R" metacharacter
	1286	matches any character except \(L"\en\(R" (unless you use \f(CW\(C`/s\(C'\fR).
	1287	.PP
	1288	You can specify a series of alternatives for a pattern using \(L"\|\(R" to
	1289	separate them, so that \f(CW\(C`fee\|fie\|foe\(C'\fR will match any of \(L"fee\(R", \(L"fie\(R",
	1290	or \(L"foe\(R" in the target string (as would \f(CW\(C`f(e\|i\|o)e\(C'\fR). The
	1291	first alternative includes everything from the last pattern delimiter
	1292	(\(L"(\(R", \(L"[\(R", or the beginning of the pattern) up to the first \(L"\|\(R", and
	1293	the last alternative contains everything from the last \(L"\|\(R" to the next
	1294	pattern delimiter. That's why it's common practice to include
	1295	alternatives in parentheses: to minimize confusion about where they
	1296	start and end.
	1297	.PP
	1298	Alternatives are tried from left to right, so the first
	1299	alternative found for which the entire expression matches, is the one that
	1300	is chosen. This means that alternatives are not necessarily greedy. For
	1301	example: when matching \f(CW\(C`foo\|foot\(C'\fR against \(L"barefoot\(R", only the \(L"foo\(R"
	1302	part will match, as that is the first alternative tried, and it successfully
	1303	matches the target string. (This might not seem important, but it is
	1304	important when you are capturing matched text using parentheses.)
	1305	.PP
	1306	Also remember that \(L"\|\(R" is interpreted as a literal within square brackets,
	1307	so if you write \f(CW\(C`[fee\|fie\|foe]\(C'\fR you're really only matching \f(CW\(C`[feio\|]\(C'\fR.
	1308	.PP
	1309	Within a pattern, you may designate subpatterns for later reference
	1310	by enclosing them in parentheses, and you may refer back to the
	1311	\&\fIn\fRth subpattern later in the pattern using the metacharacter
	1312	\&\e\fIn\fR. Subpatterns are numbered based on the left to right order
	1313	of their opening parenthesis. A backreference matches whatever
	1314	actually matched the subpattern in the string being examined, not
	1315	the rules for that subpattern. Therefore, \f(CW\(C`(0\|0x)\ed\es\e1\ed\(C'\fR will
	1316	match \(L"0x1234 0x4321\(R", but not \(L"0x1234 01234\(R", because subpattern
	1317	1 matched \(L"0x\(R", even though the rule \f(CW\(C`0\|0x\(C'\fR could potentially match
	1318	the leading 0 in the second number.
	1319	.ie n .Sh "Warning on \e1 vs $1"
	1320	.el .Sh "Warning on \e1 vs \f(CW$1\fP"
	1321	.IX Subsection "Warning on 1 vs $1"
	1322	Some people get too used to writing things like:
	1323	.PP
	1324	.Vb 1
	1325	\& $pattern =~ s/(\eW)/\e\e\e1/g;
	1326	.Ve
	1327	.PP
	1328	This is grandfathered for the \s-1RHS\s0 of a substitute to avoid shocking the
	1329	\&\fBsed\fR addicts, but it's a dirty habit to get into. That's because in
	1330	PerlThink, the righthand side of an \f(CW\(C`s///\(C'\fR is a double-quoted string. \f(CW\(C`\e1\(C'\fR in
	1331	the usual double-quoted string means a control\-A. The customary Unix
	1332	meaning of \f(CW\(C`\e1\(C'\fR is kludged in for \f(CW\(C`s///\(C'\fR. However, if you get into the habit
	1333	of doing that, you get yourself into trouble if you then add an \f(CW\(C`/e\(C'\fR
	1334	modifier.
	1335	.PP
	1336	.Vb 1
	1337	\& s/(\ed+)/ \e1 + 1 /eg; # causes warning under -w
	1338	.Ve
	1339	.PP
	1340	Or if you try to do
	1341	.PP
	1342	.Vb 1
	1343	\& s/(\ed+)/\e1000/;
	1344	.Ve
	1345	.PP
	1346	You can't disambiguate that by saying \f(CW\(C`\e{1}000\(C'\fR, whereas you can fix it with
	1347	\&\f(CW\(C`${1}000\(C'\fR. The operation of interpolation should not be confused
	1348	with the operation of matching a backreference. Certainly they mean two
	1349	different things on the \fIleft\fR side of the \f(CW\(C`s///\(C'\fR.
	1350	.Sh "Repeated patterns matching zero-length substring"
	1351	.IX Subsection "Repeated patterns matching zero-length substring"
	1352	\&\fB\s-1WARNING\s0\fR: Difficult material (and prose) ahead. This section needs a rewrite.
	1353	.PP
	1354	Regular expressions provide a terse and powerful programming language. As
	1355	with most other power tools, power comes together with the ability
	1356	to wreak havoc.
	1357	.PP
	1358	A common abuse of this power stems from the ability to make infinite
	1359	loops using regular expressions, with something as innocuous as:
	1360	.PP
	1361	.Vb 1
	1362	\& 'foo' =~ m{ ( o? )* }x;
	1363	.Ve
	1364	.PP
	1365	The \f(CW\(C`o?\(C'\fR can match at the beginning of \f(CW'foo'\fR, and since the position
	1366	in the string is not moved by the match, \f(CW\(C`o?\(C'\fR would match again and again
	1367	because of the \f(CW\(C`\*(C'\fR modifier. Another common way to create a similar cycle
	1368	is with the looping modifier \f(CW\(C`//g\(C'\fR:
	1369	.PP
	1370	.Vb 1
	1371	\& @matches = ( 'foo' =~ m{ o? }xg );
	1372	.Ve
	1373	.PP
	1374	or
	1375	.PP
	1376	.Vb 1
	1377	\& print "match: <$&>\en" while 'foo' =~ m{ o? }xg;
	1378	.Ve
	1379	.PP
	1380	or the loop implied by \fIsplit()\fR.
	1381	.PP
	1382	However, long experience has shown that many programming tasks may
	1383	be significantly simplified by using repeated subexpressions that
	1384	may match zero-length substrings. Here's a simple example being:
	1385	.PP
	1386	.Vb 2
	1387	\& @chars = split //, $string; # // is not magic in split
	1388	\& ($whitewashed = $string) =~ s/()/ /g; # parens avoid magic s// /
	1389	.Ve
	1390	.PP
	1391	Thus Perl allows such constructs, by \fIforcefully breaking
	1392	the infinite loop\fR. The rules for this are different for lower-level
	1393	loops given by the greedy modifiers \f(CW\(C`+{}\*(C'\fR, and for higher-level
	1394	ones like the \f(CW\(C`/g\(C'\fR modifier or \fIsplit()\fR operator.
	1395	.PP
	1396	The lower-level loops are \fIinterrupted\fR (that is, the loop is
	1397	broken) when Perl detects that a repeated expression matched a
	1398	zero-length substring. Thus
	1399	.PP
	1400	.Vb 1
	1401	\& m{ (?: NON_ZERO_LENGTH \| ZERO_LENGTH )* }x;
	1402	.Ve
	1403	.PP
	1404	is made equivalent to
	1405	.PP
	1406	.Vb 4
	1407	\& m{ (?: NON_ZERO_LENGTH )*
	1408	\& \|
	1409	\& (?: ZERO_LENGTH )?
	1410	\& }x;
	1411	.Ve
	1412	.PP
	1413	The higher level-loops preserve an additional state between iterations:
	1414	whether the last match was zero\-length. To break the loop, the following
	1415	match after a zero-length match is prohibited to have a length of zero.
	1416	This prohibition interacts with backtracking (see \(L"Backtracking\(R"),
	1417	and so the \fIsecond best\fR match is chosen if the \fIbest\fR match is of
	1418	zero length.
	1419	.PP
	1420	For example:
	1421	.PP
	1422	.Vb 2
	1423	\& $_ = 'bar';
	1424	\& s/\ew??/<$&>/g;
	1425	.Ve
	1426	.PP
	1427	results in \f(CW\(C`<><b><><a><><r><>\(C'\fR. At each position of the string the best
	1428	match given by non-greedy \f(CW\(C`??\(C'\fR is the zero-length match, and the \fIsecond
	1429	best\fR match is what is matched by \f(CW\(C`\ew\(C'\fR. Thus zero-length matches
	1430	alternate with one-character-long matches.
	1431	.PP
	1432	Similarly, for repeated \f(CW\(C`m/()/g\(C'\fR the second-best match is the match at the
	1433	position one notch further in the string.
	1434	.PP
	1435	The additional state of being \fImatched with zero-length\fR is associated with
	1436	the matched string, and is reset by each assignment to \fIpos()\fR.
	1437	Zero-length matches at the end of the previous match are ignored
	1438	during \f(CW\(C`split\(C'\fR.
	1439	.Sh "Combining pieces together"
	1440	.IX Subsection "Combining pieces together"
	1441	Each of the elementary pieces of regular expressions which were described
	1442	before (such as \f(CW\(C`ab\(C'\fR or \f(CW\(C`\eZ\(C'\fR) could match at most one substring
	1443	at the given position of the input string. However, in a typical regular
	1444	expression these elementary pieces are combined into more complicated
	1445	patterns using combining operators \f(CW\(C`ST\(C'\fR, \f(CW\(C`S\|T\(C'\fR, \f(CW\(C`S\*(C'\fR etc
	1446	(in these examples \f(CW\(C`S\(C'\fR and \f(CW\(C`T\(C'\fR are regular subexpressions).
	1447	.PP
	1448	Such combinations can include alternatives, leading to a problem of choice:
	1449	if we match a regular expression \f(CW\(C`a\|ab\(C'\fR against \f(CW"abc"\fR, will it match
	1450	substring \f(CW"a"\fR or \f(CW"ab"\fR? One way to describe which substring is
	1451	actually matched is the concept of backtracking (see \(L"Backtracking\(R").
	1452	However, this description is too low-level and makes you think
	1453	in terms of a particular implementation.
	1454	.PP
	1455	Another description starts with notions of \(L"better\(R"/\(L"worse\(R". All the
	1456	substrings which may be matched by the given regular expression can be
	1457	sorted from the \(L"best\(R" match to the \(L"worst\(R" match, and it is the \(L"best\(R"
	1458	match which is chosen. This substitutes the question of \(L"what is chosen?\(R"
	1459	by the question of \(L"which matches are better, and which are worse?\(R".
	1460	.PP
	1461	Again, for elementary pieces there is no such question, since at most
	1462	one match at a given position is possible. This section describes the
	1463	notion of better/worse for combining operators. In the description
	1464	below \f(CW\(C`S\(C'\fR and \f(CW\(C`T\(C'\fR are regular subexpressions.
	1465	.ie n .IP """ST""" 4
	1466	.el .IP "\f(CWST\fR" 4
	1467	.IX Item "ST"
	1468	Consider two possible matches, \f(CW\(C`AB\(C'\fR and \f(CW\(C`A'B'\(C'\fR, \f(CW\(C`A\(C'\fR and \f(CW\(C`A'\(C'\fR are
	1469	substrings which can be matched by \f(CW\(C`S\(C'\fR, \f(CW\(C`B\(C'\fR and \f(CW\(C`B'\(C'\fR are substrings
	1470	which can be matched by \f(CW\(C`T\(C'\fR.
	1471	.Sp
	1472	If \f(CW\(C`A\(C'\fR is better match for \f(CW\(C`S\(C'\fR than \f(CW\(C`A'\(C'\fR, \f(CW\(C`AB\(C'\fR is a better
	1473	match than \f(CW\(C`A'B'\(C'\fR.
	1474	.Sp
	1475	If \f(CW\(C`A\(C'\fR and \f(CW\(C`A'\(C'\fR coincide: \f(CW\(C`AB\(C'\fR is a better match than \f(CW\(C`AB'\(C'\fR if
	1476	\&\f(CW\(C`B\(C'\fR is better match for \f(CW\(C`T\(C'\fR than \f(CW\(C`B'\(C'\fR.
	1477	.ie n .IP """S\|T""" 4
	1478	.el .IP "\f(CWS\|T\fR" 4
	1479	.IX Item "S\|T"
	1480	When \f(CW\(C`S\(C'\fR can match, it is a better match than when only \f(CW\(C`T\(C'\fR can match.
	1481	.Sp
	1482	Ordering of two matches for \f(CW\(C`S\(C'\fR is the same as for \f(CW\(C`S\(C'\fR. Similar for
	1483	two matches for \f(CW\(C`T\(C'\fR.
	1484	.ie n .IP """S{REPEAT_COUNT}""" 4
	1485	.el .IP "\f(CWS{REPEAT_COUNT}\fR" 4
	1486	.IX Item "S{REPEAT_COUNT}"
	1487	Matches as \f(CW\(C`SSS...S\(C'\fR (repeated as many times as necessary).
	1488	.ie n .IP """S{min,max}""" 4
	1489	.el .IP "\f(CWS{min,max}\fR" 4
	1490	.IX Item "S{min,max}"
	1491	Matches as \f(CW\(C`S{max}\|S{max\-1}\|...\|S{min+1}\|S{min}\(C'\fR.
	1492	.ie n .IP """S{min,max}?""" 4
	1493	.el .IP "\f(CWS{min,max}?\fR" 4
	1494	.IX Item "S{min,max}?"
	1495	Matches as \f(CW\(C`S{min}\|S{min+1}\|...\|S{max\-1}\|S{max}\(C'\fR.
	1496	.ie n .IP """S?""\fR, \f(CW""S*""\fR, \f(CW""S+""" 4
	1497	.el .IP "\f(CWS?\fR, \f(CWS*\fR, \f(CWS+\fR" 4
	1498	.IX Item "S?, S*, S+"
	1499	Same as \f(CW\(C`S{0,1}\(C'\fR, \f(CW\(C`S{0,BIG_NUMBER}\(C'\fR, \f(CW\(C`S{1,BIG_NUMBER}\(C'\fR respectively.
	1500	.ie n .IP """S??""\fR, \f(CW""S*?""\fR, \f(CW""S+?""" 4
	1501	.el .IP "\f(CWS??\fR, \f(CWS*?\fR, \f(CWS+?\fR" 4
	1502	.IX Item "S??, S*?, S+?"
	1503	Same as \f(CW\(C`S{0,1}?\(C'\fR, \f(CW\(C`S{0,BIG_NUMBER}?\(C'\fR, \f(CW\(C`S{1,BIG_NUMBER}?\(C'\fR respectively.
	1504	.ie n .IP """(?>S)""" 4
	1505	.el .IP "\f(CW(?>S)\fR" 4
	1506	.IX Item "(?>S)"
	1507	Matches the best match for \f(CW\(C`S\(C'\fR and only that.
	1508	.ie n .IP """(?=S)""\fR, \f(CW""(?<=S)""" 4
	1509	.el .IP "\f(CW(?=S)\fR, \f(CW(?<=S)\fR" 4
	1510	.IX Item "(?=S), (?<=S)"
	1511	Only the best match for \f(CW\(C`S\(C'\fR is considered. (This is important only if
	1512	\&\f(CW\(C`S\(C'\fR has capturing parentheses, and backreferences are used somewhere
	1513	else in the whole regular expression.)
	1514	.ie n .IP """(?!S)""\fR, \f(CW""(?<!S)""" 4
	1515	.el .IP "\f(CW(?!S)\fR, \f(CW(?<!S)\fR" 4
	1516	.IX Item "(?!S), (?<!S)"
	1517	For this grouping operator there is no need to describe the ordering, since
	1518	only whether or not \f(CW\(C`S\(C'\fR can match is important.
	1519	.ie n .IP """(??{ EXPR })""" 4
	1520	.el .IP "\f(CW(??{ EXPR })\fR" 4
	1521	.IX Item "(??{ EXPR })"
	1522	The ordering is the same as for the regular expression which is
	1523	the result of \s-1EXPR\s0.
	1524	.ie n .IP """(?(condition)yes\-pattern\|no\-pattern)""" 4
	1525	.el .IP "\f(CW(?(condition)yes\-pattern\|no\-pattern)\fR" 4
	1526	.IX Item "(?(condition)yes-pattern\|no-pattern)"
	1527	Recall that which of \f(CW\(C`yes\-pattern\(C'\fR or \f(CW\(C`no\-pattern\(C'\fR actually matches is
	1528	already determined. The ordering of the matches is the same as for the
	1529	chosen subexpression.
	1530	.PP
	1531	The above recipes describe the ordering of matches \fIat a given position\fR.
	1532	One more rule is needed to understand how a match is determined for the
	1533	whole regular expression: a match at an earlier position is always better
	1534	than a match at a later position.
	1535	.Sh "Creating custom \s-1RE\s0 engines"
	1536	.IX Subsection "Creating custom RE engines"
	1537	Overloaded constants (see overload) provide a simple way to extend
	1538	the functionality of the \s-1RE\s0 engine.
	1539	.PP
	1540	Suppose that we want to enable a new \s-1RE\s0 escape-sequence \f(CW\(C`\eY\|\(C'\fR which
	1541	matches at boundary between whitespace characters and non-whitespace
	1542	characters. Note that \f(CW\(C`(?=\eS)(?<!\eS)\|(?!\eS)(?<=\eS)\(C'\fR matches exactly
	1543	at these positions, so we want to have each \f(CW\(C`\eY\|\(C'\fR in the place of the
	1544	more complicated version. We can create a module \f(CW\(C`customre\(C'\fR to do
	1545	this:
	1546	.PP
	1547	.Vb 2
	1548	\& package customre;
	1549	\& use overload;
	1550	.Ve
	1551	.PP
	1552	.Vb 5
	1553	\& sub import {
	1554	\& shift;
	1555	\& die "No argument to customre::import allowed" if @_;
	1556	\& overload::constant 'qr' => \e&convert;
	1557	\& }
	1558	.Ve
	1559	.PP
	1560	.Vb 1
	1561	\& sub invalid { die "/$_[0]/: invalid escape '\e\e$_[1]'"}
	1562	.Ve
	1563	.PP
	1564	.Vb 12
	1565	\& # We must also take care of not escaping the legitimate \e\eY\|
	1566	\& # sequence, hence the presence of '\e\e' in the conversion rules.
	1567	\& my %rules = ( '\e\e' => '\e\e\e\e',
	1568	\& 'Y\|' => qr/(?=\eS)(?<!\eS)\|(?!\eS)(?<=\eS)/ );
	1569	\& sub convert {
	1570	\& my $re = shift;
	1571	\& $re =~ s{
	1572	\& \e\e ( \e\e \| Y . )
	1573	\& }
	1574	\& { $rules{$1} or invalid($re,$1) }sgex;
	1575	\& return $re;
	1576	\& }
	1577	.Ve
	1578	.PP
	1579	Now \f(CW\(C`use customre\(C'\fR enables the new escape in constant regular
	1580	expressions, i.e., those without any runtime variable interpolations.
	1581	As documented in overload, this conversion will work only over
	1582	literal parts of regular expressions. For \f(CW\(C`\eY\|$re\eY\|\(C'\fR the variable
	1583	part of this regular expression needs to be converted explicitly
	1584	(but only if the special meaning of \f(CW\(C`\eY\|\(C'\fR should be enabled inside \f(CW$re\fR):
	1585	.PP
	1586	.Vb 5
	1587	\& use customre;
	1588	\& $re = <>;
	1589	\& chomp $re;
	1590	\& $re = customre::convert $re;
	1591	\& /\eY\|$re\eY\|/;
	1592	.Ve
	1593	.SH "BUGS"
	1594	.IX Header "BUGS"
	1595	This document varies from difficult to understand to completely
	1596	and utterly opaque. The wandering prose riddled with jargon is
	1597	hard to fathom in several places.
	1598	.PP
	1599	This document needs a rewrite that separates the tutorial content
	1600	from the reference content.
	1601	.SH "SEE ALSO"
	1602	.IX Header "SEE ALSO"
	1603	perlrequick.
	1604	.PP
	1605	perlretut.
	1606	.PP
	1607	\&\(L"Regexp Quote-Like Operators\(R" in perlop.
	1608	.PP
	1609	\&\(L"Gory details of parsing quoted constructs\(R" in perlop.
	1610	.PP
	1611	perlfaq6.
	1612	.PP
	1613	\&\(L"pos\(R" in perlfunc.
	1614	.PP
	1615	perllocale.
	1616	.PP
	1617	perlebcdic.
	1618	.PP
	1619	\&\fIMastering Regular Expressions\fR by Jeffrey Friedl, published
	1620	by O'Reilly and Associates.