git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental

... / ...

Commit	Line	Data
	1	.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
	2	.\"
	3	.\" Standard preamble:
	4	.\" ========================================================================
	5	.de Sh \" Subsection heading
	6	.br
	7	.if t .Sp
	8	.ne 5
	9	.PP
	10	\fB\\$1\fR
	11	.PP
	12	..
	13	.de Sp \" Vertical space (when we can't use .PP)
	14	.if t .sp .5v
	15	.if n .sp
	16	..
	17	.de Vb \" Begin verbatim text
	18	.ft CW
	19	.nf
	20	.ne \\$1
	21	..
	22	.de Ve \" End verbatim text
	23	.ft R
	24	.fi
	25	..
	26	.\" Set up some character translations and predefined strings. \*(-- will
	27	.\" give an unbreakable dash, \(PI will give pi, \(L" will give a left
	28	.\" double quote, and \*(R" will give a right double quote. \| will give a
	29	.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
	30	.\" do unbreakable dashes and therefore won't be available. \(C` and \(C'
	31	.\" expand to `' in nroff, nothing in troff, for use with C<>.
	32	.tr \(W-\|\(bv\(Tr
	33	.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
	34	.ie n \{\
	35	. ds -- \(*W-
	36	. ds PI pi
	37	. if (\n(.H=4u)&(1m=24u) .ds -- \(W\h'-12u'\(W\h'-12u'-\" diablo 10 pitch
	38	. if (\n(.H=4u)&(1m=20u) .ds -- \(W\h'-12u'\(W\h'-8u'-\" diablo 12 pitch
	39	. ds L" ""
	40	. ds R" ""
	41	. ds C` ""
	42	. ds C' ""
	43	'br\}
	44	.el\{\
	45	. ds -- \\|\(em\\|
	46	. ds PI \(*p
	47	. ds L" ``
	48	. ds R" ''
	49	'br\}
	50	.\"
	51	.\" If the F register is turned on, we'll generate index entries on stderr for
	52	.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
	53	.\" entries marked with X<> in POD. Of course, you'll have to process the
	54	.\" output yourself in some meaningful fashion.
	55	.if \nF \{\
	56	. de IX
	57	. tm Index:\\$1\t\\n%\t"\\$2"
	58	..
	59	. nr % 0
	60	. rr F
	61	.\}
	62	.\"
	63	.\" For nroff, turn off justification. Always turn off hyphenation; it makes
	64	.\" way too many mistakes in technical documents.
	65	.hy 0
	66	.if n .na
	67	.\"
	68	.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
	69	.\" Fear. Run. Save yourself. No user-serviceable parts.
	70	. \" fudge factors for nroff and troff
	71	.if n \{\
	72	. ds #H 0
	73	. ds #V .8m
	74	. ds #F .3m
	75	. ds #[ \f1
	76	. ds #] \fP
	77	.\}
	78	.if t \{\
	79	. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
	80	. ds #V .6m
	81	. ds #F 0
	82	. ds #[ \&
	83	. ds #] \&
	84	.\}
	85	. \" simple accents for nroff and troff
	86	.if n \{\
	87	. ds ' \&
	88	. ds ` \&
	89	. ds ^ \&
	90	. ds , \&
	91	. ds ~ ~
	92	. ds /
	93	.\}
	94	.if t \{\
	95	. ds ' \\k:\h'-(\\n(.wu8/10-\(#H)'\'\h"\|\\n:u"
	96	. ds ` \\k:\h'-(\\n(.wu8/10-\(#H)'\`\h'\|\\n:u'
	97	. ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'^\h'\|\\n:u'
	98	. ds , \\k:\h'-(\\n(.wu*8/10)',\h'\|\\n:u'
	99	. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'\|\\n:u'
	100	. ds / \\k:\h'-(\\n(.wu8/10-\(#H)'\z\(sl\h'\|\\n:u'
	101	.\}
	102	. \" troff and (daisy-wheel) nroff accents
	103	.ds : \\k:\h'-(\\n(.wu8/10-\(#H+.1m+\(#F)'\v'-\(#V'\z.\h'.2m+\(#F'.\h'\|\\n:u'\v'\(#V'
	104	.ds 8 \h'\(#H'\(b\h'-\*(#H'
	105	.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\(#H)/2u'\v'-.3n'\(#[\z\(de\v'.3n'\h'\|\\n:u'\*(#]
	106	.ds d- \h'\(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\(#H'
	107	.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'\|\\n:u'
	108	.ds th \(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u2/3)'\s-1o\s+1\*(#]
	109	.ds Th \(#[\s+2I\s-2\h'-\w'I'u3/5'\v'-.3m'o\v'.3m'\*(#]
	110	.ds ae a\h'-(\w'a'u*4/10)'e
	111	.ds Ae A\h'-(\w'A'u*4/10)'E
	112	. \" corrections for vroff
	113	.if v .ds ~ \\k:\h'-(\\n(.wu9/10-\(#H)'\s-2\u~\d\s+2\h'\|\\n:u'
	114	.if v .ds ^ \\k:\h'-(\\n(.wu10/11-\(#H)'\v'-.4m'^\v'.4m'\h'\|\\n:u'
	115	. \" for low resolution devices (crt and lpr)
	116	.if \n(.H>23 .if \n(.V>19 \
	117	\{\
	118	. ds : e
	119	. ds 8 ss
	120	. ds o a
	121	. ds d- d\h'-1'\(ga
	122	. ds D- D\h'-1'\(hy
	123	. ds th \o'bp'
	124	. ds Th \o'LP'
	125	. ds ae ae
	126	. ds Ae AE
	127	.\}
	128	.rm #[ #] #H #V #F C
	129	.\" ========================================================================
	130	.\"
	131	.IX Title "PERLRETUT 1"
	132	.TH PERLRETUT 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide"
	133	.SH "NAME"
	134	perlretut \- Perl regular expressions tutorial
	135	.SH "DESCRIPTION"
	136	.IX Header "DESCRIPTION"
	137	This page provides a basic tutorial on understanding, creating and
	138	using regular expressions in Perl. It serves as a complement to the
	139	reference page on regular expressions perlre. Regular expressions
	140	are an integral part of the \f(CW\(C`m//\(C'\fR, \f(CW\(C`s///\(C'\fR, \f(CW\(C`qr//\(C'\fR and \f(CW\(C`split\(C'\fR
	141	operators and so this tutorial also overlaps with
	142	\&\(L"Regexp Quote-Like Operators\(R" in perlop and \(L"split\(R" in perlfunc.
	143	.PP
	144	Perl is widely renowned for excellence in text processing, and regular
	145	expressions are one of the big factors behind this fame. Perl regular
	146	expressions display an efficiency and flexibility unknown in most
	147	other computer languages. Mastering even the basics of regular
	148	expressions will allow you to manipulate text with surprising ease.
	149	.PP
	150	What is a regular expression? A regular expression is simply a string
	151	that describes a pattern. Patterns are in common use these days;
	152	examples are the patterns typed into a search engine to find web pages
	153	and the patterns used to list files in a directory, e.g., \f(CW\(C`ls .txt\*(C'\fR
	154	or \f(CW\(C`dir .\(C'\fR. In Perl, the patterns described by regular expressions
	155	are used to search strings, extract desired parts of strings, and to
	156	do search and replace operations.
	157	.PP
	158	Regular expressions have the undeserved reputation of being abstract
	159	and difficult to understand. Regular expressions are constructed using
	160	simple concepts like conditionals and loops and are no more difficult
	161	to understand than the corresponding \f(CW\(C`if\(C'\fR conditionals and \f(CW\(C`while\(C'\fR
	162	loops in the Perl language itself. In fact, the main challenge in
	163	learning regular expressions is just getting used to the terse
	164	notation used to express these concepts.
	165	.PP
	166	This tutorial flattens the learning curve by discussing regular
	167	expression concepts, along with their notation, one at a time and with
	168	many examples. The first part of the tutorial will progress from the
	169	simplest word searches to the basic regular expression concepts. If
	170	you master the first part, you will have all the tools needed to solve
	171	about 98% of your needs. The second part of the tutorial is for those
	172	comfortable with the basics and hungry for more power tools. It
	173	discusses the more advanced regular expression operators and
	174	introduces the latest cutting edge innovations in 5.6.0.
	175	.PP
	176	A note: to save time, 'regular expression' is often abbreviated as
	177	regexp or regex. Regexp is a more natural abbreviation than regex, but
	178	is harder to pronounce. The Perl pod documentation is evenly split on
	179	regexp vs regex; in Perl, there is more than one way to abbreviate it.
	180	We'll use regexp in this tutorial.
	181	.SH "Part 1: The basics"
	182	.IX Header "Part 1: The basics"
	183	.Sh "Simple word matching"
	184	.IX Subsection "Simple word matching"
	185	The simplest regexp is simply a word, or more generally, a string of
	186	characters. A regexp consisting of a word matches any string that
	187	contains that word:
	188	.PP
	189	.Vb 1
	190	\& "Hello World" =~ /World/; # matches
	191	.Ve
	192	.PP
	193	What is this perl statement all about? \f(CW"Hello World"\fR is a simple
	194	double quoted string. \f(CW\(C`World\(C'\fR is the regular expression and the
	195	\&\f(CW\(C`//\(C'\fR enclosing \f(CW\(C`/World/\(C'\fR tells perl to search a string for a match.
	196	The operator \f(CW\(C`=~\(C'\fR associates the string with the regexp match and
	197	produces a true value if the regexp matched, or false if the regexp
	198	did not match. In our case, \f(CW\(C`World\(C'\fR matches the second word in
	199	\&\f(CW"Hello World"\fR, so the expression is true. Expressions like this
	200	are useful in conditionals:
	201	.PP
	202	.Vb 6
	203	\& if ("Hello World" =~ /World/) {
	204	\& print "It matches\en";
	205	\& }
	206	\& else {
	207	\& print "It doesn't match\en";
	208	\& }
	209	.Ve
	210	.PP
	211	There are useful variations on this theme. The sense of the match can
	212	be reversed by using \f(CW\(C`!~\(C'\fR operator:
	213	.PP
	214	.Vb 6
	215	\& if ("Hello World" !~ /World/) {
	216	\& print "It doesn't match\en";
	217	\& }
	218	\& else {
	219	\& print "It matches\en";
	220	\& }
	221	.Ve
	222	.PP
	223	The literal string in the regexp can be replaced by a variable:
	224	.PP
	225	.Vb 7
	226	\& $greeting = "World";
	227	\& if ("Hello World" =~ /$greeting/) {
	228	\& print "It matches\en";
	229	\& }
	230	\& else {
	231	\& print "It doesn't match\en";
	232	\& }
	233	.Ve
	234	.PP
	235	If you're matching against the special default variable \f(CW$_\fR, the
	236	\&\f(CW\(C`$_ =~\(C'\fR part can be omitted:
	237	.PP
	238	.Vb 7
	239	\& $_ = "Hello World";
	240	\& if (/World/) {
	241	\& print "It matches\en";
	242	\& }
	243	\& else {
	244	\& print "It doesn't match\en";
	245	\& }
	246	.Ve
	247	.PP
	248	And finally, the \f(CW\(C`//\(C'\fR default delimiters for a match can be changed
	249	to arbitrary delimiters by putting an \f(CW'm'\fR out front:
	250	.PP
	251	.Vb 4
	252	\& "Hello World" =~ m!World!; # matches, delimited by '!'
	253	\& "Hello World" =~ m{World}; # matches, note the matching '{}'
	254	\& "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin',
	255	\& # '/' becomes an ordinary char
	256	.Ve
	257	.PP
	258	\&\f(CW\(C`/World/\(C'\fR, \f(CW\(C`m!World!\(C'\fR, and \f(CW\(C`m{World}\(C'\fR all represent the
	259	same thing. When, e.g., \f(CW""\fR is used as a delimiter, the forward
	260	slash \f(CW'/'\fR becomes an ordinary character and can be used in a regexp
	261	without trouble.
	262	.PP
	263	Let's consider how different regexps would match \f(CW"Hello World"\fR:
	264	.PP
	265	.Vb 4
	266	\& "Hello World" =~ /world/; # doesn't match
	267	\& "Hello World" =~ /o W/; # matches
	268	\& "Hello World" =~ /oW/; # doesn't match
	269	\& "Hello World" =~ /World /; # doesn't match
	270	.Ve
	271	.PP
	272	The first regexp \f(CW\(C`world\(C'\fR doesn't match because regexps are
	273	case\-sensitive. The second regexp matches because the substring
	274	\&\f(CW'o\ W'\fR\ occurs in the string \f(CW"Hello\ World"\fR\ . The space
	275	character ' ' is treated like any other character in a regexp and is
	276	needed to match in this case. The lack of a space character is the
	277	reason the third regexp \f(CW'oW'\fR doesn't match. The fourth regexp
	278	\&\f(CW'World '\fR doesn't match because there is a space at the end of the
	279	regexp, but not at the end of the string. The lesson here is that
	280	regexps must match a part of the string \fIexactly\fR in order for the
	281	statement to be true.
	282	.PP
	283	If a regexp matches in more than one place in the string, perl will
	284	always match at the earliest possible point in the string:
	285	.PP
	286	.Vb 2
	287	\& "Hello World" =~ /o/; # matches 'o' in 'Hello'
	288	\& "That hat is red" =~ /hat/; # matches 'hat' in 'That'
	289	.Ve
	290	.PP
	291	With respect to character matching, there are a few more points you
	292	need to know about. First of all, not all characters can be used 'as
	293	is' in a match. Some characters, called \fBmetacharacters\fR, are reserved
	294	for use in regexp notation. The metacharacters are
	295	.PP
	296	.Vb 1
	297	\& {}[]()^$.\|*+?\e
	298	.Ve
	299	.PP
	300	The significance of each of these will be explained
	301	in the rest of the tutorial, but for now, it is important only to know
	302	that a metacharacter can be matched by putting a backslash before it:
	303	.PP
	304	.Vb 5
	305	\& "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter
	306	\& "2+2=4" =~ /2\e+2/; # matches, \e+ is treated like an ordinary +
	307	\& "The interval is [0,1)." =~ /[0,1)./ # is a syntax error!
	308	\& "The interval is [0,1)." =~ /\e[0,1\e)\e./ # matches
	309	\& "/usr/bin/perl" =~ /\e/usr\e/bin\e/perl/; # matches
	310	.Ve
	311	.PP
	312	In the last regexp, the forward slash \f(CW'/'\fR is also backslashed,
	313	because it is used to delimit the regexp. This can lead to \s-1LTS\s0
	314	(leaning toothpick syndrome), however, and it is often more readable
	315	to change delimiters.
	316	.PP
	317	.Vb 1
	318	\& "/usr/bin/perl" =~ m!/usr/bin/perl!; # easier to read
	319	.Ve
	320	.PP
	321	The backslash character \f(CW'\e'\fR is a metacharacter itself and needs to
	322	be backslashed:
	323	.PP
	324	.Vb 1
	325	\& 'C:\eWIN32' =~ /C:\e\eWIN/; # matches
	326	.Ve
	327	.PP
	328	In addition to the metacharacters, there are some \s-1ASCII\s0 characters
	329	which don't have printable character equivalents and are instead
	330	represented by \fBescape sequences\fR. Common examples are \f(CW\(C`\et\(C'\fR for a
	331	tab, \f(CW\(C`\en\(C'\fR for a newline, \f(CW\(C`\er\(C'\fR for a carriage return and \f(CW\(C`\ea\(C'\fR for a
	332	bell. If your string is better thought of as a sequence of arbitrary
	333	bytes, the octal escape sequence, e.g., \f(CW\(C`\e033\(C'\fR, or hexadecimal escape
	334	sequence, e.g., \f(CW\(C`\ex1B\(C'\fR may be a more natural representation for your
	335	bytes. Here are some examples of escapes:
	336	.PP
	337	.Vb 4
	338	\& "1000\et2000" =~ m(0\et2) # matches
	339	\& "1000\en2000" =~ /0\en20/ # matches
	340	\& "1000\et2000" =~ /\e000\et2/ # doesn't match, "0" ne "\e000"
	341	\& "cat" =~ /\e143\ex61\ex74/ # matches, but a weird way to spell cat
	342	.Ve
	343	.PP
	344	If you've been around Perl a while, all this talk of escape sequences
	345	may seem familiar. Similar escape sequences are used in double-quoted
	346	strings and in fact the regexps in Perl are mostly treated as
	347	double-quoted strings. This means that variables can be used in
	348	regexps as well. Just like double-quoted strings, the values of the
	349	variables in the regexp will be substituted in before the regexp is
	350	evaluated for matching purposes. So we have:
	351	.PP
	352	.Vb 4
	353	\& $foo = 'house';
	354	\& 'housecat' =~ /$foo/; # matches
	355	\& 'cathouse' =~ /cat$foo/; # matches
	356	\& 'housecat' =~ /${foo}cat/; # matches
	357	.Ve
	358	.PP
	359	So far, so good. With the knowledge above you can already perform
	360	searches with just about any literal string regexp you can dream up.
	361	Here is a \fIvery simple\fR emulation of the Unix grep program:
	362	.PP
	363	.Vb 7
	364	\& % cat > simple_grep
	365	\& #!/usr/bin/perl
	366	\& $regexp = shift;
	367	\& while (<>) {
	368	\& print if /$regexp/;
	369	\& }
	370	\& ^D
	371	.Ve
	372	.PP
	373	.Vb 1
	374	\& % chmod +x simple_grep
	375	.Ve
	376	.PP
	377	.Vb 10
	378	\& % simple_grep abba /usr/dict/words
	379	\& Babbage
	380	\& cabbage
	381	\& cabbages
	382	\& sabbath
	383	\& Sabbathize
	384	\& Sabbathizes
	385	\& sabbatical
	386	\& scabbard
	387	\& scabbards
	388	.Ve
	389	.PP
	390	This program is easy to understand. \f(CW\(C`#!/usr/bin/perl\(C'\fR is the standard
	391	way to invoke a perl program from the shell.
	392	\&\f(CW\(C`$regexp\ =\ shift;\(C'\fR\ saves the first command line argument as the
	393	regexp to be used, leaving the rest of the command line arguments to
	394	be treated as files. \f(CW\(C`while\ (<>)\(C'\fR\ loops over all the lines in
	395	all the files. For each line, \f(CW\(C`print\ if\ /$regexp/;\(C'\fR\ prints the
	396	line if the regexp matches the line. In this line, both \f(CW\(C`print\(C'\fR and
	397	\&\f(CW\(C`/$regexp/\(C'\fR use the default variable \f(CW$_\fR implicitly.
	398	.PP
	399	With all of the regexps above, if the regexp matched anywhere in the
	400	string, it was considered a match. Sometimes, however, we'd like to
	401	specify \fIwhere\fR in the string the regexp should try to match. To do
	402	this, we would use the \fBanchor\fR metacharacters \f(CW\(C`^\(C'\fR and \f(CW\(C`$\(C'\fR. The
	403	anchor \f(CW\(C`^\(C'\fR means match at the beginning of the string and the anchor
	404	\&\f(CW\(C`$\(C'\fR means match at the end of the string, or before a newline at the
	405	end of the string. Here is how they are used:
	406	.PP
	407	.Vb 4
	408	\& "housekeeper" =~ /keeper/; # matches
	409	\& "housekeeper" =~ /^keeper/; # doesn't match
	410	\& "housekeeper" =~ /keeper$/; # matches
	411	\& "housekeeper\en" =~ /keeper$/; # matches
	412	.Ve
	413	.PP
	414	The second regexp doesn't match because \f(CW\(C`^\(C'\fR constrains \f(CW\(C`keeper\(C'\fR to
	415	match only at the beginning of the string, but \f(CW"housekeeper"\fR has
	416	keeper starting in the middle. The third regexp does match, since the
	417	\&\f(CW\(C`$\(C'\fR constrains \f(CW\(C`keeper\(C'\fR to match only at the end of the string.
	418	.PP
	419	When both \f(CW\(C`^\(C'\fR and \f(CW\(C`$\(C'\fR are used at the same time, the regexp has to
	420	match both the beginning and the end of the string, i.e., the regexp
	421	matches the whole string. Consider
	422	.PP
	423	.Vb 3
	424	\& "keeper" =~ /^keep$/; # doesn't match
	425	\& "keeper" =~ /^keeper$/; # matches
	426	\& "" =~ /^$/; # ^$ matches an empty string
	427	.Ve
	428	.PP
	429	The first regexp doesn't match because the string has more to it than
	430	\&\f(CW\(C`keep\(C'\fR. Since the second regexp is exactly the string, it
	431	matches. Using both \f(CW\(C`^\(C'\fR and \f(CW\(C`$\(C'\fR in a regexp forces the complete
	432	string to match, so it gives you complete control over which strings
	433	match and which don't. Suppose you are looking for a fellow named
	434	bert, off in a string by himself:
	435	.PP
	436	.Vb 1
	437	\& "dogbert" =~ /bert/; # matches, but not what you want
	438	.Ve
	439	.PP
	440	.Vb 2
	441	\& "dilbert" =~ /^bert/; # doesn't match, but ..
	442	\& "bertram" =~ /^bert/; # matches, so still not good enough
	443	.Ve
	444	.PP
	445	.Vb 3
	446	\& "bertram" =~ /^bert$/; # doesn't match, good
	447	\& "dilbert" =~ /^bert$/; # doesn't match, good
	448	\& "bert" =~ /^bert$/; # matches, perfect
	449	.Ve
	450	.PP
	451	Of course, in the case of a literal string, one could just as easily
	452	use the string equivalence \f(CW\(C`$string\ eq\ 'bert'\(C'\fR\ and it would be
	453	more efficient. The \f(CW\(C`^...$\(C'\fR regexp really becomes useful when we
	454	add in the more powerful regexp tools below.
	455	.Sh "Using character classes"
	456	.IX Subsection "Using character classes"
	457	Although one can already do quite a lot with the literal string
	458	regexps above, we've only scratched the surface of regular expression
	459	technology. In this and subsequent sections we will introduce regexp
	460	concepts (and associated metacharacter notations) that will allow a
	461	regexp to not just represent a single character sequence, but a \fIwhole
	462	class\fR of them.
	463	.PP
	464	One such concept is that of a \fBcharacter class\fR. A character class
	465	allows a set of possible characters, rather than just a single
	466	character, to match at a particular point in a regexp. Character
	467	classes are denoted by brackets \f(CW\(C`[...]\(C'\fR, with the set of characters
	468	to be possibly matched inside. Here are some examples:
	469	.PP
	470	.Vb 4
	471	\& /cat/; # matches 'cat'
	472	\& /[bcr]at/; # matches 'bat, 'cat', or 'rat'
	473	\& /item[0123456789]/; # matches 'item0' or ... or 'item9'
	474	\& "abc" =~ /[cab]/; # matches 'a'
	475	.Ve
	476	.PP
	477	In the last statement, even though \f(CW'c'\fR is the first character in
	478	the class, \f(CW'a'\fR matches because the first character position in the
	479	string is the earliest point at which the regexp can match.
	480	.PP
	481	.Vb 2
	482	\& /[yY][eE][sS]/; # match 'yes' in a case-insensitive way
	483	\& # 'yes', 'Yes', 'YES', etc.
	484	.Ve
	485	.PP
	486	This regexp displays a common task: perform a case-insensitive
	487	match. Perl provides away of avoiding all those brackets by simply
	488	appending an \f(CW'i'\fR to the end of the match. Then \f(CW\(C`/[yY][eE][sS]/;\(C'\fR
	489	can be rewritten as \f(CW\(C`/yes/i;\(C'\fR. The \f(CW'i'\fR stands for
	490	case-insensitive and is an example of a \fBmodifier\fR of the matching
	491	operation. We will meet other modifiers later in the tutorial.
	492	.PP
	493	We saw in the section above that there were ordinary characters, which
	494	represented themselves, and special characters, which needed a
	495	backslash \f(CW\(C`\e\(C'\fR to represent themselves. The same is true in a
	496	character class, but the sets of ordinary and special characters
	497	inside a character class are different than those outside a character
	498	class. The special characters for a character class are \f(CW\(C`\-]\e^$\(C'\fR. \f(CW\(C`]\(C'\fR
	499	is special because it denotes the end of a character class. \f(CW\(C`$\(C'\fR is
	500	special because it denotes a scalar variable. \f(CW\(C`\e\(C'\fR is special because
	501	it is used in escape sequences, just like above. Here is how the
	502	special characters \f(CW\(C`]$\e\(C'\fR are handled:
	503	.PP
	504	.Vb 5
	505	\& /[\e]c]def/; # matches ']def' or 'cdef'
	506	\& $x = 'bcr';
	507	\& /[$x]at/; # matches 'bat', 'cat', or 'rat'
	508	\& /[\e$x]at/; # matches '$at' or 'xat'
	509	\& /[\e\e$x]at/; # matches '\eat', 'bat, 'cat', or 'rat'
	510	.Ve
	511	.PP
	512	The last two are a little tricky. in \f(CW\(C`[\e$x]\(C'\fR, the backslash protects
	513	the dollar sign, so the character class has two members \f(CW\(C`$\(C'\fR and \f(CW\(C`x\(C'\fR.
	514	In \f(CW\(C`[\e\e$x]\(C'\fR, the backslash is protected, so \f(CW$x\fR is treated as a
	515	variable and substituted in double quote fashion.
	516	.PP
	517	The special character \f(CW'\-'\fR acts as a range operator within character
	518	classes, so that a contiguous set of characters can be written as a
	519	range. With ranges, the unwieldy \f(CW\(C`[0123456789]\(C'\fR and \f(CW\(C`[abc...xyz]\(C'\fR
	520	become the svelte \f(CW\(C`[0\-9]\(C'\fR and \f(CW\(C`[a\-z]\(C'\fR. Some examples are
	521	.PP
	522	.Vb 6
	523	\& /item[0-9]/; # matches 'item0' or ... or 'item9'
	524	\& /[0-9bx-z]aa/; # matches '0aa', ..., '9aa',
	525	\& # 'baa', 'xaa', 'yaa', or 'zaa'
	526	\& /[0-9a-fA-F]/; # matches a hexadecimal digit
	527	\& /[0-9a-zA-Z_]/; # matches a "word" character,
	528	\& # like those in a perl variable name
	529	.Ve
	530	.PP
	531	If \f(CW'\-'\fR is the first or last character in a character class, it is
	532	treated as an ordinary character; \f(CW\(C`[\-ab]\(C'\fR, \f(CW\(C`[ab\-]\(C'\fR and \f(CW\(C`[a\e\-b]\(C'\fR are
	533	all equivalent.
	534	.PP
	535	The special character \f(CW\(C`^\(C'\fR in the first position of a character class
	536	denotes a \fBnegated character class\fR, which matches any character but
	537	those in the brackets. Both \f(CW\(C`[...]\(C'\fR and \f(CW\(C`[^...]\(C'\fR must match a
	538	character, or the match fails. Then
	539	.PP
	540	.Vb 4
	541	\& /[^a]at/; # doesn't match 'aat' or 'at', but matches
	542	\& # all other 'bat', 'cat, '0at', '%at', etc.
	543	\& /[^0-9]/; # matches a non-numeric character
	544	\& /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary
	545	.Ve
	546	.PP
	547	Now, even \f(CW\(C`[0\-9]\(C'\fR can be a bother the write multiple times, so in the
	548	interest of saving keystrokes and making regexps more readable, Perl
	549	has several abbreviations for common character classes:
	550	.IP "\(bu" 4
	551	\&\ed is a digit and represents [0\-9]
	552	.IP "\(bu" 4
	553	\&\es is a whitespace character and represents [\e \et\er\en\ef]
	554	.IP "\(bu" 4
	555	\&\ew is a word character (alphanumeric or _) and represents [0\-9a\-zA\-Z_]
	556	.IP "\(bu" 4
	557	\&\eD is a negated \ed; it represents any character but a digit [^0\-9]
	558	.IP "\(bu" 4
	559	\&\eS is a negated \es; it represents any non-whitespace character [^\es]
	560	.IP "\(bu" 4
	561	\&\eW is a negated \ew; it represents any non-word character [^\ew]
	562	.IP "\(bu" 4
	563	The period '.' matches any character but \(L"\en\(R"
	564	.PP
	565	The \f(CW\(C`\ed\es\ew\eD\eS\eW\(C'\fR abbreviations can be used both inside and outside
	566	of character classes. Here are some in use:
	567	.PP
	568	.Vb 7
	569	\& /\ed\ed:\ed\ed:\ed\ed/; # matches a hh:mm:ss time format
	570	\& /[\ed\es]/; # matches any digit or whitespace character
	571	\& /\ew\eW\ew/; # matches a word char, followed by a
	572	\& # non-word char, followed by a word char
	573	\& /..rt/; # matches any two chars, followed by 'rt'
	574	\& /end\e./; # matches 'end.'
	575	\& /end[.]/; # same thing, matches 'end.'
	576	.Ve
	577	.PP
	578	Because a period is a metacharacter, it needs to be escaped to match
	579	as an ordinary period. Because, for example, \f(CW\(C`\ed\(C'\fR and \f(CW\(C`\ew\(C'\fR are sets
	580	of characters, it is incorrect to think of \f(CW\(C`[^\ed\ew]\(C'\fR as \f(CW\(C`[\eD\eW]\(C'\fR; in
	581	fact \f(CW\(C`[^\ed\ew]\(C'\fR is the same as \f(CW\(C`[^\ew]\(C'\fR, which is the same as
	582	\&\f(CW\(C`[\eW]\(C'\fR. Think DeMorgan's laws.
	583	.PP
	584	An anchor useful in basic regexps is the \fBword\ anchor\fR\
	585	\&\f(CW\(C`\eb\(C'\fR. This matches a boundary between a word character and a non-word
	586	character \f(CW\(C`\ew\eW\(C'\fR or \f(CW\(C`\eW\ew\(C'\fR:
	587	.PP
	588	.Vb 5
	589	\& $x = "Housecat catenates house and cat";
	590	\& $x =~ /cat/; # matches cat in 'housecat'
	591	\& $x =~ /\ebcat/; # matches cat in 'catenates'
	592	\& $x =~ /cat\eb/; # matches cat in 'housecat'
	593	\& $x =~ /\ebcat\eb/; # matches 'cat' at end of string
	594	.Ve
	595	.PP
	596	Note in the last example, the end of the string is considered a word
	597	boundary.
	598	.PP
	599	You might wonder why \f(CW'.'\fR matches everything but \f(CW"\en"\fR \- why not
	600	every character? The reason is that often one is matching against
	601	lines and would like to ignore the newline characters. For instance,
	602	while the string \f(CW"\en"\fR represents one line, we would like to think
	603	of as empty. Then
	604	.PP
	605	.Vb 2
	606	\& "" =~ /^$/; # matches
	607	\& "\en" =~ /^$/; # matches, "\en" is ignored
	608	.Ve
	609	.PP
	610	.Vb 5
	611	\& "" =~ /./; # doesn't match; it needs a char
	612	\& "" =~ /^.$/; # doesn't match; it needs a char
	613	\& "\en" =~ /^.$/; # doesn't match; it needs a char other than "\en"
	614	\& "a" =~ /^.$/; # matches
	615	\& "a\en" =~ /^.$/; # matches, ignores the "\en"
	616	.Ve
	617	.PP
	618	This behavior is convenient, because we usually want to ignore
	619	newlines when we count and match characters in a line. Sometimes,
	620	however, we want to keep track of newlines. We might even want \f(CW\(C`^\(C'\fR
	621	and \f(CW\(C`$\(C'\fR to anchor at the beginning and end of lines within the
	622	string, rather than just the beginning and end of the string. Perl
	623	allows us to choose between ignoring and paying attention to newlines
	624	by using the \f(CW\(C`//s\(C'\fR and \f(CW\(C`//m\(C'\fR modifiers. \f(CW\(C`//s\(C'\fR and \f(CW\(C`//m\(C'\fR stand for
	625	single line and multi-line and they determine whether a string is to
	626	be treated as one continuous string, or as a set of lines. The two
	627	modifiers affect two aspects of how the regexp is interpreted: 1) how
	628	the \f(CW'.'\fR character class is defined, and 2) where the anchors \f(CW\(C`^\(C'\fR
	629	and \f(CW\(C`$\(C'\fR are able to match. Here are the four possible combinations:
	630	.IP "\(bu" 4
	631	no modifiers (//): Default behavior. \f(CW'.'\fR matches any character
	632	except \f(CW"\en"\fR. \f(CW\(C`^\(C'\fR matches only at the beginning of the string and
	633	\&\f(CW\(C`$\(C'\fR matches only at the end or before a newline at the end.
	634	.IP "\(bu" 4
	635	s modifier (//s): Treat string as a single long line. \f(CW'.'\fR matches
	636	any character, even \f(CW"\en"\fR. \f(CW\(C`^\(C'\fR matches only at the beginning of
	637	the string and \f(CW\(C`$\(C'\fR matches only at the end or before a newline at the
	638	end.
	639	.IP "\(bu" 4
	640	m modifier (//m): Treat string as a set of multiple lines. \f(CW'.'\fR
	641	matches any character except \f(CW"\en"\fR. \f(CW\(C`^\(C'\fR and \f(CW\(C`$\(C'\fR are able to match
	642	at the start or end of \fIany\fR line within the string.
	643	.IP "\(bu" 4
	644	both s and m modifiers (//sm): Treat string as a single long line, but
	645	detect multiple lines. \f(CW'.'\fR matches any character, even
	646	\&\f(CW"\en"\fR. \f(CW\(C`^\(C'\fR and \f(CW\(C`$\(C'\fR, however, are able to match at the start or end
	647	of \fIany\fR line within the string.
	648	.PP
	649	Here are examples of \f(CW\(C`//s\(C'\fR and \f(CW\(C`//m\(C'\fR in action:
	650	.PP
	651	.Vb 1
	652	\& $x = "There once was a girl\enWho programmed in Perl\en";
	653	.Ve
	654	.PP
	655	.Vb 4
	656	\& $x =~ /^Who/; # doesn't match, "Who" not at start of string
	657	\& $x =~ /^Who/s; # doesn't match, "Who" not at start of string
	658	\& $x =~ /^Who/m; # matches, "Who" at start of second line
	659	\& $x =~ /^Who/sm; # matches, "Who" at start of second line
	660	.Ve
	661	.PP
	662	.Vb 4
	663	\& $x =~ /girl.Who/; # doesn't match, "." doesn't match "\en"
	664	\& $x =~ /girl.Who/s; # matches, "." matches "\en"
	665	\& $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\en"
	666	\& $x =~ /girl.Who/sm; # matches, "." matches "\en"
	667	.Ve
	668	.PP
	669	Most of the time, the default behavior is what is want, but \f(CW\(C`//s\(C'\fR and
	670	\&\f(CW\(C`//m\(C'\fR are occasionally very useful. If \f(CW\(C`//m\(C'\fR is being used, the start
	671	of the string can still be matched with \f(CW\(C`\eA\(C'\fR and the end of string
	672	can still be matched with the anchors \f(CW\(C`\eZ\(C'\fR (matches both the end and
	673	the newline before, like \f(CW\(C`$\(C'\fR), and \f(CW\(C`\ez\(C'\fR (matches only the end):
	674	.PP
	675	.Vb 2
	676	\& $x =~ /^Who/m; # matches, "Who" at start of second line
	677	\& $x =~ /\eAWho/m; # doesn't match, "Who" is not at start of string
	678	.Ve
	679	.PP
	680	.Vb 2
	681	\& $x =~ /girl$/m; # matches, "girl" at end of first line
	682	\& $x =~ /girl\eZ/m; # doesn't match, "girl" is not at end of string
	683	.Ve
	684	.PP
	685	.Vb 2
	686	\& $x =~ /Perl\eZ/m; # matches, "Perl" is at newline before end
	687	\& $x =~ /Perl\ez/m; # doesn't match, "Perl" is not at end of string
	688	.Ve
	689	.PP
	690	We now know how to create choices among classes of characters in a
	691	regexp. What about choices among words or character strings? Such
	692	choices are described in the next section.
	693	.Sh "Matching this or that"
	694	.IX Subsection "Matching this or that"
	695	Sometimes we would like to our regexp to be able to match different
	696	possible words or character strings. This is accomplished by using
	697	the \fBalternation\fR metacharacter \f(CW\(C`\|\(C'\fR. To match \f(CW\(C`dog\(C'\fR or \f(CW\(C`cat\(C'\fR, we
	698	form the regexp \f(CW\(C`dog\|cat\(C'\fR. As before, perl will try to match the
	699	regexp at the earliest possible point in the string. At each
	700	character position, perl will first try to match the first
	701	alternative, \f(CW\(C`dog\(C'\fR. If \f(CW\(C`dog\(C'\fR doesn't match, perl will then try the
	702	next alternative, \f(CW\(C`cat\(C'\fR. If \f(CW\(C`cat\(C'\fR doesn't match either, then the
	703	match fails and perl moves to the next position in the string. Some
	704	examples:
	705	.PP
	706	.Vb 2
	707	\& "cats and dogs" =~ /cat\|dog\|bird/; # matches "cat"
	708	\& "cats and dogs" =~ /dog\|cat\|bird/; # matches "cat"
	709	.Ve
	710	.PP
	711	Even though \f(CW\(C`dog\(C'\fR is the first alternative in the second regexp,
	712	\&\f(CW\(C`cat\(C'\fR is able to match earlier in the string.
	713	.PP
	714	.Vb 2
	715	\& "cats" =~ /c\|ca\|cat\|cats/; # matches "c"
	716	\& "cats" =~ /cats\|cat\|ca\|c/; # matches "cats"
	717	.Ve
	718	.PP
	719	Here, all the alternatives match at the first string position, so the
	720	first alternative is the one that matches. If some of the
	721	alternatives are truncations of the others, put the longest ones first
	722	to give them a chance to match.
	723	.PP
	724	.Vb 2
	725	\& "cab" =~ /a\|b\|c/ # matches "c"
	726	\& # /a\|b\|c/ == /[abc]/
	727	.Ve
	728	.PP
	729	The last example points out that character classes are like
	730	alternations of characters. At a given character position, the first
	731	alternative that allows the regexp match to succeed will be the one
	732	that matches.
	733	.Sh "Grouping things and hierarchical matching"
	734	.IX Subsection "Grouping things and hierarchical matching"
	735	Alternation allows a regexp to choose among alternatives, but by
	736	itself it unsatisfying. The reason is that each alternative is a whole
	737	regexp, but sometime we want alternatives for just part of a
	738	regexp. For instance, suppose we want to search for housecats or
	739	housekeepers. The regexp \f(CW\(C`housecat\|housekeeper\(C'\fR fits the bill, but is
	740	inefficient because we had to type \f(CW\(C`house\(C'\fR twice. It would be nice to
	741	have parts of the regexp be constant, like \f(CW\(C`house\(C'\fR, and some
	742	parts have alternatives, like \f(CW\(C`cat\|keeper\(C'\fR.
	743	.PP
	744	The \fBgrouping\fR metacharacters \f(CW\(C`()\(C'\fR solve this problem. Grouping
	745	allows parts of a regexp to be treated as a single unit. Parts of a
	746	regexp are grouped by enclosing them in parentheses. Thus we could solve
	747	the \f(CW\(C`housecat\|housekeeper\(C'\fR by forming the regexp as
	748	\&\f(CW\(C`house(cat\|keeper)\(C'\fR. The regexp \f(CW\(C`house(cat\|keeper)\(C'\fR means match
	749	\&\f(CW\(C`house\(C'\fR followed by either \f(CW\(C`cat\(C'\fR or \f(CW\(C`keeper\(C'\fR. Some more examples
	750	are
	751	.PP
	752	.Vb 4
	753	\& /(a\|b)b/; # matches 'ab' or 'bb'
	754	\& /(ac\|b)b/; # matches 'acb' or 'bb'
	755	\& /(^a\|b)c/; # matches 'ac' at start of string or 'bc' anywhere
	756	\& /(a\|[bc])d/; # matches 'ad', 'bd', or 'cd'
	757	.Ve
	758	.PP
	759	.Vb 3
	760	\& /house(cat\|)/; # matches either 'housecat' or 'house'
	761	\& /house(cat(s\|)\|)/; # matches either 'housecats' or 'housecat' or
	762	\& # 'house'. Note groups can be nested.
	763	.Ve
	764	.PP
	765	.Vb 3
	766	\& /(19\|20\|)\ed\ed/; # match years 19xx, 20xx, or the Y2K problem, xx
	767	\& "20" =~ /(19\|20\|)\ed\ed/; # matches the null alternative '()\ed\ed',
	768	\& # because '20\ed\ed' can't match
	769	.Ve
	770	.PP
	771	Alternations behave the same way in groups as out of them: at a given
	772	string position, the leftmost alternative that allows the regexp to
	773	match is taken. So in the last example at the first string position,
	774	\&\f(CW"20"\fR matches the second alternative, but there is nothing left over
	775	to match the next two digits \f(CW\(C`\ed\ed\(C'\fR. So perl moves on to the next
	776	alternative, which is the null alternative and that works, since
	777	\&\f(CW"20"\fR is two digits.
	778	.PP
	779	The process of trying one alternative, seeing if it matches, and
	780	moving on to the next alternative if it doesn't, is called
	781	\&\fBbacktracking\fR. The term 'backtracking' comes from the idea that
	782	matching a regexp is like a walk in the woods. Successfully matching
	783	a regexp is like arriving at a destination. There are many possible
	784	trailheads, one for each string position, and each one is tried in
	785	order, left to right. From each trailhead there may be many paths,
	786	some of which get you there, and some which are dead ends. When you
	787	walk along a trail and hit a dead end, you have to backtrack along the
	788	trail to an earlier point to try another trail. If you hit your
	789	destination, you stop immediately and forget about trying all the
	790	other trails. You are persistent, and only if you have tried all the
	791	trails from all the trailheads and not arrived at your destination, do
	792	you declare failure. To be concrete, here is a step-by-step analysis
	793	of what perl does when it tries to match the regexp
	794	.PP
	795	.Vb 1
	796	\& "abcde" =~ /(abd\|abc)(df\|d\|de)/;
	797	.Ve
	798	.IP "0" 4
	799	Start with the first letter in the string 'a'.
	800	.IP "1" 4
	801	.IX Item "1"
	802	Try the first alternative in the first group 'abd'.
	803	.IP "2" 4
	804	.IX Item "2"
	805	Match 'a' followed by 'b'. So far so good.
	806	.IP "3" 4
	807	.IX Item "3"
	808	\&'d' in the regexp doesn't match 'c' in the string \- a dead
	809	end. So backtrack two characters and pick the second alternative in
	810	the first group 'abc'.
	811	.IP "4" 4
	812	.IX Item "4"
	813	Match 'a' followed by 'b' followed by 'c'. We are on a roll
	814	and have satisfied the first group. Set \f(CW$1\fR to 'abc'.
	815	.IP "5" 4
	816	.IX Item "5"
	817	Move on to the second group and pick the first alternative
	818	\&'df'.
	819	.IP "6" 4
	820	.IX Item "6"
	821	Match the 'd'.
	822	.IP "7" 4
	823	.IX Item "7"
	824	\&'f' in the regexp doesn't match 'e' in the string, so a dead
	825	end. Backtrack one character and pick the second alternative in the
	826	second group 'd'.
	827	.IP "8" 4
	828	.IX Item "8"
	829	\&'d' matches. The second grouping is satisfied, so set \f(CW$2\fR to
	830	\&'d'.
	831	.IP "9" 4
	832	.IX Item "9"
	833	We are at the end of the regexp, so we are done! We have
	834	matched 'abcd' out of the string \(L"abcde\(R".
	835	.PP
	836	There are a couple of things to note about this analysis. First, the
	837	third alternative in the second group 'de' also allows a match, but we
	838	stopped before we got to it \- at a given character position, leftmost
	839	wins. Second, we were able to get a match at the first character
	840	position of the string 'a'. If there were no matches at the first
	841	position, perl would move to the second character position 'b' and
	842	attempt the match all over again. Only when all possible paths at all
	843	possible character positions have been exhausted does perl give
	844	up and declare \f(CW\(C`$string\ =~\ /(abd\|abc)(df\|d\|de)/;\(C'\fR\ to be false.
	845	.PP
	846	Even with all this work, regexp matching happens remarkably fast. To
	847	speed things up, during compilation stage, perl compiles the regexp
	848	into a compact sequence of opcodes that can often fit inside a
	849	processor cache. When the code is executed, these opcodes can then run
	850	at full throttle and search very quickly.
	851	.Sh "Extracting matches"
	852	.IX Subsection "Extracting matches"
	853	The grouping metacharacters \f(CW\(C`()\(C'\fR also serve another completely
	854	different function: they allow the extraction of the parts of a string
	855	that matched. This is very useful to find out what matched and for
	856	text processing in general. For each grouping, the part that matched
	857	inside goes into the special variables \f(CW$1\fR, \f(CW$2\fR, etc. They can be
	858	used just as ordinary variables:
	859	.PP
	860	.Vb 6
	861	\& # extract hours, minutes, seconds
	862	\& if ($time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/) { # match hh:mm:ss format
	863	\& $hours = $1;
	864	\& $minutes = $2;
	865	\& $seconds = $3;
	866	\& }
	867	.Ve
	868	.PP
	869	Now, we know that in scalar context,
	870	\&\f(CW\(C`$time\ =~\ /(\ed\ed):(\ed\ed):(\ed\ed)/\(C'\fR\ returns a true or false
	871	value. In list context, however, it returns the list of matched values
	872	\&\f(CW\(C`($1,$2,$3)\(C'\fR. So we could write the code more compactly as
	873	.PP
	874	.Vb 2
	875	\& # extract hours, minutes, seconds
	876	\& ($hours, $minutes, $second) = ($time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/);
	877	.Ve
	878	.PP
	879	If the groupings in a regexp are nested, \f(CW$1\fR gets the group with the
	880	leftmost opening parenthesis, \f(CW$2\fR the next opening parenthesis,
	881	etc. For example, here is a complex regexp and the matching variables
	882	indicated below it:
	883	.PP
	884	.Vb 2
	885	\& /(ab(cd\|ef)((gi)\|j))/;
	886	\& 1 2 34
	887	.Ve
	888	.PP
	889	so that if the regexp matched, e.g., \f(CW$2\fR would contain 'cd' or 'ef'. For
	890	convenience, perl sets \f(CW$+\fR to the string held by the highest numbered
	891	\&\f(CW$1\fR, \f(CW$2\fR, ... that got assigned (and, somewhat related, \f(CW$^N\fR to the
	892	value of the \f(CW$1\fR, \f(CW$2\fR, ... most-recently assigned; i.e. the \f(CW$1\fR,
	893	\&\f(CW$2\fR, ... associated with the rightmost closing parenthesis used in the
	894	match).
	895	.PP
	896	Closely associated with the matching variables \f(CW$1\fR, \f(CW$2\fR, ... are
	897	the \fBbackreferences\fR \f(CW\(C`\e1\(C'\fR, \f(CW\(C`\e2\(C'\fR, ... . Backreferences are simply
	898	matching variables that can be used \fIinside\fR a regexp. This is a
	899	really nice feature \- what matches later in a regexp can depend on
	900	what matched earlier in the regexp. Suppose we wanted to look
	901	for doubled words in text, like 'the the'. The following regexp finds
	902	all 3\-letter doubles with a space in between:
	903	.PP
	904	.Vb 1
	905	\& /(\ew\ew\ew)\es\e1/;
	906	.Ve
	907	.PP
	908	The grouping assigns a value to \e1, so that the same 3 letter sequence
	909	is used for both parts. Here are some words with repeated parts:
	910	.PP
	911	.Vb 7
	912	\& % simple_grep '^(\ew\ew\ew\ew\|\ew\ew\ew\|\ew\ew\|\ew)\e1$' /usr/dict/words
	913	\& beriberi
	914	\& booboo
	915	\& coco
	916	\& mama
	917	\& murmur
	918	\& papa
	919	.Ve
	920	.PP
	921	The regexp has a single grouping which considers 4\-letter
	922	combinations, then 3\-letter combinations, etc. and uses \f(CW\(C`\e1\(C'\fR to look for
	923	a repeat. Although \f(CW$1\fR and \f(CW\(C`\e1\(C'\fR represent the same thing, care should be
	924	taken to use matched variables \f(CW$1\fR, \f(CW$2\fR, ... only outside a regexp
	925	and backreferences \f(CW\(C`\e1\(C'\fR, \f(CW\(C`\e2\(C'\fR, ... only inside a regexp; not doing
	926	so may lead to surprising and/or undefined results.
	927	.PP
	928	In addition to what was matched, Perl 5.6.0 also provides the
	929	positions of what was matched with the \f(CW\(C`@\-\(C'\fR and \f(CW\(C`@+\(C'\fR
	930	arrays. \f(CW\(C`$\-[0]\(C'\fR is the position of the start of the entire match and
	931	\&\f(CW$+[0]\fR is the position of the end. Similarly, \f(CW\(C`$\-[n]\(C'\fR is the
	932	position of the start of the \f(CW$n\fR match and \f(CW$+[n]\fR is the position
	933	of the end. If \f(CW$n\fR is undefined, so are \f(CW\(C`$\-[n]\(C'\fR and \f(CW$+[n]\fR. Then
	934	this code
	935	.PP
	936	.Vb 5
	937	\& $x = "Mmm...donut, thought Homer";
	938	\& $x =~ /^(Mmm\|Yech)\e.\e.\e.(donut\|peas)/; # matches
	939	\& foreach $expr (1..$#-) {
	940	\& print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\en";
	941	\& }
	942	.Ve
	943	.PP
	944	prints
	945	.PP
	946	.Vb 2
	947	\& Match 1: 'Mmm' at position (0,3)
	948	\& Match 2: 'donut' at position (6,11)
	949	.Ve
	950	.PP
	951	Even if there are no groupings in a regexp, it is still possible to
	952	find out what exactly matched in a string. If you use them, perl
	953	will set \f(CW$`\fR to the part of the string before the match, will set \f(CW$&\fR
	954	to the part of the string that matched, and will set \f(CW$'\fR to the part
	955	of the string after the match. An example:
	956	.PP
	957	.Vb 3
	958	\& $x = "the cat caught the mouse";
	959	\& $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse'
	960	\& $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse'
	961	.Ve
	962	.PP
	963	In the second match, \f(CW\(C`$`\ =\ ''\(C'\fR\ because the regexp matched at the
	964	first character position in the string and stopped, it never saw the
	965	second 'the'. It is important to note that using \f(CW$`\fR and \f(CW$'\fR
	966	slows down regexp matching quite a bit, and \f(CW $& \fR slows it down to a
	967	lesser extent, because if they are used in one regexp in a program,
	968	they are generated for <all> regexps in the program. So if raw
	969	performance is a goal of your application, they should be avoided.
	970	If you need them, use \f(CW\(C`@\-\(C'\fR and \f(CW\(C`@+\(C'\fR instead:
	971	.PP
	972	.Vb 3
	973	\& $` is the same as substr( $x, 0, $-[0] )
	974	\& $& is the same as substr( $x, $-[0], $+[0]-$-[0] )
	975	\& $' is the same as substr( $x, $+[0] )
	976	.Ve
	977	.Sh "Matching repetitions"
	978	.IX Subsection "Matching repetitions"
	979	The examples in the previous section display an annoying weakness. We
	980	were only matching 3\-letter words, or syllables of 4 letters or
	981	less. We'd like to be able to match words or syllables of any length,
	982	without writing out tedious alternatives like
	983	\&\f(CW\(C`\ew\ew\ew\ew\|\ew\ew\ew\|\ew\ew\|\ew\(C'\fR.
	984	.PP
	985	This is exactly the problem the \fBquantifier\fR metacharacters \f(CW\(C`?\(C'\fR,
	986	\&\f(CW\(C`\(C'\fR, \f(CW\(C`+\(C'\fR, and \f(CW\(C`{}\*(C'\fR were created for. They allow us to determine the
	987	number of repeats of a portion of a regexp we consider to be a
	988	match. Quantifiers are put immediately after the character, character
	989	class, or grouping that we want to specify. They have the following
	990	meanings:
	991	.IP "\(bu" 4
	992	\&\f(CW\(C`a?\(C'\fR = match 'a' 1 or 0 times
	993	.IP "\(bu" 4
	994	\&\f(CW\(C`a\*(C'\fR = match 'a' 0 or more times, i.e., any number of times
	995	.IP "\(bu" 4
	996	\&\f(CW\(C`a+\(C'\fR = match 'a' 1 or more times, i.e., at least once
	997	.IP "\(bu" 4
	998	\&\f(CW\(C`a{n,m}\(C'\fR = match at least \f(CW\(C`n\(C'\fR times, but not more than \f(CW\(C`m\(C'\fR
	999	times.
	1000	.IP "\(bu" 4
	1001	\&\f(CW\(C`a{n,}\(C'\fR = match at least \f(CW\(C`n\(C'\fR or more times
	1002	.IP "\(bu" 4
	1003	\&\f(CW\(C`a{n}\(C'\fR = match exactly \f(CW\(C`n\(C'\fR times
	1004	.PP
	1005	Here are some examples:
	1006	.PP
	1007	.Vb 9
	1008	\& /[a-z]+\es+\ed*/; # match a lowercase word, at least some space, and
	1009	\& # any number of digits
	1010	\& /(\ew+)\es+\e1/; # match doubled words of arbitrary length
	1011	\& /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes'
	1012	\& $year =~ /\ed{2,4}/; # make sure year is at least 2 but not more
	1013	\& # than 4 digits
	1014	\& $year =~ /\ed{4}\|\ed{2}/; # better match; throw out 3 digit dates
	1015	\& $year =~ /\ed{2}(\ed{2})?/; # same thing written differently. However,
	1016	\& # this produces $1 and the other does not.
	1017	.Ve
	1018	.PP
	1019	.Vb 7
	1020	\& % simple_grep '^(\ew+)\e1$' /usr/dict/words # isn't this easier?
	1021	\& beriberi
	1022	\& booboo
	1023	\& coco
	1024	\& mama
	1025	\& murmur
	1026	\& papa
	1027	.Ve
	1028	.PP
	1029	For all of these quantifiers, perl will try to match as much of the
	1030	string as possible, while still allowing the regexp to succeed. Thus
	1031	with \f(CW\(C`/a?.../\(C'\fR, perl will first try to match the regexp with the \f(CW\(C`a\(C'\fR
	1032	present; if that fails, perl will try to match the regexp without the
	1033	\&\f(CW\(C`a\(C'\fR present. For the quantifier \f(CW\(C`\*(C'\fR, we get the following:
	1034	.PP
	1035	.Vb 5
	1036	\& $x = "the cat in the hat";
	1037	\& $x =~ /^(.)(cat)(.)$/; # matches,
	1038	\& # $1 = 'the '
	1039	\& # $2 = 'cat'
	1040	\& # $3 = ' in the hat'
	1041	.Ve
	1042	.PP
	1043	Which is what we might expect, the match finds the only \f(CW\(C`cat\(C'\fR in the
	1044	string and locks onto it. Consider, however, this regexp:
	1045	.PP
	1046	.Vb 4
	1047	\& $x =~ /^(.)(at)(.)$/; # matches,
	1048	\& # $1 = 'the cat in the h'
	1049	\& # $2 = 'at'
	1050	\& # $3 = '' (0 matches)
	1051	.Ve
	1052	.PP
	1053	One might initially guess that perl would find the \f(CW\(C`at\(C'\fR in \f(CW\(C`cat\(C'\fR and
	1054	stop there, but that wouldn't give the longest possible string to the
	1055	first quantifier \f(CW\(C`.\(C'\fR. Instead, the first quantifier \f(CW\(C`.\(C'\fR grabs as
	1056	much of the string as possible while still having the regexp match. In
	1057	this example, that means having the \f(CW\(C`at\(C'\fR sequence with the final \f(CW\(C`at\(C'\fR
	1058	in the string. The other important principle illustrated here is that
	1059	when there are two or more elements in a regexp, the \fIleftmost\fR
	1060	quantifier, if there is one, gets to grab as much the string as
	1061	possible, leaving the rest of the regexp to fight over scraps. Thus in
	1062	our example, the first quantifier \f(CW\(C`.\*(C'\fR grabs most of the string, while
	1063	the second quantifier \f(CW\(C`.\*(C'\fR gets the empty string. Quantifiers that
	1064	grab as much of the string as possible are called \fBmaximal match\fR or
	1065	\&\fBgreedy\fR quantifiers.
	1066	.PP
	1067	When a regexp can match a string in several different ways, we can use
	1068	the principles above to predict which way the regexp will match:
	1069	.IP "\(bu" 4
	1070	Principle 0: Taken as a whole, any regexp will be matched at the
	1071	earliest possible position in the string.
	1072	.IP "\(bu" 4
	1073	Principle 1: In an alternation \f(CW\(C`a\|b\|c...\(C'\fR, the leftmost alternative
	1074	that allows a match for the whole regexp will be the one used.
	1075	.IP "\(bu" 4
	1076	Principle 2: The maximal matching quantifiers \f(CW\(C`?\(C'\fR, \f(CW\(C`\(C'\fR, \f(CW\(C`+\*(C'\fR and
	1077	\&\f(CW\(C`{n,m}\(C'\fR will in general match as much of the string as possible while
	1078	still allowing the whole regexp to match.
	1079	.IP "\(bu" 4
	1080	Principle 3: If there are two or more elements in a regexp, the
	1081	leftmost greedy quantifier, if any, will match as much of the string
	1082	as possible while still allowing the whole regexp to match. The next
	1083	leftmost greedy quantifier, if any, will try to match as much of the
	1084	string remaining available to it as possible, while still allowing the
	1085	whole regexp to match. And so on, until all the regexp elements are
	1086	satisfied.
	1087	.PP
	1088	As we have seen above, Principle 0 overrides the others \- the regexp
	1089	will be matched as early as possible, with the other principles
	1090	determining how the regexp matches at that earliest character
	1091	position.
	1092	.PP
	1093	Here is an example of these principles in action:
	1094	.PP
	1095	.Vb 5
	1096	\& $x = "The programming republic of Perl";
	1097	\& $x =~ /^(.+)(e\|r)(.*)$/; # matches,
	1098	\& # $1 = 'The programming republic of Pe'
	1099	\& # $2 = 'r'
	1100	\& # $3 = 'l'
	1101	.Ve
	1102	.PP
	1103	This regexp matches at the earliest string position, \f(CW'T'\fR. One
	1104	might think that \f(CW\(C`e\(C'\fR, being leftmost in the alternation, would be
	1105	matched, but \f(CW\(C`r\(C'\fR produces the longest string in the first quantifier.
	1106	.PP
	1107	.Vb 3
	1108	\& $x =~ /(m{1,2})(.*)$/; # matches,
	1109	\& # $1 = 'mm'
	1110	\& # $2 = 'ing republic of Perl'
	1111	.Ve
	1112	.PP
	1113	Here, The earliest possible match is at the first \f(CW'm'\fR in
	1114	\&\f(CW\(C`programming\(C'\fR. \f(CW\(C`m{1,2}\(C'\fR is the first quantifier, so it gets to match
	1115	a maximal \f(CW\(C`mm\(C'\fR.
	1116	.PP
	1117	.Vb 3
	1118	\& $x =~ /.(m{1,2})(.)$/; # matches,
	1119	\& # $1 = 'm'
	1120	\& # $2 = 'ing republic of Perl'
	1121	.Ve
	1122	.PP
	1123	Here, the regexp matches at the start of the string. The first
	1124	quantifier \f(CW\(C`.\*(C'\fR grabs as much as possible, leaving just a single
	1125	\&\f(CW'm'\fR for the second quantifier \f(CW\(C`m{1,2}\(C'\fR.
	1126	.PP
	1127	.Vb 4
	1128	\& $x =~ /(.?)(m{1,2})(.*)$/; # matches,
	1129	\& # $1 = 'a'
	1130	\& # $2 = 'mm'
	1131	\& # $3 = 'ing republic of Perl'
	1132	.Ve
	1133	.PP
	1134	Here, \f(CW\(C`.?\(C'\fR eats its maximal one character at the earliest possible
	1135	position in the string, \f(CW'a'\fR in \f(CW\(C`programming\(C'\fR, leaving \f(CW\(C`m{1,2}\(C'\fR
	1136	the opportunity to match both \f(CW\(C`m\(C'\fR's. Finally,
	1137	.PP
	1138	.Vb 1
	1139	\& "aXXXb" =~ /(X*)/; # matches with $1 = ''
	1140	.Ve
	1141	.PP
	1142	because it can match zero copies of \f(CW'X'\fR at the beginning of the
	1143	string. If you definitely want to match at least one \f(CW'X'\fR, use
	1144	\&\f(CW\(C`X+\(C'\fR, not \f(CW\(C`X\*(C'\fR.
	1145	.PP
	1146	Sometimes greed is not good. At times, we would like quantifiers to
	1147	match a \fIminimal\fR piece of string, rather than a maximal piece. For
	1148	this purpose, Larry Wall created the \fBminimal\ match\fR\ or
	1149	\&\fBnon-greedy\fR quantifiers \f(CW\(C`??\(C'\fR,\f(CW\(C`?\(C'\fR, \f(CW\(C`+?\(C'\fR, and \f(CW\(C`{}?\*(C'\fR. These are
	1150	the usual quantifiers with a \f(CW\(C`?\(C'\fR appended to them. They have the
	1151	following meanings:
	1152	.IP "\(bu" 4
	1153	\&\f(CW\(C`a??\(C'\fR = match 'a' 0 or 1 times. Try 0 first, then 1.
	1154	.IP "\(bu" 4
	1155	\&\f(CW\(C`a?\*(C'\fR = match 'a' 0 or more times, i.e., any number of times,
	1156	but as few times as possible
	1157	.IP "\(bu" 4
	1158	\&\f(CW\(C`a+?\(C'\fR = match 'a' 1 or more times, i.e., at least once, but
	1159	as few times as possible
	1160	.IP "\(bu" 4
	1161	\&\f(CW\(C`a{n,m}?\(C'\fR = match at least \f(CW\(C`n\(C'\fR times, not more than \f(CW\(C`m\(C'\fR
	1162	times, as few times as possible
	1163	.IP "\(bu" 4
	1164	\&\f(CW\(C`a{n,}?\(C'\fR = match at least \f(CW\(C`n\(C'\fR times, but as few times as
	1165	possible
	1166	.IP "\(bu" 4
	1167	\&\f(CW\(C`a{n}?\(C'\fR = match exactly \f(CW\(C`n\(C'\fR times. Because we match exactly
	1168	\&\f(CW\(C`n\(C'\fR times, \f(CW\(C`a{n}?\(C'\fR is equivalent to \f(CW\(C`a{n}\(C'\fR and is just there for
	1169	notational consistency.
	1170	.PP
	1171	Let's look at the example above, but with minimal quantifiers:
	1172	.PP
	1173	.Vb 5
	1174	\& $x = "The programming republic of Perl";
	1175	\& $x =~ /^(.+?)(e\|r)(.*)$/; # matches,
	1176	\& # $1 = 'Th'
	1177	\& # $2 = 'e'
	1178	\& # $3 = ' programming republic of Perl'
	1179	.Ve
	1180	.PP
	1181	The minimal string that will allow both the start of the string \f(CW\(C`^\(C'\fR
	1182	and the alternation to match is \f(CW\(C`Th\(C'\fR, with the alternation \f(CW\(C`e\|r\(C'\fR
	1183	matching \f(CW\(C`e\(C'\fR. The second quantifier \f(CW\(C`.\*(C'\fR is free to gobble up the
	1184	rest of the string.
	1185	.PP
	1186	.Vb 3
	1187	\& $x =~ /(m{1,2}?)(.*?)$/; # matches,
	1188	\& # $1 = 'm'
	1189	\& # $2 = 'ming republic of Perl'
	1190	.Ve
	1191	.PP
	1192	The first string position that this regexp can match is at the first
	1193	\&\f(CW'm'\fR in \f(CW\(C`programming\(C'\fR. At this position, the minimal \f(CW\(C`m{1,2}?\(C'\fR
	1194	matches just one \f(CW'm'\fR. Although the second quantifier \f(CW\(C`.?\*(C'\fR would
	1195	prefer to match no characters, it is constrained by the end-of-string
	1196	anchor \f(CW\(C`$\(C'\fR to match the rest of the string.
	1197	.PP
	1198	.Vb 4
	1199	\& $x =~ /(.?)(m{1,2}?)(.)$/; # matches,
	1200	\& # $1 = 'The progra'
	1201	\& # $2 = 'm'
	1202	\& # $3 = 'ming republic of Perl'
	1203	.Ve
	1204	.PP
	1205	In this regexp, you might expect the first minimal quantifier \f(CW\(C`.?\*(C'\fR
	1206	to match the empty string, because it is not constrained by a \f(CW\(C`^\(C'\fR
	1207	anchor to match the beginning of the word. Principle 0 applies here,
	1208	however. Because it is possible for the whole regexp to match at the
	1209	start of the string, it \fIwill\fR match at the start of the string. Thus
	1210	the first quantifier has to match everything up to the first \f(CW\(C`m\(C'\fR. The
	1211	second minimal quantifier matches just one \f(CW\(C`m\(C'\fR and the third
	1212	quantifier matches the rest of the string.
	1213	.PP
	1214	.Vb 4
	1215	\& $x =~ /(.??)(m{1,2})(.*)$/; # matches,
	1216	\& # $1 = 'a'
	1217	\& # $2 = 'mm'
	1218	\& # $3 = 'ing republic of Perl'
	1219	.Ve
	1220	.PP
	1221	Just as in the previous regexp, the first quantifier \f(CW\(C`.??\(C'\fR can match
	1222	earliest at position \f(CW'a'\fR, so it does. The second quantifier is
	1223	greedy, so it matches \f(CW\(C`mm\(C'\fR, and the third matches the rest of the
	1224	string.
	1225	.PP
	1226	We can modify principle 3 above to take into account non-greedy
	1227	quantifiers:
	1228	.IP "\(bu" 4
	1229	Principle 3: If there are two or more elements in a regexp, the
	1230	leftmost greedy (non\-greedy) quantifier, if any, will match as much
	1231	(little) of the string as possible while still allowing the whole
	1232	regexp to match. The next leftmost greedy (non\-greedy) quantifier, if
	1233	any, will try to match as much (little) of the string remaining
	1234	available to it as possible, while still allowing the whole regexp to
	1235	match. And so on, until all the regexp elements are satisfied.
	1236	.PP
	1237	Just like alternation, quantifiers are also susceptible to
	1238	backtracking. Here is a step-by-step analysis of the example
	1239	.PP
	1240	.Vb 5
	1241	\& $x = "the cat in the hat";
	1242	\& $x =~ /^(.)(at)(.)$/; # matches,
	1243	\& # $1 = 'the cat in the h'
	1244	\& # $2 = 'at'
	1245	\& # $3 = '' (0 matches)
	1246	.Ve
	1247	.IP "0" 4
	1248	Start with the first letter in the string 't'.
	1249	.IP "1" 4
	1250	.IX Item "1"
	1251	The first quantifier '.*' starts out by matching the whole
	1252	string 'the cat in the hat'.
	1253	.IP "2" 4
	1254	.IX Item "2"
	1255	\&'a' in the regexp element 'at' doesn't match the end of the
	1256	string. Backtrack one character.
	1257	.IP "3" 4
	1258	.IX Item "3"
	1259	\&'a' in the regexp element 'at' still doesn't match the last
	1260	letter of the string 't', so backtrack one more character.
	1261	.IP "4" 4
	1262	.IX Item "4"
	1263	Now we can match the 'a' and the 't'.
	1264	.IP "5" 4
	1265	.IX Item "5"
	1266	Move on to the third element '.*'. Since we are at the end of
	1267	the string and '.*' can match 0 times, assign it the empty string.
	1268	.IP "6" 4
	1269	.IX Item "6"
	1270	We are done!
	1271	.PP
	1272	Most of the time, all this moving forward and backtracking happens
	1273	quickly and searching is fast. There are some pathological regexps,
	1274	however, whose execution time exponentially grows with the size of the
	1275	string. A typical structure that blows up in your face is of the form
	1276	.PP
	1277	.Vb 1
	1278	\& /(a\|b+)*/;
	1279	.Ve
	1280	.PP
	1281	The problem is the nested indeterminate quantifiers. There are many
	1282	different ways of partitioning a string of length n between the \f(CW\(C`+\(C'\fR
	1283	and \f(CW\(C`\(C'\fR: one repetition with \f(CW\(C`b+\*(C'\fR of length n, two repetitions with
	1284	the first \f(CW\(C`b+\(C'\fR length k and the second with length n\-k, m repetitions
	1285	whose bits add up to length n, etc. In fact there are an exponential
	1286	number of ways to partition a string as a function of length. A
	1287	regexp may get lucky and match early in the process, but if there is
	1288	no match, perl will try \fIevery\fR possibility before giving up. So be
	1289	careful with nested \f(CW\(C`\(C'\fR's, \f(CW\(C`{n,m}\(C'\fR's, and \f(CW\(C`+\*(C'\fR's. The book
	1290	\&\fIMastering regular expressions\fR by Jeffrey Friedl gives a wonderful
	1291	discussion of this and other efficiency issues.
	1292	.Sh "Building a regexp"
	1293	.IX Subsection "Building a regexp"
	1294	At this point, we have all the basic regexp concepts covered, so let's
	1295	give a more involved example of a regular expression. We will build a
	1296	regexp that matches numbers.
	1297	.PP
	1298	The first task in building a regexp is to decide what we want to match
	1299	and what we want to exclude. In our case, we want to match both
	1300	integers and floating point numbers and we want to reject any string
	1301	that isn't a number.
	1302	.PP
	1303	The next task is to break the problem down into smaller problems that
	1304	are easily converted into a regexp.
	1305	.PP
	1306	The simplest case is integers. These consist of a sequence of digits,
	1307	with an optional sign in front. The digits we can represent with
	1308	\&\f(CW\(C`\ed+\(C'\fR and the sign can be matched with \f(CW\(C`[+\-]\(C'\fR. Thus the integer
	1309	regexp is
	1310	.PP
	1311	.Vb 1
	1312	\& /[+-]?\ed+/; # matches integers
	1313	.Ve
	1314	.PP
	1315	A floating point number potentially has a sign, an integral part, a
	1316	decimal point, a fractional part, and an exponent. One or more of these
	1317	parts is optional, so we need to check out the different
	1318	possibilities. Floating point numbers which are in proper form include
	1319	123., 0.345, .34, \-1e6, and 25.4E\-72. As with integers, the sign out
	1320	front is completely optional and can be matched by \f(CW\(C`[+\-]?\(C'\fR. We can
	1321	see that if there is no exponent, floating point numbers must have a
	1322	decimal point, otherwise they are integers. We might be tempted to
	1323	model these with \f(CW\(C`\ed\e.\ed\(C'\fR, but this would also match just a single
	1324	decimal point, which is not a number. So the three cases of floating
	1325	point number sans exponent are
	1326	.PP
	1327	.Vb 3
	1328	\& /[+-]?\ed+\e./; # 1., 321., etc.
	1329	\& /[+-]?\e.\ed+/; # .1, .234, etc.
	1330	\& /[+-]?\ed+\e.\ed+/; # 1.0, 30.56, etc.
	1331	.Ve
	1332	.PP
	1333	These can be combined into a single regexp with a three-way alternation:
	1334	.PP
	1335	.Vb 1
	1336	\& /[+-]?(\ed+\e.\ed+\|\ed+\e.\|\e.\ed+)/; # floating point, no exponent
	1337	.Ve
	1338	.PP
	1339	In this alternation, it is important to put \f(CW'\ed+\e.\ed+'\fR before
	1340	\&\f(CW'\ed+\e.'\fR. If \f(CW'\ed+\e.'\fR were first, the regexp would happily match that
	1341	and ignore the fractional part of the number.
	1342	.PP
	1343	Now consider floating point numbers with exponents. The key
	1344	observation here is that \fIboth\fR integers and numbers with decimal
	1345	points are allowed in front of an exponent. Then exponents, like the
	1346	overall sign, are independent of whether we are matching numbers with
	1347	or without decimal points, and can be 'decoupled' from the
	1348	mantissa. The overall form of the regexp now becomes clear:
	1349	.PP
	1350	.Vb 1
	1351	\& /^(optional sign)(integer \| f.p. mantissa)(optional exponent)$/;
	1352	.Ve
	1353	.PP
	1354	The exponent is an \f(CW\(C`e\(C'\fR or \f(CW\(C`E\(C'\fR, followed by an integer. So the
	1355	exponent regexp is
	1356	.PP
	1357	.Vb 1
	1358	\& /[eE][+-]?\ed+/; # exponent
	1359	.Ve
	1360	.PP
	1361	Putting all the parts together, we get a regexp that matches numbers:
	1362	.PP
	1363	.Vb 1
	1364	\& /^[+-]?(\ed+\e.\ed+\|\ed+\e.\|\e.\ed+\|\ed+)([eE][+-]?\ed+)?$/; # Ta da!
	1365	.Ve
	1366	.PP
	1367	Long regexps like this may impress your friends, but can be hard to
	1368	decipher. In complex situations like this, the \f(CW\(C`//x\(C'\fR modifier for a
	1369	match is invaluable. It allows one to put nearly arbitrary whitespace
	1370	and comments into a regexp without affecting their meaning. Using it,
	1371	we can rewrite our 'extended' regexp in the more pleasing form
	1372	.PP
	1373	.Vb 10
	1374	\& /^
	1375	\& [+-]? # first, match an optional sign
	1376	\& ( # then match integers or f.p. mantissas:
	1377	\& \ed+\e.\ed+ # mantissa of the form a.b
	1378	\& \|\ed+\e. # mantissa of the form a.
	1379	\& \|\e.\ed+ # mantissa of the form .b
	1380	\& \|\ed+ # integer of the form a
	1381	\& )
	1382	\& ([eE][+-]?\ed+)? # finally, optionally match an exponent
	1383	\& $/x;
	1384	.Ve
	1385	.PP
	1386	If whitespace is mostly irrelevant, how does one include space
	1387	characters in an extended regexp? The answer is to backslash it
	1388	\&\f(CW'\e\ '\fR\ or put it in a character class \f(CW\(C`[\ ]\(C'\fR\ . The same thing
	1389	goes for pound signs, use \f(CW\(C`\e#\(C'\fR or \f(CW\(C`[#]\(C'\fR. For instance, Perl allows
	1390	a space between the sign and the mantissa/integer, and we could add
	1391	this to our regexp as follows:
	1392	.PP
	1393	.Vb 10
	1394	\& /^
	1395	\& [+-]?\e * # first, match an optional sign and space
	1396	\& ( # then match integers or f.p. mantissas:
	1397	\& \ed+\e.\ed+ # mantissa of the form a.b
	1398	\& \|\ed+\e. # mantissa of the form a.
	1399	\& \|\e.\ed+ # mantissa of the form .b
	1400	\& \|\ed+ # integer of the form a
	1401	\& )
	1402	\& ([eE][+-]?\ed+)? # finally, optionally match an exponent
	1403	\& $/x;
	1404	.Ve
	1405	.PP
	1406	In this form, it is easier to see a way to simplify the
	1407	alternation. Alternatives 1, 2, and 4 all start with \f(CW\(C`\ed+\(C'\fR, so it
	1408	could be factored out:
	1409	.PP
	1410	.Vb 11
	1411	\& /^
	1412	\& [+-]?\e * # first, match an optional sign
	1413	\& ( # then match integers or f.p. mantissas:
	1414	\& \ed+ # start out with a ...
	1415	\& (
	1416	\& \e.\ed* # mantissa of the form a.b or a.
	1417	\& )? # ? takes care of integers of the form a
	1418	\& \|\e.\ed+ # mantissa of the form .b
	1419	\& )
	1420	\& ([eE][+-]?\ed+)? # finally, optionally match an exponent
	1421	\& $/x;
	1422	.Ve
	1423	.PP
	1424	or written in the compact form,
	1425	.PP
	1426	.Vb 1
	1427	\& /^[+-]?\e (\ed+(\e.\ed)?\|\e.\ed+)([eE][+-]?\ed+)?$/;
	1428	.Ve
	1429	.PP
	1430	This is our final regexp. To recap, we built a regexp by
	1431	.IP "\(bu" 4
	1432	specifying the task in detail,
	1433	.IP "\(bu" 4
	1434	breaking down the problem into smaller parts,
	1435	.IP "\(bu" 4
	1436	translating the small parts into regexps,
	1437	.IP "\(bu" 4
	1438	combining the regexps,
	1439	.IP "\(bu" 4
	1440	and optimizing the final combined regexp.
	1441	.PP
	1442	These are also the typical steps involved in writing a computer
	1443	program. This makes perfect sense, because regular expressions are
	1444	essentially programs written a little computer language that specifies
	1445	patterns.
	1446	.Sh "Using regular expressions in Perl"
	1447	.IX Subsection "Using regular expressions in Perl"
	1448	The last topic of Part 1 briefly covers how regexps are used in Perl
	1449	programs. Where do they fit into Perl syntax?
	1450	.PP
	1451	We have already introduced the matching operator in its default
	1452	\&\f(CW\(C`/regexp/\(C'\fR and arbitrary delimiter \f(CW\(C`m!regexp!\(C'\fR forms. We have used
	1453	the binding operator \f(CW\(C`=~\(C'\fR and its negation \f(CW\(C`!~\(C'\fR to test for string
	1454	matches. Associated with the matching operator, we have discussed the
	1455	single line \f(CW\(C`//s\(C'\fR, multi-line \f(CW\(C`//m\(C'\fR, case-insensitive \f(CW\(C`//i\(C'\fR and
	1456	extended \f(CW\(C`//x\(C'\fR modifiers.
	1457	.PP
	1458	There are a few more things you might want to know about matching
	1459	operators. First, we pointed out earlier that variables in regexps are
	1460	substituted before the regexp is evaluated:
	1461	.PP
	1462	.Vb 4
	1463	\& $pattern = 'Seuss';
	1464	\& while (<>) {
	1465	\& print if /$pattern/;
	1466	\& }
	1467	.Ve
	1468	.PP
	1469	This will print any lines containing the word \f(CW\(C`Seuss\(C'\fR. It is not as
	1470	efficient as it could be, however, because perl has to re-evaluate
	1471	\&\f(CW$pattern\fR each time through the loop. If \f(CW$pattern\fR won't be
	1472	changing over the lifetime of the script, we can add the \f(CW\(C`//o\(C'\fR
	1473	modifier, which directs perl to only perform variable substitutions
	1474	once:
	1475	.PP
	1476	.Vb 6
	1477	\& #!/usr/bin/perl
	1478	\& # Improved simple_grep
	1479	\& $regexp = shift;
	1480	\& while (<>) {
	1481	\& print if /$regexp/o; # a good deal faster
	1482	\& }
	1483	.Ve
	1484	.PP
	1485	If you change \f(CW$pattern\fR after the first substitution happens, perl
	1486	will ignore it. If you don't want any substitutions at all, use the
	1487	special delimiter \f(CW\(C`m''\(C'\fR:
	1488	.PP
	1489	.Vb 4
	1490	\& @pattern = ('Seuss');
	1491	\& while (<>) {
	1492	\& print if m'@pattern'; # matches literal '@pattern', not 'Seuss'
	1493	\& }
	1494	.Ve
	1495	.PP
	1496	\&\f(CW\(C`m''\(C'\fR acts like single quotes on a regexp; all other \f(CW\(C`m\(C'\fR delimiters
	1497	act like double quotes. If the regexp evaluates to the empty string,
	1498	the regexp in the \fIlast successful match\fR is used instead. So we have
	1499	.PP
	1500	.Vb 2
	1501	\& "dog" =~ /d/; # 'd' matches
	1502	\& "dogbert =~ //; # this matches the 'd' regexp used before
	1503	.Ve
	1504	.PP
	1505	The final two modifiers \f(CW\(C`//g\(C'\fR and \f(CW\(C`//c\(C'\fR concern multiple matches.
	1506	The modifier \f(CW\(C`//g\(C'\fR stands for global matching and allows the
	1507	matching operator to match within a string as many times as possible.
	1508	In scalar context, successive invocations against a string will have
	1509	`\f(CW\(C`//g\(C'\fR jump from match to match, keeping track of position in the
	1510	string as it goes along. You can get or set the position with the
	1511	\&\f(CW\(C`pos()\(C'\fR function.
	1512	.PP
	1513	The use of \f(CW\(C`//g\(C'\fR is shown in the following example. Suppose we have
	1514	a string that consists of words separated by spaces. If we know how
	1515	many words there are in advance, we could extract the words using
	1516	groupings:
	1517	.PP
	1518	.Vb 5
	1519	\& $x = "cat dog house"; # 3 words
	1520	\& $x =~ /^\es(\ew+)\es+(\ew+)\es+(\ew+)\es$/; # matches,
	1521	\& # $1 = 'cat'
	1522	\& # $2 = 'dog'
	1523	\& # $3 = 'house'
	1524	.Ve
	1525	.PP
	1526	But what if we had an indeterminate number of words? This is the sort
	1527	of task \f(CW\(C`//g\(C'\fR was made for. To extract all words, form the simple
	1528	regexp \f(CW\(C`(\ew+)\(C'\fR and loop over all matches with \f(CW\(C`/(\ew+)/g\(C'\fR:
	1529	.PP
	1530	.Vb 3
	1531	\& while ($x =~ /(\ew+)/g) {
	1532	\& print "Word is $1, ends at position ", pos $x, "\en";
	1533	\& }
	1534	.Ve
	1535	.PP
	1536	prints
	1537	.PP
	1538	.Vb 3
	1539	\& Word is cat, ends at position 3
	1540	\& Word is dog, ends at position 7
	1541	\& Word is house, ends at position 13
	1542	.Ve
	1543	.PP
	1544	A failed match or changing the target string resets the position. If
	1545	you don't want the position reset after failure to match, add the
	1546	\&\f(CW\(C`//c\(C'\fR, as in \f(CW\(C`/regexp/gc\(C'\fR. The current position in the string is
	1547	associated with the string, not the regexp. This means that different
	1548	strings have different positions and their respective positions can be
	1549	set or read independently.
	1550	.PP
	1551	In list context, \f(CW\(C`//g\(C'\fR returns a list of matched groupings, or if
	1552	there are no groupings, a list of matches to the whole regexp. So if
	1553	we wanted just the words, we could use
	1554	.PP
	1555	.Vb 4
	1556	\& @words = ($x =~ /(\ew+)/g); # matches,
	1557	\& # $word[0] = 'cat'
	1558	\& # $word[1] = 'dog'
	1559	\& # $word[2] = 'house'
	1560	.Ve
	1561	.PP
	1562	Closely associated with the \f(CW\(C`//g\(C'\fR modifier is the \f(CW\(C`\eG\(C'\fR anchor. The
	1563	\&\f(CW\(C`\eG\(C'\fR anchor matches at the point where the previous \f(CW\(C`//g\(C'\fR match left
	1564	off. \f(CW\(C`\eG\(C'\fR allows us to easily do context-sensitive matching:
	1565	.PP
	1566	.Vb 12
	1567	\& $metric = 1; # use metric units
	1568	\& ...
	1569	\& $x = <FILE>; # read in measurement
	1570	\& $x =~ /^([+-]?\ed+)\es*/g; # get magnitude
	1571	\& $weight = $1;
	1572	\& if ($metric) { # error checking
	1573	\& print "Units error!" unless $x =~ /\eGkg\e./g;
	1574	\& }
	1575	\& else {
	1576	\& print "Units error!" unless $x =~ /\eGlbs\e./g;
	1577	\& }
	1578	\& $x =~ /\eG\es+(widget\|sprocket)/g; # continue processing
	1579	.Ve
	1580	.PP
	1581	The combination of \f(CW\(C`//g\(C'\fR and \f(CW\(C`\eG\(C'\fR allows us to process the string a
	1582	bit at a time and use arbitrary Perl logic to decide what to do next.
	1583	Currently, the \f(CW\(C`\eG\(C'\fR anchor is only fully supported when used to anchor
	1584	to the start of the pattern.
	1585	.PP
	1586	\&\f(CW\(C`\eG\(C'\fR is also invaluable in processing fixed length records with
	1587	regexps. Suppose we have a snippet of coding region \s-1DNA\s0, encoded as
	1588	base pair letters \f(CW\(C`ATCGTTGAAT...\(C'\fR and we want to find all the stop
	1589	codons \f(CW\(C`TGA\(C'\fR. In a coding region, codons are 3\-letter sequences, so
	1590	we can think of the \s-1DNA\s0 snippet as a sequence of 3\-letter records. The
	1591	naive regexp
	1592	.PP
	1593	.Vb 3
	1594	\& # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC"
	1595	\& $dna = "ATCGTTGAATGCAAATGACATGAC";
	1596	\& $dna =~ /TGA/;
	1597	.Ve
	1598	.PP
	1599	doesn't work; it may match a \f(CW\(C`TGA\(C'\fR, but there is no guarantee that
	1600	the match is aligned with codon boundaries, e.g., the substring
	1601	\&\f(CW\(C`GTT\ GAA\(C'\fR\ gives a match. A better solution is
	1602	.PP
	1603	.Vb 3
	1604	\& while ($dna =~ /(\ew\ew\ew)?TGA/g) { # note the minimal ?
	1605	\& print "Got a TGA stop codon at position ", pos $dna, "\en";
	1606	\& }
	1607	.Ve
	1608	.PP
	1609	which prints
	1610	.PP
	1611	.Vb 2
	1612	\& Got a TGA stop codon at position 18
	1613	\& Got a TGA stop codon at position 23
	1614	.Ve
	1615	.PP
	1616	Position 18 is good, but position 23 is bogus. What happened?
	1617	.PP
	1618	The answer is that our regexp works well until we get past the last
	1619	real match. Then the regexp will fail to match a synchronized \f(CW\(C`TGA\(C'\fR
	1620	and start stepping ahead one character position at a time, not what we
	1621	want. The solution is to use \f(CW\(C`\eG\(C'\fR to anchor the match to the codon
	1622	alignment:
	1623	.PP
	1624	.Vb 3
	1625	\& while ($dna =~ /\eG(\ew\ew\ew)*?TGA/g) {
	1626	\& print "Got a TGA stop codon at position ", pos $dna, "\en";
	1627	\& }
	1628	.Ve
	1629	.PP
	1630	This prints
	1631	.PP
	1632	.Vb 1
	1633	\& Got a TGA stop codon at position 18
	1634	.Ve
	1635	.PP
	1636	which is the correct answer. This example illustrates that it is
	1637	important not only to match what is desired, but to reject what is not
	1638	desired.
	1639	.PP
	1640	\&\fBsearch and replace\fR
	1641	.PP
	1642	Regular expressions also play a big role in \fBsearch and replace\fR
	1643	operations in Perl. Search and replace is accomplished with the
	1644	\&\f(CW\(C`s///\(C'\fR operator. The general form is
	1645	\&\f(CW\(C`s/regexp/replacement/modifiers\(C'\fR, with everything we know about
	1646	regexps and modifiers applying in this case as well. The
	1647	\&\f(CW\(C`replacement\(C'\fR is a Perl double quoted string that replaces in the
	1648	string whatever is matched with the \f(CW\(C`regexp\(C'\fR. The operator \f(CW\(C`=~\(C'\fR is
	1649	also used here to associate a string with \f(CW\(C`s///\(C'\fR. If matching
	1650	against \f(CW$_\fR, the \f(CW\(C`$_\ =~\(C'\fR\ can be dropped. If there is a match,
	1651	\&\f(CW\(C`s///\(C'\fR returns the number of substitutions made, otherwise it returns
	1652	false. Here are a few examples:
	1653	.PP
	1654	.Vb 8
	1655	\& $x = "Time to feed the cat!";
	1656	\& $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!"
	1657	\& if ($x =~ s/^(Time.*hacker)!$/$1 now!/) {
	1658	\& $more_insistent = 1;
	1659	\& }
	1660	\& $y = "'quoted words'";
	1661	\& $y =~ s/^'(.*)'$/$1/; # strip single quotes,
	1662	\& # $y contains "quoted words"
	1663	.Ve
	1664	.PP
	1665	In the last example, the whole string was matched, but only the part
	1666	inside the single quotes was grouped. With the \f(CW\(C`s///\(C'\fR operator, the
	1667	matched variables \f(CW$1\fR, \f(CW$2\fR, etc. are immediately available for use
	1668	in the replacement expression, so we use \f(CW$1\fR to replace the quoted
	1669	string with just what was quoted. With the global modifier, \f(CW\(C`s///g\(C'\fR
	1670	will search and replace all occurrences of the regexp in the string:
	1671	.PP
	1672	.Vb 6
	1673	\& $x = "I batted 4 for 4";
	1674	\& $x =~ s/4/four/; # doesn't do it all:
	1675	\& # $x contains "I batted four for 4"
	1676	\& $x = "I batted 4 for 4";
	1677	\& $x =~ s/4/four/g; # does it all:
	1678	\& # $x contains "I batted four for four"
	1679	.Ve
	1680	.PP
	1681	If you prefer 'regex' over 'regexp' in this tutorial, you could use
	1682	the following program to replace it:
	1683	.PP
	1684	.Vb 9
	1685	\& % cat > simple_replace
	1686	\& #!/usr/bin/perl
	1687	\& $regexp = shift;
	1688	\& $replacement = shift;
	1689	\& while (<>) {
	1690	\& s/$regexp/$replacement/go;
	1691	\& print;
	1692	\& }
	1693	\& ^D
	1694	.Ve
	1695	.PP
	1696	.Vb 1
	1697	\& % simple_replace regexp regex perlretut.pod
	1698	.Ve
	1699	.PP
	1700	In \f(CW\(C`simple_replace\(C'\fR we used the \f(CW\(C`s///g\(C'\fR modifier to replace all
	1701	occurrences of the regexp on each line and the \f(CW\(C`s///o\(C'\fR modifier to
	1702	compile the regexp only once. As with \f(CW\(C`simple_grep\(C'\fR, both the
	1703	\&\f(CW\(C`print\(C'\fR and the \f(CW\(C`s/$regexp/$replacement/go\(C'\fR use \f(CW$_\fR implicitly.
	1704	.PP
	1705	A modifier available specifically to search and replace is the
	1706	\&\f(CW\(C`s///e\(C'\fR evaluation modifier. \f(CW\(C`s///e\(C'\fR wraps an \f(CW\(C`eval{...}\(C'\fR around
	1707	the replacement string and the evaluated result is substituted for the
	1708	matched substring. \f(CW\(C`s///e\(C'\fR is useful if you need to do a bit of
	1709	computation in the process of replacing text. This example counts
	1710	character frequencies in a line:
	1711	.PP
	1712	.Vb 4
	1713	\& $x = "Bill the cat";
	1714	\& $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself
	1715	\& print "frequency of '$_' is $chars{$_}\en"
	1716	\& foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars);
	1717	.Ve
	1718	.PP
	1719	This prints
	1720	.PP
	1721	.Vb 9
	1722	\& frequency of ' ' is 2
	1723	\& frequency of 't' is 2
	1724	\& frequency of 'l' is 2
	1725	\& frequency of 'B' is 1
	1726	\& frequency of 'c' is 1
	1727	\& frequency of 'e' is 1
	1728	\& frequency of 'h' is 1
	1729	\& frequency of 'i' is 1
	1730	\& frequency of 'a' is 1
	1731	.Ve
	1732	.PP
	1733	As with the match \f(CW\(C`m//\(C'\fR operator, \f(CW\(C`s///\(C'\fR can use other delimiters,
	1734	such as \f(CW\(C`s!!!\(C'\fR and \f(CW\(C`s{}{}\(C'\fR, and even \f(CW\(C`s{}//\(C'\fR. If single quotes are
	1735	used \f(CW\(C`s'''\(C'\fR, then the regexp and replacement are treated as single
	1736	quoted strings and there are no substitutions. \f(CW\(C`s///\(C'\fR in list context
	1737	returns the same thing as in scalar context, i.e., the number of
	1738	matches.
	1739	.PP
	1740	\&\fBThe split operator\fR
	1741	.PP
	1742	The \fB\f(CB\(C`split\(C'\fB \fR function can also optionally use a matching operator
	1743	\&\f(CW\(C`m//\(C'\fR to split a string. \f(CW\(C`split /regexp/, string, limit\(C'\fR splits
	1744	\&\f(CW\(C`string\(C'\fR into a list of substrings and returns that list. The regexp
	1745	is used to match the character sequence that the \f(CW\(C`string\(C'\fR is split
	1746	with respect to. The \f(CW\(C`limit\(C'\fR, if present, constrains splitting into
	1747	no more than \f(CW\(C`limit\(C'\fR number of strings. For example, to split a
	1748	string into words, use
	1749	.PP
	1750	.Vb 4
	1751	\& $x = "Calvin and Hobbes";
	1752	\& @words = split /\es+/, $x; # $word[0] = 'Calvin'
	1753	\& # $word[1] = 'and'
	1754	\& # $word[2] = 'Hobbes'
	1755	.Ve
	1756	.PP
	1757	If the empty regexp \f(CW\(C`//\(C'\fR is used, the regexp always matches and
	1758	the string is split into individual characters. If the regexp has
	1759	groupings, then list produced contains the matched substrings from the
	1760	groupings as well. For instance,
	1761	.PP
	1762	.Vb 12
	1763	\& $x = "/usr/bin/perl";
	1764	\& @dirs = split m!/!, $x; # $dirs[0] = ''
	1765	\& # $dirs[1] = 'usr'
	1766	\& # $dirs[2] = 'bin'
	1767	\& # $dirs[3] = 'perl'
	1768	\& @parts = split m!(/)!, $x; # $parts[0] = ''
	1769	\& # $parts[1] = '/'
	1770	\& # $parts[2] = 'usr'
	1771	\& # $parts[3] = '/'
	1772	\& # $parts[4] = 'bin'
	1773	\& # $parts[5] = '/'
	1774	\& # $parts[6] = 'perl'
	1775	.Ve
	1776	.PP
	1777	Since the first character of \f(CW$x\fR matched the regexp, \f(CW\(C`split\(C'\fR prepended
	1778	an empty initial element to the list.
	1779	.PP
	1780	If you have read this far, congratulations! You now have all the basic
	1781	tools needed to use regular expressions to solve a wide range of text
	1782	processing problems. If this is your first time through the tutorial,
	1783	why not stop here and play around with regexps a while... Part\ 2
	1784	concerns the more esoteric aspects of regular expressions and those
	1785	concepts certainly aren't needed right at the start.
	1786	.SH "Part 2: Power tools"
	1787	.IX Header "Part 2: Power tools"
	1788	\&\s-1OK\s0, you know the basics of regexps and you want to know more. If
	1789	matching regular expressions is analogous to a walk in the woods, then
	1790	the tools discussed in Part 1 are analogous to topo maps and a
	1791	compass, basic tools we use all the time. Most of the tools in part 2
	1792	are analogous to flare guns and satellite phones. They aren't used
	1793	too often on a hike, but when we are stuck, they can be invaluable.
	1794	.PP
	1795	What follows are the more advanced, less used, or sometimes esoteric
	1796	capabilities of perl regexps. In Part 2, we will assume you are
	1797	comfortable with the basics and concentrate on the new features.
	1798	.Sh "More on characters, strings, and character classes"
	1799	.IX Subsection "More on characters, strings, and character classes"
	1800	There are a number of escape sequences and character classes that we
	1801	haven't covered yet.
	1802	.PP
	1803	There are several escape sequences that convert characters or strings
	1804	between upper and lower case. \f(CW\(C`\el\(C'\fR and \f(CW\(C`\eu\(C'\fR convert the next
	1805	character to lower or upper case, respectively:
	1806	.PP
	1807	.Vb 4
	1808	\& $x = "perl";
	1809	\& $string =~ /\eu$x/; # matches 'Perl' in $string
	1810	\& $x = "M(rs?\|s)\e\e."; # note the double backslash
	1811	\& $string =~ /\el$x/; # matches 'mr.', 'mrs.', and 'ms.',
	1812	.Ve
	1813	.PP
	1814	\&\f(CW\(C`\eL\(C'\fR and \f(CW\(C`\eU\(C'\fR converts a whole substring, delimited by \f(CW\(C`\eL\(C'\fR or
	1815	\&\f(CW\(C`\eU\(C'\fR and \f(CW\(C`\eE\(C'\fR, to lower or upper case:
	1816	.PP
	1817	.Vb 4
	1818	\& $x = "This word is in lower case:\eL SHOUT\eE";
	1819	\& $x =~ /shout/; # matches
	1820	\& $x = "I STILL KEYPUNCH CARDS FOR MY 360"
	1821	\& $x =~ /\eUkeypunch/; # matches punch card string
	1822	.Ve
	1823	.PP
	1824	If there is no \f(CW\(C`\eE\(C'\fR, case is converted until the end of the
	1825	string. The regexps \f(CW\(C`\eL\eu$word\(C'\fR or \f(CW\(C`\eu\eL$word\(C'\fR convert the first
	1826	character of \f(CW$word\fR to uppercase and the rest of the characters to
	1827	lowercase.
	1828	.PP
	1829	Control characters can be escaped with \f(CW\(C`\ec\(C'\fR, so that a control-Z
	1830	character would be matched with \f(CW\(C`\ecZ\(C'\fR. The escape sequence
	1831	\&\f(CW\(C`\eQ\(C'\fR...\f(CW\(C`\eE\(C'\fR quotes, or protects most non-alphabetic characters. For
	1832	instance,
	1833	.PP
	1834	.Vb 2
	1835	\& $x = "\eQThat !^*&%~& cat!";
	1836	\& $x =~ /\eQ!^*&%~&\eE/; # check for rough language
	1837	.Ve
	1838	.PP
	1839	It does not protect \f(CW\(C`$\(C'\fR or \f(CW\(C`@\(C'\fR, so that variables can still be
	1840	substituted.
	1841	.PP
	1842	With the advent of 5.6.0, perl regexps can handle more than just the
	1843	standard \s-1ASCII\s0 character set. Perl now supports \fBUnicode\fR, a standard
	1844	for encoding the character sets from many of the world's written
	1845	languages. Unicode does this by allowing characters to be more than
	1846	one byte wide. Perl uses the \s-1UTF\-8\s0 encoding, in which \s-1ASCII\s0 characters
	1847	are still encoded as one byte, but characters greater than \f(CW\(C`chr(127)\(C'\fR
	1848	may be stored as two or more bytes.
	1849	.PP
	1850	What does this mean for regexps? Well, regexp users don't need to know
	1851	much about perl's internal representation of strings. But they do need
	1852	to know 1) how to represent Unicode characters in a regexp and 2) when
	1853	a matching operation will treat the string to be searched as a
	1854	sequence of bytes (the old way) or as a sequence of Unicode characters
	1855	(the new way). The answer to 1) is that Unicode characters greater
	1856	than \f(CW\(C`chr(127)\(C'\fR may be represented using the \f(CW\(C`\ex{hex}\(C'\fR notation,
	1857	with \f(CW\(C`hex\(C'\fR a hexadecimal integer:
	1858	.PP
	1859	.Vb 1
	1860	\& /\ex{263a}/; # match a Unicode smiley face :)
	1861	.Ve
	1862	.PP
	1863	Unicode characters in the range of 128\-255 use two hexadecimal digits
	1864	with braces: \f(CW\(C`\ex{ab}\(C'\fR. Note that this is different than \f(CW\(C`\exab\(C'\fR,
	1865	which is just a hexadecimal byte with no Unicode significance.
	1866	.PP
	1867	\&\fB\s-1NOTE\s0\fR: in Perl 5.6.0 it used to be that one needed to say \f(CW\*(C`use
	1868	utf8\*(C'\fR to use any Unicode features. This is no more the case: for
	1869	almost all Unicode processing, the explicit \f(CW\(C`utf8\(C'\fR pragma is not
	1870	needed. (The only case where it matters is if your Perl script is in
	1871	Unicode and encoded in \s-1UTF\-8\s0, then an explicit \f(CW\(C`use utf8\(C'\fR is needed.)
	1872	.PP
	1873	Figuring out the hexadecimal sequence of a Unicode character you want
	1874	or deciphering someone else's hexadecimal Unicode regexp is about as
	1875	much fun as programming in machine code. So another way to specify
	1876	Unicode characters is to use the \fBnamed\ character\fR\ escape
	1877	sequence \f(CW\(C`\eN{name}\(C'\fR. \f(CW\(C`name\(C'\fR is a name for the Unicode character, as
	1878	specified in the Unicode standard. For instance, if we wanted to
	1879	represent or match the astrological sign for the planet Mercury, we
	1880	could use
	1881	.PP
	1882	.Vb 3
	1883	\& use charnames ":full"; # use named chars with Unicode full names
	1884	\& $x = "abc\eN{MERCURY}def";
	1885	\& $x =~ /\eN{MERCURY}/; # matches
	1886	.Ve
	1887	.PP
	1888	One can also use short names or restrict names to a certain alphabet:
	1889	.PP
	1890	.Vb 2
	1891	\& use charnames ':full';
	1892	\& print "\eN{GREEK SMALL LETTER SIGMA} is called sigma.\en";
	1893	.Ve
	1894	.PP
	1895	.Vb 2
	1896	\& use charnames ":short";
	1897	\& print "\eN{greek:Sigma} is an upper-case sigma.\en";
	1898	.Ve
	1899	.PP
	1900	.Vb 2
	1901	\& use charnames qw(greek);
	1902	\& print "\eN{sigma} is Greek sigma\en";
	1903	.Ve
	1904	.PP
	1905	A list of full names is found in the file Names.txt in the
	1906	lib/perl5/5.X.X/unicore directory.
	1907	.PP
	1908	The answer to requirement 2), as of 5.6.0, is that if a regexp
	1909	contains Unicode characters, the string is searched as a sequence of
	1910	Unicode characters. Otherwise, the string is searched as a sequence of
	1911	bytes. If the string is being searched as a sequence of Unicode
	1912	characters, but matching a single byte is required, we can use the \f(CW\(C`\eC\(C'\fR
	1913	escape sequence. \f(CW\(C`\eC\(C'\fR is a character class akin to \f(CW\(C`.\(C'\fR except that
	1914	it matches \fIany\fR byte 0\-255. So
	1915	.PP
	1916	.Vb 7
	1917	\& use charnames ":full"; # use named chars with Unicode full names
	1918	\& $x = "a";
	1919	\& $x =~ /\eC/; # matches 'a', eats one byte
	1920	\& $x = "";
	1921	\& $x =~ /\eC/; # doesn't match, no bytes to match
	1922	\& $x = "\eN{MERCURY}"; # two-byte Unicode character
	1923	\& $x =~ /\eC/; # matches, but dangerous!
	1924	.Ve
	1925	.PP
	1926	The last regexp matches, but is dangerous because the string
	1927	\&\fIcharacter\fR position is no longer synchronized to the string \fIbyte\fR
	1928	position. This generates the warning 'Malformed \s-1UTF\-8\s0
	1929	character'. The \f(CW\(C`\eC\(C'\fR is best used for matching the binary data in strings
	1930	with binary data intermixed with Unicode characters.
	1931	.PP
	1932	Let us now discuss the rest of the character classes. Just as with
	1933	Unicode characters, there are named Unicode character classes
	1934	represented by the \f(CW\(C`\ep{name}\(C'\fR escape sequence. Closely associated is
	1935	the \f(CW\(C`\eP{name}\(C'\fR character class, which is the negation of the
	1936	\&\f(CW\(C`\ep{name}\(C'\fR class. For example, to match lower and uppercase
	1937	characters,
	1938	.PP
	1939	.Vb 6
	1940	\& use charnames ":full"; # use named chars with Unicode full names
	1941	\& $x = "BOB";
	1942	\& $x =~ /^\ep{IsUpper}/; # matches, uppercase char class
	1943	\& $x =~ /^\eP{IsUpper}/; # doesn't match, char class sans uppercase
	1944	\& $x =~ /^\ep{IsLower}/; # doesn't match, lowercase char class
	1945	\& $x =~ /^\eP{IsLower}/; # matches, char class sans lowercase
	1946	.Ve
	1947	.PP
	1948	Here is the association between some Perl named classes and the
	1949	traditional Unicode classes:
	1950	.PP
	1951	.Vb 1
	1952	\& Perl class name Unicode class name or regular expression
	1953	.Ve
	1954	.PP
	1955	.Vb 15
	1956	\& IsAlpha /^[LM]/
	1957	\& IsAlnum /^[LMN]/
	1958	\& IsASCII $code <= 127
	1959	\& IsCntrl /^C/
	1960	\& IsBlank $code =~ /^(0020\|0009)$/ \|\| /^Z[^lp]/
	1961	\& IsDigit Nd
	1962	\& IsGraph /^([LMNPS]\|Co)/
	1963	\& IsLower Ll
	1964	\& IsPrint /^([LMNPS]\|Co\|Zs)/
	1965	\& IsPunct /^P/
	1966	\& IsSpace /^Z/ \|\| ($code =~ /^(0009\|000A\|000B\|000C\|000D)$/
	1967	\& IsSpacePerl /^Z/ \|\| ($code =~ /^(0009\|000A\|000C\|000D\|0085\|2028\|2029)$/
	1968	\& IsUpper /^L[ut]/
	1969	\& IsWord /^[LMN]/ \|\| $code eq "005F"
	1970	\& IsXDigit $code =~ /^00(3[0-9]\|[46][1-6])$/
	1971	.Ve
	1972	.PP
	1973	You can also use the official Unicode class names with the \f(CW\(C`\ep\(C'\fR and
	1974	\&\f(CW\(C`\eP\(C'\fR, like \f(CW\(C`\ep{L}\(C'\fR for Unicode 'letters', or \f(CW\(C`\ep{Lu}\(C'\fR for uppercase
	1975	letters, or \f(CW\(C`\eP{Nd}\(C'\fR for non\-digits. If a \f(CW\(C`name\(C'\fR is just one
	1976	letter, the braces can be dropped. For instance, \f(CW\(C`\epM\(C'\fR is the
	1977	character class of Unicode 'marks', for example accent marks.
	1978	For the full list see perlunicode.
	1979	.PP
	1980	The Unicode has also been separated into various sets of characters
	1981	which you can test with \f(CW\(C`\ep{In...}\(C'\fR (in) and \f(CW\(C`\eP{In...}\(C'\fR (not in),
	1982	for example \f(CW\(C`\ep{Latin}\(C'\fR, \f(CW\(C`\ep{Greek}\(C'\fR, or \f(CW\(C`\eP{Katakana}\(C'\fR.
	1983	For the full list see perlunicode.
	1984	.PP
	1985	\&\f(CW\(C`\eX\(C'\fR is an abbreviation for a character class sequence that includes
	1986	the Unicode 'combining character sequences'. A 'combining character
	1987	sequence' is a base character followed by any number of combining
	1988	characters. An example of a combining character is an accent. Using
	1989	the Unicode full names, e.g., \f(CW\(C`A\ +\ COMBINING\ RING\(C'\fR\ is a combining
	1990	character sequence with base character \f(CW\(C`A\(C'\fR and combining character
	1991	\&\f(CW\(C`COMBINING\ RING\(C'\fR\ , which translates in Danish to A with the circle
	1992	atop it, as in the word Angstrom. \f(CW\(C`\eX\(C'\fR is equivalent to \f(CW\(C`\ePM\epM}\*(C'\fR,
	1993	i.e., a non-mark followed by one or more marks.
	1994	.PP
	1995	For the full and latest information about Unicode see the latest
	1996	Unicode standard, or the Unicode Consortium's website http://www.unicode.org/
	1997	.PP
	1998	As if all those classes weren't enough, Perl also defines \s-1POSIX\s0 style
	1999	character classes. These have the form \f(CW\(C`[:name:]\(C'\fR, with \f(CW\(C`name\(C'\fR the
	2000	name of the \s-1POSIX\s0 class. The \s-1POSIX\s0 classes are \f(CW\(C`alpha\(C'\fR, \f(CW\(C`alnum\(C'\fR,
	2001	\&\f(CW\(C`ascii\(C'\fR, \f(CW\(C`cntrl\(C'\fR, \f(CW\(C`digit\(C'\fR, \f(CW\(C`graph\(C'\fR, \f(CW\(C`lower\(C'\fR, \f(CW\(C`print\(C'\fR, \f(CW\(C`punct\(C'\fR,
	2002	\&\f(CW\(C`space\(C'\fR, \f(CW\(C`upper\(C'\fR, and \f(CW\(C`xdigit\(C'\fR, and two extensions, \f(CW\(C`word\(C'\fR (a Perl
	2003	extension to match \f(CW\(C`\ew\(C'\fR), and \f(CW\(C`blank\(C'\fR (a \s-1GNU\s0 extension). If \f(CW\(C`utf8\(C'\fR
	2004	is being used, then these classes are defined the same as their
	2005	corresponding perl Unicode classes: \f(CW\(C`[:upper:]\(C'\fR is the same as
	2006	\&\f(CW\(C`\ep{IsUpper}\(C'\fR, etc. The \s-1POSIX\s0 character classes, however, don't
	2007	require using \f(CW\(C`utf8\(C'\fR. The \f(CW\(C`[:digit:]\(C'\fR, \f(CW\(C`[:word:]\(C'\fR, and
	2008	\&\f(CW\(C`[:space:]\(C'\fR correspond to the familiar \f(CW\(C`\ed\(C'\fR, \f(CW\(C`\ew\(C'\fR, and \f(CW\(C`\es\(C'\fR
	2009	character classes. To negate a \s-1POSIX\s0 class, put a \f(CW\(C`^\(C'\fR in front of
	2010	the name, so that, e.g., \f(CW\(C`[:^digit:]\(C'\fR corresponds to \f(CW\(C`\eD\(C'\fR and under
	2011	\&\f(CW\(C`utf8\(C'\fR, \f(CW\(C`\eP{IsDigit}\(C'\fR. The Unicode and \s-1POSIX\s0 character classes can
	2012	be used just like \f(CW\(C`\ed\(C'\fR, with the exception that \s-1POSIX\s0 character
	2013	classes can only be used inside of a character class:
	2014	.PP
	2015	.Vb 7
	2016	\& /\es+[abc[:digit:]xyz]\es*/; # match a,b,c,x,y,z, or a digit
	2017	\& /^=item\es[[:digit:]]/; # match '=item',
	2018	\& # followed by a space and a digit
	2019	\& use charnames ":full";
	2020	\& /\es+[abc\ep{IsDigit}xyz]\es+/; # match a,b,c,x,y,z, or a digit
	2021	\& /^=item\es\ep{IsDigit}/; # match '=item',
	2022	\& # followed by a space and a digit
	2023	.Ve
	2024	.PP
	2025	Whew! That is all the rest of the characters and character classes.
	2026	.Sh "Compiling and saving regular expressions"
	2027	.IX Subsection "Compiling and saving regular expressions"
	2028	In Part 1 we discussed the \f(CW\(C`//o\(C'\fR modifier, which compiles a regexp
	2029	just once. This suggests that a compiled regexp is some data structure
	2030	that can be stored once and used again and again. The regexp quote
	2031	\&\f(CW\(C`qr//\(C'\fR does exactly that: \f(CW\(C`qr/string/\(C'\fR compiles the \f(CW\(C`string\(C'\fR as a
	2032	regexp and transforms the result into a form that can be assigned to a
	2033	variable:
	2034	.PP
	2035	.Vb 1
	2036	\& $reg = qr/foo+bar?/; # reg contains a compiled regexp
	2037	.Ve
	2038	.PP
	2039	Then \f(CW$reg\fR can be used as a regexp:
	2040	.PP
	2041	.Vb 3
	2042	\& $x = "fooooba";
	2043	\& $x =~ $reg; # matches, just like /foo+bar?/
	2044	\& $x =~ /$reg/; # same thing, alternate form
	2045	.Ve
	2046	.PP
	2047	\&\f(CW$reg\fR can also be interpolated into a larger regexp:
	2048	.PP
	2049	.Vb 1
	2050	\& $x =~ /(abc)?$reg/; # still matches
	2051	.Ve
	2052	.PP
	2053	As with the matching operator, the regexp quote can use different
	2054	delimiters, e.g., \f(CW\(C`qr!!\(C'\fR, \f(CW\(C`qr{}\(C'\fR and \f(CW\(C`qr~~\(C'\fR. The single quote
	2055	delimiters \f(CW\(C`qr''\(C'\fR prevent any interpolation from taking place.
	2056	.PP
	2057	Pre-compiled regexps are useful for creating dynamic matches that
	2058	don't need to be recompiled each time they are encountered. Using
	2059	pre-compiled regexps, \f(CW\(C`simple_grep\(C'\fR program can be expanded into a
	2060	program that matches multiple patterns:
	2061	.PP
	2062	.Vb 4
	2063	\& % cat > multi_grep
	2064	\& #!/usr/bin/perl
	2065	\& # multi_grep - match any of <number> regexps
	2066	\& # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ...
	2067	.Ve
	2068	.PP
	2069	.Vb 12
	2070	\& $number = shift;
	2071	\& $regexp[$_] = shift foreach (0..$number-1);
	2072	\& @compiled = map qr/$_/, @regexp;
	2073	\& while ($line = <>) {
	2074	\& foreach $pattern (@compiled) {
	2075	\& if ($line =~ /$pattern/) {
	2076	\& print $line;
	2077	\& last; # we matched, so move onto the next line
	2078	\& }
	2079	\& }
	2080	\& }
	2081	\& ^D
	2082	.Ve
	2083	.PP
	2084	.Vb 4
	2085	\& % multi_grep 2 last for multi_grep
	2086	\& $regexp[$_] = shift foreach (0..$number-1);
	2087	\& foreach $pattern (@compiled) {
	2088	\& last;
	2089	.Ve
	2090	.PP
	2091	Storing pre-compiled regexps in an array \f(CW@compiled\fR allows us to
	2092	simply loop through the regexps without any recompilation, thus gaining
	2093	flexibility without sacrificing speed.
	2094	.Sh "Embedding comments and modifiers in a regular expression"
	2095	.IX Subsection "Embedding comments and modifiers in a regular expression"
	2096	Starting with this section, we will be discussing Perl's set of
	2097	\&\fBextended patterns\fR. These are extensions to the traditional regular
	2098	expression syntax that provide powerful new tools for pattern
	2099	matching. We have already seen extensions in the form of the minimal
	2100	matching constructs \f(CW\(C`??\(C'\fR, \f(CW\(C`?\(C'\fR, \f(CW\(C`+?\(C'\fR, \f(CW\(C`{n,m}?\(C'\fR, and \f(CW\(C`{n,}?\*(C'\fR. The
	2101	rest of the extensions below have the form \f(CW\(C`(?char...)\(C'\fR, where the
	2102	\&\f(CW\(C`char\(C'\fR is a character that determines the type of extension.
	2103	.PP
	2104	The first extension is an embedded comment \f(CW\(C`(?#text)\(C'\fR. This embeds a
	2105	comment into the regular expression without affecting its meaning. The
	2106	comment should not have any closing parentheses in the text. An
	2107	example is
	2108	.PP
	2109	.Vb 1
	2110	\& /(?# Match an integer:)[+-]?\ed+/;
	2111	.Ve
	2112	.PP
	2113	This style of commenting has been largely superseded by the raw,
	2114	freeform commenting that is allowed with the \f(CW\(C`//x\(C'\fR modifier.
	2115	.PP
	2116	The modifiers \f(CW\(C`//i\(C'\fR, \f(CW\(C`//m\(C'\fR, \f(CW\(C`//s\(C'\fR, and \f(CW\(C`//x\(C'\fR can also embedded in
	2117	a regexp using \f(CW\(C`(?i)\(C'\fR, \f(CW\(C`(?m)\(C'\fR, \f(CW\(C`(?s)\(C'\fR, and \f(CW\(C`(?x)\(C'\fR. For instance,
	2118	.PP
	2119	.Vb 7
	2120	\& /(?i)yes/; # match 'yes' case insensitively
	2121	\& /yes/i; # same thing
	2122	\& /(?x)( # freeform version of an integer regexp
	2123	\& [+-]? # match an optional sign
	2124	\& \ed+ # match a sequence of digits
	2125	\& )
	2126	\& /x;
	2127	.Ve
	2128	.PP
	2129	Embedded modifiers can have two important advantages over the usual
	2130	modifiers. Embedded modifiers allow a custom set of modifiers to
	2131	\&\fIeach\fR regexp pattern. This is great for matching an array of regexps
	2132	that must have different modifiers:
	2133	.PP
	2134	.Vb 8
	2135	\& $pattern[0] = '(?i)doctor';
	2136	\& $pattern[1] = 'Johnson';
	2137	\& ...
	2138	\& while (<>) {
	2139	\& foreach $patt (@pattern) {
	2140	\& print if /$patt/;
	2141	\& }
	2142	\& }
	2143	.Ve
	2144	.PP
	2145	The second advantage is that embedded modifiers only affect the regexp
	2146	inside the group the embedded modifier is contained in. So grouping
	2147	can be used to localize the modifier's effects:
	2148	.PP
	2149	.Vb 1
	2150	\& /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc.
	2151	.Ve
	2152	.PP
	2153	Embedded modifiers can also turn off any modifiers already present
	2154	by using, e.g., \f(CW\(C`(?\-i)\(C'\fR. Modifiers can also be combined into
	2155	a single expression, e.g., \f(CW\(C`(?s\-i)\(C'\fR turns on single line mode and
	2156	turns off case insensitivity.
	2157	.Sh "Non-capturing groupings"
	2158	.IX Subsection "Non-capturing groupings"
	2159	We noted in Part 1 that groupings \f(CW\(C`()\(C'\fR had two distinct functions: 1)
	2160	group regexp elements together as a single unit, and 2) extract, or
	2161	capture, substrings that matched the regexp in the
	2162	grouping. Non-capturing groupings, denoted by \f(CW\(C`(?:regexp)\(C'\fR, allow the
	2163	regexp to be treated as a single unit, but don't extract substrings or
	2164	set matching variables \f(CW$1\fR, etc. Both capturing and non-capturing
	2165	groupings are allowed to co-exist in the same regexp. Because there is
	2166	no extraction, non-capturing groupings are faster than capturing
	2167	groupings. Non-capturing groupings are also handy for choosing exactly
	2168	which parts of a regexp are to be extracted to matching variables:
	2169	.PP
	2170	.Vb 2
	2171	\& # match a number, $1-$4 are set, but we only want $1
	2172	\& /([+-]?\e (\ed+(\e.\ed)?\|\e.\ed+)([eE][+-]?\ed+)?)/;
	2173	.Ve
	2174	.PP
	2175	.Vb 2
	2176	\& # match a number faster , only $1 is set
	2177	\& /([+-]?\e (?:\ed+(?:\e.\ed)?\|\e.\ed+)(?:[eE][+-]?\ed+)?)/;
	2178	.Ve
	2179	.PP
	2180	.Vb 2
	2181	\& # match a number, get $1 = whole number, $2 = exponent
	2182	\& /([+-]?\e (?:\ed+(?:\e.\ed)?\|\e.\ed+)(?:[eE]([+-]?\ed+))?)/;
	2183	.Ve
	2184	.PP
	2185	Non-capturing groupings are also useful for removing nuisance
	2186	elements gathered from a split operation:
	2187	.PP
	2188	.Vb 3
	2189	\& $x = '12a34b5';
	2190	\& @num = split /(a\|b)/, $x; # @num = ('12','a','34','b','5')
	2191	\& @num = split /(?:a\|b)/, $x; # @num = ('12','34','5')
	2192	.Ve
	2193	.PP
	2194	Non-capturing groupings may also have embedded modifiers:
	2195	\&\f(CW\(C`(?i\-m:regexp)\(C'\fR is a non-capturing grouping that matches \f(CW\(C`regexp\(C'\fR
	2196	case insensitively and turns off multi-line mode.
	2197	.Sh "Looking ahead and looking behind"
	2198	.IX Subsection "Looking ahead and looking behind"
	2199	This section concerns the lookahead and lookbehind assertions. First,
	2200	a little background.
	2201	.PP
	2202	In Perl regular expressions, most regexp elements 'eat up' a certain
	2203	amount of string when they match. For instance, the regexp element
	2204	\&\f(CW\(C`[abc}]\(C'\fR eats up one character of the string when it matches, in the
	2205	sense that perl moves to the next character position in the string
	2206	after the match. There are some elements, however, that don't eat up
	2207	characters (advance the character position) if they match. The examples
	2208	we have seen so far are the anchors. The anchor \f(CW\(C`^\(C'\fR matches the
	2209	beginning of the line, but doesn't eat any characters. Similarly, the
	2210	word boundary anchor \f(CW\(C`\eb\(C'\fR matches, e.g., if the character to the left
	2211	is a word character and the character to the right is a non-word
	2212	character, but it doesn't eat up any characters itself. Anchors are
	2213	examples of 'zero\-width assertions'. Zero\-width, because they consume
	2214	no characters, and assertions, because they test some property of the
	2215	string. In the context of our walk in the woods analogy to regexp
	2216	matching, most regexp elements move us along a trail, but anchors have
	2217	us stop a moment and check our surroundings. If the local environment
	2218	checks out, we can proceed forward. But if the local environment
	2219	doesn't satisfy us, we must backtrack.
	2220	.PP
	2221	Checking the environment entails either looking ahead on the trail,
	2222	looking behind, or both. \f(CW\(C`^\(C'\fR looks behind, to see that there are no
	2223	characters before. \f(CW\(C`$\(C'\fR looks ahead, to see that there are no
	2224	characters after. \f(CW\(C`\eb\(C'\fR looks both ahead and behind, to see if the
	2225	characters on either side differ in their 'word'\-ness.
	2226	.PP
	2227	The lookahead and lookbehind assertions are generalizations of the
	2228	anchor concept. Lookahead and lookbehind are zero-width assertions
	2229	that let us specify which characters we want to test for. The
	2230	lookahead assertion is denoted by \f(CW\(C`(?=regexp)\(C'\fR and the lookbehind
	2231	assertion is denoted by \f(CW\(C`(?<=fixed\-regexp)\(C'\fR. Some examples are
	2232	.PP
	2233	.Vb 8
	2234	\& $x = "I catch the housecat 'Tom-cat' with catnip";
	2235	\& $x =~ /cat(?=\es+)/; # matches 'cat' in 'housecat'
	2236	\& @catwords = ($x =~ /(?<=\es)cat\ew+/g); # matches,
	2237	\& # $catwords[0] = 'catch'
	2238	\& # $catwords[1] = 'catnip'
	2239	\& $x =~ /\ebcat\eb/; # matches 'cat' in 'Tom-cat'
	2240	\& $x =~ /(?<=\es)cat(?=\es)/; # doesn't match; no isolated 'cat' in
	2241	\& # middle of $x
	2242	.Ve
	2243	.PP
	2244	Note that the parentheses in \f(CW\(C`(?=regexp)\(C'\fR and \f(CW\(C`(?<=regexp)\(C'\fR are
	2245	non\-capturing, since these are zero-width assertions. Thus in the
	2246	second regexp, the substrings captured are those of the whole regexp
	2247	itself. Lookahead \f(CW\(C`(?=regexp)\(C'\fR can match arbitrary regexps, but
	2248	lookbehind \f(CW\(C`(?<=fixed\-regexp)\(C'\fR only works for regexps of fixed
	2249	width, i.e., a fixed number of characters long. Thus
	2250	\&\f(CW\(C`(?<=(ab\|bc))\(C'\fR is fine, but \f(CW\(C`(?<=(ab))\*(C'\fR is not. The
	2251	negated versions of the lookahead and lookbehind assertions are
	2252	denoted by \f(CW\(C`(?!regexp)\(C'\fR and \f(CW\(C`(?<!fixed\-regexp)\(C'\fR respectively.
	2253	They evaluate true if the regexps do \fInot\fR match:
	2254	.PP
	2255	.Vb 4
	2256	\& $x = "foobar";
	2257	\& $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo'
	2258	\& $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo'
	2259	\& $x =~ /(?<!\es)foo/; # matches, there is no \es before 'foo'
	2260	.Ve
	2261	.PP
	2262	The \f(CW\(C`\eC\(C'\fR is unsupported in lookbehind, because the already
	2263	treacherous definition of \f(CW\(C`\eC\(C'\fR would become even more so
	2264	when going backwards.
	2265	.Sh "Using independent subexpressions to prevent backtracking"
	2266	.IX Subsection "Using independent subexpressions to prevent backtracking"
	2267	The last few extended patterns in this tutorial are experimental as of
	2268	5.6.0. Play with them, use them in some code, but don't rely on them
	2269	just yet for production code.
	2270	.PP
	2271	\&\fBIndependent\ subexpressions\fR\ are regular expressions, in the
	2272	context of a larger regular expression, that function independently of
	2273	the larger regular expression. That is, they consume as much or as
	2274	little of the string as they wish without regard for the ability of
	2275	the larger regexp to match. Independent subexpressions are represented
	2276	by \f(CW\(C`(?>regexp)\(C'\fR. We can illustrate their behavior by first
	2277	considering an ordinary regexp:
	2278	.PP
	2279	.Vb 2
	2280	\& $x = "ab";
	2281	\& $x =~ /a*ab/; # matches
	2282	.Ve
	2283	.PP
	2284	This obviously matches, but in the process of matching, the
	2285	subexpression \f(CW\(C`a\(C'\fR first grabbed the \f(CW\(C`a\*(C'\fR. Doing so, however,
	2286	wouldn't allow the whole regexp to match, so after backtracking, \f(CW\(C`a\*(C'\fR
	2287	eventually gave back the \f(CW\(C`a\(C'\fR and matched the empty string. Here, what
	2288	\&\f(CW\(C`a\*(C'\fR matched was \fIdependent\fR on what the rest of the regexp matched.
	2289	.PP
	2290	Contrast that with an independent subexpression:
	2291	.PP
	2292	.Vb 1
	2293	\& $x =~ /(?>a*)ab/; # doesn't match!
	2294	.Ve
	2295	.PP
	2296	The independent subexpression \f(CW\(C`(?>a)\*(C'\fR doesn't care about the rest
	2297	of the regexp, so it sees an \f(CW\(C`a\(C'\fR and grabs it. Then the rest of the
	2298	regexp \f(CW\(C`ab\(C'\fR cannot match. Because \f(CW\(C`(?>a)\*(C'\fR is independent, there
	2299	is no backtracking and the independent subexpression does not give
	2300	up its \f(CW\(C`a\(C'\fR. Thus the match of the regexp as a whole fails. A similar
	2301	behavior occurs with completely independent regexps:
	2302	.PP
	2303	.Vb 3
	2304	\& $x = "ab";
	2305	\& $x =~ /a*/g; # matches, eats an 'a'
	2306	\& $x =~ /\eGab/g; # doesn't match, no 'a' available
	2307	.Ve
	2308	.PP
	2309	Here \f(CW\(C`//g\(C'\fR and \f(CW\(C`\eG\(C'\fR create a 'tag team' handoff of the string from
	2310	one regexp to the other. Regexps with an independent subexpression are
	2311	much like this, with a handoff of the string to the independent
	2312	subexpression, and a handoff of the string back to the enclosing
	2313	regexp.
	2314	.PP
	2315	The ability of an independent subexpression to prevent backtracking
	2316	can be quite useful. Suppose we want to match a non-empty string
	2317	enclosed in parentheses up to two levels deep. Then the following
	2318	regexp matches:
	2319	.PP
	2320	.Vb 2
	2321	\& $x = "abc(de(fg)h"; # unbalanced parentheses
	2322	\& $x =~ /\e( ( [^()]+ \| \e([^()]*\e) )+ \e)/x;
	2323	.Ve
	2324	.PP
	2325	The regexp matches an open parenthesis, one or more copies of an
	2326	alternation, and a close parenthesis. The alternation is two\-way, with
	2327	the first alternative \f(CW\(C`[^()]+\(C'\fR matching a substring with no
	2328	parentheses and the second alternative \f(CW\(C`\e([^()]\e)\*(C'\fR matching a
	2329	substring delimited by parentheses. The problem with this regexp is
	2330	that it is pathological: it has nested indeterminate quantifiers
	2331	of the form \f(CW\(C`(a+\|b)+\(C'\fR. We discussed in Part 1 how nested quantifiers
	2332	like this could take an exponentially long time to execute if there
	2333	was no match possible. To prevent the exponential blowup, we need to
	2334	prevent useless backtracking at some point. This can be done by
	2335	enclosing the inner quantifier as an independent subexpression:
	2336	.PP
	2337	.Vb 1
	2338	\& $x =~ /\e( ( (?>[^()]+) \| \e([^()]*\e) )+ \e)/x;
	2339	.Ve
	2340	.PP
	2341	Here, \f(CW\(C`(?>[^()]+)\(C'\fR breaks the degeneracy of string partitioning
	2342	by gobbling up as much of the string as possible and keeping it. Then
	2343	match failures fail much more quickly.
	2344	.Sh "Conditional expressions"
	2345	.IX Subsection "Conditional expressions"
	2346	A \fBconditional\ expression\fR\ is a form of if-then-else statement
	2347	that allows one to choose which patterns are to be matched, based on
	2348	some condition. There are two types of conditional expression:
	2349	\&\f(CW\(C`(?(condition)yes\-regexp)\(C'\fR and
	2350	\&\f(CW\(C`(?(condition)yes\-regexp\|no\-regexp)\(C'\fR. \f(CW\(C`(?(condition)yes\-regexp)\(C'\fR is
	2351	like an \f(CW'if\ ()\ {}'\fR\ statement in Perl. If the \f(CW\(C`condition\(C'\fR is true,
	2352	the \f(CW\(C`yes\-regexp\(C'\fR will be matched. If the \f(CW\(C`condition\(C'\fR is false, the
	2353	\&\f(CW\(C`yes\-regexp\(C'\fR will be skipped and perl will move onto the next regexp
	2354	element. The second form is like an \f(CW'if\ ()\ {}\ else\ {}'\fR\ statement
	2355	in Perl. If the \f(CW\(C`condition\(C'\fR is true, the \f(CW\(C`yes\-regexp\(C'\fR will be
	2356	matched, otherwise the \f(CW\(C`no\-regexp\(C'\fR will be matched.
	2357	.PP
	2358	The \f(CW\(C`condition\(C'\fR can have two forms. The first form is simply an
	2359	integer in parentheses \f(CW\(C`(integer)\(C'\fR. It is true if the corresponding
	2360	backreference \f(CW\(C`\einteger\(C'\fR matched earlier in the regexp. The second
	2361	form is a bare zero width assertion \f(CW\(C`(?...)\(C'\fR, either a
	2362	lookahead, a lookbehind, or a code assertion (discussed in the next
	2363	section).
	2364	.PP
	2365	The integer form of the \f(CW\(C`condition\(C'\fR allows us to choose, with more
	2366	flexibility, what to match based on what matched earlier in the
	2367	regexp. This searches for words of the form \f(CW"$x$x"\fR or
	2368	\&\f(CW"$x$y$y$x"\fR:
	2369	.PP
	2370	.Vb 9
	2371	\& % simple_grep '^(\ew+)(\ew+)?(?(2)\e2\e1\|\e1)$' /usr/dict/words
	2372	\& beriberi
	2373	\& coco
	2374	\& couscous
	2375	\& deed
	2376	\& ...
	2377	\& toot
	2378	\& toto
	2379	\& tutu
	2380	.Ve
	2381	.PP
	2382	The lookbehind \f(CW\(C`condition\(C'\fR allows, along with backreferences,
	2383	an earlier part of the match to influence a later part of the
	2384	match. For instance,
	2385	.PP
	2386	.Vb 1
	2387	\& /[ATGC]+(?(?<=AA)G\|C)$/;
	2388	.Ve
	2389	.PP
	2390	matches a \s-1DNA\s0 sequence such that it either ends in \f(CW\(C`AAG\(C'\fR, or some
	2391	other base pair combination and \f(CW\(C`C\(C'\fR. Note that the form is
	2392	\&\f(CW\(C`(?(?<=AA)G\|C)\(C'\fR and not \f(CW\(C`(?((?<=AA))G\|C)\(C'\fR; for the
	2393	lookahead, lookbehind or code assertions, the parentheses around the
	2394	conditional are not needed.
	2395	.Sh "A bit of magic: executing Perl code in a regular expression"
	2396	.IX Subsection "A bit of magic: executing Perl code in a regular expression"
	2397	Normally, regexps are a part of Perl expressions.
	2398	\&\fBCode\ evaluation\fR\ expressions turn that around by allowing
	2399	arbitrary Perl code to be a part of a regexp. A code evaluation
	2400	expression is denoted \f(CW\(C`(?{code})\(C'\fR, with \f(CW\(C`code\(C'\fR a string of Perl
	2401	statements.
	2402	.PP
	2403	Code expressions are zero-width assertions, and the value they return
	2404	depends on their environment. There are two possibilities: either the
	2405	code expression is used as a conditional in a conditional expression
	2406	\&\f(CW\(C`(?(condition)...)\(C'\fR, or it is not. If the code expression is a
	2407	conditional, the code is evaluated and the result (i.e., the result of
	2408	the last statement) is used to determine truth or falsehood. If the
	2409	code expression is not used as a conditional, the assertion always
	2410	evaluates true and the result is put into the special variable
	2411	\&\f(CW$^R\fR. The variable \f(CW$^R\fR can then be used in code expressions later
	2412	in the regexp. Here are some silly examples:
	2413	.PP
	2414	.Vb 5
	2415	\& $x = "abcdef";
	2416	\& $x =~ /abc(?{print "Hi Mom!";})def/; # matches,
	2417	\& # prints 'Hi Mom!'
	2418	\& $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match,
	2419	\& # no 'Hi Mom!'
	2420	.Ve
	2421	.PP
	2422	Pay careful attention to the next example:
	2423	.PP
	2424	.Vb 3
	2425	\& $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match,
	2426	\& # no 'Hi Mom!'
	2427	\& # but why not?
	2428	.Ve
	2429	.PP
	2430	At first glance, you'd think that it shouldn't print, because obviously
	2431	the \f(CW\(C`ddd\(C'\fR isn't going to match the target string. But look at this
	2432	example:
	2433	.PP
	2434	.Vb 2
	2435	\& $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match,
	2436	\& # but _does_ print
	2437	.Ve
	2438	.PP
	2439	Hmm. What happened here? If you've been following along, you know that
	2440	the above pattern should be effectively the same as the last one \*(--
	2441	enclosing the d in a character class isn't going to change what it
	2442	matches. So why does the first not print while the second one does?
	2443	.PP
	2444	The answer lies in the optimizations the REx engine makes. In the first
	2445	case, all the engine sees are plain old characters (aside from the
	2446	\&\f(CW\(C`?{}\(C'\fR construct). It's smart enough to realize that the string 'ddd'
	2447	doesn't occur in our target string before actually running the pattern
	2448	through. But in the second case, we've tricked it into thinking that our
	2449	pattern is more complicated than it is. It takes a look, sees our
	2450	character class, and decides that it will have to actually run the
	2451	pattern to determine whether or not it matches, and in the process of
	2452	running it hits the print statement before it discovers that we don't
	2453	have a match.
	2454	.PP
	2455	To take a closer look at how the engine does optimizations, see the
	2456	section \(L"Pragmas and debugging\(R" below.
	2457	.PP
	2458	More fun with \f(CW\(C`?{}\(C'\fR:
	2459	.PP
	2460	.Vb 6
	2461	\& $x =~ /(?{print "Hi Mom!";})/; # matches,
	2462	\& # prints 'Hi Mom!'
	2463	\& $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches,
	2464	\& # prints '1'
	2465	\& $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches,
	2466	\& # prints '1'
	2467	.Ve
	2468	.PP
	2469	The bit of magic mentioned in the section title occurs when the regexp
	2470	backtracks in the process of searching for a match. If the regexp
	2471	backtracks over a code expression and if the variables used within are
	2472	localized using \f(CW\(C`local\(C'\fR, the changes in the variables produced by the
	2473	code expression are undone! Thus, if we wanted to count how many times
	2474	a character got matched inside a group, we could use, e.g.,
	2475	.PP
	2476	.Vb 11
	2477	\& $x = "aaaa";
	2478	\& $count = 0; # initialize 'a' count
	2479	\& $c = "bob"; # test if $c gets clobbered
	2480	\& $x =~ /(?{local $c = 0;}) # initialize count
	2481	\& ( a # match 'a'
	2482	\& (?{local $c = $c + 1;}) # increment count
	2483	\& )* # do this any number of times,
	2484	\& aa # but match 'aa' at the end
	2485	\& (?{$count = $c;}) # copy local $c var into $count
	2486	\& /x;
	2487	\& print "'a' count is $count, \e$c variable is '$c'\en";
	2488	.Ve
	2489	.PP
	2490	This prints
	2491	.PP
	2492	.Vb 1
	2493	\& 'a' count is 2, $c variable is 'bob'
	2494	.Ve
	2495	.PP
	2496	If we replace the \f(CW\(C`\ (?{local\ $c\ =\ $c\ +\ 1;})\(C'\fR\ with
	2497	\&\f(CW\(C`\ (?{$c\ =\ $c\ +\ 1;})\(C'\fR\ , the variable changes are \fInot\fR undone
	2498	during backtracking, and we get
	2499	.PP
	2500	.Vb 1
	2501	\& 'a' count is 4, $c variable is 'bob'
	2502	.Ve
	2503	.PP
	2504	Note that only localized variable changes are undone. Other side
	2505	effects of code expression execution are permanent. Thus
	2506	.PP
	2507	.Vb 2
	2508	\& $x = "aaaa";
	2509	\& $x =~ /(a(?{print "Yow\en";}))*aa/;
	2510	.Ve
	2511	.PP
	2512	produces
	2513	.PP
	2514	.Vb 4
	2515	\& Yow
	2516	\& Yow
	2517	\& Yow
	2518	\& Yow
	2519	.Ve
	2520	.PP
	2521	The result \f(CW$^R\fR is automatically localized, so that it will behave
	2522	properly in the presence of backtracking.
	2523	.PP
	2524	This example uses a code expression in a conditional to match the
	2525	article 'the' in either English or German:
	2526	.PP
	2527	.Vb 11
	2528	\& $lang = 'DE'; # use German
	2529	\& ...
	2530	\& $text = "das";
	2531	\& print "matched\en"
	2532	\& if $text =~ /(?(?{
	2533	\& $lang eq 'EN'; # is the language English?
	2534	\& })
	2535	\& the \| # if so, then match 'the'
	2536	\& (die\|das\|der) # else, match 'die\|das\|der'
	2537	\& )
	2538	\& /xi;
	2539	.Ve
	2540	.PP
	2541	Note that the syntax here is \f(CW\(C`(?(?{...})yes\-regexp\|no\-regexp)\(C'\fR, not
	2542	\&\f(CW\(C`(?((?{...}))yes\-regexp\|no\-regexp)\(C'\fR. In other words, in the case of a
	2543	code expression, we don't need the extra parentheses around the
	2544	conditional.
	2545	.PP
	2546	If you try to use code expressions with interpolating variables, perl
	2547	may surprise you:
	2548	.PP
	2549	.Vb 5
	2550	\& $bar = 5;
	2551	\& $pat = '(?{ 1 })';
	2552	\& /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated
	2553	\& /foo(?{ 1 })$bar/; # compile error!
	2554	\& /foo${pat}bar/; # compile error!
	2555	.Ve
	2556	.PP
	2557	.Vb 2
	2558	\& $pat = qr/(?{ $foo = 1 })/; # precompile code regexp
	2559	\& /foo${pat}bar/; # compiles ok
	2560	.Ve
	2561	.PP
	2562	If a regexp has (1) code expressions and interpolating variables, or
	2563	(2) a variable that interpolates a code expression, perl treats the
	2564	regexp as an error. If the code expression is precompiled into a
	2565	variable, however, interpolating is ok. The question is, why is this
	2566	an error?
	2567	.PP
	2568	The reason is that variable interpolation and code expressions
	2569	together pose a security risk. The combination is dangerous because
	2570	many programmers who write search engines often take user input and
	2571	plug it directly into a regexp:
	2572	.PP
	2573	.Vb 3
	2574	\& $regexp = <>; # read user-supplied regexp
	2575	\& $chomp $regexp; # get rid of possible newline
	2576	\& $text =~ /$regexp/; # search $text for the $regexp
	2577	.Ve
	2578	.PP
	2579	If the \f(CW$regexp\fR variable contains a code expression, the user could
	2580	then execute arbitrary Perl code. For instance, some joker could
	2581	search for \f(CW\(C`system('rm\ \-rf\ ');\*(C'\fR\ to erase your files. In this
	2582	sense, the combination of interpolation and code expressions \fBtaints\fR
	2583	your regexp. So by default, using both interpolation and code
	2584	expressions in the same regexp is not allowed. If you're not
	2585	concerned about malicious users, it is possible to bypass this
	2586	security check by invoking \f(CW\(C`use\ re\ 'eval'\(C'\fR\ :
	2587	.PP
	2588	.Vb 5
	2589	\& use re 'eval'; # throw caution out the door
	2590	\& $bar = 5;
	2591	\& $pat = '(?{ 1 })';
	2592	\& /foo(?{ 1 })$bar/; # compiles ok
	2593	\& /foo${pat}bar/; # compiles ok
	2594	.Ve
	2595	.PP
	2596	Another form of code expression is the \fBpattern\ code\ expression\fR\ .
	2597	The pattern code expression is like a regular code expression, except
	2598	that the result of the code evaluation is treated as a regular
	2599	expression and matched immediately. A simple example is
	2600	.PP
	2601	.Vb 4
	2602	\& $length = 5;
	2603	\& $char = 'a';
	2604	\& $x = 'aaaaabb';
	2605	\& $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a'
	2606	.Ve
	2607	.PP
	2608	This final example contains both ordinary and pattern code
	2609	expressions. It detects if a binary string \f(CW1101010010001...\fR has a
	2610	Fibonacci spacing 0,1,1,2,3,5,... of the \f(CW1\fR's:
	2611	.PP
	2612	.Vb 17
	2613	\& $s0 = 0; $s1 = 1; # initial conditions
	2614	\& $x = "1101010010001000001";
	2615	\& print "It is a Fibonacci sequence\en"
	2616	\& if $x =~ /^1 # match an initial '1'
	2617	\& (
	2618	\& (??{'0' x $s0}) # match $s0 of '0'
	2619	\& 1 # and then a '1'
	2620	\& (?{
	2621	\& $largest = $s0; # largest seq so far
	2622	\& $s2 = $s1 + $s0; # compute next term
	2623	\& $s0 = $s1; # in Fibonacci sequence
	2624	\& $s1 = $s2;
	2625	\& })
	2626	\& )+ # repeat as needed
	2627	\& $ # that is all there is
	2628	\& /x;
	2629	\& print "Largest sequence matched was $largest\en";
	2630	.Ve
	2631	.PP
	2632	This prints
	2633	.PP
	2634	.Vb 2
	2635	\& It is a Fibonacci sequence
	2636	\& Largest sequence matched was 5
	2637	.Ve
	2638	.PP
	2639	Ha! Try that with your garden variety regexp package...
	2640	.PP
	2641	Note that the variables \f(CW$s0\fR and \f(CW$s1\fR are not substituted when the
	2642	regexp is compiled, as happens for ordinary variables outside a code
	2643	expression. Rather, the code expressions are evaluated when perl
	2644	encounters them during the search for a match.
	2645	.PP
	2646	The regexp without the \f(CW\(C`//x\(C'\fR modifier is
	2647	.PP
	2648	.Vb 1
	2649	\& /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/;
	2650	.Ve
	2651	.PP
	2652	and is a great start on an Obfuscated Perl entry :\-) When working with
	2653	code and conditional expressions, the extended form of regexps is
	2654	almost necessary in creating and debugging regexps.
	2655	.Sh "Pragmas and debugging"
	2656	.IX Subsection "Pragmas and debugging"
	2657	Speaking of debugging, there are several pragmas available to control
	2658	and debug regexps in Perl. We have already encountered one pragma in
	2659	the previous section, \f(CW\(C`use\ re\ 'eval';\(C'\fR\ , that allows variable
	2660	interpolation and code expressions to coexist in a regexp. The other
	2661	pragmas are
	2662	.PP
	2663	.Vb 3
	2664	\& use re 'taint';
	2665	\& $tainted = <>;
	2666	\& @parts = ($tainted =~ /(\ew+)\es+(\ew+)/; # @parts is now tainted
	2667	.Ve
	2668	.PP
	2669	The \f(CW\(C`taint\(C'\fR pragma causes any substrings from a match with a tainted
	2670	variable to be tainted as well. This is not normally the case, as
	2671	regexps are often used to extract the safe bits from a tainted
	2672	variable. Use \f(CW\(C`taint\(C'\fR when you are not extracting safe bits, but are
	2673	performing some other processing. Both \f(CW\(C`taint\(C'\fR and \f(CW\(C`eval\(C'\fR pragmas
	2674	are lexically scoped, which means they are in effect only until
	2675	the end of the block enclosing the pragmas.
	2676	.PP
	2677	.Vb 2
	2678	\& use re 'debug';
	2679	\& /^(.*)$/s; # output debugging info
	2680	.Ve
	2681	.PP
	2682	.Vb 2
	2683	\& use re 'debugcolor';
	2684	\& /^(.*)$/s; # output debugging info in living color
	2685	.Ve
	2686	.PP
	2687	The global \f(CW\(C`debug\(C'\fR and \f(CW\(C`debugcolor\(C'\fR pragmas allow one to get
	2688	detailed debugging info about regexp compilation and
	2689	execution. \f(CW\(C`debugcolor\(C'\fR is the same as debug, except the debugging
	2690	information is displayed in color on terminals that can display
	2691	termcap color sequences. Here is example output:
	2692	.PP
	2693	.Vb 25
	2694	\& % perl -e 'use re "debug"; "abc" =~ /a*b+c/;'
	2695	\& Compiling REx `a*b+c'
	2696	\& size 9 first at 1
	2697	\& 1: STAR(4)
	2698	\& 2: EXACT <a>(0)
	2699	\& 4: PLUS(7)
	2700	\& 5: EXACT <b>(0)
	2701	\& 7: EXACT <c>(9)
	2702	\& 9: END(0)
	2703	\& floating `bc' at 0..2147483647 (checking floating) minlen 2
	2704	\& Guessing start of match, REx `a*b+c' against `abc'...
	2705	\& Found floating substr `bc' at offset 1...
	2706	\& Guessed: match at offset 0
	2707	\& Matching REx `a*b+c' against `abc'
	2708	\& Setting an EVAL scope, savestack=3
	2709	\& 0 <> <abc> \| 1: STAR
	2710	\& EXACT <a> can match 1 times out of 32767...
	2711	\& Setting an EVAL scope, savestack=3
	2712	\& 1 <a> <bc> \| 4: PLUS
	2713	\& EXACT <b> can match 1 times out of 32767...
	2714	\& Setting an EVAL scope, savestack=3
	2715	\& 2 <ab> <c> \| 7: EXACT <c>
	2716	\& 3 <abc> <> \| 9: END
	2717	\& Match successful!
	2718	\& Freeing REx: `a*b+c'
	2719	.Ve
	2720	.PP
	2721	If you have gotten this far into the tutorial, you can probably guess
	2722	what the different parts of the debugging output tell you. The first
	2723	part
	2724	.PP
	2725	.Vb 8
	2726	\& Compiling REx `a*b+c'
	2727	\& size 9 first at 1
	2728	\& 1: STAR(4)
	2729	\& 2: EXACT <a>(0)
	2730	\& 4: PLUS(7)
	2731	\& 5: EXACT <b>(0)
	2732	\& 7: EXACT <c>(9)
	2733	\& 9: END(0)
	2734	.Ve
	2735	.PP
	2736	describes the compilation stage. \f(CWSTAR(4)\fR means that there is a
	2737	starred object, in this case \f(CW'a'\fR, and if it matches, goto line 4,
	2738	i.e., \f(CWPLUS(7)\fR. The middle lines describe some heuristics and
	2739	optimizations performed before a match:
	2740	.PP
	2741	.Vb 4
	2742	\& floating `bc' at 0..2147483647 (checking floating) minlen 2
	2743	\& Guessing start of match, REx `a*b+c' against `abc'...
	2744	\& Found floating substr `bc' at offset 1...
	2745	\& Guessed: match at offset 0
	2746	.Ve
	2747	.PP
	2748	Then the match is executed and the remaining lines describe the
	2749	process:
	2750	.PP
	2751	.Vb 12
	2752	\& Matching REx `a*b+c' against `abc'
	2753	\& Setting an EVAL scope, savestack=3
	2754	\& 0 <> <abc> \| 1: STAR
	2755	\& EXACT <a> can match 1 times out of 32767...
	2756	\& Setting an EVAL scope, savestack=3
	2757	\& 1 <a> <bc> \| 4: PLUS
	2758	\& EXACT <b> can match 1 times out of 32767...
	2759	\& Setting an EVAL scope, savestack=3
	2760	\& 2 <ab> <c> \| 7: EXACT <c>
	2761	\& 3 <abc> <> \| 9: END
	2762	\& Match successful!
	2763	\& Freeing REx: `a*b+c'
	2764	.Ve
	2765	.PP
	2766	Each step is of the form \f(CW\(C`n\ <x>\ <y>\(C'\fR\ , with \f(CW\(C`<x>\(C'\fR the
	2767	part of the string matched and \f(CW\(C`<y>\(C'\fR the part not yet
	2768	matched. The \f(CW\(C`\|\ 1:\ STAR\(C'\fR\ says that perl is at line number 1
	2769	n the compilation list above. See
	2770	\&\(L"Debugging regular expressions\(R" in perldebguts for much more detail.
	2771	.PP
	2772	An alternative method of debugging regexps is to embed \f(CW\(C`print\(C'\fR
	2773	statements within the regexp. This provides a blow-by-blow account of
	2774	the backtracking in an alternation:
	2775	.PP
	2776	.Vb 12
	2777	\& "that this" =~ m@(?{print "Start at position ", pos, "\en";})
	2778	\& t(?{print "t1\en";})
	2779	\& h(?{print "h1\en";})
	2780	\& i(?{print "i1\en";})
	2781	\& s(?{print "s1\en";})
	2782	\& \|
	2783	\& t(?{print "t2\en";})
	2784	\& h(?{print "h2\en";})
	2785	\& a(?{print "a2\en";})
	2786	\& t(?{print "t2\en";})
	2787	\& (?{print "Done at position ", pos, "\en";})
	2788	\& @x;
	2789	.Ve
	2790	.PP
	2791	prints
	2792	.PP
	2793	.Vb 8
	2794	\& Start at position 0
	2795	\& t1
	2796	\& h1
	2797	\& t2
	2798	\& h2
	2799	\& a2
	2800	\& t2
	2801	\& Done at position 4
	2802	.Ve
	2803	.SH "BUGS"
	2804	.IX Header "BUGS"
	2805	Code expressions, conditional expressions, and independent expressions
	2806	are \fBexperimental\fR. Don't use them in production code. Yet.
	2807	.SH "SEE ALSO"
	2808	.IX Header "SEE ALSO"
	2809	This is just a tutorial. For the full story on perl regular
	2810	expressions, see the perlre regular expressions reference page.
	2811	.PP
	2812	For more information on the matching \f(CW\(C`m//\(C'\fR and substitution \f(CW\(C`s///\(C'\fR
	2813	operators, see \(L"Regexp Quote-Like Operators\(R" in perlop. For
	2814	information on the \f(CW\(C`split\(C'\fR operation, see \(L"split\(R" in perlfunc.
	2815	.PP
	2816	For an excellent all-around resource on the care and feeding of
	2817	regular expressions, see the book \fIMastering Regular Expressions\fR by
	2818	Jeffrey Friedl (published by O'Reilly, \s-1ISBN\s0 1556592\-257\-3).
	2819	.SH "AUTHOR AND COPYRIGHT"
	2820	.IX Header "AUTHOR AND COPYRIGHT"
	2821	Copyright (c) 2000 Mark Kvale
	2822	All rights reserved.
	2823	.PP
	2824	This document may be distributed under the same terms as Perl itself.
	2825	.Sh "Acknowledgments"
	2826	.IX Subsection "Acknowledgments"
	2827	The inspiration for the stop codon \s-1DNA\s0 example came from the \s-1ZIP\s0
	2828	code example in chapter 7 of \fIMastering Regular Expressions\fR.
	2829	.PP
	2830	The author would like to thank Jeff Pinyan, Andrew Johnson, Peter
	2831	Haworth, Ronald J Kimball, and Joe Smith for all their helpful
	2832	comments.