Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man1 / perlretut.1
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "PERLRETUT 1"
132.TH PERLRETUT 1 "2002-06-08" "perl v5.8.0" "Perl Programmers Reference Guide"
133.SH "NAME"
134perlretut \- Perl regular expressions tutorial
135.SH "DESCRIPTION"
136.IX Header "DESCRIPTION"
137This page provides a basic tutorial on understanding, creating and
138using regular expressions in Perl. It serves as a complement to the
139reference page on regular expressions perlre. Regular expressions
140are an integral part of the \f(CW\*(C`m//\*(C'\fR, \f(CW\*(C`s///\*(C'\fR, \f(CW\*(C`qr//\*(C'\fR and \f(CW\*(C`split\*(C'\fR
141operators and so this tutorial also overlaps with
142\&\*(L"Regexp Quote-Like Operators\*(R" in perlop and \*(L"split\*(R" in perlfunc.
143.PP
144Perl is widely renowned for excellence in text processing, and regular
145expressions are one of the big factors behind this fame. Perl regular
146expressions display an efficiency and flexibility unknown in most
147other computer languages. Mastering even the basics of regular
148expressions will allow you to manipulate text with surprising ease.
149.PP
150What is a regular expression? A regular expression is simply a string
151that describes a pattern. Patterns are in common use these days;
152examples are the patterns typed into a search engine to find web pages
153and the patterns used to list files in a directory, e.g., \f(CW\*(C`ls *.txt\*(C'\fR
154or \f(CW\*(C`dir *.*\*(C'\fR. In Perl, the patterns described by regular expressions
155are used to search strings, extract desired parts of strings, and to
156do search and replace operations.
157.PP
158Regular expressions have the undeserved reputation of being abstract
159and difficult to understand. Regular expressions are constructed using
160simple concepts like conditionals and loops and are no more difficult
161to understand than the corresponding \f(CW\*(C`if\*(C'\fR conditionals and \f(CW\*(C`while\*(C'\fR
162loops in the Perl language itself. In fact, the main challenge in
163learning regular expressions is just getting used to the terse
164notation used to express these concepts.
165.PP
166This tutorial flattens the learning curve by discussing regular
167expression concepts, along with their notation, one at a time and with
168many examples. The first part of the tutorial will progress from the
169simplest word searches to the basic regular expression concepts. If
170you master the first part, you will have all the tools needed to solve
171about 98% of your needs. The second part of the tutorial is for those
172comfortable with the basics and hungry for more power tools. It
173discusses the more advanced regular expression operators and
174introduces the latest cutting edge innovations in 5.6.0.
175.PP
176A note: to save time, 'regular expression' is often abbreviated as
177regexp or regex. Regexp is a more natural abbreviation than regex, but
178is harder to pronounce. The Perl pod documentation is evenly split on
179regexp vs regex; in Perl, there is more than one way to abbreviate it.
180We'll use regexp in this tutorial.
181.SH "Part 1: The basics"
182.IX Header "Part 1: The basics"
183.Sh "Simple word matching"
184.IX Subsection "Simple word matching"
185The simplest regexp is simply a word, or more generally, a string of
186characters. A regexp consisting of a word matches any string that
187contains that word:
188.PP
189.Vb 1
190\& "Hello World" =~ /World/; # matches
191.Ve
192.PP
193What is this perl statement all about? \f(CW"Hello World"\fR is a simple
194double quoted string. \f(CW\*(C`World\*(C'\fR is the regular expression and the
195\&\f(CW\*(C`//\*(C'\fR enclosing \f(CW\*(C`/World/\*(C'\fR tells perl to search a string for a match.
196The operator \f(CW\*(C`=~\*(C'\fR associates the string with the regexp match and
197produces a true value if the regexp matched, or false if the regexp
198did not match. In our case, \f(CW\*(C`World\*(C'\fR matches the second word in
199\&\f(CW"Hello World"\fR, so the expression is true. Expressions like this
200are useful in conditionals:
201.PP
202.Vb 6
203\& if ("Hello World" =~ /World/) {
204\& print "It matches\en";
205\& }
206\& else {
207\& print "It doesn't match\en";
208\& }
209.Ve
210.PP
211There are useful variations on this theme. The sense of the match can
212be reversed by using \f(CW\*(C`!~\*(C'\fR operator:
213.PP
214.Vb 6
215\& if ("Hello World" !~ /World/) {
216\& print "It doesn't match\en";
217\& }
218\& else {
219\& print "It matches\en";
220\& }
221.Ve
222.PP
223The literal string in the regexp can be replaced by a variable:
224.PP
225.Vb 7
226\& $greeting = "World";
227\& if ("Hello World" =~ /$greeting/) {
228\& print "It matches\en";
229\& }
230\& else {
231\& print "It doesn't match\en";
232\& }
233.Ve
234.PP
235If you're matching against the special default variable \f(CW$_\fR, the
236\&\f(CW\*(C`$_ =~\*(C'\fR part can be omitted:
237.PP
238.Vb 7
239\& $_ = "Hello World";
240\& if (/World/) {
241\& print "It matches\en";
242\& }
243\& else {
244\& print "It doesn't match\en";
245\& }
246.Ve
247.PP
248And finally, the \f(CW\*(C`//\*(C'\fR default delimiters for a match can be changed
249to arbitrary delimiters by putting an \f(CW'm'\fR out front:
250.PP
251.Vb 4
252\& "Hello World" =~ m!World!; # matches, delimited by '!'
253\& "Hello World" =~ m{World}; # matches, note the matching '{}'
254\& "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin',
255\& # '/' becomes an ordinary char
256.Ve
257.PP
258\&\f(CW\*(C`/World/\*(C'\fR, \f(CW\*(C`m!World!\*(C'\fR, and \f(CW\*(C`m{World}\*(C'\fR all represent the
259same thing. When, e.g., \f(CW""\fR is used as a delimiter, the forward
260slash \f(CW'/'\fR becomes an ordinary character and can be used in a regexp
261without trouble.
262.PP
263Let's consider how different regexps would match \f(CW"Hello World"\fR:
264.PP
265.Vb 4
266\& "Hello World" =~ /world/; # doesn't match
267\& "Hello World" =~ /o W/; # matches
268\& "Hello World" =~ /oW/; # doesn't match
269\& "Hello World" =~ /World /; # doesn't match
270.Ve
271.PP
272The first regexp \f(CW\*(C`world\*(C'\fR doesn't match because regexps are
273case\-sensitive. The second regexp matches because the substring
274\&\f(CW'o\ W'\fR\ occurs in the string \f(CW"Hello\ World"\fR\ . The space
275character ' ' is treated like any other character in a regexp and is
276needed to match in this case. The lack of a space character is the
277reason the third regexp \f(CW'oW'\fR doesn't match. The fourth regexp
278\&\f(CW'World '\fR doesn't match because there is a space at the end of the
279regexp, but not at the end of the string. The lesson here is that
280regexps must match a part of the string \fIexactly\fR in order for the
281statement to be true.
282.PP
283If a regexp matches in more than one place in the string, perl will
284always match at the earliest possible point in the string:
285.PP
286.Vb 2
287\& "Hello World" =~ /o/; # matches 'o' in 'Hello'
288\& "That hat is red" =~ /hat/; # matches 'hat' in 'That'
289.Ve
290.PP
291With respect to character matching, there are a few more points you
292need to know about. First of all, not all characters can be used 'as
293is' in a match. Some characters, called \fBmetacharacters\fR, are reserved
294for use in regexp notation. The metacharacters are
295.PP
296.Vb 1
297\& {}[]()^$.|*+?\e
298.Ve
299.PP
300The significance of each of these will be explained
301in the rest of the tutorial, but for now, it is important only to know
302that a metacharacter can be matched by putting a backslash before it:
303.PP
304.Vb 5
305\& "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter
306\& "2+2=4" =~ /2\e+2/; # matches, \e+ is treated like an ordinary +
307\& "The interval is [0,1)." =~ /[0,1)./ # is a syntax error!
308\& "The interval is [0,1)." =~ /\e[0,1\e)\e./ # matches
309\& "/usr/bin/perl" =~ /\e/usr\e/local\e/bin\e/perl/; # matches
310.Ve
311.PP
312In the last regexp, the forward slash \f(CW'/'\fR is also backslashed,
313because it is used to delimit the regexp. This can lead to \s-1LTS\s0
314(leaning toothpick syndrome), however, and it is often more readable
315to change delimiters.
316.PP
317The backslash character \f(CW'\e'\fR is a metacharacter itself and needs to
318be backslashed:
319.PP
320.Vb 1
321\& 'C:\eWIN32' =~ /C:\e\eWIN/; # matches
322.Ve
323.PP
324In addition to the metacharacters, there are some \s-1ASCII\s0 characters
325which don't have printable character equivalents and are instead
326represented by \fBescape sequences\fR. Common examples are \f(CW\*(C`\et\*(C'\fR for a
327tab, \f(CW\*(C`\en\*(C'\fR for a newline, \f(CW\*(C`\er\*(C'\fR for a carriage return and \f(CW\*(C`\ea\*(C'\fR for a
328bell. If your string is better thought of as a sequence of arbitrary
329bytes, the octal escape sequence, e.g., \f(CW\*(C`\e033\*(C'\fR, or hexadecimal escape
330sequence, e.g., \f(CW\*(C`\ex1B\*(C'\fR may be a more natural representation for your
331bytes. Here are some examples of escapes:
332.PP
333.Vb 4
334\& "1000\et2000" =~ m(0\et2) # matches
335\& "1000\en2000" =~ /0\en20/ # matches
336\& "1000\et2000" =~ /\e000\et2/ # doesn't match, "0" ne "\e000"
337\& "cat" =~ /\e143\ex61\ex74/ # matches, but a weird way to spell cat
338.Ve
339.PP
340If you've been around Perl a while, all this talk of escape sequences
341may seem familiar. Similar escape sequences are used in double-quoted
342strings and in fact the regexps in Perl are mostly treated as
343double-quoted strings. This means that variables can be used in
344regexps as well. Just like double-quoted strings, the values of the
345variables in the regexp will be substituted in before the regexp is
346evaluated for matching purposes. So we have:
347.PP
348.Vb 4
349\& $foo = 'house';
350\& 'housecat' =~ /$foo/; # matches
351\& 'cathouse' =~ /cat$foo/; # matches
352\& 'housecat' =~ /${foo}cat/; # matches
353.Ve
354.PP
355So far, so good. With the knowledge above you can already perform
356searches with just about any literal string regexp you can dream up.
357Here is a \fIvery simple\fR emulation of the Unix grep program:
358.PP
359.Vb 7
360\& % cat > simple_grep
361\& #!/usr/bin/perl
362\& $regexp = shift;
363\& while (<>) {
364\& print if /$regexp/;
365\& }
366\& ^D
367.Ve
368.PP
369.Vb 1
370\& % chmod +x simple_grep
371.Ve
372.PP
373.Vb 10
374\& % simple_grep abba /usr/dict/words
375\& Babbage
376\& cabbage
377\& cabbages
378\& sabbath
379\& Sabbathize
380\& Sabbathizes
381\& sabbatical
382\& scabbard
383\& scabbards
384.Ve
385.PP
386This program is easy to understand. \f(CW\*(C`#!/usr/bin/perl\*(C'\fR is the standard
387way to invoke a perl program from the shell.
388\&\f(CW\*(C`$regexp\ =\ shift;\*(C'\fR\ saves the first command line argument as the
389regexp to be used, leaving the rest of the command line arguments to
390be treated as files. \f(CW\*(C`while\ (<>)\*(C'\fR\ loops over all the lines in
391all the files. For each line, \f(CW\*(C`print\ if\ /$regexp/;\*(C'\fR\ prints the
392line if the regexp matches the line. In this line, both \f(CW\*(C`print\*(C'\fR and
393\&\f(CW\*(C`/$regexp/\*(C'\fR use the default variable \f(CW$_\fR implicitly.
394.PP
395With all of the regexps above, if the regexp matched anywhere in the
396string, it was considered a match. Sometimes, however, we'd like to
397specify \fIwhere\fR in the string the regexp should try to match. To do
398this, we would use the \fBanchor\fR metacharacters \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR. The
399anchor \f(CW\*(C`^\*(C'\fR means match at the beginning of the string and the anchor
400\&\f(CW\*(C`$\*(C'\fR means match at the end of the string, or before a newline at the
401end of the string. Here is how they are used:
402.PP
403.Vb 4
404\& "housekeeper" =~ /keeper/; # matches
405\& "housekeeper" =~ /^keeper/; # doesn't match
406\& "housekeeper" =~ /keeper$/; # matches
407\& "housekeeper\en" =~ /keeper$/; # matches
408.Ve
409.PP
410The second regexp doesn't match because \f(CW\*(C`^\*(C'\fR constrains \f(CW\*(C`keeper\*(C'\fR to
411match only at the beginning of the string, but \f(CW"housekeeper"\fR has
412keeper starting in the middle. The third regexp does match, since the
413\&\f(CW\*(C`$\*(C'\fR constrains \f(CW\*(C`keeper\*(C'\fR to match only at the end of the string.
414.PP
415When both \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR are used at the same time, the regexp has to
416match both the beginning and the end of the string, i.e., the regexp
417matches the whole string. Consider
418.PP
419.Vb 3
420\& "keeper" =~ /^keep$/; # doesn't match
421\& "keeper" =~ /^keeper$/; # matches
422\& "" =~ /^$/; # ^$ matches an empty string
423.Ve
424.PP
425The first regexp doesn't match because the string has more to it than
426\&\f(CW\*(C`keep\*(C'\fR. Since the second regexp is exactly the string, it
427matches. Using both \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR in a regexp forces the complete
428string to match, so it gives you complete control over which strings
429match and which don't. Suppose you are looking for a fellow named
430bert, off in a string by himself:
431.PP
432.Vb 1
433\& "dogbert" =~ /bert/; # matches, but not what you want
434.Ve
435.PP
436.Vb 2
437\& "dilbert" =~ /^bert/; # doesn't match, but ..
438\& "bertram" =~ /^bert/; # matches, so still not good enough
439.Ve
440.PP
441.Vb 3
442\& "bertram" =~ /^bert$/; # doesn't match, good
443\& "dilbert" =~ /^bert$/; # doesn't match, good
444\& "bert" =~ /^bert$/; # matches, perfect
445.Ve
446.PP
447Of course, in the case of a literal string, one could just as easily
448use the string equivalence \f(CW\*(C`$string\ eq\ 'bert'\*(C'\fR\ and it would be
449more efficient. The \f(CW\*(C`^...$\*(C'\fR regexp really becomes useful when we
450add in the more powerful regexp tools below.
451.Sh "Using character classes"
452.IX Subsection "Using character classes"
453Although one can already do quite a lot with the literal string
454regexps above, we've only scratched the surface of regular expression
455technology. In this and subsequent sections we will introduce regexp
456concepts (and associated metacharacter notations) that will allow a
457regexp to not just represent a single character sequence, but a \fIwhole
458class\fR of them.
459.PP
460One such concept is that of a \fBcharacter class\fR. A character class
461allows a set of possible characters, rather than just a single
462character, to match at a particular point in a regexp. Character
463classes are denoted by brackets \f(CW\*(C`[...]\*(C'\fR, with the set of characters
464to be possibly matched inside. Here are some examples:
465.PP
466.Vb 4
467\& /cat/; # matches 'cat'
468\& /[bcr]at/; # matches 'bat, 'cat', or 'rat'
469\& /item[0123456789]/; # matches 'item0' or ... or 'item9'
470\& "abc" =~ /[cab]/; # matches 'a'
471.Ve
472.PP
473In the last statement, even though \f(CW'c'\fR is the first character in
474the class, \f(CW'a'\fR matches because the first character position in the
475string is the earliest point at which the regexp can match.
476.PP
477.Vb 2
478\& /[yY][eE][sS]/; # match 'yes' in a case-insensitive way
479\& # 'yes', 'Yes', 'YES', etc.
480.Ve
481.PP
482This regexp displays a common task: perform a case-insensitive
483match. Perl provides away of avoiding all those brackets by simply
484appending an \f(CW'i'\fR to the end of the match. Then \f(CW\*(C`/[yY][eE][sS]/;\*(C'\fR
485can be rewritten as \f(CW\*(C`/yes/i;\*(C'\fR. The \f(CW'i'\fR stands for
486case-insensitive and is an example of a \fBmodifier\fR of the matching
487operation. We will meet other modifiers later in the tutorial.
488.PP
489We saw in the section above that there were ordinary characters, which
490represented themselves, and special characters, which needed a
491backslash \f(CW\*(C`\e\*(C'\fR to represent themselves. The same is true in a
492character class, but the sets of ordinary and special characters
493inside a character class are different than those outside a character
494class. The special characters for a character class are \f(CW\*(C`\-]\e^$\*(C'\fR. \f(CW\*(C`]\*(C'\fR
495is special because it denotes the end of a character class. \f(CW\*(C`$\*(C'\fR is
496special because it denotes a scalar variable. \f(CW\*(C`\e\*(C'\fR is special because
497it is used in escape sequences, just like above. Here is how the
498special characters \f(CW\*(C`]$\e\*(C'\fR are handled:
499.PP
500.Vb 5
501\& /[\e]c]def/; # matches ']def' or 'cdef'
502\& $x = 'bcr';
503\& /[$x]at/; # matches 'bat', 'cat', or 'rat'
504\& /[\e$x]at/; # matches '$at' or 'xat'
505\& /[\e\e$x]at/; # matches '\eat', 'bat, 'cat', or 'rat'
506.Ve
507.PP
508The last two are a little tricky. in \f(CW\*(C`[\e$x]\*(C'\fR, the backslash protects
509the dollar sign, so the character class has two members \f(CW\*(C`$\*(C'\fR and \f(CW\*(C`x\*(C'\fR.
510In \f(CW\*(C`[\e\e$x]\*(C'\fR, the backslash is protected, so \f(CW$x\fR is treated as a
511variable and substituted in double quote fashion.
512.PP
513The special character \f(CW'\-'\fR acts as a range operator within character
514classes, so that a contiguous set of characters can be written as a
515range. With ranges, the unwieldy \f(CW\*(C`[0123456789]\*(C'\fR and \f(CW\*(C`[abc...xyz]\*(C'\fR
516become the svelte \f(CW\*(C`[0\-9]\*(C'\fR and \f(CW\*(C`[a\-z]\*(C'\fR. Some examples are
517.PP
518.Vb 6
519\& /item[0-9]/; # matches 'item0' or ... or 'item9'
520\& /[0-9bx-z]aa/; # matches '0aa', ..., '9aa',
521\& # 'baa', 'xaa', 'yaa', or 'zaa'
522\& /[0-9a-fA-F]/; # matches a hexadecimal digit
523\& /[0-9a-zA-Z_]/; # matches a "word" character,
524\& # like those in a perl variable name
525.Ve
526.PP
527If \f(CW'\-'\fR is the first or last character in a character class, it is
528treated as an ordinary character; \f(CW\*(C`[\-ab]\*(C'\fR, \f(CW\*(C`[ab\-]\*(C'\fR and \f(CW\*(C`[a\e\-b]\*(C'\fR are
529all equivalent.
530.PP
531The special character \f(CW\*(C`^\*(C'\fR in the first position of a character class
532denotes a \fBnegated character class\fR, which matches any character but
533those in the brackets. Both \f(CW\*(C`[...]\*(C'\fR and \f(CW\*(C`[^...]\*(C'\fR must match a
534character, or the match fails. Then
535.PP
536.Vb 4
537\& /[^a]at/; # doesn't match 'aat' or 'at', but matches
538\& # all other 'bat', 'cat, '0at', '%at', etc.
539\& /[^0-9]/; # matches a non-numeric character
540\& /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary
541.Ve
542.PP
543Now, even \f(CW\*(C`[0\-9]\*(C'\fR can be a bother the write multiple times, so in the
544interest of saving keystrokes and making regexps more readable, Perl
545has several abbreviations for common character classes:
546.IP "\(bu" 4
547\&\ed is a digit and represents [0\-9]
548.IP "\(bu" 4
549\&\es is a whitespace character and represents [\e \et\er\en\ef]
550.IP "\(bu" 4
551\&\ew is a word character (alphanumeric or _) and represents [0\-9a\-zA\-Z_]
552.IP "\(bu" 4
553\&\eD is a negated \ed; it represents any character but a digit [^0\-9]
554.IP "\(bu" 4
555\&\eS is a negated \es; it represents any non-whitespace character [^\es]
556.IP "\(bu" 4
557\&\eW is a negated \ew; it represents any non-word character [^\ew]
558.IP "\(bu" 4
559The period '.' matches any character but \*(L"\en\*(R"
560.PP
561The \f(CW\*(C`\ed\es\ew\eD\eS\eW\*(C'\fR abbreviations can be used both inside and outside
562of character classes. Here are some in use:
563.PP
564.Vb 7
565\& /\ed\ed:\ed\ed:\ed\ed/; # matches a hh:mm:ss time format
566\& /[\ed\es]/; # matches any digit or whitespace character
567\& /\ew\eW\ew/; # matches a word char, followed by a
568\& # non-word char, followed by a word char
569\& /..rt/; # matches any two chars, followed by 'rt'
570\& /end\e./; # matches 'end.'
571\& /end[.]/; # same thing, matches 'end.'
572.Ve
573.PP
574Because a period is a metacharacter, it needs to be escaped to match
575as an ordinary period. Because, for example, \f(CW\*(C`\ed\*(C'\fR and \f(CW\*(C`\ew\*(C'\fR are sets
576of characters, it is incorrect to think of \f(CW\*(C`[^\ed\ew]\*(C'\fR as \f(CW\*(C`[\eD\eW]\*(C'\fR; in
577fact \f(CW\*(C`[^\ed\ew]\*(C'\fR is the same as \f(CW\*(C`[^\ew]\*(C'\fR, which is the same as
578\&\f(CW\*(C`[\eW]\*(C'\fR. Think DeMorgan's laws.
579.PP
580An anchor useful in basic regexps is the \fBword\ anchor\fR\
581\&\f(CW\*(C`\eb\*(C'\fR. This matches a boundary between a word character and a non-word
582character \f(CW\*(C`\ew\eW\*(C'\fR or \f(CW\*(C`\eW\ew\*(C'\fR:
583.PP
584.Vb 5
585\& $x = "Housecat catenates house and cat";
586\& $x =~ /cat/; # matches cat in 'housecat'
587\& $x =~ /\ebcat/; # matches cat in 'catenates'
588\& $x =~ /cat\eb/; # matches cat in 'housecat'
589\& $x =~ /\ebcat\eb/; # matches 'cat' at end of string
590.Ve
591.PP
592Note in the last example, the end of the string is considered a word
593boundary.
594.PP
595You might wonder why \f(CW'.'\fR matches everything but \f(CW"\en"\fR \- why not
596every character? The reason is that often one is matching against
597lines and would like to ignore the newline characters. For instance,
598while the string \f(CW"\en"\fR represents one line, we would like to think
599of as empty. Then
600.PP
601.Vb 2
602\& "" =~ /^$/; # matches
603\& "\en" =~ /^$/; # matches, "\en" is ignored
604.Ve
605.PP
606.Vb 5
607\& "" =~ /./; # doesn't match; it needs a char
608\& "" =~ /^.$/; # doesn't match; it needs a char
609\& "\en" =~ /^.$/; # doesn't match; it needs a char other than "\en"
610\& "a" =~ /^.$/; # matches
611\& "a\en" =~ /^.$/; # matches, ignores the "\en"
612.Ve
613.PP
614This behavior is convenient, because we usually want to ignore
615newlines when we count and match characters in a line. Sometimes,
616however, we want to keep track of newlines. We might even want \f(CW\*(C`^\*(C'\fR
617and \f(CW\*(C`$\*(C'\fR to anchor at the beginning and end of lines within the
618string, rather than just the beginning and end of the string. Perl
619allows us to choose between ignoring and paying attention to newlines
620by using the \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR modifiers. \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR stand for
621single line and multi-line and they determine whether a string is to
622be treated as one continuous string, or as a set of lines. The two
623modifiers affect two aspects of how the regexp is interpreted: 1) how
624the \f(CW'.'\fR character class is defined, and 2) where the anchors \f(CW\*(C`^\*(C'\fR
625and \f(CW\*(C`$\*(C'\fR are able to match. Here are the four possible combinations:
626.IP "\(bu" 4
627no modifiers (//): Default behavior. \f(CW'.'\fR matches any character
628except \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR matches only at the beginning of the string and
629\&\f(CW\*(C`$\*(C'\fR matches only at the end or before a newline at the end.
630.IP "\(bu" 4
631s modifier (//s): Treat string as a single long line. \f(CW'.'\fR matches
632any character, even \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR matches only at the beginning of
633the string and \f(CW\*(C`$\*(C'\fR matches only at the end or before a newline at the
634end.
635.IP "\(bu" 4
636m modifier (//m): Treat string as a set of multiple lines. \f(CW'.'\fR
637matches any character except \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR are able to match
638at the start or end of \fIany\fR line within the string.
639.IP "\(bu" 4
640both s and m modifiers (//sm): Treat string as a single long line, but
641detect multiple lines. \f(CW'.'\fR matches any character, even
642\&\f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR, however, are able to match at the start or end
643of \fIany\fR line within the string.
644.PP
645Here are examples of \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR in action:
646.PP
647.Vb 1
648\& $x = "There once was a girl\enWho programmed in Perl\en";
649.Ve
650.PP
651.Vb 4
652\& $x =~ /^Who/; # doesn't match, "Who" not at start of string
653\& $x =~ /^Who/s; # doesn't match, "Who" not at start of string
654\& $x =~ /^Who/m; # matches, "Who" at start of second line
655\& $x =~ /^Who/sm; # matches, "Who" at start of second line
656.Ve
657.PP
658.Vb 4
659\& $x =~ /girl.Who/; # doesn't match, "." doesn't match "\en"
660\& $x =~ /girl.Who/s; # matches, "." matches "\en"
661\& $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\en"
662\& $x =~ /girl.Who/sm; # matches, "." matches "\en"
663.Ve
664.PP
665Most of the time, the default behavior is what is want, but \f(CW\*(C`//s\*(C'\fR and
666\&\f(CW\*(C`//m\*(C'\fR are occasionally very useful. If \f(CW\*(C`//m\*(C'\fR is being used, the start
667of the string can still be matched with \f(CW\*(C`\eA\*(C'\fR and the end of string
668can still be matched with the anchors \f(CW\*(C`\eZ\*(C'\fR (matches both the end and
669the newline before, like \f(CW\*(C`$\*(C'\fR), and \f(CW\*(C`\ez\*(C'\fR (matches only the end):
670.PP
671.Vb 2
672\& $x =~ /^Who/m; # matches, "Who" at start of second line
673\& $x =~ /\eAWho/m; # doesn't match, "Who" is not at start of string
674.Ve
675.PP
676.Vb 2
677\& $x =~ /girl$/m; # matches, "girl" at end of first line
678\& $x =~ /girl\eZ/m; # doesn't match, "girl" is not at end of string
679.Ve
680.PP
681.Vb 2
682\& $x =~ /Perl\eZ/m; # matches, "Perl" is at newline before end
683\& $x =~ /Perl\ez/m; # doesn't match, "Perl" is not at end of string
684.Ve
685.PP
686We now know how to create choices among classes of characters in a
687regexp. What about choices among words or character strings? Such
688choices are described in the next section.
689.Sh "Matching this or that"
690.IX Subsection "Matching this or that"
691Sometimes we would like to our regexp to be able to match different
692possible words or character strings. This is accomplished by using
693the \fBalternation\fR metacharacter \f(CW\*(C`|\*(C'\fR. To match \f(CW\*(C`dog\*(C'\fR or \f(CW\*(C`cat\*(C'\fR, we
694form the regexp \f(CW\*(C`dog|cat\*(C'\fR. As before, perl will try to match the
695regexp at the earliest possible point in the string. At each
696character position, perl will first try to match the first
697alternative, \f(CW\*(C`dog\*(C'\fR. If \f(CW\*(C`dog\*(C'\fR doesn't match, perl will then try the
698next alternative, \f(CW\*(C`cat\*(C'\fR. If \f(CW\*(C`cat\*(C'\fR doesn't match either, then the
699match fails and perl moves to the next position in the string. Some
700examples:
701.PP
702.Vb 2
703\& "cats and dogs" =~ /cat|dog|bird/; # matches "cat"
704\& "cats and dogs" =~ /dog|cat|bird/; # matches "cat"
705.Ve
706.PP
707Even though \f(CW\*(C`dog\*(C'\fR is the first alternative in the second regexp,
708\&\f(CW\*(C`cat\*(C'\fR is able to match earlier in the string.
709.PP
710.Vb 2
711\& "cats" =~ /c|ca|cat|cats/; # matches "c"
712\& "cats" =~ /cats|cat|ca|c/; # matches "cats"
713.Ve
714.PP
715Here, all the alternatives match at the first string position, so the
716first alternative is the one that matches. If some of the
717alternatives are truncations of the others, put the longest ones first
718to give them a chance to match.
719.PP
720.Vb 2
721\& "cab" =~ /a|b|c/ # matches "c"
722\& # /a|b|c/ == /[abc]/
723.Ve
724.PP
725The last example points out that character classes are like
726alternations of characters. At a given character position, the first
727alternative that allows the regexp match to succeed will be the one
728that matches.
729.Sh "Grouping things and hierarchical matching"
730.IX Subsection "Grouping things and hierarchical matching"
731Alternation allows a regexp to choose among alternatives, but by
732itself it unsatisfying. The reason is that each alternative is a whole
733regexp, but sometime we want alternatives for just part of a
734regexp. For instance, suppose we want to search for housecats or
735housekeepers. The regexp \f(CW\*(C`housecat|housekeeper\*(C'\fR fits the bill, but is
736inefficient because we had to type \f(CW\*(C`house\*(C'\fR twice. It would be nice to
737have parts of the regexp be constant, like \f(CW\*(C`house\*(C'\fR, and some
738parts have alternatives, like \f(CW\*(C`cat|keeper\*(C'\fR.
739.PP
740The \fBgrouping\fR metacharacters \f(CW\*(C`()\*(C'\fR solve this problem. Grouping
741allows parts of a regexp to be treated as a single unit. Parts of a
742regexp are grouped by enclosing them in parentheses. Thus we could solve
743the \f(CW\*(C`housecat|housekeeper\*(C'\fR by forming the regexp as
744\&\f(CW\*(C`house(cat|keeper)\*(C'\fR. The regexp \f(CW\*(C`house(cat|keeper)\*(C'\fR means match
745\&\f(CW\*(C`house\*(C'\fR followed by either \f(CW\*(C`cat\*(C'\fR or \f(CW\*(C`keeper\*(C'\fR. Some more examples
746are
747.PP
748.Vb 4
749\& /(a|b)b/; # matches 'ab' or 'bb'
750\& /(ac|b)b/; # matches 'acb' or 'bb'
751\& /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere
752\& /(a|[bc])d/; # matches 'ad', 'bd', or 'cd'
753.Ve
754.PP
755.Vb 3
756\& /house(cat|)/; # matches either 'housecat' or 'house'
757\& /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or
758\& # 'house'. Note groups can be nested.
759.Ve
760.PP
761.Vb 3
762\& /(19|20|)\ed\ed/; # match years 19xx, 20xx, or the Y2K problem, xx
763\& "20" =~ /(19|20|)\ed\ed/; # matches the null alternative '()\ed\ed',
764\& # because '20\ed\ed' can't match
765.Ve
766.PP
767Alternations behave the same way in groups as out of them: at a given
768string position, the leftmost alternative that allows the regexp to
769match is taken. So in the last example at the first string position,
770\&\f(CW"20"\fR matches the second alternative, but there is nothing left over
771to match the next two digits \f(CW\*(C`\ed\ed\*(C'\fR. So perl moves on to the next
772alternative, which is the null alternative and that works, since
773\&\f(CW"20"\fR is two digits.
774.PP
775The process of trying one alternative, seeing if it matches, and
776moving on to the next alternative if it doesn't, is called
777\&\fBbacktracking\fR. The term 'backtracking' comes from the idea that
778matching a regexp is like a walk in the woods. Successfully matching
779a regexp is like arriving at a destination. There are many possible
780trailheads, one for each string position, and each one is tried in
781order, left to right. From each trailhead there may be many paths,
782some of which get you there, and some which are dead ends. When you
783walk along a trail and hit a dead end, you have to backtrack along the
784trail to an earlier point to try another trail. If you hit your
785destination, you stop immediately and forget about trying all the
786other trails. You are persistent, and only if you have tried all the
787trails from all the trailheads and not arrived at your destination, do
788you declare failure. To be concrete, here is a step-by-step analysis
789of what perl does when it tries to match the regexp
790.PP
791.Vb 1
792\& "abcde" =~ /(abd|abc)(df|d|de)/;
793.Ve
794.IP "\(bu" 4
795Start with the first letter in the string 'a'.
796.IP "1" 4
797.IX Item "1"
798Try the first alternative in the first group 'abd'.
799.IP "2" 4
800.IX Item "2"
801Match 'a' followed by 'b'. So far so good.
802.IP "3" 4
803.IX Item "3"
804\&'d' in the regexp doesn't match 'c' in the string \- a dead
805end. So backtrack two characters and pick the second alternative in
806the first group 'abc'.
807.IP "4" 4
808.IX Item "4"
809Match 'a' followed by 'b' followed by 'c'. We are on a roll
810and have satisfied the first group. Set \f(CW$1\fR to 'abc'.
811.IP "5" 4
812.IX Item "5"
813Move on to the second group and pick the first alternative
814\&'df'.
815.IP "6" 4
816.IX Item "6"
817Match the 'd'.
818.IP "7" 4
819.IX Item "7"
820\&'f' in the regexp doesn't match 'e' in the string, so a dead
821end. Backtrack one character and pick the second alternative in the
822second group 'd'.
823.IP "8" 4
824.IX Item "8"
825\&'d' matches. The second grouping is satisfied, so set \f(CW$2\fR to
826\&'d'.
827.IP "9" 4
828.IX Item "9"
829We are at the end of the regexp, so we are done! We have
830matched 'abcd' out of the string \*(L"abcde\*(R".
831.PP
832There are a couple of things to note about this analysis. First, the
833third alternative in the second group 'de' also allows a match, but we
834stopped before we got to it \- at a given character position, leftmost
835wins. Second, we were able to get a match at the first character
836position of the string 'a'. If there were no matches at the first
837position, perl would move to the second character position 'b' and
838attempt the match all over again. Only when all possible paths at all
839possible character positions have been exhausted does perl give
840up and declare \f(CW\*(C`$string\ =~\ /(abd|abc)(df|d|de)/;\*(C'\fR\ to be false.
841.PP
842Even with all this work, regexp matching happens remarkably fast. To
843speed things up, during compilation stage, perl compiles the regexp
844into a compact sequence of opcodes that can often fit inside a
845processor cache. When the code is executed, these opcodes can then run
846at full throttle and search very quickly.
847.Sh "Extracting matches"
848.IX Subsection "Extracting matches"
849The grouping metacharacters \f(CW\*(C`()\*(C'\fR also serve another completely
850different function: they allow the extraction of the parts of a string
851that matched. This is very useful to find out what matched and for
852text processing in general. For each grouping, the part that matched
853inside goes into the special variables \f(CW$1\fR, \f(CW$2\fR, etc. They can be
854used just as ordinary variables:
855.PP
856.Vb 5
857\& # extract hours, minutes, seconds
858\& $time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/; # match hh:mm:ss format
859\& $hours = $1;
860\& $minutes = $2;
861\& $seconds = $3;
862.Ve
863.PP
864Now, we know that in scalar context,
865\&\f(CW\*(C`$time\ =~\ /(\ed\ed):(\ed\ed):(\ed\ed)/\*(C'\fR\ returns a true or false
866value. In list context, however, it returns the list of matched values
867\&\f(CW\*(C`($1,$2,$3)\*(C'\fR. So we could write the code more compactly as
868.PP
869.Vb 2
870\& # extract hours, minutes, seconds
871\& ($hours, $minutes, $second) = ($time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/);
872.Ve
873.PP
874If the groupings in a regexp are nested, \f(CW$1\fR gets the group with the
875leftmost opening parenthesis, \f(CW$2\fR the next opening parenthesis,
876etc. For example, here is a complex regexp and the matching variables
877indicated below it:
878.PP
879.Vb 2
880\& /(ab(cd|ef)((gi)|j))/;
881\& 1 2 34
882.Ve
883.PP
884so that if the regexp matched, e.g., \f(CW$2\fR would contain 'cd' or 'ef'. For
885convenience, perl sets \f(CW$+\fR to the string held by the highest numbered
886\&\f(CW$1\fR, \f(CW$2\fR, ... that got assigned (and, somewhat related, \f(CW$^N\fR to the
887value of the \f(CW$1\fR, \f(CW$2\fR, ... most-recently assigned; i.e. the \f(CW$1\fR,
888\&\f(CW$2\fR, ... associated with the rightmost closing parenthesis used in the
889match).
890.PP
891Closely associated with the matching variables \f(CW$1\fR, \f(CW$2\fR, ... are
892the \fBbackreferences\fR \f(CW\*(C`\e1\*(C'\fR, \f(CW\*(C`\e2\*(C'\fR, ... . Backreferences are simply
893matching variables that can be used \fIinside\fR a regexp. This is a
894really nice feature \- what matches later in a regexp can depend on
895what matched earlier in the regexp. Suppose we wanted to look
896for doubled words in text, like 'the the'. The following regexp finds
897all 3\-letter doubles with a space in between:
898.PP
899.Vb 1
900\& /(\ew\ew\ew)\es\e1/;
901.Ve
902.PP
903The grouping assigns a value to \e1, so that the same 3 letter sequence
904is used for both parts. Here are some words with repeated parts:
905.PP
906.Vb 7
907\& % simple_grep '^(\ew\ew\ew\ew|\ew\ew\ew|\ew\ew|\ew)\e1$' /usr/dict/words
908\& beriberi
909\& booboo
910\& coco
911\& mama
912\& murmur
913\& papa
914.Ve
915.PP
916The regexp has a single grouping which considers 4\-letter
917combinations, then 3\-letter combinations, etc. and uses \f(CW\*(C`\e1\*(C'\fR to look for
918a repeat. Although \f(CW$1\fR and \f(CW\*(C`\e1\*(C'\fR represent the same thing, care should be
919taken to use matched variables \f(CW$1\fR, \f(CW$2\fR, ... only outside a regexp
920and backreferences \f(CW\*(C`\e1\*(C'\fR, \f(CW\*(C`\e2\*(C'\fR, ... only inside a regexp; not doing
921so may lead to surprising and/or undefined results.
922.PP
923In addition to what was matched, Perl 5.6.0 also provides the
924positions of what was matched with the \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR
925arrays. \f(CW\*(C`$\-[0]\*(C'\fR is the position of the start of the entire match and
926\&\f(CW$+[0]\fR is the position of the end. Similarly, \f(CW\*(C`$\-[n]\*(C'\fR is the
927position of the start of the \f(CW$n\fR match and \f(CW$+[n]\fR is the position
928of the end. If \f(CW$n\fR is undefined, so are \f(CW\*(C`$\-[n]\*(C'\fR and \f(CW$+[n]\fR. Then
929this code
930.PP
931.Vb 5
932\& $x = "Mmm...donut, thought Homer";
933\& $x =~ /^(Mmm|Yech)\e.\e.\e.(donut|peas)/; # matches
934\& foreach $expr (1..$#-) {
935\& print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\en";
936\& }
937.Ve
938.PP
939prints
940.PP
941.Vb 2
942\& Match 1: 'Mmm' at position (0,3)
943\& Match 2: 'donut' at position (6,11)
944.Ve
945.PP
946Even if there are no groupings in a regexp, it is still possible to
947find out what exactly matched in a string. If you use them, perl
948will set \f(CW$`\fR to the part of the string before the match, will set \f(CW$&\fR
949to the part of the string that matched, and will set \f(CW$'\fR to the part
950of the string after the match. An example:
951.PP
952.Vb 3
953\& $x = "the cat caught the mouse";
954\& $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse'
955\& $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse'
956.Ve
957.PP
958In the second match, \f(CW\*(C`$`\ =\ ''\*(C'\fR\ because the regexp matched at the
959first character position in the string and stopped, it never saw the
960second 'the'. It is important to note that using \f(CW$`\fR and \f(CW$'\fR
961slows down regexp matching quite a bit, and \f(CW $& \fR slows it down to a
962lesser extent, because if they are used in one regexp in a program,
963they are generated for <all> regexps in the program. So if raw
964performance is a goal of your application, they should be avoided.
965If you need them, use \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR instead:
966.PP
967.Vb 3
968\& $` is the same as substr( $x, 0, $-[0] )
969\& $& is the same as substr( $x, $-[0], $+[0]-$-[0] )
970\& $' is the same as substr( $x, $+[0] )
971.Ve
972.Sh "Matching repetitions"
973.IX Subsection "Matching repetitions"
974The examples in the previous section display an annoying weakness. We
975were only matching 3\-letter words, or syllables of 4 letters or
976less. We'd like to be able to match words or syllables of any length,
977without writing out tedious alternatives like
978\&\f(CW\*(C`\ew\ew\ew\ew|\ew\ew\ew|\ew\ew|\ew\*(C'\fR.
979.PP
980This is exactly the problem the \fBquantifier\fR metacharacters \f(CW\*(C`?\*(C'\fR,
981\&\f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR, and \f(CW\*(C`{}\*(C'\fR were created for. They allow us to determine the
982number of repeats of a portion of a regexp we consider to be a
983match. Quantifiers are put immediately after the character, character
984class, or grouping that we want to specify. They have the following
985meanings:
986.IP "\(bu" 4
987\&\f(CW\*(C`a?\*(C'\fR = match 'a' 1 or 0 times
988.IP "\(bu" 4
989\&\f(CW\*(C`a*\*(C'\fR = match 'a' 0 or more times, i.e., any number of times
990.IP "\(bu" 4
991\&\f(CW\*(C`a+\*(C'\fR = match 'a' 1 or more times, i.e., at least once
992.IP "\(bu" 4
993\&\f(CW\*(C`a{n,m}\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, but not more than \f(CW\*(C`m\*(C'\fR
994times.
995.IP "\(bu" 4
996\&\f(CW\*(C`a{n,}\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR or more times
997.IP "\(bu" 4
998\&\f(CW\*(C`a{n}\*(C'\fR = match exactly \f(CW\*(C`n\*(C'\fR times
999.PP
1000Here are some examples:
1001.PP
1002.Vb 9
1003\& /[a-z]+\es+\ed*/; # match a lowercase word, at least some space, and
1004\& # any number of digits
1005\& /(\ew+)\es+\e1/; # match doubled words of arbitrary length
1006\& /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes'
1007\& $year =~ /\ed{2,4}/; # make sure year is at least 2 but not more
1008\& # than 4 digits
1009\& $year =~ /\ed{4}|\ed{2}/; # better match; throw out 3 digit dates
1010\& $year =~ /\ed{2}(\ed{2})?/; # same thing written differently. However,
1011\& # this produces $1 and the other does not.
1012.Ve
1013.PP
1014.Vb 7
1015\& % simple_grep '^(\ew+)\e1$' /usr/dict/words # isn't this easier?
1016\& beriberi
1017\& booboo
1018\& coco
1019\& mama
1020\& murmur
1021\& papa
1022.Ve
1023.PP
1024For all of these quantifiers, perl will try to match as much of the
1025string as possible, while still allowing the regexp to succeed. Thus
1026with \f(CW\*(C`/a?.../\*(C'\fR, perl will first try to match the regexp with the \f(CW\*(C`a\*(C'\fR
1027present; if that fails, perl will try to match the regexp without the
1028\&\f(CW\*(C`a\*(C'\fR present. For the quantifier \f(CW\*(C`*\*(C'\fR, we get the following:
1029.PP
1030.Vb 5
1031\& $x = "the cat in the hat";
1032\& $x =~ /^(.*)(cat)(.*)$/; # matches,
1033\& # $1 = 'the '
1034\& # $2 = 'cat'
1035\& # $3 = ' in the hat'
1036.Ve
1037.PP
1038Which is what we might expect, the match finds the only \f(CW\*(C`cat\*(C'\fR in the
1039string and locks onto it. Consider, however, this regexp:
1040.PP
1041.Vb 4
1042\& $x =~ /^(.*)(at)(.*)$/; # matches,
1043\& # $1 = 'the cat in the h'
1044\& # $2 = 'at'
1045\& # $3 = '' (0 matches)
1046.Ve
1047.PP
1048One might initially guess that perl would find the \f(CW\*(C`at\*(C'\fR in \f(CW\*(C`cat\*(C'\fR and
1049stop there, but that wouldn't give the longest possible string to the
1050first quantifier \f(CW\*(C`.*\*(C'\fR. Instead, the first quantifier \f(CW\*(C`.*\*(C'\fR grabs as
1051much of the string as possible while still having the regexp match. In
1052this example, that means having the \f(CW\*(C`at\*(C'\fR sequence with the final \f(CW\*(C`at\*(C'\fR
1053in the string. The other important principle illustrated here is that
1054when there are two or more elements in a regexp, the \fIleftmost\fR
1055quantifier, if there is one, gets to grab as much the string as
1056possible, leaving the rest of the regexp to fight over scraps. Thus in
1057our example, the first quantifier \f(CW\*(C`.*\*(C'\fR grabs most of the string, while
1058the second quantifier \f(CW\*(C`.*\*(C'\fR gets the empty string. Quantifiers that
1059grab as much of the string as possible are called \fBmaximal match\fR or
1060\&\fBgreedy\fR quantifiers.
1061.PP
1062When a regexp can match a string in several different ways, we can use
1063the principles above to predict which way the regexp will match:
1064.IP "\(bu" 4
1065Principle 0: Taken as a whole, any regexp will be matched at the
1066earliest possible position in the string.
1067.IP "\(bu" 4
1068Principle 1: In an alternation \f(CW\*(C`a|b|c...\*(C'\fR, the leftmost alternative
1069that allows a match for the whole regexp will be the one used.
1070.IP "\(bu" 4
1071Principle 2: The maximal matching quantifiers \f(CW\*(C`?\*(C'\fR, \f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR and
1072\&\f(CW\*(C`{n,m}\*(C'\fR will in general match as much of the string as possible while
1073still allowing the whole regexp to match.
1074.IP "\(bu" 4
1075Principle 3: If there are two or more elements in a regexp, the
1076leftmost greedy quantifier, if any, will match as much of the string
1077as possible while still allowing the whole regexp to match. The next
1078leftmost greedy quantifier, if any, will try to match as much of the
1079string remaining available to it as possible, while still allowing the
1080whole regexp to match. And so on, until all the regexp elements are
1081satisfied.
1082.PP
1083As we have seen above, Principle 0 overrides the others \- the regexp
1084will be matched as early as possible, with the other principles
1085determining how the regexp matches at that earliest character
1086position.
1087.PP
1088Here is an example of these principles in action:
1089.PP
1090.Vb 5
1091\& $x = "The programming republic of Perl";
1092\& $x =~ /^(.+)(e|r)(.*)$/; # matches,
1093\& # $1 = 'The programming republic of Pe'
1094\& # $2 = 'r'
1095\& # $3 = 'l'
1096.Ve
1097.PP
1098This regexp matches at the earliest string position, \f(CW'T'\fR. One
1099might think that \f(CW\*(C`e\*(C'\fR, being leftmost in the alternation, would be
1100matched, but \f(CW\*(C`r\*(C'\fR produces the longest string in the first quantifier.
1101.PP
1102.Vb 3
1103\& $x =~ /(m{1,2})(.*)$/; # matches,
1104\& # $1 = 'mm'
1105\& # $2 = 'ing republic of Perl'
1106.Ve
1107.PP
1108Here, The earliest possible match is at the first \f(CW'm'\fR in
1109\&\f(CW\*(C`programming\*(C'\fR. \f(CW\*(C`m{1,2}\*(C'\fR is the first quantifier, so it gets to match
1110a maximal \f(CW\*(C`mm\*(C'\fR.
1111.PP
1112.Vb 3
1113\& $x =~ /.*(m{1,2})(.*)$/; # matches,
1114\& # $1 = 'm'
1115\& # $2 = 'ing republic of Perl'
1116.Ve
1117.PP
1118Here, the regexp matches at the start of the string. The first
1119quantifier \f(CW\*(C`.*\*(C'\fR grabs as much as possible, leaving just a single
1120\&\f(CW'm'\fR for the second quantifier \f(CW\*(C`m{1,2}\*(C'\fR.
1121.PP
1122.Vb 4
1123\& $x =~ /(.?)(m{1,2})(.*)$/; # matches,
1124\& # $1 = 'a'
1125\& # $2 = 'mm'
1126\& # $3 = 'ing republic of Perl'
1127.Ve
1128.PP
1129Here, \f(CW\*(C`.?\*(C'\fR eats its maximal one character at the earliest possible
1130position in the string, \f(CW'a'\fR in \f(CW\*(C`programming\*(C'\fR, leaving \f(CW\*(C`m{1,2}\*(C'\fR
1131the opportunity to match both \f(CW\*(C`m\*(C'\fR's. Finally,
1132.PP
1133.Vb 1
1134\& "aXXXb" =~ /(X*)/; # matches with $1 = ''
1135.Ve
1136.PP
1137because it can match zero copies of \f(CW'X'\fR at the beginning of the
1138string. If you definitely want to match at least one \f(CW'X'\fR, use
1139\&\f(CW\*(C`X+\*(C'\fR, not \f(CW\*(C`X*\*(C'\fR.
1140.PP
1141Sometimes greed is not good. At times, we would like quantifiers to
1142match a \fIminimal\fR piece of string, rather than a maximal piece. For
1143this purpose, Larry Wall created the \fBminimal\ match\fR\ or
1144\&\fBnon-greedy\fR quantifiers \f(CW\*(C`??\*(C'\fR,\f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, and \f(CW\*(C`{}?\*(C'\fR. These are
1145the usual quantifiers with a \f(CW\*(C`?\*(C'\fR appended to them. They have the
1146following meanings:
1147.IP "\(bu" 4
1148\&\f(CW\*(C`a??\*(C'\fR = match 'a' 0 or 1 times. Try 0 first, then 1.
1149.IP "\(bu" 4
1150\&\f(CW\*(C`a*?\*(C'\fR = match 'a' 0 or more times, i.e., any number of times,
1151but as few times as possible
1152.IP "\(bu" 4
1153\&\f(CW\*(C`a+?\*(C'\fR = match 'a' 1 or more times, i.e., at least once, but
1154as few times as possible
1155.IP "\(bu" 4
1156\&\f(CW\*(C`a{n,m}?\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, not more than \f(CW\*(C`m\*(C'\fR
1157times, as few times as possible
1158.IP "\(bu" 4
1159\&\f(CW\*(C`a{n,}?\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, but as few times as
1160possible
1161.IP "\(bu" 4
1162\&\f(CW\*(C`a{n}?\*(C'\fR = match exactly \f(CW\*(C`n\*(C'\fR times. Because we match exactly
1163\&\f(CW\*(C`n\*(C'\fR times, \f(CW\*(C`a{n}?\*(C'\fR is equivalent to \f(CW\*(C`a{n}\*(C'\fR and is just there for
1164notational consistency.
1165.PP
1166Let's look at the example above, but with minimal quantifiers:
1167.PP
1168.Vb 5
1169\& $x = "The programming republic of Perl";
1170\& $x =~ /^(.+?)(e|r)(.*)$/; # matches,
1171\& # $1 = 'Th'
1172\& # $2 = 'e'
1173\& # $3 = ' programming republic of Perl'
1174.Ve
1175.PP
1176The minimal string that will allow both the start of the string \f(CW\*(C`^\*(C'\fR
1177and the alternation to match is \f(CW\*(C`Th\*(C'\fR, with the alternation \f(CW\*(C`e|r\*(C'\fR
1178matching \f(CW\*(C`e\*(C'\fR. The second quantifier \f(CW\*(C`.*\*(C'\fR is free to gobble up the
1179rest of the string.
1180.PP
1181.Vb 3
1182\& $x =~ /(m{1,2}?)(.*?)$/; # matches,
1183\& # $1 = 'm'
1184\& # $2 = 'ming republic of Perl'
1185.Ve
1186.PP
1187The first string position that this regexp can match is at the first
1188\&\f(CW'm'\fR in \f(CW\*(C`programming\*(C'\fR. At this position, the minimal \f(CW\*(C`m{1,2}?\*(C'\fR
1189matches just one \f(CW'm'\fR. Although the second quantifier \f(CW\*(C`.*?\*(C'\fR would
1190prefer to match no characters, it is constrained by the end-of-string
1191anchor \f(CW\*(C`$\*(C'\fR to match the rest of the string.
1192.PP
1193.Vb 4
1194\& $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches,
1195\& # $1 = 'The progra'
1196\& # $2 = 'm'
1197\& # $3 = 'ming republic of Perl'
1198.Ve
1199.PP
1200In this regexp, you might expect the first minimal quantifier \f(CW\*(C`.*?\*(C'\fR
1201to match the empty string, because it is not constrained by a \f(CW\*(C`^\*(C'\fR
1202anchor to match the beginning of the word. Principle 0 applies here,
1203however. Because it is possible for the whole regexp to match at the
1204start of the string, it \fIwill\fR match at the start of the string. Thus
1205the first quantifier has to match everything up to the first \f(CW\*(C`m\*(C'\fR. The
1206second minimal quantifier matches just one \f(CW\*(C`m\*(C'\fR and the third
1207quantifier matches the rest of the string.
1208.PP
1209.Vb 4
1210\& $x =~ /(.??)(m{1,2})(.*)$/; # matches,
1211\& # $1 = 'a'
1212\& # $2 = 'mm'
1213\& # $3 = 'ing republic of Perl'
1214.Ve
1215.PP
1216Just as in the previous regexp, the first quantifier \f(CW\*(C`.??\*(C'\fR can match
1217earliest at position \f(CW'a'\fR, so it does. The second quantifier is
1218greedy, so it matches \f(CW\*(C`mm\*(C'\fR, and the third matches the rest of the
1219string.
1220.PP
1221We can modify principle 3 above to take into account non-greedy
1222quantifiers:
1223.IP "\(bu" 4
1224Principle 3: If there are two or more elements in a regexp, the
1225leftmost greedy (non\-greedy) quantifier, if any, will match as much
1226(little) of the string as possible while still allowing the whole
1227regexp to match. The next leftmost greedy (non\-greedy) quantifier, if
1228any, will try to match as much (little) of the string remaining
1229available to it as possible, while still allowing the whole regexp to
1230match. And so on, until all the regexp elements are satisfied.
1231.PP
1232Just like alternation, quantifiers are also susceptible to
1233backtracking. Here is a step-by-step analysis of the example
1234.PP
1235.Vb 5
1236\& $x = "the cat in the hat";
1237\& $x =~ /^(.*)(at)(.*)$/; # matches,
1238\& # $1 = 'the cat in the h'
1239\& # $2 = 'at'
1240\& # $3 = '' (0 matches)
1241.Ve
1242.IP "\(bu" 4
1243Start with the first letter in the string 't'.
1244.IP "1" 4
1245.IX Item "1"
1246The first quantifier '.*' starts out by matching the whole
1247string 'the cat in the hat'.
1248.IP "2" 4
1249.IX Item "2"
1250\&'a' in the regexp element 'at' doesn't match the end of the
1251string. Backtrack one character.
1252.IP "3" 4
1253.IX Item "3"
1254\&'a' in the regexp element 'at' still doesn't match the last
1255letter of the string 't', so backtrack one more character.
1256.IP "4" 4
1257.IX Item "4"
1258Now we can match the 'a' and the 't'.
1259.IP "5" 4
1260.IX Item "5"
1261Move on to the third element '.*'. Since we are at the end of
1262the string and '.*' can match 0 times, assign it the empty string.
1263.IP "6" 4
1264.IX Item "6"
1265We are done!
1266.PP
1267Most of the time, all this moving forward and backtracking happens
1268quickly and searching is fast. There are some pathological regexps,
1269however, whose execution time exponentially grows with the size of the
1270string. A typical structure that blows up in your face is of the form
1271.PP
1272.Vb 1
1273\& /(a|b+)*/;
1274.Ve
1275.PP
1276The problem is the nested indeterminate quantifiers. There are many
1277different ways of partitioning a string of length n between the \f(CW\*(C`+\*(C'\fR
1278and \f(CW\*(C`*\*(C'\fR: one repetition with \f(CW\*(C`b+\*(C'\fR of length n, two repetitions with
1279the first \f(CW\*(C`b+\*(C'\fR length k and the second with length n\-k, m repetitions
1280whose bits add up to length n, etc. In fact there are an exponential
1281number of ways to partition a string as a function of length. A
1282regexp may get lucky and match early in the process, but if there is
1283no match, perl will try \fIevery\fR possibility before giving up. So be
1284careful with nested \f(CW\*(C`*\*(C'\fR's, \f(CW\*(C`{n,m}\*(C'\fR's, and \f(CW\*(C`+\*(C'\fR's. The book
1285\&\fIMastering regular expressions\fR by Jeffrey Friedl gives a wonderful
1286discussion of this and other efficiency issues.
1287.Sh "Building a regexp"
1288.IX Subsection "Building a regexp"
1289At this point, we have all the basic regexp concepts covered, so let's
1290give a more involved example of a regular expression. We will build a
1291regexp that matches numbers.
1292.PP
1293The first task in building a regexp is to decide what we want to match
1294and what we want to exclude. In our case, we want to match both
1295integers and floating point numbers and we want to reject any string
1296that isn't a number.
1297.PP
1298The next task is to break the problem down into smaller problems that
1299are easily converted into a regexp.
1300.PP
1301The simplest case is integers. These consist of a sequence of digits,
1302with an optional sign in front. The digits we can represent with
1303\&\f(CW\*(C`\ed+\*(C'\fR and the sign can be matched with \f(CW\*(C`[+\-]\*(C'\fR. Thus the integer
1304regexp is
1305.PP
1306.Vb 1
1307\& /[+-]?\ed+/; # matches integers
1308.Ve
1309.PP
1310A floating point number potentially has a sign, an integral part, a
1311decimal point, a fractional part, and an exponent. One or more of these
1312parts is optional, so we need to check out the different
1313possibilities. Floating point numbers which are in proper form include
1314123., 0.345, .34, \-1e6, and 25.4E\-72. As with integers, the sign out
1315front is completely optional and can be matched by \f(CW\*(C`[+\-]?\*(C'\fR. We can
1316see that if there is no exponent, floating point numbers must have a
1317decimal point, otherwise they are integers. We might be tempted to
1318model these with \f(CW\*(C`\ed*\e.\ed*\*(C'\fR, but this would also match just a single
1319decimal point, which is not a number. So the three cases of floating
1320point number sans exponent are
1321.PP
1322.Vb 3
1323\& /[+-]?\ed+\e./; # 1., 321., etc.
1324\& /[+-]?\e.\ed+/; # .1, .234, etc.
1325\& /[+-]?\ed+\e.\ed+/; # 1.0, 30.56, etc.
1326.Ve
1327.PP
1328These can be combined into a single regexp with a three-way alternation:
1329.PP
1330.Vb 1
1331\& /[+-]?(\ed+\e.\ed+|\ed+\e.|\e.\ed+)/; # floating point, no exponent
1332.Ve
1333.PP
1334In this alternation, it is important to put \f(CW'\ed+\e.\ed+'\fR before
1335\&\f(CW'\ed+\e.'\fR. If \f(CW'\ed+\e.'\fR were first, the regexp would happily match that
1336and ignore the fractional part of the number.
1337.PP
1338Now consider floating point numbers with exponents. The key
1339observation here is that \fIboth\fR integers and numbers with decimal
1340points are allowed in front of an exponent. Then exponents, like the
1341overall sign, are independent of whether we are matching numbers with
1342or without decimal points, and can be 'decoupled' from the
1343mantissa. The overall form of the regexp now becomes clear:
1344.PP
1345.Vb 1
1346\& /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/;
1347.Ve
1348.PP
1349The exponent is an \f(CW\*(C`e\*(C'\fR or \f(CW\*(C`E\*(C'\fR, followed by an integer. So the
1350exponent regexp is
1351.PP
1352.Vb 1
1353\& /[eE][+-]?\ed+/; # exponent
1354.Ve
1355.PP
1356Putting all the parts together, we get a regexp that matches numbers:
1357.PP
1358.Vb 1
1359\& /^[+-]?(\ed+\e.\ed+|\ed+\e.|\e.\ed+|\ed+)([eE][+-]?\ed+)?$/; # Ta da!
1360.Ve
1361.PP
1362Long regexps like this may impress your friends, but can be hard to
1363decipher. In complex situations like this, the \f(CW\*(C`//x\*(C'\fR modifier for a
1364match is invaluable. It allows one to put nearly arbitrary whitespace
1365and comments into a regexp without affecting their meaning. Using it,
1366we can rewrite our 'extended' regexp in the more pleasing form
1367.PP
1368.Vb 10
1369\& /^
1370\& [+-]? # first, match an optional sign
1371\& ( # then match integers or f.p. mantissas:
1372\& \ed+\e.\ed+ # mantissa of the form a.b
1373\& |\ed+\e. # mantissa of the form a.
1374\& |\e.\ed+ # mantissa of the form .b
1375\& |\ed+ # integer of the form a
1376\& )
1377\& ([eE][+-]?\ed+)? # finally, optionally match an exponent
1378\& $/x;
1379.Ve
1380.PP
1381If whitespace is mostly irrelevant, how does one include space
1382characters in an extended regexp? The answer is to backslash it
1383\&\f(CW'\e\ '\fR\ or put it in a character class \f(CW\*(C`[\ ]\*(C'\fR\ . The same thing
1384goes for pound signs, use \f(CW\*(C`\e#\*(C'\fR or \f(CW\*(C`[#]\*(C'\fR. For instance, Perl allows
1385a space between the sign and the mantissa/integer, and we could add
1386this to our regexp as follows:
1387.PP
1388.Vb 10
1389\& /^
1390\& [+-]?\e * # first, match an optional sign *and space*
1391\& ( # then match integers or f.p. mantissas:
1392\& \ed+\e.\ed+ # mantissa of the form a.b
1393\& |\ed+\e. # mantissa of the form a.
1394\& |\e.\ed+ # mantissa of the form .b
1395\& |\ed+ # integer of the form a
1396\& )
1397\& ([eE][+-]?\ed+)? # finally, optionally match an exponent
1398\& $/x;
1399.Ve
1400.PP
1401In this form, it is easier to see a way to simplify the
1402alternation. Alternatives 1, 2, and 4 all start with \f(CW\*(C`\ed+\*(C'\fR, so it
1403could be factored out:
1404.PP
1405.Vb 11
1406\& /^
1407\& [+-]?\e * # first, match an optional sign
1408\& ( # then match integers or f.p. mantissas:
1409\& \ed+ # start out with a ...
1410\& (
1411\& \e.\ed* # mantissa of the form a.b or a.
1412\& )? # ? takes care of integers of the form a
1413\& |\e.\ed+ # mantissa of the form .b
1414\& )
1415\& ([eE][+-]?\ed+)? # finally, optionally match an exponent
1416\& $/x;
1417.Ve
1418.PP
1419or written in the compact form,
1420.PP
1421.Vb 1
1422\& /^[+-]?\e *(\ed+(\e.\ed*)?|\e.\ed+)([eE][+-]?\ed+)?$/;
1423.Ve
1424.PP
1425This is our final regexp. To recap, we built a regexp by
1426.IP "\(bu" 4
1427specifying the task in detail,
1428.IP "\(bu" 4
1429breaking down the problem into smaller parts,
1430.IP "\(bu" 4
1431translating the small parts into regexps,
1432.IP "\(bu" 4
1433combining the regexps,
1434.IP "\(bu" 4
1435and optimizing the final combined regexp.
1436.PP
1437These are also the typical steps involved in writing a computer
1438program. This makes perfect sense, because regular expressions are
1439essentially programs written a little computer language that specifies
1440patterns.
1441.Sh "Using regular expressions in Perl"
1442.IX Subsection "Using regular expressions in Perl"
1443The last topic of Part 1 briefly covers how regexps are used in Perl
1444programs. Where do they fit into Perl syntax?
1445.PP
1446We have already introduced the matching operator in its default
1447\&\f(CW\*(C`/regexp/\*(C'\fR and arbitrary delimiter \f(CW\*(C`m!regexp!\*(C'\fR forms. We have used
1448the binding operator \f(CW\*(C`=~\*(C'\fR and its negation \f(CW\*(C`!~\*(C'\fR to test for string
1449matches. Associated with the matching operator, we have discussed the
1450single line \f(CW\*(C`//s\*(C'\fR, multi-line \f(CW\*(C`//m\*(C'\fR, case-insensitive \f(CW\*(C`//i\*(C'\fR and
1451extended \f(CW\*(C`//x\*(C'\fR modifiers.
1452.PP
1453There are a few more things you might want to know about matching
1454operators. First, we pointed out earlier that variables in regexps are
1455substituted before the regexp is evaluated:
1456.PP
1457.Vb 4
1458\& $pattern = 'Seuss';
1459\& while (<>) {
1460\& print if /$pattern/;
1461\& }
1462.Ve
1463.PP
1464This will print any lines containing the word \f(CW\*(C`Seuss\*(C'\fR. It is not as
1465efficient as it could be, however, because perl has to re-evaluate
1466\&\f(CW$pattern\fR each time through the loop. If \f(CW$pattern\fR won't be
1467changing over the lifetime of the script, we can add the \f(CW\*(C`//o\*(C'\fR
1468modifier, which directs perl to only perform variable substitutions
1469once:
1470.PP
1471.Vb 6
1472\& #!/usr/bin/perl
1473\& # Improved simple_grep
1474\& $regexp = shift;
1475\& while (<>) {
1476\& print if /$regexp/o; # a good deal faster
1477\& }
1478.Ve
1479.PP
1480If you change \f(CW$pattern\fR after the first substitution happens, perl
1481will ignore it. If you don't want any substitutions at all, use the
1482special delimiter \f(CW\*(C`m''\*(C'\fR:
1483.PP
1484.Vb 4
1485\& $pattern = 'Seuss';
1486\& while (<>) {
1487\& print if m'$pattern'; # matches '$pattern', not 'Seuss'
1488\& }
1489.Ve
1490.PP
1491\&\f(CW\*(C`m''\*(C'\fR acts like single quotes on a regexp; all other \f(CW\*(C`m\*(C'\fR delimiters
1492act like double quotes. If the regexp evaluates to the empty string,
1493the regexp in the \fIlast successful match\fR is used instead. So we have
1494.PP
1495.Vb 2
1496\& "dog" =~ /d/; # 'd' matches
1497\& "dogbert =~ //; # this matches the 'd' regexp used before
1498.Ve
1499.PP
1500The final two modifiers \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`//c\*(C'\fR concern multiple matches.
1501The modifier \f(CW\*(C`//g\*(C'\fR stands for global matching and allows the
1502matching operator to match within a string as many times as possible.
1503In scalar context, successive invocations against a string will have
1504`\f(CW\*(C`//g\*(C'\fR jump from match to match, keeping track of position in the
1505string as it goes along. You can get or set the position with the
1506\&\f(CW\*(C`pos()\*(C'\fR function.
1507.PP
1508The use of \f(CW\*(C`//g\*(C'\fR is shown in the following example. Suppose we have
1509a string that consists of words separated by spaces. If we know how
1510many words there are in advance, we could extract the words using
1511groupings:
1512.PP
1513.Vb 5
1514\& $x = "cat dog house"; # 3 words
1515\& $x =~ /^\es*(\ew+)\es+(\ew+)\es+(\ew+)\es*$/; # matches,
1516\& # $1 = 'cat'
1517\& # $2 = 'dog'
1518\& # $3 = 'house'
1519.Ve
1520.PP
1521But what if we had an indeterminate number of words? This is the sort
1522of task \f(CW\*(C`//g\*(C'\fR was made for. To extract all words, form the simple
1523regexp \f(CW\*(C`(\ew+)\*(C'\fR and loop over all matches with \f(CW\*(C`/(\ew+)/g\*(C'\fR:
1524.PP
1525.Vb 3
1526\& while ($x =~ /(\ew+)/g) {
1527\& print "Word is $1, ends at position ", pos $x, "\en";
1528\& }
1529.Ve
1530.PP
1531prints
1532.PP
1533.Vb 3
1534\& Word is cat, ends at position 3
1535\& Word is dog, ends at position 7
1536\& Word is house, ends at position 13
1537.Ve
1538.PP
1539A failed match or changing the target string resets the position. If
1540you don't want the position reset after failure to match, add the
1541\&\f(CW\*(C`//c\*(C'\fR, as in \f(CW\*(C`/regexp/gc\*(C'\fR. The current position in the string is
1542associated with the string, not the regexp. This means that different
1543strings have different positions and their respective positions can be
1544set or read independently.
1545.PP
1546In list context, \f(CW\*(C`//g\*(C'\fR returns a list of matched groupings, or if
1547there are no groupings, a list of matches to the whole regexp. So if
1548we wanted just the words, we could use
1549.PP
1550.Vb 4
1551\& @words = ($x =~ /(\ew+)/g); # matches,
1552\& # $word[0] = 'cat'
1553\& # $word[1] = 'dog'
1554\& # $word[2] = 'house'
1555.Ve
1556.PP
1557Closely associated with the \f(CW\*(C`//g\*(C'\fR modifier is the \f(CW\*(C`\eG\*(C'\fR anchor. The
1558\&\f(CW\*(C`\eG\*(C'\fR anchor matches at the point where the previous \f(CW\*(C`//g\*(C'\fR match left
1559off. \f(CW\*(C`\eG\*(C'\fR allows us to easily do context-sensitive matching:
1560.PP
1561.Vb 12
1562\& $metric = 1; # use metric units
1563\& ...
1564\& $x = <FILE>; # read in measurement
1565\& $x =~ /^([+-]?\ed+)\es*/g; # get magnitude
1566\& $weight = $1;
1567\& if ($metric) { # error checking
1568\& print "Units error!" unless $x =~ /\eGkg\e./g;
1569\& }
1570\& else {
1571\& print "Units error!" unless $x =~ /\eGlbs\e./g;
1572\& }
1573\& $x =~ /\eG\es+(widget|sprocket)/g; # continue processing
1574.Ve
1575.PP
1576The combination of \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`\eG\*(C'\fR allows us to process the string a
1577bit at a time and use arbitrary Perl logic to decide what to do next.
1578Currently, the \f(CW\*(C`\eG\*(C'\fR anchor is only fully supported when used to anchor
1579to the start of the pattern.
1580.PP
1581\&\f(CW\*(C`\eG\*(C'\fR is also invaluable in processing fixed length records with
1582regexps. Suppose we have a snippet of coding region \s-1DNA\s0, encoded as
1583base pair letters \f(CW\*(C`ATCGTTGAAT...\*(C'\fR and we want to find all the stop
1584codons \f(CW\*(C`TGA\*(C'\fR. In a coding region, codons are 3\-letter sequences, so
1585we can think of the \s-1DNA\s0 snippet as a sequence of 3\-letter records. The
1586naive regexp
1587.PP
1588.Vb 3
1589\& # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC"
1590\& $dna = "ATCGTTGAATGCAAATGACATGAC";
1591\& $dna =~ /TGA/;
1592.Ve
1593.PP
1594doesn't work; it may match a \f(CW\*(C`TGA\*(C'\fR, but there is no guarantee that
1595the match is aligned with codon boundaries, e.g., the substring
1596\&\f(CW\*(C`GTT\ GAA\*(C'\fR\ gives a match. A better solution is
1597.PP
1598.Vb 3
1599\& while ($dna =~ /(\ew\ew\ew)*?TGA/g) { # note the minimal *?
1600\& print "Got a TGA stop codon at position ", pos $dna, "\en";
1601\& }
1602.Ve
1603.PP
1604which prints
1605.PP
1606.Vb 2
1607\& Got a TGA stop codon at position 18
1608\& Got a TGA stop codon at position 23
1609.Ve
1610.PP
1611Position 18 is good, but position 23 is bogus. What happened?
1612.PP
1613The answer is that our regexp works well until we get past the last
1614real match. Then the regexp will fail to match a synchronized \f(CW\*(C`TGA\*(C'\fR
1615and start stepping ahead one character position at a time, not what we
1616want. The solution is to use \f(CW\*(C`\eG\*(C'\fR to anchor the match to the codon
1617alignment:
1618.PP
1619.Vb 3
1620\& while ($dna =~ /\eG(\ew\ew\ew)*?TGA/g) {
1621\& print "Got a TGA stop codon at position ", pos $dna, "\en";
1622\& }
1623.Ve
1624.PP
1625This prints
1626.PP
1627.Vb 1
1628\& Got a TGA stop codon at position 18
1629.Ve
1630.PP
1631which is the correct answer. This example illustrates that it is
1632important not only to match what is desired, but to reject what is not
1633desired.
1634.PP
1635\&\fBsearch and replace\fR
1636.PP
1637Regular expressions also play a big role in \fBsearch and replace\fR
1638operations in Perl. Search and replace is accomplished with the
1639\&\f(CW\*(C`s///\*(C'\fR operator. The general form is
1640\&\f(CW\*(C`s/regexp/replacement/modifiers\*(C'\fR, with everything we know about
1641regexps and modifiers applying in this case as well. The
1642\&\f(CW\*(C`replacement\*(C'\fR is a Perl double quoted string that replaces in the
1643string whatever is matched with the \f(CW\*(C`regexp\*(C'\fR. The operator \f(CW\*(C`=~\*(C'\fR is
1644also used here to associate a string with \f(CW\*(C`s///\*(C'\fR. If matching
1645against \f(CW$_\fR, the \f(CW\*(C`$_\ =~\*(C'\fR\ can be dropped. If there is a match,
1646\&\f(CW\*(C`s///\*(C'\fR returns the number of substitutions made, otherwise it returns
1647false. Here are a few examples:
1648.PP
1649.Vb 8
1650\& $x = "Time to feed the cat!";
1651\& $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!"
1652\& if ($x =~ s/^(Time.*hacker)!$/$1 now!/) {
1653\& $more_insistent = 1;
1654\& }
1655\& $y = "'quoted words'";
1656\& $y =~ s/^'(.*)'$/$1/; # strip single quotes,
1657\& # $y contains "quoted words"
1658.Ve
1659.PP
1660In the last example, the whole string was matched, but only the part
1661inside the single quotes was grouped. With the \f(CW\*(C`s///\*(C'\fR operator, the
1662matched variables \f(CW$1\fR, \f(CW$2\fR, etc. are immediately available for use
1663in the replacement expression, so we use \f(CW$1\fR to replace the quoted
1664string with just what was quoted. With the global modifier, \f(CW\*(C`s///g\*(C'\fR
1665will search and replace all occurrences of the regexp in the string:
1666.PP
1667.Vb 6
1668\& $x = "I batted 4 for 4";
1669\& $x =~ s/4/four/; # doesn't do it all:
1670\& # $x contains "I batted four for 4"
1671\& $x = "I batted 4 for 4";
1672\& $x =~ s/4/four/g; # does it all:
1673\& # $x contains "I batted four for four"
1674.Ve
1675.PP
1676If you prefer 'regex' over 'regexp' in this tutorial, you could use
1677the following program to replace it:
1678.PP
1679.Vb 9
1680\& % cat > simple_replace
1681\& #!/usr/bin/perl
1682\& $regexp = shift;
1683\& $replacement = shift;
1684\& while (<>) {
1685\& s/$regexp/$replacement/go;
1686\& print;
1687\& }
1688\& ^D
1689.Ve
1690.PP
1691.Vb 1
1692\& % simple_replace regexp regex perlretut.pod
1693.Ve
1694.PP
1695In \f(CW\*(C`simple_replace\*(C'\fR we used the \f(CW\*(C`s///g\*(C'\fR modifier to replace all
1696occurrences of the regexp on each line and the \f(CW\*(C`s///o\*(C'\fR modifier to
1697compile the regexp only once. As with \f(CW\*(C`simple_grep\*(C'\fR, both the
1698\&\f(CW\*(C`print\*(C'\fR and the \f(CW\*(C`s/$regexp/$replacement/go\*(C'\fR use \f(CW$_\fR implicitly.
1699.PP
1700A modifier available specifically to search and replace is the
1701\&\f(CW\*(C`s///e\*(C'\fR evaluation modifier. \f(CW\*(C`s///e\*(C'\fR wraps an \f(CW\*(C`eval{...}\*(C'\fR around
1702the replacement string and the evaluated result is substituted for the
1703matched substring. \f(CW\*(C`s///e\*(C'\fR is useful if you need to do a bit of
1704computation in the process of replacing text. This example counts
1705character frequencies in a line:
1706.PP
1707.Vb 4
1708\& $x = "Bill the cat";
1709\& $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself
1710\& print "frequency of '$_' is $chars{$_}\en"
1711\& foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars);
1712.Ve
1713.PP
1714This prints
1715.PP
1716.Vb 9
1717\& frequency of ' ' is 2
1718\& frequency of 't' is 2
1719\& frequency of 'l' is 2
1720\& frequency of 'B' is 1
1721\& frequency of 'c' is 1
1722\& frequency of 'e' is 1
1723\& frequency of 'h' is 1
1724\& frequency of 'i' is 1
1725\& frequency of 'a' is 1
1726.Ve
1727.PP
1728As with the match \f(CW\*(C`m//\*(C'\fR operator, \f(CW\*(C`s///\*(C'\fR can use other delimiters,
1729such as \f(CW\*(C`s!!!\*(C'\fR and \f(CW\*(C`s{}{}\*(C'\fR, and even \f(CW\*(C`s{}//\*(C'\fR. If single quotes are
1730used \f(CW\*(C`s'''\*(C'\fR, then the regexp and replacement are treated as single
1731quoted strings and there are no substitutions. \f(CW\*(C`s///\*(C'\fR in list context
1732returns the same thing as in scalar context, i.e., the number of
1733matches.
1734.PP
1735\&\fBThe split operator\fR
1736.PP
1737The \fB\f(CB\*(C`split\*(C'\fB \fR function can also optionally use a matching operator
1738\&\f(CW\*(C`m//\*(C'\fR to split a string. \f(CW\*(C`split /regexp/, string, limit\*(C'\fR splits
1739\&\f(CW\*(C`string\*(C'\fR into a list of substrings and returns that list. The regexp
1740is used to match the character sequence that the \f(CW\*(C`string\*(C'\fR is split
1741with respect to. The \f(CW\*(C`limit\*(C'\fR, if present, constrains splitting into
1742no more than \f(CW\*(C`limit\*(C'\fR number of strings. For example, to split a
1743string into words, use
1744.PP
1745.Vb 4
1746\& $x = "Calvin and Hobbes";
1747\& @words = split /\es+/, $x; # $word[0] = 'Calvin'
1748\& # $word[1] = 'and'
1749\& # $word[2] = 'Hobbes'
1750.Ve
1751.PP
1752If the empty regexp \f(CW\*(C`//\*(C'\fR is used, the regexp always matches and
1753the string is split into individual characters. If the regexp has
1754groupings, then list produced contains the matched substrings from the
1755groupings as well. For instance,
1756.PP
1757.Vb 12
1758\& $x = "/usr/bin/perl";
1759\& @dirs = split m!/!, $x; # $dirs[0] = ''
1760\& # $dirs[1] = 'usr'
1761\& # $dirs[2] = 'bin'
1762\& # $dirs[3] = 'perl'
1763\& @parts = split m!(/)!, $x; # $parts[0] = ''
1764\& # $parts[1] = '/'
1765\& # $parts[2] = 'usr'
1766\& # $parts[3] = '/'
1767\& # $parts[4] = 'bin'
1768\& # $parts[5] = '/'
1769\& # $parts[6] = 'perl'
1770.Ve
1771.PP
1772Since the first character of \f(CW$x\fR matched the regexp, \f(CW\*(C`split\*(C'\fR prepended
1773an empty initial element to the list.
1774.PP
1775If you have read this far, congratulations! You now have all the basic
1776tools needed to use regular expressions to solve a wide range of text
1777processing problems. If this is your first time through the tutorial,
1778why not stop here and play around with regexps a while... Part\ 2
1779concerns the more esoteric aspects of regular expressions and those
1780concepts certainly aren't needed right at the start.
1781.SH "Part 2: Power tools"
1782.IX Header "Part 2: Power tools"
1783\&\s-1OK\s0, you know the basics of regexps and you want to know more. If
1784matching regular expressions is analogous to a walk in the woods, then
1785the tools discussed in Part 1 are analogous to topo maps and a
1786compass, basic tools we use all the time. Most of the tools in part 2
1787are analogous to flare guns and satellite phones. They aren't used
1788too often on a hike, but when we are stuck, they can be invaluable.
1789.PP
1790What follows are the more advanced, less used, or sometimes esoteric
1791capabilities of perl regexps. In Part 2, we will assume you are
1792comfortable with the basics and concentrate on the new features.
1793.Sh "More on characters, strings, and character classes"
1794.IX Subsection "More on characters, strings, and character classes"
1795There are a number of escape sequences and character classes that we
1796haven't covered yet.
1797.PP
1798There are several escape sequences that convert characters or strings
1799between upper and lower case. \f(CW\*(C`\el\*(C'\fR and \f(CW\*(C`\eu\*(C'\fR convert the next
1800character to lower or upper case, respectively:
1801.PP
1802.Vb 4
1803\& $x = "perl";
1804\& $string =~ /\eu$x/; # matches 'Perl' in $string
1805\& $x = "M(rs?|s)\e\e."; # note the double backslash
1806\& $string =~ /\el$x/; # matches 'mr.', 'mrs.', and 'ms.',
1807.Ve
1808.PP
1809\&\f(CW\*(C`\eL\*(C'\fR and \f(CW\*(C`\eU\*(C'\fR converts a whole substring, delimited by \f(CW\*(C`\eL\*(C'\fR or
1810\&\f(CW\*(C`\eU\*(C'\fR and \f(CW\*(C`\eE\*(C'\fR, to lower or upper case:
1811.PP
1812.Vb 4
1813\& $x = "This word is in lower case:\eL SHOUT\eE";
1814\& $x =~ /shout/; # matches
1815\& $x = "I STILL KEYPUNCH CARDS FOR MY 360"
1816\& $x =~ /\eUkeypunch/; # matches punch card string
1817.Ve
1818.PP
1819If there is no \f(CW\*(C`\eE\*(C'\fR, case is converted until the end of the
1820string. The regexps \f(CW\*(C`\eL\eu$word\*(C'\fR or \f(CW\*(C`\eu\eL$word\*(C'\fR convert the first
1821character of \f(CW$word\fR to uppercase and the rest of the characters to
1822lowercase.
1823.PP
1824Control characters can be escaped with \f(CW\*(C`\ec\*(C'\fR, so that a control-Z
1825character would be matched with \f(CW\*(C`\ecZ\*(C'\fR. The escape sequence
1826\&\f(CW\*(C`\eQ\*(C'\fR...\f(CW\*(C`\eE\*(C'\fR quotes, or protects most non-alphabetic characters. For
1827instance,
1828.PP
1829.Vb 2
1830\& $x = "\eQThat !^*&%~& cat!";
1831\& $x =~ /\eQ!^*&%~&\eE/; # check for rough language
1832.Ve
1833.PP
1834It does not protect \f(CW\*(C`$\*(C'\fR or \f(CW\*(C`@\*(C'\fR, so that variables can still be
1835substituted.
1836.PP
1837With the advent of 5.6.0, perl regexps can handle more than just the
1838standard \s-1ASCII\s0 character set. Perl now supports \fBUnicode\fR, a standard
1839for encoding the character sets from many of the world's written
1840languages. Unicode does this by allowing characters to be more than
1841one byte wide. Perl uses the \s-1UTF\-8\s0 encoding, in which \s-1ASCII\s0 characters
1842are still encoded as one byte, but characters greater than \f(CW\*(C`chr(127)\*(C'\fR
1843may be stored as two or more bytes.
1844.PP
1845What does this mean for regexps? Well, regexp users don't need to know
1846much about perl's internal representation of strings. But they do need
1847to know 1) how to represent Unicode characters in a regexp and 2) when
1848a matching operation will treat the string to be searched as a
1849sequence of bytes (the old way) or as a sequence of Unicode characters
1850(the new way). The answer to 1) is that Unicode characters greater
1851than \f(CW\*(C`chr(127)\*(C'\fR may be represented using the \f(CW\*(C`\ex{hex}\*(C'\fR notation,
1852with \f(CW\*(C`hex\*(C'\fR a hexadecimal integer:
1853.PP
1854.Vb 1
1855\& /\ex{263a}/; # match a Unicode smiley face :)
1856.Ve
1857.PP
1858Unicode characters in the range of 128\-255 use two hexadecimal digits
1859with braces: \f(CW\*(C`\ex{ab}\*(C'\fR. Note that this is different than \f(CW\*(C`\exab\*(C'\fR,
1860which is just a hexadecimal byte with no Unicode significance.
1861.PP
1862\&\fB\s-1NOTE\s0\fR: in Perl 5.6.0 it used to be that one needed to say \f(CW\*(C`use
1863utf8\*(C'\fR to use any Unicode features. This is no more the case: for
1864almost all Unicode processing, the explicit \f(CW\*(C`utf8\*(C'\fR pragma is not
1865needed. (The only case where it matters is if your Perl script is in
1866Unicode and encoded in \s-1UTF\-8\s0, then an explicit \f(CW\*(C`use utf8\*(C'\fR is needed.)
1867.PP
1868Figuring out the hexadecimal sequence of a Unicode character you want
1869or deciphering someone else's hexadecimal Unicode regexp is about as
1870much fun as programming in machine code. So another way to specify
1871Unicode characters is to use the \fBnamed\ character\fR\ escape
1872sequence \f(CW\*(C`\eN{name}\*(C'\fR. \f(CW\*(C`name\*(C'\fR is a name for the Unicode character, as
1873specified in the Unicode standard. For instance, if we wanted to
1874represent or match the astrological sign for the planet Mercury, we
1875could use
1876.PP
1877.Vb 3
1878\& use charnames ":full"; # use named chars with Unicode full names
1879\& $x = "abc\eN{MERCURY}def";
1880\& $x =~ /\eN{MERCURY}/; # matches
1881.Ve
1882.PP
1883One can also use short names or restrict names to a certain alphabet:
1884.PP
1885.Vb 2
1886\& use charnames ':full';
1887\& print "\eN{GREEK SMALL LETTER SIGMA} is called sigma.\en";
1888.Ve
1889.PP
1890.Vb 2
1891\& use charnames ":short";
1892\& print "\eN{greek:Sigma} is an upper-case sigma.\en";
1893.Ve
1894.PP
1895.Vb 2
1896\& use charnames qw(greek);
1897\& print "\eN{sigma} is Greek sigma\en";
1898.Ve
1899.PP
1900A list of full names is found in the file Names.txt in the
1901lib/perl5/5.X.X/unicore directory.
1902.PP
1903The answer to requirement 2), as of 5.6.0, is that if a regexp
1904contains Unicode characters, the string is searched as a sequence of
1905Unicode characters. Otherwise, the string is searched as a sequence of
1906bytes. If the string is being searched as a sequence of Unicode
1907characters, but matching a single byte is required, we can use the \f(CW\*(C`\eC\*(C'\fR
1908escape sequence. \f(CW\*(C`\eC\*(C'\fR is a character class akin to \f(CW\*(C`.\*(C'\fR except that
1909it matches \fIany\fR byte 0\-255. So
1910.PP
1911.Vb 7
1912\& use charnames ":full"; # use named chars with Unicode full names
1913\& $x = "a";
1914\& $x =~ /\eC/; # matches 'a', eats one byte
1915\& $x = "";
1916\& $x =~ /\eC/; # doesn't match, no bytes to match
1917\& $x = "\eN{MERCURY}"; # two-byte Unicode character
1918\& $x =~ /\eC/; # matches, but dangerous!
1919.Ve
1920.PP
1921The last regexp matches, but is dangerous because the string
1922\&\fIcharacter\fR position is no longer synchronized to the string \fIbyte\fR
1923position. This generates the warning 'Malformed \s-1UTF\-8\s0
1924character'. \f(CW\*(C`\eC\*(C'\fR is best used for matching the binary data in strings
1925with binary data intermixed with Unicode characters.
1926.PP
1927Let us now discuss the rest of the character classes. Just as with
1928Unicode characters, there are named Unicode character classes
1929represented by the \f(CW\*(C`\ep{name}\*(C'\fR escape sequence. Closely associated is
1930the \f(CW\*(C`\eP{name}\*(C'\fR character class, which is the negation of the
1931\&\f(CW\*(C`\ep{name}\*(C'\fR class. For example, to match lower and uppercase
1932characters,
1933.PP
1934.Vb 6
1935\& use charnames ":full"; # use named chars with Unicode full names
1936\& $x = "BOB";
1937\& $x =~ /^\ep{IsUpper}/; # matches, uppercase char class
1938\& $x =~ /^\eP{IsUpper}/; # doesn't match, char class sans uppercase
1939\& $x =~ /^\ep{IsLower}/; # doesn't match, lowercase char class
1940\& $x =~ /^\eP{IsLower}/; # matches, char class sans lowercase
1941.Ve
1942.PP
1943Here is the association between some Perl named classes and the
1944traditional Unicode classes:
1945.PP
1946.Vb 1
1947\& Perl class name Unicode class name or regular expression
1948.Ve
1949.PP
1950.Vb 15
1951\& IsAlpha /^[LM]/
1952\& IsAlnum /^[LMN]/
1953\& IsASCII $code <= 127
1954\& IsCntrl /^C/
1955\& IsBlank $code =~ /^(0020|0009)$/ || /^Z[^lp]/
1956\& IsDigit Nd
1957\& IsGraph /^([LMNPS]|Co)/
1958\& IsLower Ll
1959\& IsPrint /^([LMNPS]|Co|Zs)/
1960\& IsPunct /^P/
1961\& IsSpace /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/
1962\& IsSpacePerl /^Z/ || ($code =~ /^(0009|000A|000C|000D|0085|2028|2029)$/
1963\& IsUpper /^L[ut]/
1964\& IsWord /^[LMN]/ || $code eq "005F"
1965\& IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/
1966.Ve
1967.PP
1968You can also use the official Unicode class names with the \f(CW\*(C`\ep\*(C'\fR and
1969\&\f(CW\*(C`\eP\*(C'\fR, like \f(CW\*(C`\ep{L}\*(C'\fR for Unicode 'letters', or \f(CW\*(C`\ep{Lu}\*(C'\fR for uppercase
1970letters, or \f(CW\*(C`\eP{Nd}\*(C'\fR for non\-digits. If a \f(CW\*(C`name\*(C'\fR is just one
1971letter, the braces can be dropped. For instance, \f(CW\*(C`\epM\*(C'\fR is the
1972character class of Unicode 'marks', for example accent marks.
1973For the full list see perlunicode.
1974.PP
1975The Unicode has also been separated into various sets of charaters
1976which you can test with \f(CW\*(C`\ep{In...}\*(C'\fR (in) and \f(CW\*(C`\eP{In...}\*(C'\fR (not in),
1977for example \f(CW\*(C`\ep{Latin}\*(C'\fR, \f(CW\*(C`\ep{Greek}\*(C'\fR, or \f(CW\*(C`\eP{Katakana}\*(C'\fR.
1978For the full list see perlunicode.
1979.PP
1980\&\f(CW\*(C`\eX\*(C'\fR is an abbreviation for a character class sequence that includes
1981the Unicode 'combining character sequences'. A 'combining character
1982sequence' is a base character followed by any number of combining
1983characters. An example of a combining character is an accent. Using
1984the Unicode full names, e.g., \f(CW\*(C`A\ +\ COMBINING\ RING\*(C'\fR\ is a combining
1985character sequence with base character \f(CW\*(C`A\*(C'\fR and combining character
1986\&\f(CW\*(C`COMBINING\ RING\*(C'\fR\ , which translates in Danish to A with the circle
1987atop it, as in the word Angstrom. \f(CW\*(C`\eX\*(C'\fR is equivalent to \f(CW\*(C`\ePM\epM*}\*(C'\fR,
1988i.e., a non-mark followed by one or more marks.
1989.PP
1990For the full and latest information about Unicode see the latest
1991Unicode standard, or the Unicode Consortium's website http://www.unicode.org/
1992.PP
1993As if all those classes weren't enough, Perl also defines \s-1POSIX\s0 style
1994character classes. These have the form \f(CW\*(C`[:name:]\*(C'\fR, with \f(CW\*(C`name\*(C'\fR the
1995name of the \s-1POSIX\s0 class. The \s-1POSIX\s0 classes are \f(CW\*(C`alpha\*(C'\fR, \f(CW\*(C`alnum\*(C'\fR,
1996\&\f(CW\*(C`ascii\*(C'\fR, \f(CW\*(C`cntrl\*(C'\fR, \f(CW\*(C`digit\*(C'\fR, \f(CW\*(C`graph\*(C'\fR, \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`print\*(C'\fR, \f(CW\*(C`punct\*(C'\fR,
1997\&\f(CW\*(C`space\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, and \f(CW\*(C`xdigit\*(C'\fR, and two extensions, \f(CW\*(C`word\*(C'\fR (a Perl
1998extension to match \f(CW\*(C`\ew\*(C'\fR), and \f(CW\*(C`blank\*(C'\fR (a \s-1GNU\s0 extension). If \f(CW\*(C`utf8\*(C'\fR
1999is being used, then these classes are defined the same as their
2000corresponding perl Unicode classes: \f(CW\*(C`[:upper:]\*(C'\fR is the same as
2001\&\f(CW\*(C`\ep{IsUpper}\*(C'\fR, etc. The \s-1POSIX\s0 character classes, however, don't
2002require using \f(CW\*(C`utf8\*(C'\fR. The \f(CW\*(C`[:digit:]\*(C'\fR, \f(CW\*(C`[:word:]\*(C'\fR, and
2003\&\f(CW\*(C`[:space:]\*(C'\fR correspond to the familiar \f(CW\*(C`\ed\*(C'\fR, \f(CW\*(C`\ew\*(C'\fR, and \f(CW\*(C`\es\*(C'\fR
2004character classes. To negate a \s-1POSIX\s0 class, put a \f(CW\*(C`^\*(C'\fR in front of
2005the name, so that, e.g., \f(CW\*(C`[:^digit:]\*(C'\fR corresponds to \f(CW\*(C`\eD\*(C'\fR and under
2006\&\f(CW\*(C`utf8\*(C'\fR, \f(CW\*(C`\eP{IsDigit}\*(C'\fR. The Unicode and \s-1POSIX\s0 character classes can
2007be used just like \f(CW\*(C`\ed\*(C'\fR, with the exception that \s-1POSIX\s0 character
2008classes can only be used inside of a character class:
2009.PP
2010.Vb 7
2011\& /\es+[abc[:digit:]xyz]\es*/; # match a,b,c,x,y,z, or a digit
2012\& /^=item\es[[:digit:]]/; # match '=item',
2013\& # followed by a space and a digit
2014\& use charnames ":full";
2015\& /\es+[abc\ep{IsDigit}xyz]\es+/; # match a,b,c,x,y,z, or a digit
2016\& /^=item\es\ep{IsDigit}/; # match '=item',
2017\& # followed by a space and a digit
2018.Ve
2019.PP
2020Whew! That is all the rest of the characters and character classes.
2021.Sh "Compiling and saving regular expressions"
2022.IX Subsection "Compiling and saving regular expressions"
2023In Part 1 we discussed the \f(CW\*(C`//o\*(C'\fR modifier, which compiles a regexp
2024just once. This suggests that a compiled regexp is some data structure
2025that can be stored once and used again and again. The regexp quote
2026\&\f(CW\*(C`qr//\*(C'\fR does exactly that: \f(CW\*(C`qr/string/\*(C'\fR compiles the \f(CW\*(C`string\*(C'\fR as a
2027regexp and transforms the result into a form that can be assigned to a
2028variable:
2029.PP
2030.Vb 1
2031\& $reg = qr/foo+bar?/; # reg contains a compiled regexp
2032.Ve
2033.PP
2034Then \f(CW$reg\fR can be used as a regexp:
2035.PP
2036.Vb 3
2037\& $x = "fooooba";
2038\& $x =~ $reg; # matches, just like /foo+bar?/
2039\& $x =~ /$reg/; # same thing, alternate form
2040.Ve
2041.PP
2042\&\f(CW$reg\fR can also be interpolated into a larger regexp:
2043.PP
2044.Vb 1
2045\& $x =~ /(abc)?$reg/; # still matches
2046.Ve
2047.PP
2048As with the matching operator, the regexp quote can use different
2049delimiters, e.g., \f(CW\*(C`qr!!\*(C'\fR, \f(CW\*(C`qr{}\*(C'\fR and \f(CW\*(C`qr~~\*(C'\fR. The single quote
2050delimiters \f(CW\*(C`qr''\*(C'\fR prevent any interpolation from taking place.
2051.PP
2052Pre-compiled regexps are useful for creating dynamic matches that
2053don't need to be recompiled each time they are encountered. Using
2054pre-compiled regexps, \f(CW\*(C`simple_grep\*(C'\fR program can be expanded into a
2055program that matches multiple patterns:
2056.PP
2057.Vb 4
2058\& % cat > multi_grep
2059\& #!/usr/bin/perl
2060\& # multi_grep - match any of <number> regexps
2061\& # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ...
2062.Ve
2063.PP
2064.Vb 12
2065\& $number = shift;
2066\& $regexp[$_] = shift foreach (0..$number-1);
2067\& @compiled = map qr/$_/, @regexp;
2068\& while ($line = <>) {
2069\& foreach $pattern (@compiled) {
2070\& if ($line =~ /$pattern/) {
2071\& print $line;
2072\& last; # we matched, so move onto the next line
2073\& }
2074\& }
2075\& }
2076\& ^D
2077.Ve
2078.PP
2079.Vb 4
2080\& % multi_grep 2 last for multi_grep
2081\& $regexp[$_] = shift foreach (0..$number-1);
2082\& foreach $pattern (@compiled) {
2083\& last;
2084.Ve
2085.PP
2086Storing pre-compiled regexps in an array \f(CW@compiled\fR allows us to
2087simply loop through the regexps without any recompilation, thus gaining
2088flexibility without sacrificing speed.
2089.Sh "Embedding comments and modifiers in a regular expression"
2090.IX Subsection "Embedding comments and modifiers in a regular expression"
2091Starting with this section, we will be discussing Perl's set of
2092\&\fBextended patterns\fR. These are extensions to the traditional regular
2093expression syntax that provide powerful new tools for pattern
2094matching. We have already seen extensions in the form of the minimal
2095matching constructs \f(CW\*(C`??\*(C'\fR, \f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, \f(CW\*(C`{n,m}?\*(C'\fR, and \f(CW\*(C`{n,}?\*(C'\fR. The
2096rest of the extensions below have the form \f(CW\*(C`(?char...)\*(C'\fR, where the
2097\&\f(CW\*(C`char\*(C'\fR is a character that determines the type of extension.
2098.PP
2099The first extension is an embedded comment \f(CW\*(C`(?#text)\*(C'\fR. This embeds a
2100comment into the regular expression without affecting its meaning. The
2101comment should not have any closing parentheses in the text. An
2102example is
2103.PP
2104.Vb 1
2105\& /(?# Match an integer:)[+-]?\ed+/;
2106.Ve
2107.PP
2108This style of commenting has been largely superseded by the raw,
2109freeform commenting that is allowed with the \f(CW\*(C`//x\*(C'\fR modifier.
2110.PP
2111The modifiers \f(CW\*(C`//i\*(C'\fR, \f(CW\*(C`//m\*(C'\fR, \f(CW\*(C`//s\*(C'\fR, and \f(CW\*(C`//x\*(C'\fR can also embedded in
2112a regexp using \f(CW\*(C`(?i)\*(C'\fR, \f(CW\*(C`(?m)\*(C'\fR, \f(CW\*(C`(?s)\*(C'\fR, and \f(CW\*(C`(?x)\*(C'\fR. For instance,
2113.PP
2114.Vb 7
2115\& /(?i)yes/; # match 'yes' case insensitively
2116\& /yes/i; # same thing
2117\& /(?x)( # freeform version of an integer regexp
2118\& [+-]? # match an optional sign
2119\& \ed+ # match a sequence of digits
2120\& )
2121\& /x;
2122.Ve
2123.PP
2124Embedded modifiers can have two important advantages over the usual
2125modifiers. Embedded modifiers allow a custom set of modifiers to
2126\&\fIeach\fR regexp pattern. This is great for matching an array of regexps
2127that must have different modifiers:
2128.PP
2129.Vb 8
2130\& $pattern[0] = '(?i)doctor';
2131\& $pattern[1] = 'Johnson';
2132\& ...
2133\& while (<>) {
2134\& foreach $patt (@pattern) {
2135\& print if /$patt/;
2136\& }
2137\& }
2138.Ve
2139.PP
2140The second advantage is that embedded modifiers only affect the regexp
2141inside the group the embedded modifier is contained in. So grouping
2142can be used to localize the modifier's effects:
2143.PP
2144.Vb 1
2145\& /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc.
2146.Ve
2147.PP
2148Embedded modifiers can also turn off any modifiers already present
2149by using, e.g., \f(CW\*(C`(?\-i)\*(C'\fR. Modifiers can also be combined into
2150a single expression, e.g., \f(CW\*(C`(?s\-i)\*(C'\fR turns on single line mode and
2151turns off case insensitivity.
2152.Sh "Non-capturing groupings"
2153.IX Subsection "Non-capturing groupings"
2154We noted in Part 1 that groupings \f(CW\*(C`()\*(C'\fR had two distinct functions: 1)
2155group regexp elements together as a single unit, and 2) extract, or
2156capture, substrings that matched the regexp in the
2157grouping. Non-capturing groupings, denoted by \f(CW\*(C`(?:regexp)\*(C'\fR, allow the
2158regexp to be treated as a single unit, but don't extract substrings or
2159set matching variables \f(CW$1\fR, etc. Both capturing and non-capturing
2160groupings are allowed to co-exist in the same regexp. Because there is
2161no extraction, non-capturing groupings are faster than capturing
2162groupings. Non-capturing groupings are also handy for choosing exactly
2163which parts of a regexp are to be extracted to matching variables:
2164.PP
2165.Vb 2
2166\& # match a number, $1-$4 are set, but we only want $1
2167\& /([+-]?\e *(\ed+(\e.\ed*)?|\e.\ed+)([eE][+-]?\ed+)?)/;
2168.Ve
2169.PP
2170.Vb 2
2171\& # match a number faster , only $1 is set
2172\& /([+-]?\e *(?:\ed+(?:\e.\ed*)?|\e.\ed+)(?:[eE][+-]?\ed+)?)/;
2173.Ve
2174.PP
2175.Vb 2
2176\& # match a number, get $1 = whole number, $2 = exponent
2177\& /([+-]?\e *(?:\ed+(?:\e.\ed*)?|\e.\ed+)(?:[eE]([+-]?\ed+))?)/;
2178.Ve
2179.PP
2180Non-capturing groupings are also useful for removing nuisance
2181elements gathered from a split operation:
2182.PP
2183.Vb 3
2184\& $x = '12a34b5';
2185\& @num = split /(a|b)/, $x; # @num = ('12','a','34','b','5')
2186\& @num = split /(?:a|b)/, $x; # @num = ('12','34','5')
2187.Ve
2188.PP
2189Non-capturing groupings may also have embedded modifiers:
2190\&\f(CW\*(C`(?i\-m:regexp)\*(C'\fR is a non-capturing grouping that matches \f(CW\*(C`regexp\*(C'\fR
2191case insensitively and turns off multi-line mode.
2192.Sh "Looking ahead and looking behind"
2193.IX Subsection "Looking ahead and looking behind"
2194This section concerns the lookahead and lookbehind assertions. First,
2195a little background.
2196.PP
2197In Perl regular expressions, most regexp elements 'eat up' a certain
2198amount of string when they match. For instance, the regexp element
2199\&\f(CW\*(C`[abc}]\*(C'\fR eats up one character of the string when it matches, in the
2200sense that perl moves to the next character position in the string
2201after the match. There are some elements, however, that don't eat up
2202characters (advance the character position) if they match. The examples
2203we have seen so far are the anchors. The anchor \f(CW\*(C`^\*(C'\fR matches the
2204beginning of the line, but doesn't eat any characters. Similarly, the
2205word boundary anchor \f(CW\*(C`\eb\*(C'\fR matches, e.g., if the character to the left
2206is a word character and the character to the right is a non-word
2207character, but it doesn't eat up any characters itself. Anchors are
2208examples of 'zero\-width assertions'. Zero\-width, because they consume
2209no characters, and assertions, because they test some property of the
2210string. In the context of our walk in the woods analogy to regexp
2211matching, most regexp elements move us along a trail, but anchors have
2212us stop a moment and check our surroundings. If the local environment
2213checks out, we can proceed forward. But if the local environment
2214doesn't satisfy us, we must backtrack.
2215.PP
2216Checking the environment entails either looking ahead on the trail,
2217looking behind, or both. \f(CW\*(C`^\*(C'\fR looks behind, to see that there are no
2218characters before. \f(CW\*(C`$\*(C'\fR looks ahead, to see that there are no
2219characters after. \f(CW\*(C`\eb\*(C'\fR looks both ahead and behind, to see if the
2220characters on either side differ in their 'word'\-ness.
2221.PP
2222The lookahead and lookbehind assertions are generalizations of the
2223anchor concept. Lookahead and lookbehind are zero-width assertions
2224that let us specify which characters we want to test for. The
2225lookahead assertion is denoted by \f(CW\*(C`(?=regexp)\*(C'\fR and the lookbehind
2226assertion is denoted by \f(CW\*(C`(?<=fixed\-regexp)\*(C'\fR. Some examples are
2227.PP
2228.Vb 8
2229\& $x = "I catch the housecat 'Tom-cat' with catnip";
2230\& $x =~ /cat(?=\es+)/; # matches 'cat' in 'housecat'
2231\& @catwords = ($x =~ /(?<=\es)cat\ew+/g); # matches,
2232\& # $catwords[0] = 'catch'
2233\& # $catwords[1] = 'catnip'
2234\& $x =~ /\ebcat\eb/; # matches 'cat' in 'Tom-cat'
2235\& $x =~ /(?<=\es)cat(?=\es)/; # doesn't match; no isolated 'cat' in
2236\& # middle of $x
2237.Ve
2238.PP
2239Note that the parentheses in \f(CW\*(C`(?=regexp)\*(C'\fR and \f(CW\*(C`(?<=regexp)\*(C'\fR are
2240non\-capturing, since these are zero-width assertions. Thus in the
2241second regexp, the substrings captured are those of the whole regexp
2242itself. Lookahead \f(CW\*(C`(?=regexp)\*(C'\fR can match arbitrary regexps, but
2243lookbehind \f(CW\*(C`(?<=fixed\-regexp)\*(C'\fR only works for regexps of fixed
2244width, i.e., a fixed number of characters long. Thus
2245\&\f(CW\*(C`(?<=(ab|bc))\*(C'\fR is fine, but \f(CW\*(C`(?<=(ab)*)\*(C'\fR is not. The
2246negated versions of the lookahead and lookbehind assertions are
2247denoted by \f(CW\*(C`(?!regexp)\*(C'\fR and \f(CW\*(C`(?<!fixed\-regexp)\*(C'\fR respectively.
2248They evaluate true if the regexps do \fInot\fR match:
2249.PP
2250.Vb 4
2251\& $x = "foobar";
2252\& $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo'
2253\& $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo'
2254\& $x =~ /(?<!\es)foo/; # matches, there is no \es before 'foo'
2255.Ve
2256.Sh "Using independent subexpressions to prevent backtracking"
2257.IX Subsection "Using independent subexpressions to prevent backtracking"
2258The last few extended patterns in this tutorial are experimental as of
22595.6.0. Play with them, use them in some code, but don't rely on them
2260just yet for production code.
2261.PP
2262\&\fBIndependent\ subexpressions\fR\ are regular expressions, in the
2263context of a larger regular expression, that function independently of
2264the larger regular expression. That is, they consume as much or as
2265little of the string as they wish without regard for the ability of
2266the larger regexp to match. Independent subexpressions are represented
2267by \f(CW\*(C`(?>regexp)\*(C'\fR. We can illustrate their behavior by first
2268considering an ordinary regexp:
2269.PP
2270.Vb 2
2271\& $x = "ab";
2272\& $x =~ /a*ab/; # matches
2273.Ve
2274.PP
2275This obviously matches, but in the process of matching, the
2276subexpression \f(CW\*(C`a*\*(C'\fR first grabbed the \f(CW\*(C`a\*(C'\fR. Doing so, however,
2277wouldn't allow the whole regexp to match, so after backtracking, \f(CW\*(C`a*\*(C'\fR
2278eventually gave back the \f(CW\*(C`a\*(C'\fR and matched the empty string. Here, what
2279\&\f(CW\*(C`a*\*(C'\fR matched was \fIdependent\fR on what the rest of the regexp matched.
2280.PP
2281Contrast that with an independent subexpression:
2282.PP
2283.Vb 1
2284\& $x =~ /(?>a*)ab/; # doesn't match!
2285.Ve
2286.PP
2287The independent subexpression \f(CW\*(C`(?>a*)\*(C'\fR doesn't care about the rest
2288of the regexp, so it sees an \f(CW\*(C`a\*(C'\fR and grabs it. Then the rest of the
2289regexp \f(CW\*(C`ab\*(C'\fR cannot match. Because \f(CW\*(C`(?>a*)\*(C'\fR is independent, there
2290is no backtracking and the independent subexpression does not give
2291up its \f(CW\*(C`a\*(C'\fR. Thus the match of the regexp as a whole fails. A similar
2292behavior occurs with completely independent regexps:
2293.PP
2294.Vb 3
2295\& $x = "ab";
2296\& $x =~ /a*/g; # matches, eats an 'a'
2297\& $x =~ /\eGab/g; # doesn't match, no 'a' available
2298.Ve
2299.PP
2300Here \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`\eG\*(C'\fR create a 'tag team' handoff of the string from
2301one regexp to the other. Regexps with an independent subexpression are
2302much like this, with a handoff of the string to the independent
2303subexpression, and a handoff of the string back to the enclosing
2304regexp.
2305.PP
2306The ability of an independent subexpression to prevent backtracking
2307can be quite useful. Suppose we want to match a non-empty string
2308enclosed in parentheses up to two levels deep. Then the following
2309regexp matches:
2310.PP
2311.Vb 2
2312\& $x = "abc(de(fg)h"; # unbalanced parentheses
2313\& $x =~ /\e( ( [^()]+ | \e([^()]*\e) )+ \e)/x;
2314.Ve
2315.PP
2316The regexp matches an open parenthesis, one or more copies of an
2317alternation, and a close parenthesis. The alternation is two\-way, with
2318the first alternative \f(CW\*(C`[^()]+\*(C'\fR matching a substring with no
2319parentheses and the second alternative \f(CW\*(C`\e([^()]*\e)\*(C'\fR matching a
2320substring delimited by parentheses. The problem with this regexp is
2321that it is pathological: it has nested indeterminate quantifiers
2322of the form \f(CW\*(C`(a+|b)+\*(C'\fR. We discussed in Part 1 how nested quantifiers
2323like this could take an exponentially long time to execute if there
2324was no match possible. To prevent the exponential blowup, we need to
2325prevent useless backtracking at some point. This can be done by
2326enclosing the inner quantifier as an independent subexpression:
2327.PP
2328.Vb 1
2329\& $x =~ /\e( ( (?>[^()]+) | \e([^()]*\e) )+ \e)/x;
2330.Ve
2331.PP
2332Here, \f(CW\*(C`(?>[^()]+)\*(C'\fR breaks the degeneracy of string partitioning
2333by gobbling up as much of the string as possible and keeping it. Then
2334match failures fail much more quickly.
2335.Sh "Conditional expressions"
2336.IX Subsection "Conditional expressions"
2337A \fBconditional\ expression\fR\ is a form of if-then-else statement
2338that allows one to choose which patterns are to be matched, based on
2339some condition. There are two types of conditional expression:
2340\&\f(CW\*(C`(?(condition)yes\-regexp)\*(C'\fR and
2341\&\f(CW\*(C`(?(condition)yes\-regexp|no\-regexp)\*(C'\fR. \f(CW\*(C`(?(condition)yes\-regexp)\*(C'\fR is
2342like an \f(CW'if\ ()\ {}'\fR\ statement in Perl. If the \f(CW\*(C`condition\*(C'\fR is true,
2343the \f(CW\*(C`yes\-regexp\*(C'\fR will be matched. If the \f(CW\*(C`condition\*(C'\fR is false, the
2344\&\f(CW\*(C`yes\-regexp\*(C'\fR will be skipped and perl will move onto the next regexp
2345element. The second form is like an \f(CW'if\ ()\ {}\ else\ {}'\fR\ statement
2346in Perl. If the \f(CW\*(C`condition\*(C'\fR is true, the \f(CW\*(C`yes\-regexp\*(C'\fR will be
2347matched, otherwise the \f(CW\*(C`no\-regexp\*(C'\fR will be matched.
2348.PP
2349The \f(CW\*(C`condition\*(C'\fR can have two forms. The first form is simply an
2350integer in parentheses \f(CW\*(C`(integer)\*(C'\fR. It is true if the corresponding
2351backreference \f(CW\*(C`\einteger\*(C'\fR matched earlier in the regexp. The second
2352form is a bare zero width assertion \f(CW\*(C`(?...)\*(C'\fR, either a
2353lookahead, a lookbehind, or a code assertion (discussed in the next
2354section).
2355.PP
2356The integer form of the \f(CW\*(C`condition\*(C'\fR allows us to choose, with more
2357flexibility, what to match based on what matched earlier in the
2358regexp. This searches for words of the form \f(CW"$x$x"\fR or
2359\&\f(CW"$x$y$y$x"\fR:
2360.PP
2361.Vb 9
2362\& % simple_grep '^(\ew+)(\ew+)?(?(2)\e2\e1|\e1)$' /usr/dict/words
2363\& beriberi
2364\& coco
2365\& couscous
2366\& deed
2367\& ...
2368\& toot
2369\& toto
2370\& tutu
2371.Ve
2372.PP
2373The lookbehind \f(CW\*(C`condition\*(C'\fR allows, along with backreferences,
2374an earlier part of the match to influence a later part of the
2375match. For instance,
2376.PP
2377.Vb 1
2378\& /[ATGC]+(?(?<=AA)G|C)$/;
2379.Ve
2380.PP
2381matches a \s-1DNA\s0 sequence such that it either ends in \f(CW\*(C`AAG\*(C'\fR, or some
2382other base pair combination and \f(CW\*(C`C\*(C'\fR. Note that the form is
2383\&\f(CW\*(C`(?(?<=AA)G|C)\*(C'\fR and not \f(CW\*(C`(?((?<=AA))G|C)\*(C'\fR; for the
2384lookahead, lookbehind or code assertions, the parentheses around the
2385conditional are not needed.
2386.Sh "A bit of magic: executing Perl code in a regular expression"
2387.IX Subsection "A bit of magic: executing Perl code in a regular expression"
2388Normally, regexps are a part of Perl expressions.
2389\&\fBCode\ evaluation\fR\ expressions turn that around by allowing
2390arbitrary Perl code to be a part of a regexp. A code evaluation
2391expression is denoted \f(CW\*(C`(?{code})\*(C'\fR, with \f(CW\*(C`code\*(C'\fR a string of Perl
2392statements.
2393.PP
2394Code expressions are zero-width assertions, and the value they return
2395depends on their environment. There are two possibilities: either the
2396code expression is used as a conditional in a conditional expression
2397\&\f(CW\*(C`(?(condition)...)\*(C'\fR, or it is not. If the code expression is a
2398conditional, the code is evaluated and the result (i.e., the result of
2399the last statement) is used to determine truth or falsehood. If the
2400code expression is not used as a conditional, the assertion always
2401evaluates true and the result is put into the special variable
2402\&\f(CW$^R\fR. The variable \f(CW$^R\fR can then be used in code expressions later
2403in the regexp. Here are some silly examples:
2404.PP
2405.Vb 5
2406\& $x = "abcdef";
2407\& $x =~ /abc(?{print "Hi Mom!";})def/; # matches,
2408\& # prints 'Hi Mom!'
2409\& $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match,
2410\& # no 'Hi Mom!'
2411.Ve
2412.PP
2413Pay careful attention to the next example:
2414.PP
2415.Vb 3
2416\& $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match,
2417\& # no 'Hi Mom!'
2418\& # but why not?
2419.Ve
2420.PP
2421At first glance, you'd think that it shouldn't print, because obviously
2422the \f(CW\*(C`ddd\*(C'\fR isn't going to match the target string. But look at this
2423example:
2424.PP
2425.Vb 2
2426\& $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match,
2427\& # but _does_ print
2428.Ve
2429.PP
2430Hmm. What happened here? If you've been following along, you know that
2431the above pattern should be effectively the same as the last one \*(--
2432enclosing the d in a character class isn't going to change what it
2433matches. So why does the first not print while the second one does?
2434.PP
2435The answer lies in the optimizations the REx engine makes. In the first
2436case, all the engine sees are plain old characters (aside from the
2437\&\f(CW\*(C`?{}\*(C'\fR construct). It's smart enough to realize that the string 'ddd'
2438doesn't occur in our target string before actually running the pattern
2439through. But in the second case, we've tricked it into thinking that our
2440pattern is more complicated than it is. It takes a look, sees our
2441character class, and decides that it will have to actually run the
2442pattern to determine whether or not it matches, and in the process of
2443running it hits the print statement before it discovers that we don't
2444have a match.
2445.PP
2446To take a closer look at how the engine does optimizations, see the
2447section \*(L"Pragmas and debugging\*(R" below.
2448.PP
2449More fun with \f(CW\*(C`?{}\*(C'\fR:
2450.PP
2451.Vb 6
2452\& $x =~ /(?{print "Hi Mom!";})/; # matches,
2453\& # prints 'Hi Mom!'
2454\& $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches,
2455\& # prints '1'
2456\& $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches,
2457\& # prints '1'
2458.Ve
2459.PP
2460The bit of magic mentioned in the section title occurs when the regexp
2461backtracks in the process of searching for a match. If the regexp
2462backtracks over a code expression and if the variables used within are
2463localized using \f(CW\*(C`local\*(C'\fR, the changes in the variables produced by the
2464code expression are undone! Thus, if we wanted to count how many times
2465a character got matched inside a group, we could use, e.g.,
2466.PP
2467.Vb 11
2468\& $x = "aaaa";
2469\& $count = 0; # initialize 'a' count
2470\& $c = "bob"; # test if $c gets clobbered
2471\& $x =~ /(?{local $c = 0;}) # initialize count
2472\& ( a # match 'a'
2473\& (?{local $c = $c + 1;}) # increment count
2474\& )* # do this any number of times,
2475\& aa # but match 'aa' at the end
2476\& (?{$count = $c;}) # copy local $c var into $count
2477\& /x;
2478\& print "'a' count is $count, \e$c variable is '$c'\en";
2479.Ve
2480.PP
2481This prints
2482.PP
2483.Vb 1
2484\& 'a' count is 2, $c variable is 'bob'
2485.Ve
2486.PP
2487If we replace the \f(CW\*(C`\ (?{local\ $c\ =\ $c\ +\ 1;})\*(C'\fR\ with
2488\&\f(CW\*(C`\ (?{$c\ =\ $c\ +\ 1;})\*(C'\fR\ , the variable changes are \fInot\fR undone
2489during backtracking, and we get
2490.PP
2491.Vb 1
2492\& 'a' count is 4, $c variable is 'bob'
2493.Ve
2494.PP
2495Note that only localized variable changes are undone. Other side
2496effects of code expression execution are permanent. Thus
2497.PP
2498.Vb 2
2499\& $x = "aaaa";
2500\& $x =~ /(a(?{print "Yow\en";}))*aa/;
2501.Ve
2502.PP
2503produces
2504.PP
2505.Vb 4
2506\& Yow
2507\& Yow
2508\& Yow
2509\& Yow
2510.Ve
2511.PP
2512The result \f(CW$^R\fR is automatically localized, so that it will behave
2513properly in the presence of backtracking.
2514.PP
2515This example uses a code expression in a conditional to match the
2516article 'the' in either English or German:
2517.PP
2518.Vb 11
2519\& $lang = 'DE'; # use German
2520\& ...
2521\& $text = "das";
2522\& print "matched\en"
2523\& if $text =~ /(?(?{
2524\& $lang eq 'EN'; # is the language English?
2525\& })
2526\& the | # if so, then match 'the'
2527\& (die|das|der) # else, match 'die|das|der'
2528\& )
2529\& /xi;
2530.Ve
2531.PP
2532Note that the syntax here is \f(CW\*(C`(?(?{...})yes\-regexp|no\-regexp)\*(C'\fR, not
2533\&\f(CW\*(C`(?((?{...}))yes\-regexp|no\-regexp)\*(C'\fR. In other words, in the case of a
2534code expression, we don't need the extra parentheses around the
2535conditional.
2536.PP
2537If you try to use code expressions with interpolating variables, perl
2538may surprise you:
2539.PP
2540.Vb 5
2541\& $bar = 5;
2542\& $pat = '(?{ 1 })';
2543\& /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated
2544\& /foo(?{ 1 })$bar/; # compile error!
2545\& /foo${pat}bar/; # compile error!
2546.Ve
2547.PP
2548.Vb 2
2549\& $pat = qr/(?{ $foo = 1 })/; # precompile code regexp
2550\& /foo${pat}bar/; # compiles ok
2551.Ve
2552.PP
2553If a regexp has (1) code expressions and interpolating variables,or
2554(2) a variable that interpolates a code expression, perl treats the
2555regexp as an error. If the code expression is precompiled into a
2556variable, however, interpolating is ok. The question is, why is this
2557an error?
2558.PP
2559The reason is that variable interpolation and code expressions
2560together pose a security risk. The combination is dangerous because
2561many programmers who write search engines often take user input and
2562plug it directly into a regexp:
2563.PP
2564.Vb 3
2565\& $regexp = <>; # read user-supplied regexp
2566\& $chomp $regexp; # get rid of possible newline
2567\& $text =~ /$regexp/; # search $text for the $regexp
2568.Ve
2569.PP
2570If the \f(CW$regexp\fR variable contains a code expression, the user could
2571then execute arbitrary Perl code. For instance, some joker could
2572search for \f(CW\*(C`system('rm\ \-rf\ *');\*(C'\fR\ to erase your files. In this
2573sense, the combination of interpolation and code expressions \fBtaints\fR
2574your regexp. So by default, using both interpolation and code
2575expressions in the same regexp is not allowed. If you're not
2576concerned about malicious users, it is possible to bypass this
2577security check by invoking \f(CW\*(C`use\ re\ 'eval'\*(C'\fR\ :
2578.PP
2579.Vb 5
2580\& use re 'eval'; # throw caution out the door
2581\& $bar = 5;
2582\& $pat = '(?{ 1 })';
2583\& /foo(?{ 1 })$bar/; # compiles ok
2584\& /foo${pat}bar/; # compiles ok
2585.Ve
2586.PP
2587Another form of code expression is the \fBpattern\ code\ expression\fR\ .
2588The pattern code expression is like a regular code expression, except
2589that the result of the code evaluation is treated as a regular
2590expression and matched immediately. A simple example is
2591.PP
2592.Vb 4
2593\& $length = 5;
2594\& $char = 'a';
2595\& $x = 'aaaaabb';
2596\& $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a'
2597.Ve
2598.PP
2599This final example contains both ordinary and pattern code
2600expressions. It detects if a binary string \f(CW1101010010001...\fR has a
2601Fibonacci spacing 0,1,1,2,3,5,... of the \f(CW1\fR's:
2602.PP
2603.Vb 17
2604\& $s0 = 0; $s1 = 1; # initial conditions
2605\& $x = "1101010010001000001";
2606\& print "It is a Fibonacci sequence\en"
2607\& if $x =~ /^1 # match an initial '1'
2608\& (
2609\& (??{'0' x $s0}) # match $s0 of '0'
2610\& 1 # and then a '1'
2611\& (?{
2612\& $largest = $s0; # largest seq so far
2613\& $s2 = $s1 + $s0; # compute next term
2614\& $s0 = $s1; # in Fibonacci sequence
2615\& $s1 = $s2;
2616\& })
2617\& )+ # repeat as needed
2618\& $ # that is all there is
2619\& /x;
2620\& print "Largest sequence matched was $largest\en";
2621.Ve
2622.PP
2623This prints
2624.PP
2625.Vb 2
2626\& It is a Fibonacci sequence
2627\& Largest sequence matched was 5
2628.Ve
2629.PP
2630Ha! Try that with your garden variety regexp package...
2631.PP
2632Note that the variables \f(CW$s0\fR and \f(CW$s1\fR are not substituted when the
2633regexp is compiled, as happens for ordinary variables outside a code
2634expression. Rather, the code expressions are evaluated when perl
2635encounters them during the search for a match.
2636.PP
2637The regexp without the \f(CW\*(C`//x\*(C'\fR modifier is
2638.PP
2639.Vb 1
2640\& /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/;
2641.Ve
2642.PP
2643and is a great start on an Obfuscated Perl entry :\-) When working with
2644code and conditional expressions, the extended form of regexps is
2645almost necessary in creating and debugging regexps.
2646.Sh "Pragmas and debugging"
2647.IX Subsection "Pragmas and debugging"
2648Speaking of debugging, there are several pragmas available to control
2649and debug regexps in Perl. We have already encountered one pragma in
2650the previous section, \f(CW\*(C`use\ re\ 'eval';\*(C'\fR\ , that allows variable
2651interpolation and code expressions to coexist in a regexp. The other
2652pragmas are
2653.PP
2654.Vb 3
2655\& use re 'taint';
2656\& $tainted = <>;
2657\& @parts = ($tainted =~ /(\ew+)\es+(\ew+)/; # @parts is now tainted
2658.Ve
2659.PP
2660The \f(CW\*(C`taint\*(C'\fR pragma causes any substrings from a match with a tainted
2661variable to be tainted as well. This is not normally the case, as
2662regexps are often used to extract the safe bits from a tainted
2663variable. Use \f(CW\*(C`taint\*(C'\fR when you are not extracting safe bits, but are
2664performing some other processing. Both \f(CW\*(C`taint\*(C'\fR and \f(CW\*(C`eval\*(C'\fR pragmas
2665are lexically scoped, which means they are in effect only until
2666the end of the block enclosing the pragmas.
2667.PP
2668.Vb 2
2669\& use re 'debug';
2670\& /^(.*)$/s; # output debugging info
2671.Ve
2672.PP
2673.Vb 2
2674\& use re 'debugcolor';
2675\& /^(.*)$/s; # output debugging info in living color
2676.Ve
2677.PP
2678The global \f(CW\*(C`debug\*(C'\fR and \f(CW\*(C`debugcolor\*(C'\fR pragmas allow one to get
2679detailed debugging info about regexp compilation and
2680execution. \f(CW\*(C`debugcolor\*(C'\fR is the same as debug, except the debugging
2681information is displayed in color on terminals that can display
2682termcap color sequences. Here is example output:
2683.PP
2684.Vb 25
2685\& % perl -e 'use re "debug"; "abc" =~ /a*b+c/;'
2686\& Compiling REx `a*b+c'
2687\& size 9 first at 1
2688\& 1: STAR(4)
2689\& 2: EXACT <a>(0)
2690\& 4: PLUS(7)
2691\& 5: EXACT <b>(0)
2692\& 7: EXACT <c>(9)
2693\& 9: END(0)
2694\& floating `bc' at 0..2147483647 (checking floating) minlen 2
2695\& Guessing start of match, REx `a*b+c' against `abc'...
2696\& Found floating substr `bc' at offset 1...
2697\& Guessed: match at offset 0
2698\& Matching REx `a*b+c' against `abc'
2699\& Setting an EVAL scope, savestack=3
2700\& 0 <> <abc> | 1: STAR
2701\& EXACT <a> can match 1 times out of 32767...
2702\& Setting an EVAL scope, savestack=3
2703\& 1 <a> <bc> | 4: PLUS
2704\& EXACT <b> can match 1 times out of 32767...
2705\& Setting an EVAL scope, savestack=3
2706\& 2 <ab> <c> | 7: EXACT <c>
2707\& 3 <abc> <> | 9: END
2708\& Match successful!
2709\& Freeing REx: `a*b+c'
2710.Ve
2711.PP
2712If you have gotten this far into the tutorial, you can probably guess
2713what the different parts of the debugging output tell you. The first
2714part
2715.PP
2716.Vb 8
2717\& Compiling REx `a*b+c'
2718\& size 9 first at 1
2719\& 1: STAR(4)
2720\& 2: EXACT <a>(0)
2721\& 4: PLUS(7)
2722\& 5: EXACT <b>(0)
2723\& 7: EXACT <c>(9)
2724\& 9: END(0)
2725.Ve
2726.PP
2727describes the compilation stage. \f(CWSTAR(4)\fR means that there is a
2728starred object, in this case \f(CW'a'\fR, and if it matches, goto line 4,
2729i.e., \f(CWPLUS(7)\fR. The middle lines describe some heuristics and
2730optimizations performed before a match:
2731.PP
2732.Vb 4
2733\& floating `bc' at 0..2147483647 (checking floating) minlen 2
2734\& Guessing start of match, REx `a*b+c' against `abc'...
2735\& Found floating substr `bc' at offset 1...
2736\& Guessed: match at offset 0
2737.Ve
2738.PP
2739Then the match is executed and the remaining lines describe the
2740process:
2741.PP
2742.Vb 12
2743\& Matching REx `a*b+c' against `abc'
2744\& Setting an EVAL scope, savestack=3
2745\& 0 <> <abc> | 1: STAR
2746\& EXACT <a> can match 1 times out of 32767...
2747\& Setting an EVAL scope, savestack=3
2748\& 1 <a> <bc> | 4: PLUS
2749\& EXACT <b> can match 1 times out of 32767...
2750\& Setting an EVAL scope, savestack=3
2751\& 2 <ab> <c> | 7: EXACT <c>
2752\& 3 <abc> <> | 9: END
2753\& Match successful!
2754\& Freeing REx: `a*b+c'
2755.Ve
2756.PP
2757Each step is of the form \f(CW\*(C`n\ <x>\ <y>\*(C'\fR\ , with \f(CW\*(C`<x>\*(C'\fR the
2758part of the string matched and \f(CW\*(C`<y>\*(C'\fR the part not yet
2759matched. The \f(CW\*(C`|\ 1:\ STAR\*(C'\fR\ says that perl is at line number 1
2760n the compilation list above. See
2761\&\*(L"Debugging regular expressions\*(R" in perldebguts for much more detail.
2762.PP
2763An alternative method of debugging regexps is to embed \f(CW\*(C`print\*(C'\fR
2764statements within the regexp. This provides a blow-by-blow account of
2765the backtracking in an alternation:
2766.PP
2767.Vb 12
2768\& "that this" =~ m@(?{print "Start at position ", pos, "\en";})
2769\& t(?{print "t1\en";})
2770\& h(?{print "h1\en";})
2771\& i(?{print "i1\en";})
2772\& s(?{print "s1\en";})
2773\& |
2774\& t(?{print "t2\en";})
2775\& h(?{print "h2\en";})
2776\& a(?{print "a2\en";})
2777\& t(?{print "t2\en";})
2778\& (?{print "Done at position ", pos, "\en";})
2779\& @x;
2780.Ve
2781.PP
2782prints
2783.PP
2784.Vb 8
2785\& Start at position 0
2786\& t1
2787\& h1
2788\& t2
2789\& h2
2790\& a2
2791\& t2
2792\& Done at position 4
2793.Ve
2794.SH "BUGS"
2795.IX Header "BUGS"
2796Code expressions, conditional expressions, and independent expressions
2797are \fBexperimental\fR. Don't use them in production code. Yet.
2798.SH "SEE ALSO"
2799.IX Header "SEE ALSO"
2800This is just a tutorial. For the full story on perl regular
2801expressions, see the perlre regular expressions reference page.
2802.PP
2803For more information on the matching \f(CW\*(C`m//\*(C'\fR and substitution \f(CW\*(C`s///\*(C'\fR
2804operators, see \*(L"Regexp Quote-Like Operators\*(R" in perlop. For
2805information on the \f(CW\*(C`split\*(C'\fR operation, see \*(L"split\*(R" in perlfunc.
2806.PP
2807For an excellent all-around resource on the care and feeding of
2808regular expressions, see the book \fIMastering Regular Expressions\fR by
2809Jeffrey Friedl (published by O'Reilly, \s-1ISBN\s0 1556592\-257\-3).
2810.SH "AUTHOR AND COPYRIGHT"
2811.IX Header "AUTHOR AND COPYRIGHT"
2812Copyright (c) 2000 Mark Kvale
2813All rights reserved.
2814.PP
2815This document may be distributed under the same terms as Perl itself.
2816.Sh "Acknowledgments"
2817.IX Subsection "Acknowledgments"
2818The inspiration for the stop codon \s-1DNA\s0 example came from the \s-1ZIP\s0
2819code example in chapter 7 of \fIMastering Regular Expressions\fR.
2820.PP
2821The author would like to thank Jeff Pinyan, Andrew Johnson, Peter
2822Haworth, Ronald J Kimball, and Joe Smith for all their helpful
2823comments.