Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "PERLRETUT 1" | |
132 | .TH PERLRETUT 1 "2002-06-08" "perl v5.8.0" "Perl Programmers Reference Guide" | |
133 | .SH "NAME" | |
134 | perlretut \- Perl regular expressions tutorial | |
135 | .SH "DESCRIPTION" | |
136 | .IX Header "DESCRIPTION" | |
137 | This page provides a basic tutorial on understanding, creating and | |
138 | using regular expressions in Perl. It serves as a complement to the | |
139 | reference page on regular expressions perlre. Regular expressions | |
140 | are an integral part of the \f(CW\*(C`m//\*(C'\fR, \f(CW\*(C`s///\*(C'\fR, \f(CW\*(C`qr//\*(C'\fR and \f(CW\*(C`split\*(C'\fR | |
141 | operators and so this tutorial also overlaps with | |
142 | \&\*(L"Regexp Quote-Like Operators\*(R" in perlop and \*(L"split\*(R" in perlfunc. | |
143 | .PP | |
144 | Perl is widely renowned for excellence in text processing, and regular | |
145 | expressions are one of the big factors behind this fame. Perl regular | |
146 | expressions display an efficiency and flexibility unknown in most | |
147 | other computer languages. Mastering even the basics of regular | |
148 | expressions will allow you to manipulate text with surprising ease. | |
149 | .PP | |
150 | What is a regular expression? A regular expression is simply a string | |
151 | that describes a pattern. Patterns are in common use these days; | |
152 | examples are the patterns typed into a search engine to find web pages | |
153 | and the patterns used to list files in a directory, e.g., \f(CW\*(C`ls *.txt\*(C'\fR | |
154 | or \f(CW\*(C`dir *.*\*(C'\fR. In Perl, the patterns described by regular expressions | |
155 | are used to search strings, extract desired parts of strings, and to | |
156 | do search and replace operations. | |
157 | .PP | |
158 | Regular expressions have the undeserved reputation of being abstract | |
159 | and difficult to understand. Regular expressions are constructed using | |
160 | simple concepts like conditionals and loops and are no more difficult | |
161 | to understand than the corresponding \f(CW\*(C`if\*(C'\fR conditionals and \f(CW\*(C`while\*(C'\fR | |
162 | loops in the Perl language itself. In fact, the main challenge in | |
163 | learning regular expressions is just getting used to the terse | |
164 | notation used to express these concepts. | |
165 | .PP | |
166 | This tutorial flattens the learning curve by discussing regular | |
167 | expression concepts, along with their notation, one at a time and with | |
168 | many examples. The first part of the tutorial will progress from the | |
169 | simplest word searches to the basic regular expression concepts. If | |
170 | you master the first part, you will have all the tools needed to solve | |
171 | about 98% of your needs. The second part of the tutorial is for those | |
172 | comfortable with the basics and hungry for more power tools. It | |
173 | discusses the more advanced regular expression operators and | |
174 | introduces the latest cutting edge innovations in 5.6.0. | |
175 | .PP | |
176 | A note: to save time, 'regular expression' is often abbreviated as | |
177 | regexp or regex. Regexp is a more natural abbreviation than regex, but | |
178 | is harder to pronounce. The Perl pod documentation is evenly split on | |
179 | regexp vs regex; in Perl, there is more than one way to abbreviate it. | |
180 | We'll use regexp in this tutorial. | |
181 | .SH "Part 1: The basics" | |
182 | .IX Header "Part 1: The basics" | |
183 | .Sh "Simple word matching" | |
184 | .IX Subsection "Simple word matching" | |
185 | The simplest regexp is simply a word, or more generally, a string of | |
186 | characters. A regexp consisting of a word matches any string that | |
187 | contains that word: | |
188 | .PP | |
189 | .Vb 1 | |
190 | \& "Hello World" =~ /World/; # matches | |
191 | .Ve | |
192 | .PP | |
193 | What is this perl statement all about? \f(CW"Hello World"\fR is a simple | |
194 | double quoted string. \f(CW\*(C`World\*(C'\fR is the regular expression and the | |
195 | \&\f(CW\*(C`//\*(C'\fR enclosing \f(CW\*(C`/World/\*(C'\fR tells perl to search a string for a match. | |
196 | The operator \f(CW\*(C`=~\*(C'\fR associates the string with the regexp match and | |
197 | produces a true value if the regexp matched, or false if the regexp | |
198 | did not match. In our case, \f(CW\*(C`World\*(C'\fR matches the second word in | |
199 | \&\f(CW"Hello World"\fR, so the expression is true. Expressions like this | |
200 | are useful in conditionals: | |
201 | .PP | |
202 | .Vb 6 | |
203 | \& if ("Hello World" =~ /World/) { | |
204 | \& print "It matches\en"; | |
205 | \& } | |
206 | \& else { | |
207 | \& print "It doesn't match\en"; | |
208 | \& } | |
209 | .Ve | |
210 | .PP | |
211 | There are useful variations on this theme. The sense of the match can | |
212 | be reversed by using \f(CW\*(C`!~\*(C'\fR operator: | |
213 | .PP | |
214 | .Vb 6 | |
215 | \& if ("Hello World" !~ /World/) { | |
216 | \& print "It doesn't match\en"; | |
217 | \& } | |
218 | \& else { | |
219 | \& print "It matches\en"; | |
220 | \& } | |
221 | .Ve | |
222 | .PP | |
223 | The literal string in the regexp can be replaced by a variable: | |
224 | .PP | |
225 | .Vb 7 | |
226 | \& $greeting = "World"; | |
227 | \& if ("Hello World" =~ /$greeting/) { | |
228 | \& print "It matches\en"; | |
229 | \& } | |
230 | \& else { | |
231 | \& print "It doesn't match\en"; | |
232 | \& } | |
233 | .Ve | |
234 | .PP | |
235 | If you're matching against the special default variable \f(CW$_\fR, the | |
236 | \&\f(CW\*(C`$_ =~\*(C'\fR part can be omitted: | |
237 | .PP | |
238 | .Vb 7 | |
239 | \& $_ = "Hello World"; | |
240 | \& if (/World/) { | |
241 | \& print "It matches\en"; | |
242 | \& } | |
243 | \& else { | |
244 | \& print "It doesn't match\en"; | |
245 | \& } | |
246 | .Ve | |
247 | .PP | |
248 | And finally, the \f(CW\*(C`//\*(C'\fR default delimiters for a match can be changed | |
249 | to arbitrary delimiters by putting an \f(CW'm'\fR out front: | |
250 | .PP | |
251 | .Vb 4 | |
252 | \& "Hello World" =~ m!World!; # matches, delimited by '!' | |
253 | \& "Hello World" =~ m{World}; # matches, note the matching '{}' | |
254 | \& "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin', | |
255 | \& # '/' becomes an ordinary char | |
256 | .Ve | |
257 | .PP | |
258 | \&\f(CW\*(C`/World/\*(C'\fR, \f(CW\*(C`m!World!\*(C'\fR, and \f(CW\*(C`m{World}\*(C'\fR all represent the | |
259 | same thing. When, e.g., \f(CW""\fR is used as a delimiter, the forward | |
260 | slash \f(CW'/'\fR becomes an ordinary character and can be used in a regexp | |
261 | without trouble. | |
262 | .PP | |
263 | Let's consider how different regexps would match \f(CW"Hello World"\fR: | |
264 | .PP | |
265 | .Vb 4 | |
266 | \& "Hello World" =~ /world/; # doesn't match | |
267 | \& "Hello World" =~ /o W/; # matches | |
268 | \& "Hello World" =~ /oW/; # doesn't match | |
269 | \& "Hello World" =~ /World /; # doesn't match | |
270 | .Ve | |
271 | .PP | |
272 | The first regexp \f(CW\*(C`world\*(C'\fR doesn't match because regexps are | |
273 | case\-sensitive. The second regexp matches because the substring | |
274 | \&\f(CW'o\ W'\fR\ occurs in the string \f(CW"Hello\ World"\fR\ . The space | |
275 | character ' ' is treated like any other character in a regexp and is | |
276 | needed to match in this case. The lack of a space character is the | |
277 | reason the third regexp \f(CW'oW'\fR doesn't match. The fourth regexp | |
278 | \&\f(CW'World '\fR doesn't match because there is a space at the end of the | |
279 | regexp, but not at the end of the string. The lesson here is that | |
280 | regexps must match a part of the string \fIexactly\fR in order for the | |
281 | statement to be true. | |
282 | .PP | |
283 | If a regexp matches in more than one place in the string, perl will | |
284 | always match at the earliest possible point in the string: | |
285 | .PP | |
286 | .Vb 2 | |
287 | \& "Hello World" =~ /o/; # matches 'o' in 'Hello' | |
288 | \& "That hat is red" =~ /hat/; # matches 'hat' in 'That' | |
289 | .Ve | |
290 | .PP | |
291 | With respect to character matching, there are a few more points you | |
292 | need to know about. First of all, not all characters can be used 'as | |
293 | is' in a match. Some characters, called \fBmetacharacters\fR, are reserved | |
294 | for use in regexp notation. The metacharacters are | |
295 | .PP | |
296 | .Vb 1 | |
297 | \& {}[]()^$.|*+?\e | |
298 | .Ve | |
299 | .PP | |
300 | The significance of each of these will be explained | |
301 | in the rest of the tutorial, but for now, it is important only to know | |
302 | that a metacharacter can be matched by putting a backslash before it: | |
303 | .PP | |
304 | .Vb 5 | |
305 | \& "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter | |
306 | \& "2+2=4" =~ /2\e+2/; # matches, \e+ is treated like an ordinary + | |
307 | \& "The interval is [0,1)." =~ /[0,1)./ # is a syntax error! | |
308 | \& "The interval is [0,1)." =~ /\e[0,1\e)\e./ # matches | |
309 | \& "/usr/bin/perl" =~ /\e/usr\e/local\e/bin\e/perl/; # matches | |
310 | .Ve | |
311 | .PP | |
312 | In the last regexp, the forward slash \f(CW'/'\fR is also backslashed, | |
313 | because it is used to delimit the regexp. This can lead to \s-1LTS\s0 | |
314 | (leaning toothpick syndrome), however, and it is often more readable | |
315 | to change delimiters. | |
316 | .PP | |
317 | The backslash character \f(CW'\e'\fR is a metacharacter itself and needs to | |
318 | be backslashed: | |
319 | .PP | |
320 | .Vb 1 | |
321 | \& 'C:\eWIN32' =~ /C:\e\eWIN/; # matches | |
322 | .Ve | |
323 | .PP | |
324 | In addition to the metacharacters, there are some \s-1ASCII\s0 characters | |
325 | which don't have printable character equivalents and are instead | |
326 | represented by \fBescape sequences\fR. Common examples are \f(CW\*(C`\et\*(C'\fR for a | |
327 | tab, \f(CW\*(C`\en\*(C'\fR for a newline, \f(CW\*(C`\er\*(C'\fR for a carriage return and \f(CW\*(C`\ea\*(C'\fR for a | |
328 | bell. If your string is better thought of as a sequence of arbitrary | |
329 | bytes, the octal escape sequence, e.g., \f(CW\*(C`\e033\*(C'\fR, or hexadecimal escape | |
330 | sequence, e.g., \f(CW\*(C`\ex1B\*(C'\fR may be a more natural representation for your | |
331 | bytes. Here are some examples of escapes: | |
332 | .PP | |
333 | .Vb 4 | |
334 | \& "1000\et2000" =~ m(0\et2) # matches | |
335 | \& "1000\en2000" =~ /0\en20/ # matches | |
336 | \& "1000\et2000" =~ /\e000\et2/ # doesn't match, "0" ne "\e000" | |
337 | \& "cat" =~ /\e143\ex61\ex74/ # matches, but a weird way to spell cat | |
338 | .Ve | |
339 | .PP | |
340 | If you've been around Perl a while, all this talk of escape sequences | |
341 | may seem familiar. Similar escape sequences are used in double-quoted | |
342 | strings and in fact the regexps in Perl are mostly treated as | |
343 | double-quoted strings. This means that variables can be used in | |
344 | regexps as well. Just like double-quoted strings, the values of the | |
345 | variables in the regexp will be substituted in before the regexp is | |
346 | evaluated for matching purposes. So we have: | |
347 | .PP | |
348 | .Vb 4 | |
349 | \& $foo = 'house'; | |
350 | \& 'housecat' =~ /$foo/; # matches | |
351 | \& 'cathouse' =~ /cat$foo/; # matches | |
352 | \& 'housecat' =~ /${foo}cat/; # matches | |
353 | .Ve | |
354 | .PP | |
355 | So far, so good. With the knowledge above you can already perform | |
356 | searches with just about any literal string regexp you can dream up. | |
357 | Here is a \fIvery simple\fR emulation of the Unix grep program: | |
358 | .PP | |
359 | .Vb 7 | |
360 | \& % cat > simple_grep | |
361 | \& #!/usr/bin/perl | |
362 | \& $regexp = shift; | |
363 | \& while (<>) { | |
364 | \& print if /$regexp/; | |
365 | \& } | |
366 | \& ^D | |
367 | .Ve | |
368 | .PP | |
369 | .Vb 1 | |
370 | \& % chmod +x simple_grep | |
371 | .Ve | |
372 | .PP | |
373 | .Vb 10 | |
374 | \& % simple_grep abba /usr/dict/words | |
375 | \& Babbage | |
376 | \& cabbage | |
377 | \& cabbages | |
378 | \& sabbath | |
379 | \& Sabbathize | |
380 | \& Sabbathizes | |
381 | \& sabbatical | |
382 | \& scabbard | |
383 | \& scabbards | |
384 | .Ve | |
385 | .PP | |
386 | This program is easy to understand. \f(CW\*(C`#!/usr/bin/perl\*(C'\fR is the standard | |
387 | way to invoke a perl program from the shell. | |
388 | \&\f(CW\*(C`$regexp\ =\ shift;\*(C'\fR\ saves the first command line argument as the | |
389 | regexp to be used, leaving the rest of the command line arguments to | |
390 | be treated as files. \f(CW\*(C`while\ (<>)\*(C'\fR\ loops over all the lines in | |
391 | all the files. For each line, \f(CW\*(C`print\ if\ /$regexp/;\*(C'\fR\ prints the | |
392 | line if the regexp matches the line. In this line, both \f(CW\*(C`print\*(C'\fR and | |
393 | \&\f(CW\*(C`/$regexp/\*(C'\fR use the default variable \f(CW$_\fR implicitly. | |
394 | .PP | |
395 | With all of the regexps above, if the regexp matched anywhere in the | |
396 | string, it was considered a match. Sometimes, however, we'd like to | |
397 | specify \fIwhere\fR in the string the regexp should try to match. To do | |
398 | this, we would use the \fBanchor\fR metacharacters \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR. The | |
399 | anchor \f(CW\*(C`^\*(C'\fR means match at the beginning of the string and the anchor | |
400 | \&\f(CW\*(C`$\*(C'\fR means match at the end of the string, or before a newline at the | |
401 | end of the string. Here is how they are used: | |
402 | .PP | |
403 | .Vb 4 | |
404 | \& "housekeeper" =~ /keeper/; # matches | |
405 | \& "housekeeper" =~ /^keeper/; # doesn't match | |
406 | \& "housekeeper" =~ /keeper$/; # matches | |
407 | \& "housekeeper\en" =~ /keeper$/; # matches | |
408 | .Ve | |
409 | .PP | |
410 | The second regexp doesn't match because \f(CW\*(C`^\*(C'\fR constrains \f(CW\*(C`keeper\*(C'\fR to | |
411 | match only at the beginning of the string, but \f(CW"housekeeper"\fR has | |
412 | keeper starting in the middle. The third regexp does match, since the | |
413 | \&\f(CW\*(C`$\*(C'\fR constrains \f(CW\*(C`keeper\*(C'\fR to match only at the end of the string. | |
414 | .PP | |
415 | When both \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR are used at the same time, the regexp has to | |
416 | match both the beginning and the end of the string, i.e., the regexp | |
417 | matches the whole string. Consider | |
418 | .PP | |
419 | .Vb 3 | |
420 | \& "keeper" =~ /^keep$/; # doesn't match | |
421 | \& "keeper" =~ /^keeper$/; # matches | |
422 | \& "" =~ /^$/; # ^$ matches an empty string | |
423 | .Ve | |
424 | .PP | |
425 | The first regexp doesn't match because the string has more to it than | |
426 | \&\f(CW\*(C`keep\*(C'\fR. Since the second regexp is exactly the string, it | |
427 | matches. Using both \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR in a regexp forces the complete | |
428 | string to match, so it gives you complete control over which strings | |
429 | match and which don't. Suppose you are looking for a fellow named | |
430 | bert, off in a string by himself: | |
431 | .PP | |
432 | .Vb 1 | |
433 | \& "dogbert" =~ /bert/; # matches, but not what you want | |
434 | .Ve | |
435 | .PP | |
436 | .Vb 2 | |
437 | \& "dilbert" =~ /^bert/; # doesn't match, but .. | |
438 | \& "bertram" =~ /^bert/; # matches, so still not good enough | |
439 | .Ve | |
440 | .PP | |
441 | .Vb 3 | |
442 | \& "bertram" =~ /^bert$/; # doesn't match, good | |
443 | \& "dilbert" =~ /^bert$/; # doesn't match, good | |
444 | \& "bert" =~ /^bert$/; # matches, perfect | |
445 | .Ve | |
446 | .PP | |
447 | Of course, in the case of a literal string, one could just as easily | |
448 | use the string equivalence \f(CW\*(C`$string\ eq\ 'bert'\*(C'\fR\ and it would be | |
449 | more efficient. The \f(CW\*(C`^...$\*(C'\fR regexp really becomes useful when we | |
450 | add in the more powerful regexp tools below. | |
451 | .Sh "Using character classes" | |
452 | .IX Subsection "Using character classes" | |
453 | Although one can already do quite a lot with the literal string | |
454 | regexps above, we've only scratched the surface of regular expression | |
455 | technology. In this and subsequent sections we will introduce regexp | |
456 | concepts (and associated metacharacter notations) that will allow a | |
457 | regexp to not just represent a single character sequence, but a \fIwhole | |
458 | class\fR of them. | |
459 | .PP | |
460 | One such concept is that of a \fBcharacter class\fR. A character class | |
461 | allows a set of possible characters, rather than just a single | |
462 | character, to match at a particular point in a regexp. Character | |
463 | classes are denoted by brackets \f(CW\*(C`[...]\*(C'\fR, with the set of characters | |
464 | to be possibly matched inside. Here are some examples: | |
465 | .PP | |
466 | .Vb 4 | |
467 | \& /cat/; # matches 'cat' | |
468 | \& /[bcr]at/; # matches 'bat, 'cat', or 'rat' | |
469 | \& /item[0123456789]/; # matches 'item0' or ... or 'item9' | |
470 | \& "abc" =~ /[cab]/; # matches 'a' | |
471 | .Ve | |
472 | .PP | |
473 | In the last statement, even though \f(CW'c'\fR is the first character in | |
474 | the class, \f(CW'a'\fR matches because the first character position in the | |
475 | string is the earliest point at which the regexp can match. | |
476 | .PP | |
477 | .Vb 2 | |
478 | \& /[yY][eE][sS]/; # match 'yes' in a case-insensitive way | |
479 | \& # 'yes', 'Yes', 'YES', etc. | |
480 | .Ve | |
481 | .PP | |
482 | This regexp displays a common task: perform a case-insensitive | |
483 | match. Perl provides away of avoiding all those brackets by simply | |
484 | appending an \f(CW'i'\fR to the end of the match. Then \f(CW\*(C`/[yY][eE][sS]/;\*(C'\fR | |
485 | can be rewritten as \f(CW\*(C`/yes/i;\*(C'\fR. The \f(CW'i'\fR stands for | |
486 | case-insensitive and is an example of a \fBmodifier\fR of the matching | |
487 | operation. We will meet other modifiers later in the tutorial. | |
488 | .PP | |
489 | We saw in the section above that there were ordinary characters, which | |
490 | represented themselves, and special characters, which needed a | |
491 | backslash \f(CW\*(C`\e\*(C'\fR to represent themselves. The same is true in a | |
492 | character class, but the sets of ordinary and special characters | |
493 | inside a character class are different than those outside a character | |
494 | class. The special characters for a character class are \f(CW\*(C`\-]\e^$\*(C'\fR. \f(CW\*(C`]\*(C'\fR | |
495 | is special because it denotes the end of a character class. \f(CW\*(C`$\*(C'\fR is | |
496 | special because it denotes a scalar variable. \f(CW\*(C`\e\*(C'\fR is special because | |
497 | it is used in escape sequences, just like above. Here is how the | |
498 | special characters \f(CW\*(C`]$\e\*(C'\fR are handled: | |
499 | .PP | |
500 | .Vb 5 | |
501 | \& /[\e]c]def/; # matches ']def' or 'cdef' | |
502 | \& $x = 'bcr'; | |
503 | \& /[$x]at/; # matches 'bat', 'cat', or 'rat' | |
504 | \& /[\e$x]at/; # matches '$at' or 'xat' | |
505 | \& /[\e\e$x]at/; # matches '\eat', 'bat, 'cat', or 'rat' | |
506 | .Ve | |
507 | .PP | |
508 | The last two are a little tricky. in \f(CW\*(C`[\e$x]\*(C'\fR, the backslash protects | |
509 | the dollar sign, so the character class has two members \f(CW\*(C`$\*(C'\fR and \f(CW\*(C`x\*(C'\fR. | |
510 | In \f(CW\*(C`[\e\e$x]\*(C'\fR, the backslash is protected, so \f(CW$x\fR is treated as a | |
511 | variable and substituted in double quote fashion. | |
512 | .PP | |
513 | The special character \f(CW'\-'\fR acts as a range operator within character | |
514 | classes, so that a contiguous set of characters can be written as a | |
515 | range. With ranges, the unwieldy \f(CW\*(C`[0123456789]\*(C'\fR and \f(CW\*(C`[abc...xyz]\*(C'\fR | |
516 | become the svelte \f(CW\*(C`[0\-9]\*(C'\fR and \f(CW\*(C`[a\-z]\*(C'\fR. Some examples are | |
517 | .PP | |
518 | .Vb 6 | |
519 | \& /item[0-9]/; # matches 'item0' or ... or 'item9' | |
520 | \& /[0-9bx-z]aa/; # matches '0aa', ..., '9aa', | |
521 | \& # 'baa', 'xaa', 'yaa', or 'zaa' | |
522 | \& /[0-9a-fA-F]/; # matches a hexadecimal digit | |
523 | \& /[0-9a-zA-Z_]/; # matches a "word" character, | |
524 | \& # like those in a perl variable name | |
525 | .Ve | |
526 | .PP | |
527 | If \f(CW'\-'\fR is the first or last character in a character class, it is | |
528 | treated as an ordinary character; \f(CW\*(C`[\-ab]\*(C'\fR, \f(CW\*(C`[ab\-]\*(C'\fR and \f(CW\*(C`[a\e\-b]\*(C'\fR are | |
529 | all equivalent. | |
530 | .PP | |
531 | The special character \f(CW\*(C`^\*(C'\fR in the first position of a character class | |
532 | denotes a \fBnegated character class\fR, which matches any character but | |
533 | those in the brackets. Both \f(CW\*(C`[...]\*(C'\fR and \f(CW\*(C`[^...]\*(C'\fR must match a | |
534 | character, or the match fails. Then | |
535 | .PP | |
536 | .Vb 4 | |
537 | \& /[^a]at/; # doesn't match 'aat' or 'at', but matches | |
538 | \& # all other 'bat', 'cat, '0at', '%at', etc. | |
539 | \& /[^0-9]/; # matches a non-numeric character | |
540 | \& /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary | |
541 | .Ve | |
542 | .PP | |
543 | Now, even \f(CW\*(C`[0\-9]\*(C'\fR can be a bother the write multiple times, so in the | |
544 | interest of saving keystrokes and making regexps more readable, Perl | |
545 | has several abbreviations for common character classes: | |
546 | .IP "\(bu" 4 | |
547 | \&\ed is a digit and represents [0\-9] | |
548 | .IP "\(bu" 4 | |
549 | \&\es is a whitespace character and represents [\e \et\er\en\ef] | |
550 | .IP "\(bu" 4 | |
551 | \&\ew is a word character (alphanumeric or _) and represents [0\-9a\-zA\-Z_] | |
552 | .IP "\(bu" 4 | |
553 | \&\eD is a negated \ed; it represents any character but a digit [^0\-9] | |
554 | .IP "\(bu" 4 | |
555 | \&\eS is a negated \es; it represents any non-whitespace character [^\es] | |
556 | .IP "\(bu" 4 | |
557 | \&\eW is a negated \ew; it represents any non-word character [^\ew] | |
558 | .IP "\(bu" 4 | |
559 | The period '.' matches any character but \*(L"\en\*(R" | |
560 | .PP | |
561 | The \f(CW\*(C`\ed\es\ew\eD\eS\eW\*(C'\fR abbreviations can be used both inside and outside | |
562 | of character classes. Here are some in use: | |
563 | .PP | |
564 | .Vb 7 | |
565 | \& /\ed\ed:\ed\ed:\ed\ed/; # matches a hh:mm:ss time format | |
566 | \& /[\ed\es]/; # matches any digit or whitespace character | |
567 | \& /\ew\eW\ew/; # matches a word char, followed by a | |
568 | \& # non-word char, followed by a word char | |
569 | \& /..rt/; # matches any two chars, followed by 'rt' | |
570 | \& /end\e./; # matches 'end.' | |
571 | \& /end[.]/; # same thing, matches 'end.' | |
572 | .Ve | |
573 | .PP | |
574 | Because a period is a metacharacter, it needs to be escaped to match | |
575 | as an ordinary period. Because, for example, \f(CW\*(C`\ed\*(C'\fR and \f(CW\*(C`\ew\*(C'\fR are sets | |
576 | of characters, it is incorrect to think of \f(CW\*(C`[^\ed\ew]\*(C'\fR as \f(CW\*(C`[\eD\eW]\*(C'\fR; in | |
577 | fact \f(CW\*(C`[^\ed\ew]\*(C'\fR is the same as \f(CW\*(C`[^\ew]\*(C'\fR, which is the same as | |
578 | \&\f(CW\*(C`[\eW]\*(C'\fR. Think DeMorgan's laws. | |
579 | .PP | |
580 | An anchor useful in basic regexps is the \fBword\ anchor\fR\ | |
581 | \&\f(CW\*(C`\eb\*(C'\fR. This matches a boundary between a word character and a non-word | |
582 | character \f(CW\*(C`\ew\eW\*(C'\fR or \f(CW\*(C`\eW\ew\*(C'\fR: | |
583 | .PP | |
584 | .Vb 5 | |
585 | \& $x = "Housecat catenates house and cat"; | |
586 | \& $x =~ /cat/; # matches cat in 'housecat' | |
587 | \& $x =~ /\ebcat/; # matches cat in 'catenates' | |
588 | \& $x =~ /cat\eb/; # matches cat in 'housecat' | |
589 | \& $x =~ /\ebcat\eb/; # matches 'cat' at end of string | |
590 | .Ve | |
591 | .PP | |
592 | Note in the last example, the end of the string is considered a word | |
593 | boundary. | |
594 | .PP | |
595 | You might wonder why \f(CW'.'\fR matches everything but \f(CW"\en"\fR \- why not | |
596 | every character? The reason is that often one is matching against | |
597 | lines and would like to ignore the newline characters. For instance, | |
598 | while the string \f(CW"\en"\fR represents one line, we would like to think | |
599 | of as empty. Then | |
600 | .PP | |
601 | .Vb 2 | |
602 | \& "" =~ /^$/; # matches | |
603 | \& "\en" =~ /^$/; # matches, "\en" is ignored | |
604 | .Ve | |
605 | .PP | |
606 | .Vb 5 | |
607 | \& "" =~ /./; # doesn't match; it needs a char | |
608 | \& "" =~ /^.$/; # doesn't match; it needs a char | |
609 | \& "\en" =~ /^.$/; # doesn't match; it needs a char other than "\en" | |
610 | \& "a" =~ /^.$/; # matches | |
611 | \& "a\en" =~ /^.$/; # matches, ignores the "\en" | |
612 | .Ve | |
613 | .PP | |
614 | This behavior is convenient, because we usually want to ignore | |
615 | newlines when we count and match characters in a line. Sometimes, | |
616 | however, we want to keep track of newlines. We might even want \f(CW\*(C`^\*(C'\fR | |
617 | and \f(CW\*(C`$\*(C'\fR to anchor at the beginning and end of lines within the | |
618 | string, rather than just the beginning and end of the string. Perl | |
619 | allows us to choose between ignoring and paying attention to newlines | |
620 | by using the \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR modifiers. \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR stand for | |
621 | single line and multi-line and they determine whether a string is to | |
622 | be treated as one continuous string, or as a set of lines. The two | |
623 | modifiers affect two aspects of how the regexp is interpreted: 1) how | |
624 | the \f(CW'.'\fR character class is defined, and 2) where the anchors \f(CW\*(C`^\*(C'\fR | |
625 | and \f(CW\*(C`$\*(C'\fR are able to match. Here are the four possible combinations: | |
626 | .IP "\(bu" 4 | |
627 | no modifiers (//): Default behavior. \f(CW'.'\fR matches any character | |
628 | except \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR matches only at the beginning of the string and | |
629 | \&\f(CW\*(C`$\*(C'\fR matches only at the end or before a newline at the end. | |
630 | .IP "\(bu" 4 | |
631 | s modifier (//s): Treat string as a single long line. \f(CW'.'\fR matches | |
632 | any character, even \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR matches only at the beginning of | |
633 | the string and \f(CW\*(C`$\*(C'\fR matches only at the end or before a newline at the | |
634 | end. | |
635 | .IP "\(bu" 4 | |
636 | m modifier (//m): Treat string as a set of multiple lines. \f(CW'.'\fR | |
637 | matches any character except \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR are able to match | |
638 | at the start or end of \fIany\fR line within the string. | |
639 | .IP "\(bu" 4 | |
640 | both s and m modifiers (//sm): Treat string as a single long line, but | |
641 | detect multiple lines. \f(CW'.'\fR matches any character, even | |
642 | \&\f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR, however, are able to match at the start or end | |
643 | of \fIany\fR line within the string. | |
644 | .PP | |
645 | Here are examples of \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR in action: | |
646 | .PP | |
647 | .Vb 1 | |
648 | \& $x = "There once was a girl\enWho programmed in Perl\en"; | |
649 | .Ve | |
650 | .PP | |
651 | .Vb 4 | |
652 | \& $x =~ /^Who/; # doesn't match, "Who" not at start of string | |
653 | \& $x =~ /^Who/s; # doesn't match, "Who" not at start of string | |
654 | \& $x =~ /^Who/m; # matches, "Who" at start of second line | |
655 | \& $x =~ /^Who/sm; # matches, "Who" at start of second line | |
656 | .Ve | |
657 | .PP | |
658 | .Vb 4 | |
659 | \& $x =~ /girl.Who/; # doesn't match, "." doesn't match "\en" | |
660 | \& $x =~ /girl.Who/s; # matches, "." matches "\en" | |
661 | \& $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\en" | |
662 | \& $x =~ /girl.Who/sm; # matches, "." matches "\en" | |
663 | .Ve | |
664 | .PP | |
665 | Most of the time, the default behavior is what is want, but \f(CW\*(C`//s\*(C'\fR and | |
666 | \&\f(CW\*(C`//m\*(C'\fR are occasionally very useful. If \f(CW\*(C`//m\*(C'\fR is being used, the start | |
667 | of the string can still be matched with \f(CW\*(C`\eA\*(C'\fR and the end of string | |
668 | can still be matched with the anchors \f(CW\*(C`\eZ\*(C'\fR (matches both the end and | |
669 | the newline before, like \f(CW\*(C`$\*(C'\fR), and \f(CW\*(C`\ez\*(C'\fR (matches only the end): | |
670 | .PP | |
671 | .Vb 2 | |
672 | \& $x =~ /^Who/m; # matches, "Who" at start of second line | |
673 | \& $x =~ /\eAWho/m; # doesn't match, "Who" is not at start of string | |
674 | .Ve | |
675 | .PP | |
676 | .Vb 2 | |
677 | \& $x =~ /girl$/m; # matches, "girl" at end of first line | |
678 | \& $x =~ /girl\eZ/m; # doesn't match, "girl" is not at end of string | |
679 | .Ve | |
680 | .PP | |
681 | .Vb 2 | |
682 | \& $x =~ /Perl\eZ/m; # matches, "Perl" is at newline before end | |
683 | \& $x =~ /Perl\ez/m; # doesn't match, "Perl" is not at end of string | |
684 | .Ve | |
685 | .PP | |
686 | We now know how to create choices among classes of characters in a | |
687 | regexp. What about choices among words or character strings? Such | |
688 | choices are described in the next section. | |
689 | .Sh "Matching this or that" | |
690 | .IX Subsection "Matching this or that" | |
691 | Sometimes we would like to our regexp to be able to match different | |
692 | possible words or character strings. This is accomplished by using | |
693 | the \fBalternation\fR metacharacter \f(CW\*(C`|\*(C'\fR. To match \f(CW\*(C`dog\*(C'\fR or \f(CW\*(C`cat\*(C'\fR, we | |
694 | form the regexp \f(CW\*(C`dog|cat\*(C'\fR. As before, perl will try to match the | |
695 | regexp at the earliest possible point in the string. At each | |
696 | character position, perl will first try to match the first | |
697 | alternative, \f(CW\*(C`dog\*(C'\fR. If \f(CW\*(C`dog\*(C'\fR doesn't match, perl will then try the | |
698 | next alternative, \f(CW\*(C`cat\*(C'\fR. If \f(CW\*(C`cat\*(C'\fR doesn't match either, then the | |
699 | match fails and perl moves to the next position in the string. Some | |
700 | examples: | |
701 | .PP | |
702 | .Vb 2 | |
703 | \& "cats and dogs" =~ /cat|dog|bird/; # matches "cat" | |
704 | \& "cats and dogs" =~ /dog|cat|bird/; # matches "cat" | |
705 | .Ve | |
706 | .PP | |
707 | Even though \f(CW\*(C`dog\*(C'\fR is the first alternative in the second regexp, | |
708 | \&\f(CW\*(C`cat\*(C'\fR is able to match earlier in the string. | |
709 | .PP | |
710 | .Vb 2 | |
711 | \& "cats" =~ /c|ca|cat|cats/; # matches "c" | |
712 | \& "cats" =~ /cats|cat|ca|c/; # matches "cats" | |
713 | .Ve | |
714 | .PP | |
715 | Here, all the alternatives match at the first string position, so the | |
716 | first alternative is the one that matches. If some of the | |
717 | alternatives are truncations of the others, put the longest ones first | |
718 | to give them a chance to match. | |
719 | .PP | |
720 | .Vb 2 | |
721 | \& "cab" =~ /a|b|c/ # matches "c" | |
722 | \& # /a|b|c/ == /[abc]/ | |
723 | .Ve | |
724 | .PP | |
725 | The last example points out that character classes are like | |
726 | alternations of characters. At a given character position, the first | |
727 | alternative that allows the regexp match to succeed will be the one | |
728 | that matches. | |
729 | .Sh "Grouping things and hierarchical matching" | |
730 | .IX Subsection "Grouping things and hierarchical matching" | |
731 | Alternation allows a regexp to choose among alternatives, but by | |
732 | itself it unsatisfying. The reason is that each alternative is a whole | |
733 | regexp, but sometime we want alternatives for just part of a | |
734 | regexp. For instance, suppose we want to search for housecats or | |
735 | housekeepers. The regexp \f(CW\*(C`housecat|housekeeper\*(C'\fR fits the bill, but is | |
736 | inefficient because we had to type \f(CW\*(C`house\*(C'\fR twice. It would be nice to | |
737 | have parts of the regexp be constant, like \f(CW\*(C`house\*(C'\fR, and some | |
738 | parts have alternatives, like \f(CW\*(C`cat|keeper\*(C'\fR. | |
739 | .PP | |
740 | The \fBgrouping\fR metacharacters \f(CW\*(C`()\*(C'\fR solve this problem. Grouping | |
741 | allows parts of a regexp to be treated as a single unit. Parts of a | |
742 | regexp are grouped by enclosing them in parentheses. Thus we could solve | |
743 | the \f(CW\*(C`housecat|housekeeper\*(C'\fR by forming the regexp as | |
744 | \&\f(CW\*(C`house(cat|keeper)\*(C'\fR. The regexp \f(CW\*(C`house(cat|keeper)\*(C'\fR means match | |
745 | \&\f(CW\*(C`house\*(C'\fR followed by either \f(CW\*(C`cat\*(C'\fR or \f(CW\*(C`keeper\*(C'\fR. Some more examples | |
746 | are | |
747 | .PP | |
748 | .Vb 4 | |
749 | \& /(a|b)b/; # matches 'ab' or 'bb' | |
750 | \& /(ac|b)b/; # matches 'acb' or 'bb' | |
751 | \& /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere | |
752 | \& /(a|[bc])d/; # matches 'ad', 'bd', or 'cd' | |
753 | .Ve | |
754 | .PP | |
755 | .Vb 3 | |
756 | \& /house(cat|)/; # matches either 'housecat' or 'house' | |
757 | \& /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or | |
758 | \& # 'house'. Note groups can be nested. | |
759 | .Ve | |
760 | .PP | |
761 | .Vb 3 | |
762 | \& /(19|20|)\ed\ed/; # match years 19xx, 20xx, or the Y2K problem, xx | |
763 | \& "20" =~ /(19|20|)\ed\ed/; # matches the null alternative '()\ed\ed', | |
764 | \& # because '20\ed\ed' can't match | |
765 | .Ve | |
766 | .PP | |
767 | Alternations behave the same way in groups as out of them: at a given | |
768 | string position, the leftmost alternative that allows the regexp to | |
769 | match is taken. So in the last example at the first string position, | |
770 | \&\f(CW"20"\fR matches the second alternative, but there is nothing left over | |
771 | to match the next two digits \f(CW\*(C`\ed\ed\*(C'\fR. So perl moves on to the next | |
772 | alternative, which is the null alternative and that works, since | |
773 | \&\f(CW"20"\fR is two digits. | |
774 | .PP | |
775 | The process of trying one alternative, seeing if it matches, and | |
776 | moving on to the next alternative if it doesn't, is called | |
777 | \&\fBbacktracking\fR. The term 'backtracking' comes from the idea that | |
778 | matching a regexp is like a walk in the woods. Successfully matching | |
779 | a regexp is like arriving at a destination. There are many possible | |
780 | trailheads, one for each string position, and each one is tried in | |
781 | order, left to right. From each trailhead there may be many paths, | |
782 | some of which get you there, and some which are dead ends. When you | |
783 | walk along a trail and hit a dead end, you have to backtrack along the | |
784 | trail to an earlier point to try another trail. If you hit your | |
785 | destination, you stop immediately and forget about trying all the | |
786 | other trails. You are persistent, and only if you have tried all the | |
787 | trails from all the trailheads and not arrived at your destination, do | |
788 | you declare failure. To be concrete, here is a step-by-step analysis | |
789 | of what perl does when it tries to match the regexp | |
790 | .PP | |
791 | .Vb 1 | |
792 | \& "abcde" =~ /(abd|abc)(df|d|de)/; | |
793 | .Ve | |
794 | .IP "\(bu" 4 | |
795 | Start with the first letter in the string 'a'. | |
796 | .IP "1" 4 | |
797 | .IX Item "1" | |
798 | Try the first alternative in the first group 'abd'. | |
799 | .IP "2" 4 | |
800 | .IX Item "2" | |
801 | Match 'a' followed by 'b'. So far so good. | |
802 | .IP "3" 4 | |
803 | .IX Item "3" | |
804 | \&'d' in the regexp doesn't match 'c' in the string \- a dead | |
805 | end. So backtrack two characters and pick the second alternative in | |
806 | the first group 'abc'. | |
807 | .IP "4" 4 | |
808 | .IX Item "4" | |
809 | Match 'a' followed by 'b' followed by 'c'. We are on a roll | |
810 | and have satisfied the first group. Set \f(CW$1\fR to 'abc'. | |
811 | .IP "5" 4 | |
812 | .IX Item "5" | |
813 | Move on to the second group and pick the first alternative | |
814 | \&'df'. | |
815 | .IP "6" 4 | |
816 | .IX Item "6" | |
817 | Match the 'd'. | |
818 | .IP "7" 4 | |
819 | .IX Item "7" | |
820 | \&'f' in the regexp doesn't match 'e' in the string, so a dead | |
821 | end. Backtrack one character and pick the second alternative in the | |
822 | second group 'd'. | |
823 | .IP "8" 4 | |
824 | .IX Item "8" | |
825 | \&'d' matches. The second grouping is satisfied, so set \f(CW$2\fR to | |
826 | \&'d'. | |
827 | .IP "9" 4 | |
828 | .IX Item "9" | |
829 | We are at the end of the regexp, so we are done! We have | |
830 | matched 'abcd' out of the string \*(L"abcde\*(R". | |
831 | .PP | |
832 | There are a couple of things to note about this analysis. First, the | |
833 | third alternative in the second group 'de' also allows a match, but we | |
834 | stopped before we got to it \- at a given character position, leftmost | |
835 | wins. Second, we were able to get a match at the first character | |
836 | position of the string 'a'. If there were no matches at the first | |
837 | position, perl would move to the second character position 'b' and | |
838 | attempt the match all over again. Only when all possible paths at all | |
839 | possible character positions have been exhausted does perl give | |
840 | up and declare \f(CW\*(C`$string\ =~\ /(abd|abc)(df|d|de)/;\*(C'\fR\ to be false. | |
841 | .PP | |
842 | Even with all this work, regexp matching happens remarkably fast. To | |
843 | speed things up, during compilation stage, perl compiles the regexp | |
844 | into a compact sequence of opcodes that can often fit inside a | |
845 | processor cache. When the code is executed, these opcodes can then run | |
846 | at full throttle and search very quickly. | |
847 | .Sh "Extracting matches" | |
848 | .IX Subsection "Extracting matches" | |
849 | The grouping metacharacters \f(CW\*(C`()\*(C'\fR also serve another completely | |
850 | different function: they allow the extraction of the parts of a string | |
851 | that matched. This is very useful to find out what matched and for | |
852 | text processing in general. For each grouping, the part that matched | |
853 | inside goes into the special variables \f(CW$1\fR, \f(CW$2\fR, etc. They can be | |
854 | used just as ordinary variables: | |
855 | .PP | |
856 | .Vb 5 | |
857 | \& # extract hours, minutes, seconds | |
858 | \& $time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/; # match hh:mm:ss format | |
859 | \& $hours = $1; | |
860 | \& $minutes = $2; | |
861 | \& $seconds = $3; | |
862 | .Ve | |
863 | .PP | |
864 | Now, we know that in scalar context, | |
865 | \&\f(CW\*(C`$time\ =~\ /(\ed\ed):(\ed\ed):(\ed\ed)/\*(C'\fR\ returns a true or false | |
866 | value. In list context, however, it returns the list of matched values | |
867 | \&\f(CW\*(C`($1,$2,$3)\*(C'\fR. So we could write the code more compactly as | |
868 | .PP | |
869 | .Vb 2 | |
870 | \& # extract hours, minutes, seconds | |
871 | \& ($hours, $minutes, $second) = ($time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/); | |
872 | .Ve | |
873 | .PP | |
874 | If the groupings in a regexp are nested, \f(CW$1\fR gets the group with the | |
875 | leftmost opening parenthesis, \f(CW$2\fR the next opening parenthesis, | |
876 | etc. For example, here is a complex regexp and the matching variables | |
877 | indicated below it: | |
878 | .PP | |
879 | .Vb 2 | |
880 | \& /(ab(cd|ef)((gi)|j))/; | |
881 | \& 1 2 34 | |
882 | .Ve | |
883 | .PP | |
884 | so that if the regexp matched, e.g., \f(CW$2\fR would contain 'cd' or 'ef'. For | |
885 | convenience, perl sets \f(CW$+\fR to the string held by the highest numbered | |
886 | \&\f(CW$1\fR, \f(CW$2\fR, ... that got assigned (and, somewhat related, \f(CW$^N\fR to the | |
887 | value of the \f(CW$1\fR, \f(CW$2\fR, ... most-recently assigned; i.e. the \f(CW$1\fR, | |
888 | \&\f(CW$2\fR, ... associated with the rightmost closing parenthesis used in the | |
889 | match). | |
890 | .PP | |
891 | Closely associated with the matching variables \f(CW$1\fR, \f(CW$2\fR, ... are | |
892 | the \fBbackreferences\fR \f(CW\*(C`\e1\*(C'\fR, \f(CW\*(C`\e2\*(C'\fR, ... . Backreferences are simply | |
893 | matching variables that can be used \fIinside\fR a regexp. This is a | |
894 | really nice feature \- what matches later in a regexp can depend on | |
895 | what matched earlier in the regexp. Suppose we wanted to look | |
896 | for doubled words in text, like 'the the'. The following regexp finds | |
897 | all 3\-letter doubles with a space in between: | |
898 | .PP | |
899 | .Vb 1 | |
900 | \& /(\ew\ew\ew)\es\e1/; | |
901 | .Ve | |
902 | .PP | |
903 | The grouping assigns a value to \e1, so that the same 3 letter sequence | |
904 | is used for both parts. Here are some words with repeated parts: | |
905 | .PP | |
906 | .Vb 7 | |
907 | \& % simple_grep '^(\ew\ew\ew\ew|\ew\ew\ew|\ew\ew|\ew)\e1$' /usr/dict/words | |
908 | \& beriberi | |
909 | \& booboo | |
910 | \& coco | |
911 | \& mama | |
912 | \& murmur | |
913 | \& papa | |
914 | .Ve | |
915 | .PP | |
916 | The regexp has a single grouping which considers 4\-letter | |
917 | combinations, then 3\-letter combinations, etc. and uses \f(CW\*(C`\e1\*(C'\fR to look for | |
918 | a repeat. Although \f(CW$1\fR and \f(CW\*(C`\e1\*(C'\fR represent the same thing, care should be | |
919 | taken to use matched variables \f(CW$1\fR, \f(CW$2\fR, ... only outside a regexp | |
920 | and backreferences \f(CW\*(C`\e1\*(C'\fR, \f(CW\*(C`\e2\*(C'\fR, ... only inside a regexp; not doing | |
921 | so may lead to surprising and/or undefined results. | |
922 | .PP | |
923 | In addition to what was matched, Perl 5.6.0 also provides the | |
924 | positions of what was matched with the \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR | |
925 | arrays. \f(CW\*(C`$\-[0]\*(C'\fR is the position of the start of the entire match and | |
926 | \&\f(CW$+[0]\fR is the position of the end. Similarly, \f(CW\*(C`$\-[n]\*(C'\fR is the | |
927 | position of the start of the \f(CW$n\fR match and \f(CW$+[n]\fR is the position | |
928 | of the end. If \f(CW$n\fR is undefined, so are \f(CW\*(C`$\-[n]\*(C'\fR and \f(CW$+[n]\fR. Then | |
929 | this code | |
930 | .PP | |
931 | .Vb 5 | |
932 | \& $x = "Mmm...donut, thought Homer"; | |
933 | \& $x =~ /^(Mmm|Yech)\e.\e.\e.(donut|peas)/; # matches | |
934 | \& foreach $expr (1..$#-) { | |
935 | \& print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\en"; | |
936 | \& } | |
937 | .Ve | |
938 | .PP | |
939 | prints | |
940 | .PP | |
941 | .Vb 2 | |
942 | \& Match 1: 'Mmm' at position (0,3) | |
943 | \& Match 2: 'donut' at position (6,11) | |
944 | .Ve | |
945 | .PP | |
946 | Even if there are no groupings in a regexp, it is still possible to | |
947 | find out what exactly matched in a string. If you use them, perl | |
948 | will set \f(CW$`\fR to the part of the string before the match, will set \f(CW$&\fR | |
949 | to the part of the string that matched, and will set \f(CW$'\fR to the part | |
950 | of the string after the match. An example: | |
951 | .PP | |
952 | .Vb 3 | |
953 | \& $x = "the cat caught the mouse"; | |
954 | \& $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse' | |
955 | \& $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse' | |
956 | .Ve | |
957 | .PP | |
958 | In the second match, \f(CW\*(C`$`\ =\ ''\*(C'\fR\ because the regexp matched at the | |
959 | first character position in the string and stopped, it never saw the | |
960 | second 'the'. It is important to note that using \f(CW$`\fR and \f(CW$'\fR | |
961 | slows down regexp matching quite a bit, and \f(CW $& \fR slows it down to a | |
962 | lesser extent, because if they are used in one regexp in a program, | |
963 | they are generated for <all> regexps in the program. So if raw | |
964 | performance is a goal of your application, they should be avoided. | |
965 | If you need them, use \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR instead: | |
966 | .PP | |
967 | .Vb 3 | |
968 | \& $` is the same as substr( $x, 0, $-[0] ) | |
969 | \& $& is the same as substr( $x, $-[0], $+[0]-$-[0] ) | |
970 | \& $' is the same as substr( $x, $+[0] ) | |
971 | .Ve | |
972 | .Sh "Matching repetitions" | |
973 | .IX Subsection "Matching repetitions" | |
974 | The examples in the previous section display an annoying weakness. We | |
975 | were only matching 3\-letter words, or syllables of 4 letters or | |
976 | less. We'd like to be able to match words or syllables of any length, | |
977 | without writing out tedious alternatives like | |
978 | \&\f(CW\*(C`\ew\ew\ew\ew|\ew\ew\ew|\ew\ew|\ew\*(C'\fR. | |
979 | .PP | |
980 | This is exactly the problem the \fBquantifier\fR metacharacters \f(CW\*(C`?\*(C'\fR, | |
981 | \&\f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR, and \f(CW\*(C`{}\*(C'\fR were created for. They allow us to determine the | |
982 | number of repeats of a portion of a regexp we consider to be a | |
983 | match. Quantifiers are put immediately after the character, character | |
984 | class, or grouping that we want to specify. They have the following | |
985 | meanings: | |
986 | .IP "\(bu" 4 | |
987 | \&\f(CW\*(C`a?\*(C'\fR = match 'a' 1 or 0 times | |
988 | .IP "\(bu" 4 | |
989 | \&\f(CW\*(C`a*\*(C'\fR = match 'a' 0 or more times, i.e., any number of times | |
990 | .IP "\(bu" 4 | |
991 | \&\f(CW\*(C`a+\*(C'\fR = match 'a' 1 or more times, i.e., at least once | |
992 | .IP "\(bu" 4 | |
993 | \&\f(CW\*(C`a{n,m}\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, but not more than \f(CW\*(C`m\*(C'\fR | |
994 | times. | |
995 | .IP "\(bu" 4 | |
996 | \&\f(CW\*(C`a{n,}\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR or more times | |
997 | .IP "\(bu" 4 | |
998 | \&\f(CW\*(C`a{n}\*(C'\fR = match exactly \f(CW\*(C`n\*(C'\fR times | |
999 | .PP | |
1000 | Here are some examples: | |
1001 | .PP | |
1002 | .Vb 9 | |
1003 | \& /[a-z]+\es+\ed*/; # match a lowercase word, at least some space, and | |
1004 | \& # any number of digits | |
1005 | \& /(\ew+)\es+\e1/; # match doubled words of arbitrary length | |
1006 | \& /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes' | |
1007 | \& $year =~ /\ed{2,4}/; # make sure year is at least 2 but not more | |
1008 | \& # than 4 digits | |
1009 | \& $year =~ /\ed{4}|\ed{2}/; # better match; throw out 3 digit dates | |
1010 | \& $year =~ /\ed{2}(\ed{2})?/; # same thing written differently. However, | |
1011 | \& # this produces $1 and the other does not. | |
1012 | .Ve | |
1013 | .PP | |
1014 | .Vb 7 | |
1015 | \& % simple_grep '^(\ew+)\e1$' /usr/dict/words # isn't this easier? | |
1016 | \& beriberi | |
1017 | \& booboo | |
1018 | \& coco | |
1019 | \& mama | |
1020 | \& murmur | |
1021 | \& papa | |
1022 | .Ve | |
1023 | .PP | |
1024 | For all of these quantifiers, perl will try to match as much of the | |
1025 | string as possible, while still allowing the regexp to succeed. Thus | |
1026 | with \f(CW\*(C`/a?.../\*(C'\fR, perl will first try to match the regexp with the \f(CW\*(C`a\*(C'\fR | |
1027 | present; if that fails, perl will try to match the regexp without the | |
1028 | \&\f(CW\*(C`a\*(C'\fR present. For the quantifier \f(CW\*(C`*\*(C'\fR, we get the following: | |
1029 | .PP | |
1030 | .Vb 5 | |
1031 | \& $x = "the cat in the hat"; | |
1032 | \& $x =~ /^(.*)(cat)(.*)$/; # matches, | |
1033 | \& # $1 = 'the ' | |
1034 | \& # $2 = 'cat' | |
1035 | \& # $3 = ' in the hat' | |
1036 | .Ve | |
1037 | .PP | |
1038 | Which is what we might expect, the match finds the only \f(CW\*(C`cat\*(C'\fR in the | |
1039 | string and locks onto it. Consider, however, this regexp: | |
1040 | .PP | |
1041 | .Vb 4 | |
1042 | \& $x =~ /^(.*)(at)(.*)$/; # matches, | |
1043 | \& # $1 = 'the cat in the h' | |
1044 | \& # $2 = 'at' | |
1045 | \& # $3 = '' (0 matches) | |
1046 | .Ve | |
1047 | .PP | |
1048 | One might initially guess that perl would find the \f(CW\*(C`at\*(C'\fR in \f(CW\*(C`cat\*(C'\fR and | |
1049 | stop there, but that wouldn't give the longest possible string to the | |
1050 | first quantifier \f(CW\*(C`.*\*(C'\fR. Instead, the first quantifier \f(CW\*(C`.*\*(C'\fR grabs as | |
1051 | much of the string as possible while still having the regexp match. In | |
1052 | this example, that means having the \f(CW\*(C`at\*(C'\fR sequence with the final \f(CW\*(C`at\*(C'\fR | |
1053 | in the string. The other important principle illustrated here is that | |
1054 | when there are two or more elements in a regexp, the \fIleftmost\fR | |
1055 | quantifier, if there is one, gets to grab as much the string as | |
1056 | possible, leaving the rest of the regexp to fight over scraps. Thus in | |
1057 | our example, the first quantifier \f(CW\*(C`.*\*(C'\fR grabs most of the string, while | |
1058 | the second quantifier \f(CW\*(C`.*\*(C'\fR gets the empty string. Quantifiers that | |
1059 | grab as much of the string as possible are called \fBmaximal match\fR or | |
1060 | \&\fBgreedy\fR quantifiers. | |
1061 | .PP | |
1062 | When a regexp can match a string in several different ways, we can use | |
1063 | the principles above to predict which way the regexp will match: | |
1064 | .IP "\(bu" 4 | |
1065 | Principle 0: Taken as a whole, any regexp will be matched at the | |
1066 | earliest possible position in the string. | |
1067 | .IP "\(bu" 4 | |
1068 | Principle 1: In an alternation \f(CW\*(C`a|b|c...\*(C'\fR, the leftmost alternative | |
1069 | that allows a match for the whole regexp will be the one used. | |
1070 | .IP "\(bu" 4 | |
1071 | Principle 2: The maximal matching quantifiers \f(CW\*(C`?\*(C'\fR, \f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR and | |
1072 | \&\f(CW\*(C`{n,m}\*(C'\fR will in general match as much of the string as possible while | |
1073 | still allowing the whole regexp to match. | |
1074 | .IP "\(bu" 4 | |
1075 | Principle 3: If there are two or more elements in a regexp, the | |
1076 | leftmost greedy quantifier, if any, will match as much of the string | |
1077 | as possible while still allowing the whole regexp to match. The next | |
1078 | leftmost greedy quantifier, if any, will try to match as much of the | |
1079 | string remaining available to it as possible, while still allowing the | |
1080 | whole regexp to match. And so on, until all the regexp elements are | |
1081 | satisfied. | |
1082 | .PP | |
1083 | As we have seen above, Principle 0 overrides the others \- the regexp | |
1084 | will be matched as early as possible, with the other principles | |
1085 | determining how the regexp matches at that earliest character | |
1086 | position. | |
1087 | .PP | |
1088 | Here is an example of these principles in action: | |
1089 | .PP | |
1090 | .Vb 5 | |
1091 | \& $x = "The programming republic of Perl"; | |
1092 | \& $x =~ /^(.+)(e|r)(.*)$/; # matches, | |
1093 | \& # $1 = 'The programming republic of Pe' | |
1094 | \& # $2 = 'r' | |
1095 | \& # $3 = 'l' | |
1096 | .Ve | |
1097 | .PP | |
1098 | This regexp matches at the earliest string position, \f(CW'T'\fR. One | |
1099 | might think that \f(CW\*(C`e\*(C'\fR, being leftmost in the alternation, would be | |
1100 | matched, but \f(CW\*(C`r\*(C'\fR produces the longest string in the first quantifier. | |
1101 | .PP | |
1102 | .Vb 3 | |
1103 | \& $x =~ /(m{1,2})(.*)$/; # matches, | |
1104 | \& # $1 = 'mm' | |
1105 | \& # $2 = 'ing republic of Perl' | |
1106 | .Ve | |
1107 | .PP | |
1108 | Here, The earliest possible match is at the first \f(CW'm'\fR in | |
1109 | \&\f(CW\*(C`programming\*(C'\fR. \f(CW\*(C`m{1,2}\*(C'\fR is the first quantifier, so it gets to match | |
1110 | a maximal \f(CW\*(C`mm\*(C'\fR. | |
1111 | .PP | |
1112 | .Vb 3 | |
1113 | \& $x =~ /.*(m{1,2})(.*)$/; # matches, | |
1114 | \& # $1 = 'm' | |
1115 | \& # $2 = 'ing republic of Perl' | |
1116 | .Ve | |
1117 | .PP | |
1118 | Here, the regexp matches at the start of the string. The first | |
1119 | quantifier \f(CW\*(C`.*\*(C'\fR grabs as much as possible, leaving just a single | |
1120 | \&\f(CW'm'\fR for the second quantifier \f(CW\*(C`m{1,2}\*(C'\fR. | |
1121 | .PP | |
1122 | .Vb 4 | |
1123 | \& $x =~ /(.?)(m{1,2})(.*)$/; # matches, | |
1124 | \& # $1 = 'a' | |
1125 | \& # $2 = 'mm' | |
1126 | \& # $3 = 'ing republic of Perl' | |
1127 | .Ve | |
1128 | .PP | |
1129 | Here, \f(CW\*(C`.?\*(C'\fR eats its maximal one character at the earliest possible | |
1130 | position in the string, \f(CW'a'\fR in \f(CW\*(C`programming\*(C'\fR, leaving \f(CW\*(C`m{1,2}\*(C'\fR | |
1131 | the opportunity to match both \f(CW\*(C`m\*(C'\fR's. Finally, | |
1132 | .PP | |
1133 | .Vb 1 | |
1134 | \& "aXXXb" =~ /(X*)/; # matches with $1 = '' | |
1135 | .Ve | |
1136 | .PP | |
1137 | because it can match zero copies of \f(CW'X'\fR at the beginning of the | |
1138 | string. If you definitely want to match at least one \f(CW'X'\fR, use | |
1139 | \&\f(CW\*(C`X+\*(C'\fR, not \f(CW\*(C`X*\*(C'\fR. | |
1140 | .PP | |
1141 | Sometimes greed is not good. At times, we would like quantifiers to | |
1142 | match a \fIminimal\fR piece of string, rather than a maximal piece. For | |
1143 | this purpose, Larry Wall created the \fBminimal\ match\fR\ or | |
1144 | \&\fBnon-greedy\fR quantifiers \f(CW\*(C`??\*(C'\fR,\f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, and \f(CW\*(C`{}?\*(C'\fR. These are | |
1145 | the usual quantifiers with a \f(CW\*(C`?\*(C'\fR appended to them. They have the | |
1146 | following meanings: | |
1147 | .IP "\(bu" 4 | |
1148 | \&\f(CW\*(C`a??\*(C'\fR = match 'a' 0 or 1 times. Try 0 first, then 1. | |
1149 | .IP "\(bu" 4 | |
1150 | \&\f(CW\*(C`a*?\*(C'\fR = match 'a' 0 or more times, i.e., any number of times, | |
1151 | but as few times as possible | |
1152 | .IP "\(bu" 4 | |
1153 | \&\f(CW\*(C`a+?\*(C'\fR = match 'a' 1 or more times, i.e., at least once, but | |
1154 | as few times as possible | |
1155 | .IP "\(bu" 4 | |
1156 | \&\f(CW\*(C`a{n,m}?\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, not more than \f(CW\*(C`m\*(C'\fR | |
1157 | times, as few times as possible | |
1158 | .IP "\(bu" 4 | |
1159 | \&\f(CW\*(C`a{n,}?\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, but as few times as | |
1160 | possible | |
1161 | .IP "\(bu" 4 | |
1162 | \&\f(CW\*(C`a{n}?\*(C'\fR = match exactly \f(CW\*(C`n\*(C'\fR times. Because we match exactly | |
1163 | \&\f(CW\*(C`n\*(C'\fR times, \f(CW\*(C`a{n}?\*(C'\fR is equivalent to \f(CW\*(C`a{n}\*(C'\fR and is just there for | |
1164 | notational consistency. | |
1165 | .PP | |
1166 | Let's look at the example above, but with minimal quantifiers: | |
1167 | .PP | |
1168 | .Vb 5 | |
1169 | \& $x = "The programming republic of Perl"; | |
1170 | \& $x =~ /^(.+?)(e|r)(.*)$/; # matches, | |
1171 | \& # $1 = 'Th' | |
1172 | \& # $2 = 'e' | |
1173 | \& # $3 = ' programming republic of Perl' | |
1174 | .Ve | |
1175 | .PP | |
1176 | The minimal string that will allow both the start of the string \f(CW\*(C`^\*(C'\fR | |
1177 | and the alternation to match is \f(CW\*(C`Th\*(C'\fR, with the alternation \f(CW\*(C`e|r\*(C'\fR | |
1178 | matching \f(CW\*(C`e\*(C'\fR. The second quantifier \f(CW\*(C`.*\*(C'\fR is free to gobble up the | |
1179 | rest of the string. | |
1180 | .PP | |
1181 | .Vb 3 | |
1182 | \& $x =~ /(m{1,2}?)(.*?)$/; # matches, | |
1183 | \& # $1 = 'm' | |
1184 | \& # $2 = 'ming republic of Perl' | |
1185 | .Ve | |
1186 | .PP | |
1187 | The first string position that this regexp can match is at the first | |
1188 | \&\f(CW'm'\fR in \f(CW\*(C`programming\*(C'\fR. At this position, the minimal \f(CW\*(C`m{1,2}?\*(C'\fR | |
1189 | matches just one \f(CW'm'\fR. Although the second quantifier \f(CW\*(C`.*?\*(C'\fR would | |
1190 | prefer to match no characters, it is constrained by the end-of-string | |
1191 | anchor \f(CW\*(C`$\*(C'\fR to match the rest of the string. | |
1192 | .PP | |
1193 | .Vb 4 | |
1194 | \& $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches, | |
1195 | \& # $1 = 'The progra' | |
1196 | \& # $2 = 'm' | |
1197 | \& # $3 = 'ming republic of Perl' | |
1198 | .Ve | |
1199 | .PP | |
1200 | In this regexp, you might expect the first minimal quantifier \f(CW\*(C`.*?\*(C'\fR | |
1201 | to match the empty string, because it is not constrained by a \f(CW\*(C`^\*(C'\fR | |
1202 | anchor to match the beginning of the word. Principle 0 applies here, | |
1203 | however. Because it is possible for the whole regexp to match at the | |
1204 | start of the string, it \fIwill\fR match at the start of the string. Thus | |
1205 | the first quantifier has to match everything up to the first \f(CW\*(C`m\*(C'\fR. The | |
1206 | second minimal quantifier matches just one \f(CW\*(C`m\*(C'\fR and the third | |
1207 | quantifier matches the rest of the string. | |
1208 | .PP | |
1209 | .Vb 4 | |
1210 | \& $x =~ /(.??)(m{1,2})(.*)$/; # matches, | |
1211 | \& # $1 = 'a' | |
1212 | \& # $2 = 'mm' | |
1213 | \& # $3 = 'ing republic of Perl' | |
1214 | .Ve | |
1215 | .PP | |
1216 | Just as in the previous regexp, the first quantifier \f(CW\*(C`.??\*(C'\fR can match | |
1217 | earliest at position \f(CW'a'\fR, so it does. The second quantifier is | |
1218 | greedy, so it matches \f(CW\*(C`mm\*(C'\fR, and the third matches the rest of the | |
1219 | string. | |
1220 | .PP | |
1221 | We can modify principle 3 above to take into account non-greedy | |
1222 | quantifiers: | |
1223 | .IP "\(bu" 4 | |
1224 | Principle 3: If there are two or more elements in a regexp, the | |
1225 | leftmost greedy (non\-greedy) quantifier, if any, will match as much | |
1226 | (little) of the string as possible while still allowing the whole | |
1227 | regexp to match. The next leftmost greedy (non\-greedy) quantifier, if | |
1228 | any, will try to match as much (little) of the string remaining | |
1229 | available to it as possible, while still allowing the whole regexp to | |
1230 | match. And so on, until all the regexp elements are satisfied. | |
1231 | .PP | |
1232 | Just like alternation, quantifiers are also susceptible to | |
1233 | backtracking. Here is a step-by-step analysis of the example | |
1234 | .PP | |
1235 | .Vb 5 | |
1236 | \& $x = "the cat in the hat"; | |
1237 | \& $x =~ /^(.*)(at)(.*)$/; # matches, | |
1238 | \& # $1 = 'the cat in the h' | |
1239 | \& # $2 = 'at' | |
1240 | \& # $3 = '' (0 matches) | |
1241 | .Ve | |
1242 | .IP "\(bu" 4 | |
1243 | Start with the first letter in the string 't'. | |
1244 | .IP "1" 4 | |
1245 | .IX Item "1" | |
1246 | The first quantifier '.*' starts out by matching the whole | |
1247 | string 'the cat in the hat'. | |
1248 | .IP "2" 4 | |
1249 | .IX Item "2" | |
1250 | \&'a' in the regexp element 'at' doesn't match the end of the | |
1251 | string. Backtrack one character. | |
1252 | .IP "3" 4 | |
1253 | .IX Item "3" | |
1254 | \&'a' in the regexp element 'at' still doesn't match the last | |
1255 | letter of the string 't', so backtrack one more character. | |
1256 | .IP "4" 4 | |
1257 | .IX Item "4" | |
1258 | Now we can match the 'a' and the 't'. | |
1259 | .IP "5" 4 | |
1260 | .IX Item "5" | |
1261 | Move on to the third element '.*'. Since we are at the end of | |
1262 | the string and '.*' can match 0 times, assign it the empty string. | |
1263 | .IP "6" 4 | |
1264 | .IX Item "6" | |
1265 | We are done! | |
1266 | .PP | |
1267 | Most of the time, all this moving forward and backtracking happens | |
1268 | quickly and searching is fast. There are some pathological regexps, | |
1269 | however, whose execution time exponentially grows with the size of the | |
1270 | string. A typical structure that blows up in your face is of the form | |
1271 | .PP | |
1272 | .Vb 1 | |
1273 | \& /(a|b+)*/; | |
1274 | .Ve | |
1275 | .PP | |
1276 | The problem is the nested indeterminate quantifiers. There are many | |
1277 | different ways of partitioning a string of length n between the \f(CW\*(C`+\*(C'\fR | |
1278 | and \f(CW\*(C`*\*(C'\fR: one repetition with \f(CW\*(C`b+\*(C'\fR of length n, two repetitions with | |
1279 | the first \f(CW\*(C`b+\*(C'\fR length k and the second with length n\-k, m repetitions | |
1280 | whose bits add up to length n, etc. In fact there are an exponential | |
1281 | number of ways to partition a string as a function of length. A | |
1282 | regexp may get lucky and match early in the process, but if there is | |
1283 | no match, perl will try \fIevery\fR possibility before giving up. So be | |
1284 | careful with nested \f(CW\*(C`*\*(C'\fR's, \f(CW\*(C`{n,m}\*(C'\fR's, and \f(CW\*(C`+\*(C'\fR's. The book | |
1285 | \&\fIMastering regular expressions\fR by Jeffrey Friedl gives a wonderful | |
1286 | discussion of this and other efficiency issues. | |
1287 | .Sh "Building a regexp" | |
1288 | .IX Subsection "Building a regexp" | |
1289 | At this point, we have all the basic regexp concepts covered, so let's | |
1290 | give a more involved example of a regular expression. We will build a | |
1291 | regexp that matches numbers. | |
1292 | .PP | |
1293 | The first task in building a regexp is to decide what we want to match | |
1294 | and what we want to exclude. In our case, we want to match both | |
1295 | integers and floating point numbers and we want to reject any string | |
1296 | that isn't a number. | |
1297 | .PP | |
1298 | The next task is to break the problem down into smaller problems that | |
1299 | are easily converted into a regexp. | |
1300 | .PP | |
1301 | The simplest case is integers. These consist of a sequence of digits, | |
1302 | with an optional sign in front. The digits we can represent with | |
1303 | \&\f(CW\*(C`\ed+\*(C'\fR and the sign can be matched with \f(CW\*(C`[+\-]\*(C'\fR. Thus the integer | |
1304 | regexp is | |
1305 | .PP | |
1306 | .Vb 1 | |
1307 | \& /[+-]?\ed+/; # matches integers | |
1308 | .Ve | |
1309 | .PP | |
1310 | A floating point number potentially has a sign, an integral part, a | |
1311 | decimal point, a fractional part, and an exponent. One or more of these | |
1312 | parts is optional, so we need to check out the different | |
1313 | possibilities. Floating point numbers which are in proper form include | |
1314 | 123., 0.345, .34, \-1e6, and 25.4E\-72. As with integers, the sign out | |
1315 | front is completely optional and can be matched by \f(CW\*(C`[+\-]?\*(C'\fR. We can | |
1316 | see that if there is no exponent, floating point numbers must have a | |
1317 | decimal point, otherwise they are integers. We might be tempted to | |
1318 | model these with \f(CW\*(C`\ed*\e.\ed*\*(C'\fR, but this would also match just a single | |
1319 | decimal point, which is not a number. So the three cases of floating | |
1320 | point number sans exponent are | |
1321 | .PP | |
1322 | .Vb 3 | |
1323 | \& /[+-]?\ed+\e./; # 1., 321., etc. | |
1324 | \& /[+-]?\e.\ed+/; # .1, .234, etc. | |
1325 | \& /[+-]?\ed+\e.\ed+/; # 1.0, 30.56, etc. | |
1326 | .Ve | |
1327 | .PP | |
1328 | These can be combined into a single regexp with a three-way alternation: | |
1329 | .PP | |
1330 | .Vb 1 | |
1331 | \& /[+-]?(\ed+\e.\ed+|\ed+\e.|\e.\ed+)/; # floating point, no exponent | |
1332 | .Ve | |
1333 | .PP | |
1334 | In this alternation, it is important to put \f(CW'\ed+\e.\ed+'\fR before | |
1335 | \&\f(CW'\ed+\e.'\fR. If \f(CW'\ed+\e.'\fR were first, the regexp would happily match that | |
1336 | and ignore the fractional part of the number. | |
1337 | .PP | |
1338 | Now consider floating point numbers with exponents. The key | |
1339 | observation here is that \fIboth\fR integers and numbers with decimal | |
1340 | points are allowed in front of an exponent. Then exponents, like the | |
1341 | overall sign, are independent of whether we are matching numbers with | |
1342 | or without decimal points, and can be 'decoupled' from the | |
1343 | mantissa. The overall form of the regexp now becomes clear: | |
1344 | .PP | |
1345 | .Vb 1 | |
1346 | \& /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/; | |
1347 | .Ve | |
1348 | .PP | |
1349 | The exponent is an \f(CW\*(C`e\*(C'\fR or \f(CW\*(C`E\*(C'\fR, followed by an integer. So the | |
1350 | exponent regexp is | |
1351 | .PP | |
1352 | .Vb 1 | |
1353 | \& /[eE][+-]?\ed+/; # exponent | |
1354 | .Ve | |
1355 | .PP | |
1356 | Putting all the parts together, we get a regexp that matches numbers: | |
1357 | .PP | |
1358 | .Vb 1 | |
1359 | \& /^[+-]?(\ed+\e.\ed+|\ed+\e.|\e.\ed+|\ed+)([eE][+-]?\ed+)?$/; # Ta da! | |
1360 | .Ve | |
1361 | .PP | |
1362 | Long regexps like this may impress your friends, but can be hard to | |
1363 | decipher. In complex situations like this, the \f(CW\*(C`//x\*(C'\fR modifier for a | |
1364 | match is invaluable. It allows one to put nearly arbitrary whitespace | |
1365 | and comments into a regexp without affecting their meaning. Using it, | |
1366 | we can rewrite our 'extended' regexp in the more pleasing form | |
1367 | .PP | |
1368 | .Vb 10 | |
1369 | \& /^ | |
1370 | \& [+-]? # first, match an optional sign | |
1371 | \& ( # then match integers or f.p. mantissas: | |
1372 | \& \ed+\e.\ed+ # mantissa of the form a.b | |
1373 | \& |\ed+\e. # mantissa of the form a. | |
1374 | \& |\e.\ed+ # mantissa of the form .b | |
1375 | \& |\ed+ # integer of the form a | |
1376 | \& ) | |
1377 | \& ([eE][+-]?\ed+)? # finally, optionally match an exponent | |
1378 | \& $/x; | |
1379 | .Ve | |
1380 | .PP | |
1381 | If whitespace is mostly irrelevant, how does one include space | |
1382 | characters in an extended regexp? The answer is to backslash it | |
1383 | \&\f(CW'\e\ '\fR\ or put it in a character class \f(CW\*(C`[\ ]\*(C'\fR\ . The same thing | |
1384 | goes for pound signs, use \f(CW\*(C`\e#\*(C'\fR or \f(CW\*(C`[#]\*(C'\fR. For instance, Perl allows | |
1385 | a space between the sign and the mantissa/integer, and we could add | |
1386 | this to our regexp as follows: | |
1387 | .PP | |
1388 | .Vb 10 | |
1389 | \& /^ | |
1390 | \& [+-]?\e * # first, match an optional sign *and space* | |
1391 | \& ( # then match integers or f.p. mantissas: | |
1392 | \& \ed+\e.\ed+ # mantissa of the form a.b | |
1393 | \& |\ed+\e. # mantissa of the form a. | |
1394 | \& |\e.\ed+ # mantissa of the form .b | |
1395 | \& |\ed+ # integer of the form a | |
1396 | \& ) | |
1397 | \& ([eE][+-]?\ed+)? # finally, optionally match an exponent | |
1398 | \& $/x; | |
1399 | .Ve | |
1400 | .PP | |
1401 | In this form, it is easier to see a way to simplify the | |
1402 | alternation. Alternatives 1, 2, and 4 all start with \f(CW\*(C`\ed+\*(C'\fR, so it | |
1403 | could be factored out: | |
1404 | .PP | |
1405 | .Vb 11 | |
1406 | \& /^ | |
1407 | \& [+-]?\e * # first, match an optional sign | |
1408 | \& ( # then match integers or f.p. mantissas: | |
1409 | \& \ed+ # start out with a ... | |
1410 | \& ( | |
1411 | \& \e.\ed* # mantissa of the form a.b or a. | |
1412 | \& )? # ? takes care of integers of the form a | |
1413 | \& |\e.\ed+ # mantissa of the form .b | |
1414 | \& ) | |
1415 | \& ([eE][+-]?\ed+)? # finally, optionally match an exponent | |
1416 | \& $/x; | |
1417 | .Ve | |
1418 | .PP | |
1419 | or written in the compact form, | |
1420 | .PP | |
1421 | .Vb 1 | |
1422 | \& /^[+-]?\e *(\ed+(\e.\ed*)?|\e.\ed+)([eE][+-]?\ed+)?$/; | |
1423 | .Ve | |
1424 | .PP | |
1425 | This is our final regexp. To recap, we built a regexp by | |
1426 | .IP "\(bu" 4 | |
1427 | specifying the task in detail, | |
1428 | .IP "\(bu" 4 | |
1429 | breaking down the problem into smaller parts, | |
1430 | .IP "\(bu" 4 | |
1431 | translating the small parts into regexps, | |
1432 | .IP "\(bu" 4 | |
1433 | combining the regexps, | |
1434 | .IP "\(bu" 4 | |
1435 | and optimizing the final combined regexp. | |
1436 | .PP | |
1437 | These are also the typical steps involved in writing a computer | |
1438 | program. This makes perfect sense, because regular expressions are | |
1439 | essentially programs written a little computer language that specifies | |
1440 | patterns. | |
1441 | .Sh "Using regular expressions in Perl" | |
1442 | .IX Subsection "Using regular expressions in Perl" | |
1443 | The last topic of Part 1 briefly covers how regexps are used in Perl | |
1444 | programs. Where do they fit into Perl syntax? | |
1445 | .PP | |
1446 | We have already introduced the matching operator in its default | |
1447 | \&\f(CW\*(C`/regexp/\*(C'\fR and arbitrary delimiter \f(CW\*(C`m!regexp!\*(C'\fR forms. We have used | |
1448 | the binding operator \f(CW\*(C`=~\*(C'\fR and its negation \f(CW\*(C`!~\*(C'\fR to test for string | |
1449 | matches. Associated with the matching operator, we have discussed the | |
1450 | single line \f(CW\*(C`//s\*(C'\fR, multi-line \f(CW\*(C`//m\*(C'\fR, case-insensitive \f(CW\*(C`//i\*(C'\fR and | |
1451 | extended \f(CW\*(C`//x\*(C'\fR modifiers. | |
1452 | .PP | |
1453 | There are a few more things you might want to know about matching | |
1454 | operators. First, we pointed out earlier that variables in regexps are | |
1455 | substituted before the regexp is evaluated: | |
1456 | .PP | |
1457 | .Vb 4 | |
1458 | \& $pattern = 'Seuss'; | |
1459 | \& while (<>) { | |
1460 | \& print if /$pattern/; | |
1461 | \& } | |
1462 | .Ve | |
1463 | .PP | |
1464 | This will print any lines containing the word \f(CW\*(C`Seuss\*(C'\fR. It is not as | |
1465 | efficient as it could be, however, because perl has to re-evaluate | |
1466 | \&\f(CW$pattern\fR each time through the loop. If \f(CW$pattern\fR won't be | |
1467 | changing over the lifetime of the script, we can add the \f(CW\*(C`//o\*(C'\fR | |
1468 | modifier, which directs perl to only perform variable substitutions | |
1469 | once: | |
1470 | .PP | |
1471 | .Vb 6 | |
1472 | \& #!/usr/bin/perl | |
1473 | \& # Improved simple_grep | |
1474 | \& $regexp = shift; | |
1475 | \& while (<>) { | |
1476 | \& print if /$regexp/o; # a good deal faster | |
1477 | \& } | |
1478 | .Ve | |
1479 | .PP | |
1480 | If you change \f(CW$pattern\fR after the first substitution happens, perl | |
1481 | will ignore it. If you don't want any substitutions at all, use the | |
1482 | special delimiter \f(CW\*(C`m''\*(C'\fR: | |
1483 | .PP | |
1484 | .Vb 4 | |
1485 | \& $pattern = 'Seuss'; | |
1486 | \& while (<>) { | |
1487 | \& print if m'$pattern'; # matches '$pattern', not 'Seuss' | |
1488 | \& } | |
1489 | .Ve | |
1490 | .PP | |
1491 | \&\f(CW\*(C`m''\*(C'\fR acts like single quotes on a regexp; all other \f(CW\*(C`m\*(C'\fR delimiters | |
1492 | act like double quotes. If the regexp evaluates to the empty string, | |
1493 | the regexp in the \fIlast successful match\fR is used instead. So we have | |
1494 | .PP | |
1495 | .Vb 2 | |
1496 | \& "dog" =~ /d/; # 'd' matches | |
1497 | \& "dogbert =~ //; # this matches the 'd' regexp used before | |
1498 | .Ve | |
1499 | .PP | |
1500 | The final two modifiers \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`//c\*(C'\fR concern multiple matches. | |
1501 | The modifier \f(CW\*(C`//g\*(C'\fR stands for global matching and allows the | |
1502 | matching operator to match within a string as many times as possible. | |
1503 | In scalar context, successive invocations against a string will have | |
1504 | `\f(CW\*(C`//g\*(C'\fR jump from match to match, keeping track of position in the | |
1505 | string as it goes along. You can get or set the position with the | |
1506 | \&\f(CW\*(C`pos()\*(C'\fR function. | |
1507 | .PP | |
1508 | The use of \f(CW\*(C`//g\*(C'\fR is shown in the following example. Suppose we have | |
1509 | a string that consists of words separated by spaces. If we know how | |
1510 | many words there are in advance, we could extract the words using | |
1511 | groupings: | |
1512 | .PP | |
1513 | .Vb 5 | |
1514 | \& $x = "cat dog house"; # 3 words | |
1515 | \& $x =~ /^\es*(\ew+)\es+(\ew+)\es+(\ew+)\es*$/; # matches, | |
1516 | \& # $1 = 'cat' | |
1517 | \& # $2 = 'dog' | |
1518 | \& # $3 = 'house' | |
1519 | .Ve | |
1520 | .PP | |
1521 | But what if we had an indeterminate number of words? This is the sort | |
1522 | of task \f(CW\*(C`//g\*(C'\fR was made for. To extract all words, form the simple | |
1523 | regexp \f(CW\*(C`(\ew+)\*(C'\fR and loop over all matches with \f(CW\*(C`/(\ew+)/g\*(C'\fR: | |
1524 | .PP | |
1525 | .Vb 3 | |
1526 | \& while ($x =~ /(\ew+)/g) { | |
1527 | \& print "Word is $1, ends at position ", pos $x, "\en"; | |
1528 | \& } | |
1529 | .Ve | |
1530 | .PP | |
1531 | prints | |
1532 | .PP | |
1533 | .Vb 3 | |
1534 | \& Word is cat, ends at position 3 | |
1535 | \& Word is dog, ends at position 7 | |
1536 | \& Word is house, ends at position 13 | |
1537 | .Ve | |
1538 | .PP | |
1539 | A failed match or changing the target string resets the position. If | |
1540 | you don't want the position reset after failure to match, add the | |
1541 | \&\f(CW\*(C`//c\*(C'\fR, as in \f(CW\*(C`/regexp/gc\*(C'\fR. The current position in the string is | |
1542 | associated with the string, not the regexp. This means that different | |
1543 | strings have different positions and their respective positions can be | |
1544 | set or read independently. | |
1545 | .PP | |
1546 | In list context, \f(CW\*(C`//g\*(C'\fR returns a list of matched groupings, or if | |
1547 | there are no groupings, a list of matches to the whole regexp. So if | |
1548 | we wanted just the words, we could use | |
1549 | .PP | |
1550 | .Vb 4 | |
1551 | \& @words = ($x =~ /(\ew+)/g); # matches, | |
1552 | \& # $word[0] = 'cat' | |
1553 | \& # $word[1] = 'dog' | |
1554 | \& # $word[2] = 'house' | |
1555 | .Ve | |
1556 | .PP | |
1557 | Closely associated with the \f(CW\*(C`//g\*(C'\fR modifier is the \f(CW\*(C`\eG\*(C'\fR anchor. The | |
1558 | \&\f(CW\*(C`\eG\*(C'\fR anchor matches at the point where the previous \f(CW\*(C`//g\*(C'\fR match left | |
1559 | off. \f(CW\*(C`\eG\*(C'\fR allows us to easily do context-sensitive matching: | |
1560 | .PP | |
1561 | .Vb 12 | |
1562 | \& $metric = 1; # use metric units | |
1563 | \& ... | |
1564 | \& $x = <FILE>; # read in measurement | |
1565 | \& $x =~ /^([+-]?\ed+)\es*/g; # get magnitude | |
1566 | \& $weight = $1; | |
1567 | \& if ($metric) { # error checking | |
1568 | \& print "Units error!" unless $x =~ /\eGkg\e./g; | |
1569 | \& } | |
1570 | \& else { | |
1571 | \& print "Units error!" unless $x =~ /\eGlbs\e./g; | |
1572 | \& } | |
1573 | \& $x =~ /\eG\es+(widget|sprocket)/g; # continue processing | |
1574 | .Ve | |
1575 | .PP | |
1576 | The combination of \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`\eG\*(C'\fR allows us to process the string a | |
1577 | bit at a time and use arbitrary Perl logic to decide what to do next. | |
1578 | Currently, the \f(CW\*(C`\eG\*(C'\fR anchor is only fully supported when used to anchor | |
1579 | to the start of the pattern. | |
1580 | .PP | |
1581 | \&\f(CW\*(C`\eG\*(C'\fR is also invaluable in processing fixed length records with | |
1582 | regexps. Suppose we have a snippet of coding region \s-1DNA\s0, encoded as | |
1583 | base pair letters \f(CW\*(C`ATCGTTGAAT...\*(C'\fR and we want to find all the stop | |
1584 | codons \f(CW\*(C`TGA\*(C'\fR. In a coding region, codons are 3\-letter sequences, so | |
1585 | we can think of the \s-1DNA\s0 snippet as a sequence of 3\-letter records. The | |
1586 | naive regexp | |
1587 | .PP | |
1588 | .Vb 3 | |
1589 | \& # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC" | |
1590 | \& $dna = "ATCGTTGAATGCAAATGACATGAC"; | |
1591 | \& $dna =~ /TGA/; | |
1592 | .Ve | |
1593 | .PP | |
1594 | doesn't work; it may match a \f(CW\*(C`TGA\*(C'\fR, but there is no guarantee that | |
1595 | the match is aligned with codon boundaries, e.g., the substring | |
1596 | \&\f(CW\*(C`GTT\ GAA\*(C'\fR\ gives a match. A better solution is | |
1597 | .PP | |
1598 | .Vb 3 | |
1599 | \& while ($dna =~ /(\ew\ew\ew)*?TGA/g) { # note the minimal *? | |
1600 | \& print "Got a TGA stop codon at position ", pos $dna, "\en"; | |
1601 | \& } | |
1602 | .Ve | |
1603 | .PP | |
1604 | which prints | |
1605 | .PP | |
1606 | .Vb 2 | |
1607 | \& Got a TGA stop codon at position 18 | |
1608 | \& Got a TGA stop codon at position 23 | |
1609 | .Ve | |
1610 | .PP | |
1611 | Position 18 is good, but position 23 is bogus. What happened? | |
1612 | .PP | |
1613 | The answer is that our regexp works well until we get past the last | |
1614 | real match. Then the regexp will fail to match a synchronized \f(CW\*(C`TGA\*(C'\fR | |
1615 | and start stepping ahead one character position at a time, not what we | |
1616 | want. The solution is to use \f(CW\*(C`\eG\*(C'\fR to anchor the match to the codon | |
1617 | alignment: | |
1618 | .PP | |
1619 | .Vb 3 | |
1620 | \& while ($dna =~ /\eG(\ew\ew\ew)*?TGA/g) { | |
1621 | \& print "Got a TGA stop codon at position ", pos $dna, "\en"; | |
1622 | \& } | |
1623 | .Ve | |
1624 | .PP | |
1625 | This prints | |
1626 | .PP | |
1627 | .Vb 1 | |
1628 | \& Got a TGA stop codon at position 18 | |
1629 | .Ve | |
1630 | .PP | |
1631 | which is the correct answer. This example illustrates that it is | |
1632 | important not only to match what is desired, but to reject what is not | |
1633 | desired. | |
1634 | .PP | |
1635 | \&\fBsearch and replace\fR | |
1636 | .PP | |
1637 | Regular expressions also play a big role in \fBsearch and replace\fR | |
1638 | operations in Perl. Search and replace is accomplished with the | |
1639 | \&\f(CW\*(C`s///\*(C'\fR operator. The general form is | |
1640 | \&\f(CW\*(C`s/regexp/replacement/modifiers\*(C'\fR, with everything we know about | |
1641 | regexps and modifiers applying in this case as well. The | |
1642 | \&\f(CW\*(C`replacement\*(C'\fR is a Perl double quoted string that replaces in the | |
1643 | string whatever is matched with the \f(CW\*(C`regexp\*(C'\fR. The operator \f(CW\*(C`=~\*(C'\fR is | |
1644 | also used here to associate a string with \f(CW\*(C`s///\*(C'\fR. If matching | |
1645 | against \f(CW$_\fR, the \f(CW\*(C`$_\ =~\*(C'\fR\ can be dropped. If there is a match, | |
1646 | \&\f(CW\*(C`s///\*(C'\fR returns the number of substitutions made, otherwise it returns | |
1647 | false. Here are a few examples: | |
1648 | .PP | |
1649 | .Vb 8 | |
1650 | \& $x = "Time to feed the cat!"; | |
1651 | \& $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!" | |
1652 | \& if ($x =~ s/^(Time.*hacker)!$/$1 now!/) { | |
1653 | \& $more_insistent = 1; | |
1654 | \& } | |
1655 | \& $y = "'quoted words'"; | |
1656 | \& $y =~ s/^'(.*)'$/$1/; # strip single quotes, | |
1657 | \& # $y contains "quoted words" | |
1658 | .Ve | |
1659 | .PP | |
1660 | In the last example, the whole string was matched, but only the part | |
1661 | inside the single quotes was grouped. With the \f(CW\*(C`s///\*(C'\fR operator, the | |
1662 | matched variables \f(CW$1\fR, \f(CW$2\fR, etc. are immediately available for use | |
1663 | in the replacement expression, so we use \f(CW$1\fR to replace the quoted | |
1664 | string with just what was quoted. With the global modifier, \f(CW\*(C`s///g\*(C'\fR | |
1665 | will search and replace all occurrences of the regexp in the string: | |
1666 | .PP | |
1667 | .Vb 6 | |
1668 | \& $x = "I batted 4 for 4"; | |
1669 | \& $x =~ s/4/four/; # doesn't do it all: | |
1670 | \& # $x contains "I batted four for 4" | |
1671 | \& $x = "I batted 4 for 4"; | |
1672 | \& $x =~ s/4/four/g; # does it all: | |
1673 | \& # $x contains "I batted four for four" | |
1674 | .Ve | |
1675 | .PP | |
1676 | If you prefer 'regex' over 'regexp' in this tutorial, you could use | |
1677 | the following program to replace it: | |
1678 | .PP | |
1679 | .Vb 9 | |
1680 | \& % cat > simple_replace | |
1681 | \& #!/usr/bin/perl | |
1682 | \& $regexp = shift; | |
1683 | \& $replacement = shift; | |
1684 | \& while (<>) { | |
1685 | \& s/$regexp/$replacement/go; | |
1686 | \& print; | |
1687 | \& } | |
1688 | \& ^D | |
1689 | .Ve | |
1690 | .PP | |
1691 | .Vb 1 | |
1692 | \& % simple_replace regexp regex perlretut.pod | |
1693 | .Ve | |
1694 | .PP | |
1695 | In \f(CW\*(C`simple_replace\*(C'\fR we used the \f(CW\*(C`s///g\*(C'\fR modifier to replace all | |
1696 | occurrences of the regexp on each line and the \f(CW\*(C`s///o\*(C'\fR modifier to | |
1697 | compile the regexp only once. As with \f(CW\*(C`simple_grep\*(C'\fR, both the | |
1698 | \&\f(CW\*(C`print\*(C'\fR and the \f(CW\*(C`s/$regexp/$replacement/go\*(C'\fR use \f(CW$_\fR implicitly. | |
1699 | .PP | |
1700 | A modifier available specifically to search and replace is the | |
1701 | \&\f(CW\*(C`s///e\*(C'\fR evaluation modifier. \f(CW\*(C`s///e\*(C'\fR wraps an \f(CW\*(C`eval{...}\*(C'\fR around | |
1702 | the replacement string and the evaluated result is substituted for the | |
1703 | matched substring. \f(CW\*(C`s///e\*(C'\fR is useful if you need to do a bit of | |
1704 | computation in the process of replacing text. This example counts | |
1705 | character frequencies in a line: | |
1706 | .PP | |
1707 | .Vb 4 | |
1708 | \& $x = "Bill the cat"; | |
1709 | \& $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself | |
1710 | \& print "frequency of '$_' is $chars{$_}\en" | |
1711 | \& foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars); | |
1712 | .Ve | |
1713 | .PP | |
1714 | This prints | |
1715 | .PP | |
1716 | .Vb 9 | |
1717 | \& frequency of ' ' is 2 | |
1718 | \& frequency of 't' is 2 | |
1719 | \& frequency of 'l' is 2 | |
1720 | \& frequency of 'B' is 1 | |
1721 | \& frequency of 'c' is 1 | |
1722 | \& frequency of 'e' is 1 | |
1723 | \& frequency of 'h' is 1 | |
1724 | \& frequency of 'i' is 1 | |
1725 | \& frequency of 'a' is 1 | |
1726 | .Ve | |
1727 | .PP | |
1728 | As with the match \f(CW\*(C`m//\*(C'\fR operator, \f(CW\*(C`s///\*(C'\fR can use other delimiters, | |
1729 | such as \f(CW\*(C`s!!!\*(C'\fR and \f(CW\*(C`s{}{}\*(C'\fR, and even \f(CW\*(C`s{}//\*(C'\fR. If single quotes are | |
1730 | used \f(CW\*(C`s'''\*(C'\fR, then the regexp and replacement are treated as single | |
1731 | quoted strings and there are no substitutions. \f(CW\*(C`s///\*(C'\fR in list context | |
1732 | returns the same thing as in scalar context, i.e., the number of | |
1733 | matches. | |
1734 | .PP | |
1735 | \&\fBThe split operator\fR | |
1736 | .PP | |
1737 | The \fB\f(CB\*(C`split\*(C'\fB \fR function can also optionally use a matching operator | |
1738 | \&\f(CW\*(C`m//\*(C'\fR to split a string. \f(CW\*(C`split /regexp/, string, limit\*(C'\fR splits | |
1739 | \&\f(CW\*(C`string\*(C'\fR into a list of substrings and returns that list. The regexp | |
1740 | is used to match the character sequence that the \f(CW\*(C`string\*(C'\fR is split | |
1741 | with respect to. The \f(CW\*(C`limit\*(C'\fR, if present, constrains splitting into | |
1742 | no more than \f(CW\*(C`limit\*(C'\fR number of strings. For example, to split a | |
1743 | string into words, use | |
1744 | .PP | |
1745 | .Vb 4 | |
1746 | \& $x = "Calvin and Hobbes"; | |
1747 | \& @words = split /\es+/, $x; # $word[0] = 'Calvin' | |
1748 | \& # $word[1] = 'and' | |
1749 | \& # $word[2] = 'Hobbes' | |
1750 | .Ve | |
1751 | .PP | |
1752 | If the empty regexp \f(CW\*(C`//\*(C'\fR is used, the regexp always matches and | |
1753 | the string is split into individual characters. If the regexp has | |
1754 | groupings, then list produced contains the matched substrings from the | |
1755 | groupings as well. For instance, | |
1756 | .PP | |
1757 | .Vb 12 | |
1758 | \& $x = "/usr/bin/perl"; | |
1759 | \& @dirs = split m!/!, $x; # $dirs[0] = '' | |
1760 | \& # $dirs[1] = 'usr' | |
1761 | \& # $dirs[2] = 'bin' | |
1762 | \& # $dirs[3] = 'perl' | |
1763 | \& @parts = split m!(/)!, $x; # $parts[0] = '' | |
1764 | \& # $parts[1] = '/' | |
1765 | \& # $parts[2] = 'usr' | |
1766 | \& # $parts[3] = '/' | |
1767 | \& # $parts[4] = 'bin' | |
1768 | \& # $parts[5] = '/' | |
1769 | \& # $parts[6] = 'perl' | |
1770 | .Ve | |
1771 | .PP | |
1772 | Since the first character of \f(CW$x\fR matched the regexp, \f(CW\*(C`split\*(C'\fR prepended | |
1773 | an empty initial element to the list. | |
1774 | .PP | |
1775 | If you have read this far, congratulations! You now have all the basic | |
1776 | tools needed to use regular expressions to solve a wide range of text | |
1777 | processing problems. If this is your first time through the tutorial, | |
1778 | why not stop here and play around with regexps a while... Part\ 2 | |
1779 | concerns the more esoteric aspects of regular expressions and those | |
1780 | concepts certainly aren't needed right at the start. | |
1781 | .SH "Part 2: Power tools" | |
1782 | .IX Header "Part 2: Power tools" | |
1783 | \&\s-1OK\s0, you know the basics of regexps and you want to know more. If | |
1784 | matching regular expressions is analogous to a walk in the woods, then | |
1785 | the tools discussed in Part 1 are analogous to topo maps and a | |
1786 | compass, basic tools we use all the time. Most of the tools in part 2 | |
1787 | are analogous to flare guns and satellite phones. They aren't used | |
1788 | too often on a hike, but when we are stuck, they can be invaluable. | |
1789 | .PP | |
1790 | What follows are the more advanced, less used, or sometimes esoteric | |
1791 | capabilities of perl regexps. In Part 2, we will assume you are | |
1792 | comfortable with the basics and concentrate on the new features. | |
1793 | .Sh "More on characters, strings, and character classes" | |
1794 | .IX Subsection "More on characters, strings, and character classes" | |
1795 | There are a number of escape sequences and character classes that we | |
1796 | haven't covered yet. | |
1797 | .PP | |
1798 | There are several escape sequences that convert characters or strings | |
1799 | between upper and lower case. \f(CW\*(C`\el\*(C'\fR and \f(CW\*(C`\eu\*(C'\fR convert the next | |
1800 | character to lower or upper case, respectively: | |
1801 | .PP | |
1802 | .Vb 4 | |
1803 | \& $x = "perl"; | |
1804 | \& $string =~ /\eu$x/; # matches 'Perl' in $string | |
1805 | \& $x = "M(rs?|s)\e\e."; # note the double backslash | |
1806 | \& $string =~ /\el$x/; # matches 'mr.', 'mrs.', and 'ms.', | |
1807 | .Ve | |
1808 | .PP | |
1809 | \&\f(CW\*(C`\eL\*(C'\fR and \f(CW\*(C`\eU\*(C'\fR converts a whole substring, delimited by \f(CW\*(C`\eL\*(C'\fR or | |
1810 | \&\f(CW\*(C`\eU\*(C'\fR and \f(CW\*(C`\eE\*(C'\fR, to lower or upper case: | |
1811 | .PP | |
1812 | .Vb 4 | |
1813 | \& $x = "This word is in lower case:\eL SHOUT\eE"; | |
1814 | \& $x =~ /shout/; # matches | |
1815 | \& $x = "I STILL KEYPUNCH CARDS FOR MY 360" | |
1816 | \& $x =~ /\eUkeypunch/; # matches punch card string | |
1817 | .Ve | |
1818 | .PP | |
1819 | If there is no \f(CW\*(C`\eE\*(C'\fR, case is converted until the end of the | |
1820 | string. The regexps \f(CW\*(C`\eL\eu$word\*(C'\fR or \f(CW\*(C`\eu\eL$word\*(C'\fR convert the first | |
1821 | character of \f(CW$word\fR to uppercase and the rest of the characters to | |
1822 | lowercase. | |
1823 | .PP | |
1824 | Control characters can be escaped with \f(CW\*(C`\ec\*(C'\fR, so that a control-Z | |
1825 | character would be matched with \f(CW\*(C`\ecZ\*(C'\fR. The escape sequence | |
1826 | \&\f(CW\*(C`\eQ\*(C'\fR...\f(CW\*(C`\eE\*(C'\fR quotes, or protects most non-alphabetic characters. For | |
1827 | instance, | |
1828 | .PP | |
1829 | .Vb 2 | |
1830 | \& $x = "\eQThat !^*&%~& cat!"; | |
1831 | \& $x =~ /\eQ!^*&%~&\eE/; # check for rough language | |
1832 | .Ve | |
1833 | .PP | |
1834 | It does not protect \f(CW\*(C`$\*(C'\fR or \f(CW\*(C`@\*(C'\fR, so that variables can still be | |
1835 | substituted. | |
1836 | .PP | |
1837 | With the advent of 5.6.0, perl regexps can handle more than just the | |
1838 | standard \s-1ASCII\s0 character set. Perl now supports \fBUnicode\fR, a standard | |
1839 | for encoding the character sets from many of the world's written | |
1840 | languages. Unicode does this by allowing characters to be more than | |
1841 | one byte wide. Perl uses the \s-1UTF\-8\s0 encoding, in which \s-1ASCII\s0 characters | |
1842 | are still encoded as one byte, but characters greater than \f(CW\*(C`chr(127)\*(C'\fR | |
1843 | may be stored as two or more bytes. | |
1844 | .PP | |
1845 | What does this mean for regexps? Well, regexp users don't need to know | |
1846 | much about perl's internal representation of strings. But they do need | |
1847 | to know 1) how to represent Unicode characters in a regexp and 2) when | |
1848 | a matching operation will treat the string to be searched as a | |
1849 | sequence of bytes (the old way) or as a sequence of Unicode characters | |
1850 | (the new way). The answer to 1) is that Unicode characters greater | |
1851 | than \f(CW\*(C`chr(127)\*(C'\fR may be represented using the \f(CW\*(C`\ex{hex}\*(C'\fR notation, | |
1852 | with \f(CW\*(C`hex\*(C'\fR a hexadecimal integer: | |
1853 | .PP | |
1854 | .Vb 1 | |
1855 | \& /\ex{263a}/; # match a Unicode smiley face :) | |
1856 | .Ve | |
1857 | .PP | |
1858 | Unicode characters in the range of 128\-255 use two hexadecimal digits | |
1859 | with braces: \f(CW\*(C`\ex{ab}\*(C'\fR. Note that this is different than \f(CW\*(C`\exab\*(C'\fR, | |
1860 | which is just a hexadecimal byte with no Unicode significance. | |
1861 | .PP | |
1862 | \&\fB\s-1NOTE\s0\fR: in Perl 5.6.0 it used to be that one needed to say \f(CW\*(C`use | |
1863 | utf8\*(C'\fR to use any Unicode features. This is no more the case: for | |
1864 | almost all Unicode processing, the explicit \f(CW\*(C`utf8\*(C'\fR pragma is not | |
1865 | needed. (The only case where it matters is if your Perl script is in | |
1866 | Unicode and encoded in \s-1UTF\-8\s0, then an explicit \f(CW\*(C`use utf8\*(C'\fR is needed.) | |
1867 | .PP | |
1868 | Figuring out the hexadecimal sequence of a Unicode character you want | |
1869 | or deciphering someone else's hexadecimal Unicode regexp is about as | |
1870 | much fun as programming in machine code. So another way to specify | |
1871 | Unicode characters is to use the \fBnamed\ character\fR\ escape | |
1872 | sequence \f(CW\*(C`\eN{name}\*(C'\fR. \f(CW\*(C`name\*(C'\fR is a name for the Unicode character, as | |
1873 | specified in the Unicode standard. For instance, if we wanted to | |
1874 | represent or match the astrological sign for the planet Mercury, we | |
1875 | could use | |
1876 | .PP | |
1877 | .Vb 3 | |
1878 | \& use charnames ":full"; # use named chars with Unicode full names | |
1879 | \& $x = "abc\eN{MERCURY}def"; | |
1880 | \& $x =~ /\eN{MERCURY}/; # matches | |
1881 | .Ve | |
1882 | .PP | |
1883 | One can also use short names or restrict names to a certain alphabet: | |
1884 | .PP | |
1885 | .Vb 2 | |
1886 | \& use charnames ':full'; | |
1887 | \& print "\eN{GREEK SMALL LETTER SIGMA} is called sigma.\en"; | |
1888 | .Ve | |
1889 | .PP | |
1890 | .Vb 2 | |
1891 | \& use charnames ":short"; | |
1892 | \& print "\eN{greek:Sigma} is an upper-case sigma.\en"; | |
1893 | .Ve | |
1894 | .PP | |
1895 | .Vb 2 | |
1896 | \& use charnames qw(greek); | |
1897 | \& print "\eN{sigma} is Greek sigma\en"; | |
1898 | .Ve | |
1899 | .PP | |
1900 | A list of full names is found in the file Names.txt in the | |
1901 | lib/perl5/5.X.X/unicore directory. | |
1902 | .PP | |
1903 | The answer to requirement 2), as of 5.6.0, is that if a regexp | |
1904 | contains Unicode characters, the string is searched as a sequence of | |
1905 | Unicode characters. Otherwise, the string is searched as a sequence of | |
1906 | bytes. If the string is being searched as a sequence of Unicode | |
1907 | characters, but matching a single byte is required, we can use the \f(CW\*(C`\eC\*(C'\fR | |
1908 | escape sequence. \f(CW\*(C`\eC\*(C'\fR is a character class akin to \f(CW\*(C`.\*(C'\fR except that | |
1909 | it matches \fIany\fR byte 0\-255. So | |
1910 | .PP | |
1911 | .Vb 7 | |
1912 | \& use charnames ":full"; # use named chars with Unicode full names | |
1913 | \& $x = "a"; | |
1914 | \& $x =~ /\eC/; # matches 'a', eats one byte | |
1915 | \& $x = ""; | |
1916 | \& $x =~ /\eC/; # doesn't match, no bytes to match | |
1917 | \& $x = "\eN{MERCURY}"; # two-byte Unicode character | |
1918 | \& $x =~ /\eC/; # matches, but dangerous! | |
1919 | .Ve | |
1920 | .PP | |
1921 | The last regexp matches, but is dangerous because the string | |
1922 | \&\fIcharacter\fR position is no longer synchronized to the string \fIbyte\fR | |
1923 | position. This generates the warning 'Malformed \s-1UTF\-8\s0 | |
1924 | character'. \f(CW\*(C`\eC\*(C'\fR is best used for matching the binary data in strings | |
1925 | with binary data intermixed with Unicode characters. | |
1926 | .PP | |
1927 | Let us now discuss the rest of the character classes. Just as with | |
1928 | Unicode characters, there are named Unicode character classes | |
1929 | represented by the \f(CW\*(C`\ep{name}\*(C'\fR escape sequence. Closely associated is | |
1930 | the \f(CW\*(C`\eP{name}\*(C'\fR character class, which is the negation of the | |
1931 | \&\f(CW\*(C`\ep{name}\*(C'\fR class. For example, to match lower and uppercase | |
1932 | characters, | |
1933 | .PP | |
1934 | .Vb 6 | |
1935 | \& use charnames ":full"; # use named chars with Unicode full names | |
1936 | \& $x = "BOB"; | |
1937 | \& $x =~ /^\ep{IsUpper}/; # matches, uppercase char class | |
1938 | \& $x =~ /^\eP{IsUpper}/; # doesn't match, char class sans uppercase | |
1939 | \& $x =~ /^\ep{IsLower}/; # doesn't match, lowercase char class | |
1940 | \& $x =~ /^\eP{IsLower}/; # matches, char class sans lowercase | |
1941 | .Ve | |
1942 | .PP | |
1943 | Here is the association between some Perl named classes and the | |
1944 | traditional Unicode classes: | |
1945 | .PP | |
1946 | .Vb 1 | |
1947 | \& Perl class name Unicode class name or regular expression | |
1948 | .Ve | |
1949 | .PP | |
1950 | .Vb 15 | |
1951 | \& IsAlpha /^[LM]/ | |
1952 | \& IsAlnum /^[LMN]/ | |
1953 | \& IsASCII $code <= 127 | |
1954 | \& IsCntrl /^C/ | |
1955 | \& IsBlank $code =~ /^(0020|0009)$/ || /^Z[^lp]/ | |
1956 | \& IsDigit Nd | |
1957 | \& IsGraph /^([LMNPS]|Co)/ | |
1958 | \& IsLower Ll | |
1959 | \& IsPrint /^([LMNPS]|Co|Zs)/ | |
1960 | \& IsPunct /^P/ | |
1961 | \& IsSpace /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/ | |
1962 | \& IsSpacePerl /^Z/ || ($code =~ /^(0009|000A|000C|000D|0085|2028|2029)$/ | |
1963 | \& IsUpper /^L[ut]/ | |
1964 | \& IsWord /^[LMN]/ || $code eq "005F" | |
1965 | \& IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/ | |
1966 | .Ve | |
1967 | .PP | |
1968 | You can also use the official Unicode class names with the \f(CW\*(C`\ep\*(C'\fR and | |
1969 | \&\f(CW\*(C`\eP\*(C'\fR, like \f(CW\*(C`\ep{L}\*(C'\fR for Unicode 'letters', or \f(CW\*(C`\ep{Lu}\*(C'\fR for uppercase | |
1970 | letters, or \f(CW\*(C`\eP{Nd}\*(C'\fR for non\-digits. If a \f(CW\*(C`name\*(C'\fR is just one | |
1971 | letter, the braces can be dropped. For instance, \f(CW\*(C`\epM\*(C'\fR is the | |
1972 | character class of Unicode 'marks', for example accent marks. | |
1973 | For the full list see perlunicode. | |
1974 | .PP | |
1975 | The Unicode has also been separated into various sets of charaters | |
1976 | which you can test with \f(CW\*(C`\ep{In...}\*(C'\fR (in) and \f(CW\*(C`\eP{In...}\*(C'\fR (not in), | |
1977 | for example \f(CW\*(C`\ep{Latin}\*(C'\fR, \f(CW\*(C`\ep{Greek}\*(C'\fR, or \f(CW\*(C`\eP{Katakana}\*(C'\fR. | |
1978 | For the full list see perlunicode. | |
1979 | .PP | |
1980 | \&\f(CW\*(C`\eX\*(C'\fR is an abbreviation for a character class sequence that includes | |
1981 | the Unicode 'combining character sequences'. A 'combining character | |
1982 | sequence' is a base character followed by any number of combining | |
1983 | characters. An example of a combining character is an accent. Using | |
1984 | the Unicode full names, e.g., \f(CW\*(C`A\ +\ COMBINING\ RING\*(C'\fR\ is a combining | |
1985 | character sequence with base character \f(CW\*(C`A\*(C'\fR and combining character | |
1986 | \&\f(CW\*(C`COMBINING\ RING\*(C'\fR\ , which translates in Danish to A with the circle | |
1987 | atop it, as in the word Angstrom. \f(CW\*(C`\eX\*(C'\fR is equivalent to \f(CW\*(C`\ePM\epM*}\*(C'\fR, | |
1988 | i.e., a non-mark followed by one or more marks. | |
1989 | .PP | |
1990 | For the full and latest information about Unicode see the latest | |
1991 | Unicode standard, or the Unicode Consortium's website http://www.unicode.org/ | |
1992 | .PP | |
1993 | As if all those classes weren't enough, Perl also defines \s-1POSIX\s0 style | |
1994 | character classes. These have the form \f(CW\*(C`[:name:]\*(C'\fR, with \f(CW\*(C`name\*(C'\fR the | |
1995 | name of the \s-1POSIX\s0 class. The \s-1POSIX\s0 classes are \f(CW\*(C`alpha\*(C'\fR, \f(CW\*(C`alnum\*(C'\fR, | |
1996 | \&\f(CW\*(C`ascii\*(C'\fR, \f(CW\*(C`cntrl\*(C'\fR, \f(CW\*(C`digit\*(C'\fR, \f(CW\*(C`graph\*(C'\fR, \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`print\*(C'\fR, \f(CW\*(C`punct\*(C'\fR, | |
1997 | \&\f(CW\*(C`space\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, and \f(CW\*(C`xdigit\*(C'\fR, and two extensions, \f(CW\*(C`word\*(C'\fR (a Perl | |
1998 | extension to match \f(CW\*(C`\ew\*(C'\fR), and \f(CW\*(C`blank\*(C'\fR (a \s-1GNU\s0 extension). If \f(CW\*(C`utf8\*(C'\fR | |
1999 | is being used, then these classes are defined the same as their | |
2000 | corresponding perl Unicode classes: \f(CW\*(C`[:upper:]\*(C'\fR is the same as | |
2001 | \&\f(CW\*(C`\ep{IsUpper}\*(C'\fR, etc. The \s-1POSIX\s0 character classes, however, don't | |
2002 | require using \f(CW\*(C`utf8\*(C'\fR. The \f(CW\*(C`[:digit:]\*(C'\fR, \f(CW\*(C`[:word:]\*(C'\fR, and | |
2003 | \&\f(CW\*(C`[:space:]\*(C'\fR correspond to the familiar \f(CW\*(C`\ed\*(C'\fR, \f(CW\*(C`\ew\*(C'\fR, and \f(CW\*(C`\es\*(C'\fR | |
2004 | character classes. To negate a \s-1POSIX\s0 class, put a \f(CW\*(C`^\*(C'\fR in front of | |
2005 | the name, so that, e.g., \f(CW\*(C`[:^digit:]\*(C'\fR corresponds to \f(CW\*(C`\eD\*(C'\fR and under | |
2006 | \&\f(CW\*(C`utf8\*(C'\fR, \f(CW\*(C`\eP{IsDigit}\*(C'\fR. The Unicode and \s-1POSIX\s0 character classes can | |
2007 | be used just like \f(CW\*(C`\ed\*(C'\fR, with the exception that \s-1POSIX\s0 character | |
2008 | classes can only be used inside of a character class: | |
2009 | .PP | |
2010 | .Vb 7 | |
2011 | \& /\es+[abc[:digit:]xyz]\es*/; # match a,b,c,x,y,z, or a digit | |
2012 | \& /^=item\es[[:digit:]]/; # match '=item', | |
2013 | \& # followed by a space and a digit | |
2014 | \& use charnames ":full"; | |
2015 | \& /\es+[abc\ep{IsDigit}xyz]\es+/; # match a,b,c,x,y,z, or a digit | |
2016 | \& /^=item\es\ep{IsDigit}/; # match '=item', | |
2017 | \& # followed by a space and a digit | |
2018 | .Ve | |
2019 | .PP | |
2020 | Whew! That is all the rest of the characters and character classes. | |
2021 | .Sh "Compiling and saving regular expressions" | |
2022 | .IX Subsection "Compiling and saving regular expressions" | |
2023 | In Part 1 we discussed the \f(CW\*(C`//o\*(C'\fR modifier, which compiles a regexp | |
2024 | just once. This suggests that a compiled regexp is some data structure | |
2025 | that can be stored once and used again and again. The regexp quote | |
2026 | \&\f(CW\*(C`qr//\*(C'\fR does exactly that: \f(CW\*(C`qr/string/\*(C'\fR compiles the \f(CW\*(C`string\*(C'\fR as a | |
2027 | regexp and transforms the result into a form that can be assigned to a | |
2028 | variable: | |
2029 | .PP | |
2030 | .Vb 1 | |
2031 | \& $reg = qr/foo+bar?/; # reg contains a compiled regexp | |
2032 | .Ve | |
2033 | .PP | |
2034 | Then \f(CW$reg\fR can be used as a regexp: | |
2035 | .PP | |
2036 | .Vb 3 | |
2037 | \& $x = "fooooba"; | |
2038 | \& $x =~ $reg; # matches, just like /foo+bar?/ | |
2039 | \& $x =~ /$reg/; # same thing, alternate form | |
2040 | .Ve | |
2041 | .PP | |
2042 | \&\f(CW$reg\fR can also be interpolated into a larger regexp: | |
2043 | .PP | |
2044 | .Vb 1 | |
2045 | \& $x =~ /(abc)?$reg/; # still matches | |
2046 | .Ve | |
2047 | .PP | |
2048 | As with the matching operator, the regexp quote can use different | |
2049 | delimiters, e.g., \f(CW\*(C`qr!!\*(C'\fR, \f(CW\*(C`qr{}\*(C'\fR and \f(CW\*(C`qr~~\*(C'\fR. The single quote | |
2050 | delimiters \f(CW\*(C`qr''\*(C'\fR prevent any interpolation from taking place. | |
2051 | .PP | |
2052 | Pre-compiled regexps are useful for creating dynamic matches that | |
2053 | don't need to be recompiled each time they are encountered. Using | |
2054 | pre-compiled regexps, \f(CW\*(C`simple_grep\*(C'\fR program can be expanded into a | |
2055 | program that matches multiple patterns: | |
2056 | .PP | |
2057 | .Vb 4 | |
2058 | \& % cat > multi_grep | |
2059 | \& #!/usr/bin/perl | |
2060 | \& # multi_grep - match any of <number> regexps | |
2061 | \& # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ... | |
2062 | .Ve | |
2063 | .PP | |
2064 | .Vb 12 | |
2065 | \& $number = shift; | |
2066 | \& $regexp[$_] = shift foreach (0..$number-1); | |
2067 | \& @compiled = map qr/$_/, @regexp; | |
2068 | \& while ($line = <>) { | |
2069 | \& foreach $pattern (@compiled) { | |
2070 | \& if ($line =~ /$pattern/) { | |
2071 | \& print $line; | |
2072 | \& last; # we matched, so move onto the next line | |
2073 | \& } | |
2074 | \& } | |
2075 | \& } | |
2076 | \& ^D | |
2077 | .Ve | |
2078 | .PP | |
2079 | .Vb 4 | |
2080 | \& % multi_grep 2 last for multi_grep | |
2081 | \& $regexp[$_] = shift foreach (0..$number-1); | |
2082 | \& foreach $pattern (@compiled) { | |
2083 | \& last; | |
2084 | .Ve | |
2085 | .PP | |
2086 | Storing pre-compiled regexps in an array \f(CW@compiled\fR allows us to | |
2087 | simply loop through the regexps without any recompilation, thus gaining | |
2088 | flexibility without sacrificing speed. | |
2089 | .Sh "Embedding comments and modifiers in a regular expression" | |
2090 | .IX Subsection "Embedding comments and modifiers in a regular expression" | |
2091 | Starting with this section, we will be discussing Perl's set of | |
2092 | \&\fBextended patterns\fR. These are extensions to the traditional regular | |
2093 | expression syntax that provide powerful new tools for pattern | |
2094 | matching. We have already seen extensions in the form of the minimal | |
2095 | matching constructs \f(CW\*(C`??\*(C'\fR, \f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, \f(CW\*(C`{n,m}?\*(C'\fR, and \f(CW\*(C`{n,}?\*(C'\fR. The | |
2096 | rest of the extensions below have the form \f(CW\*(C`(?char...)\*(C'\fR, where the | |
2097 | \&\f(CW\*(C`char\*(C'\fR is a character that determines the type of extension. | |
2098 | .PP | |
2099 | The first extension is an embedded comment \f(CW\*(C`(?#text)\*(C'\fR. This embeds a | |
2100 | comment into the regular expression without affecting its meaning. The | |
2101 | comment should not have any closing parentheses in the text. An | |
2102 | example is | |
2103 | .PP | |
2104 | .Vb 1 | |
2105 | \& /(?# Match an integer:)[+-]?\ed+/; | |
2106 | .Ve | |
2107 | .PP | |
2108 | This style of commenting has been largely superseded by the raw, | |
2109 | freeform commenting that is allowed with the \f(CW\*(C`//x\*(C'\fR modifier. | |
2110 | .PP | |
2111 | The modifiers \f(CW\*(C`//i\*(C'\fR, \f(CW\*(C`//m\*(C'\fR, \f(CW\*(C`//s\*(C'\fR, and \f(CW\*(C`//x\*(C'\fR can also embedded in | |
2112 | a regexp using \f(CW\*(C`(?i)\*(C'\fR, \f(CW\*(C`(?m)\*(C'\fR, \f(CW\*(C`(?s)\*(C'\fR, and \f(CW\*(C`(?x)\*(C'\fR. For instance, | |
2113 | .PP | |
2114 | .Vb 7 | |
2115 | \& /(?i)yes/; # match 'yes' case insensitively | |
2116 | \& /yes/i; # same thing | |
2117 | \& /(?x)( # freeform version of an integer regexp | |
2118 | \& [+-]? # match an optional sign | |
2119 | \& \ed+ # match a sequence of digits | |
2120 | \& ) | |
2121 | \& /x; | |
2122 | .Ve | |
2123 | .PP | |
2124 | Embedded modifiers can have two important advantages over the usual | |
2125 | modifiers. Embedded modifiers allow a custom set of modifiers to | |
2126 | \&\fIeach\fR regexp pattern. This is great for matching an array of regexps | |
2127 | that must have different modifiers: | |
2128 | .PP | |
2129 | .Vb 8 | |
2130 | \& $pattern[0] = '(?i)doctor'; | |
2131 | \& $pattern[1] = 'Johnson'; | |
2132 | \& ... | |
2133 | \& while (<>) { | |
2134 | \& foreach $patt (@pattern) { | |
2135 | \& print if /$patt/; | |
2136 | \& } | |
2137 | \& } | |
2138 | .Ve | |
2139 | .PP | |
2140 | The second advantage is that embedded modifiers only affect the regexp | |
2141 | inside the group the embedded modifier is contained in. So grouping | |
2142 | can be used to localize the modifier's effects: | |
2143 | .PP | |
2144 | .Vb 1 | |
2145 | \& /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc. | |
2146 | .Ve | |
2147 | .PP | |
2148 | Embedded modifiers can also turn off any modifiers already present | |
2149 | by using, e.g., \f(CW\*(C`(?\-i)\*(C'\fR. Modifiers can also be combined into | |
2150 | a single expression, e.g., \f(CW\*(C`(?s\-i)\*(C'\fR turns on single line mode and | |
2151 | turns off case insensitivity. | |
2152 | .Sh "Non-capturing groupings" | |
2153 | .IX Subsection "Non-capturing groupings" | |
2154 | We noted in Part 1 that groupings \f(CW\*(C`()\*(C'\fR had two distinct functions: 1) | |
2155 | group regexp elements together as a single unit, and 2) extract, or | |
2156 | capture, substrings that matched the regexp in the | |
2157 | grouping. Non-capturing groupings, denoted by \f(CW\*(C`(?:regexp)\*(C'\fR, allow the | |
2158 | regexp to be treated as a single unit, but don't extract substrings or | |
2159 | set matching variables \f(CW$1\fR, etc. Both capturing and non-capturing | |
2160 | groupings are allowed to co-exist in the same regexp. Because there is | |
2161 | no extraction, non-capturing groupings are faster than capturing | |
2162 | groupings. Non-capturing groupings are also handy for choosing exactly | |
2163 | which parts of a regexp are to be extracted to matching variables: | |
2164 | .PP | |
2165 | .Vb 2 | |
2166 | \& # match a number, $1-$4 are set, but we only want $1 | |
2167 | \& /([+-]?\e *(\ed+(\e.\ed*)?|\e.\ed+)([eE][+-]?\ed+)?)/; | |
2168 | .Ve | |
2169 | .PP | |
2170 | .Vb 2 | |
2171 | \& # match a number faster , only $1 is set | |
2172 | \& /([+-]?\e *(?:\ed+(?:\e.\ed*)?|\e.\ed+)(?:[eE][+-]?\ed+)?)/; | |
2173 | .Ve | |
2174 | .PP | |
2175 | .Vb 2 | |
2176 | \& # match a number, get $1 = whole number, $2 = exponent | |
2177 | \& /([+-]?\e *(?:\ed+(?:\e.\ed*)?|\e.\ed+)(?:[eE]([+-]?\ed+))?)/; | |
2178 | .Ve | |
2179 | .PP | |
2180 | Non-capturing groupings are also useful for removing nuisance | |
2181 | elements gathered from a split operation: | |
2182 | .PP | |
2183 | .Vb 3 | |
2184 | \& $x = '12a34b5'; | |
2185 | \& @num = split /(a|b)/, $x; # @num = ('12','a','34','b','5') | |
2186 | \& @num = split /(?:a|b)/, $x; # @num = ('12','34','5') | |
2187 | .Ve | |
2188 | .PP | |
2189 | Non-capturing groupings may also have embedded modifiers: | |
2190 | \&\f(CW\*(C`(?i\-m:regexp)\*(C'\fR is a non-capturing grouping that matches \f(CW\*(C`regexp\*(C'\fR | |
2191 | case insensitively and turns off multi-line mode. | |
2192 | .Sh "Looking ahead and looking behind" | |
2193 | .IX Subsection "Looking ahead and looking behind" | |
2194 | This section concerns the lookahead and lookbehind assertions. First, | |
2195 | a little background. | |
2196 | .PP | |
2197 | In Perl regular expressions, most regexp elements 'eat up' a certain | |
2198 | amount of string when they match. For instance, the regexp element | |
2199 | \&\f(CW\*(C`[abc}]\*(C'\fR eats up one character of the string when it matches, in the | |
2200 | sense that perl moves to the next character position in the string | |
2201 | after the match. There are some elements, however, that don't eat up | |
2202 | characters (advance the character position) if they match. The examples | |
2203 | we have seen so far are the anchors. The anchor \f(CW\*(C`^\*(C'\fR matches the | |
2204 | beginning of the line, but doesn't eat any characters. Similarly, the | |
2205 | word boundary anchor \f(CW\*(C`\eb\*(C'\fR matches, e.g., if the character to the left | |
2206 | is a word character and the character to the right is a non-word | |
2207 | character, but it doesn't eat up any characters itself. Anchors are | |
2208 | examples of 'zero\-width assertions'. Zero\-width, because they consume | |
2209 | no characters, and assertions, because they test some property of the | |
2210 | string. In the context of our walk in the woods analogy to regexp | |
2211 | matching, most regexp elements move us along a trail, but anchors have | |
2212 | us stop a moment and check our surroundings. If the local environment | |
2213 | checks out, we can proceed forward. But if the local environment | |
2214 | doesn't satisfy us, we must backtrack. | |
2215 | .PP | |
2216 | Checking the environment entails either looking ahead on the trail, | |
2217 | looking behind, or both. \f(CW\*(C`^\*(C'\fR looks behind, to see that there are no | |
2218 | characters before. \f(CW\*(C`$\*(C'\fR looks ahead, to see that there are no | |
2219 | characters after. \f(CW\*(C`\eb\*(C'\fR looks both ahead and behind, to see if the | |
2220 | characters on either side differ in their 'word'\-ness. | |
2221 | .PP | |
2222 | The lookahead and lookbehind assertions are generalizations of the | |
2223 | anchor concept. Lookahead and lookbehind are zero-width assertions | |
2224 | that let us specify which characters we want to test for. The | |
2225 | lookahead assertion is denoted by \f(CW\*(C`(?=regexp)\*(C'\fR and the lookbehind | |
2226 | assertion is denoted by \f(CW\*(C`(?<=fixed\-regexp)\*(C'\fR. Some examples are | |
2227 | .PP | |
2228 | .Vb 8 | |
2229 | \& $x = "I catch the housecat 'Tom-cat' with catnip"; | |
2230 | \& $x =~ /cat(?=\es+)/; # matches 'cat' in 'housecat' | |
2231 | \& @catwords = ($x =~ /(?<=\es)cat\ew+/g); # matches, | |
2232 | \& # $catwords[0] = 'catch' | |
2233 | \& # $catwords[1] = 'catnip' | |
2234 | \& $x =~ /\ebcat\eb/; # matches 'cat' in 'Tom-cat' | |
2235 | \& $x =~ /(?<=\es)cat(?=\es)/; # doesn't match; no isolated 'cat' in | |
2236 | \& # middle of $x | |
2237 | .Ve | |
2238 | .PP | |
2239 | Note that the parentheses in \f(CW\*(C`(?=regexp)\*(C'\fR and \f(CW\*(C`(?<=regexp)\*(C'\fR are | |
2240 | non\-capturing, since these are zero-width assertions. Thus in the | |
2241 | second regexp, the substrings captured are those of the whole regexp | |
2242 | itself. Lookahead \f(CW\*(C`(?=regexp)\*(C'\fR can match arbitrary regexps, but | |
2243 | lookbehind \f(CW\*(C`(?<=fixed\-regexp)\*(C'\fR only works for regexps of fixed | |
2244 | width, i.e., a fixed number of characters long. Thus | |
2245 | \&\f(CW\*(C`(?<=(ab|bc))\*(C'\fR is fine, but \f(CW\*(C`(?<=(ab)*)\*(C'\fR is not. The | |
2246 | negated versions of the lookahead and lookbehind assertions are | |
2247 | denoted by \f(CW\*(C`(?!regexp)\*(C'\fR and \f(CW\*(C`(?<!fixed\-regexp)\*(C'\fR respectively. | |
2248 | They evaluate true if the regexps do \fInot\fR match: | |
2249 | .PP | |
2250 | .Vb 4 | |
2251 | \& $x = "foobar"; | |
2252 | \& $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo' | |
2253 | \& $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo' | |
2254 | \& $x =~ /(?<!\es)foo/; # matches, there is no \es before 'foo' | |
2255 | .Ve | |
2256 | .Sh "Using independent subexpressions to prevent backtracking" | |
2257 | .IX Subsection "Using independent subexpressions to prevent backtracking" | |
2258 | The last few extended patterns in this tutorial are experimental as of | |
2259 | 5.6.0. Play with them, use them in some code, but don't rely on them | |
2260 | just yet for production code. | |
2261 | .PP | |
2262 | \&\fBIndependent\ subexpressions\fR\ are regular expressions, in the | |
2263 | context of a larger regular expression, that function independently of | |
2264 | the larger regular expression. That is, they consume as much or as | |
2265 | little of the string as they wish without regard for the ability of | |
2266 | the larger regexp to match. Independent subexpressions are represented | |
2267 | by \f(CW\*(C`(?>regexp)\*(C'\fR. We can illustrate their behavior by first | |
2268 | considering an ordinary regexp: | |
2269 | .PP | |
2270 | .Vb 2 | |
2271 | \& $x = "ab"; | |
2272 | \& $x =~ /a*ab/; # matches | |
2273 | .Ve | |
2274 | .PP | |
2275 | This obviously matches, but in the process of matching, the | |
2276 | subexpression \f(CW\*(C`a*\*(C'\fR first grabbed the \f(CW\*(C`a\*(C'\fR. Doing so, however, | |
2277 | wouldn't allow the whole regexp to match, so after backtracking, \f(CW\*(C`a*\*(C'\fR | |
2278 | eventually gave back the \f(CW\*(C`a\*(C'\fR and matched the empty string. Here, what | |
2279 | \&\f(CW\*(C`a*\*(C'\fR matched was \fIdependent\fR on what the rest of the regexp matched. | |
2280 | .PP | |
2281 | Contrast that with an independent subexpression: | |
2282 | .PP | |
2283 | .Vb 1 | |
2284 | \& $x =~ /(?>a*)ab/; # doesn't match! | |
2285 | .Ve | |
2286 | .PP | |
2287 | The independent subexpression \f(CW\*(C`(?>a*)\*(C'\fR doesn't care about the rest | |
2288 | of the regexp, so it sees an \f(CW\*(C`a\*(C'\fR and grabs it. Then the rest of the | |
2289 | regexp \f(CW\*(C`ab\*(C'\fR cannot match. Because \f(CW\*(C`(?>a*)\*(C'\fR is independent, there | |
2290 | is no backtracking and the independent subexpression does not give | |
2291 | up its \f(CW\*(C`a\*(C'\fR. Thus the match of the regexp as a whole fails. A similar | |
2292 | behavior occurs with completely independent regexps: | |
2293 | .PP | |
2294 | .Vb 3 | |
2295 | \& $x = "ab"; | |
2296 | \& $x =~ /a*/g; # matches, eats an 'a' | |
2297 | \& $x =~ /\eGab/g; # doesn't match, no 'a' available | |
2298 | .Ve | |
2299 | .PP | |
2300 | Here \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`\eG\*(C'\fR create a 'tag team' handoff of the string from | |
2301 | one regexp to the other. Regexps with an independent subexpression are | |
2302 | much like this, with a handoff of the string to the independent | |
2303 | subexpression, and a handoff of the string back to the enclosing | |
2304 | regexp. | |
2305 | .PP | |
2306 | The ability of an independent subexpression to prevent backtracking | |
2307 | can be quite useful. Suppose we want to match a non-empty string | |
2308 | enclosed in parentheses up to two levels deep. Then the following | |
2309 | regexp matches: | |
2310 | .PP | |
2311 | .Vb 2 | |
2312 | \& $x = "abc(de(fg)h"; # unbalanced parentheses | |
2313 | \& $x =~ /\e( ( [^()]+ | \e([^()]*\e) )+ \e)/x; | |
2314 | .Ve | |
2315 | .PP | |
2316 | The regexp matches an open parenthesis, one or more copies of an | |
2317 | alternation, and a close parenthesis. The alternation is two\-way, with | |
2318 | the first alternative \f(CW\*(C`[^()]+\*(C'\fR matching a substring with no | |
2319 | parentheses and the second alternative \f(CW\*(C`\e([^()]*\e)\*(C'\fR matching a | |
2320 | substring delimited by parentheses. The problem with this regexp is | |
2321 | that it is pathological: it has nested indeterminate quantifiers | |
2322 | of the form \f(CW\*(C`(a+|b)+\*(C'\fR. We discussed in Part 1 how nested quantifiers | |
2323 | like this could take an exponentially long time to execute if there | |
2324 | was no match possible. To prevent the exponential blowup, we need to | |
2325 | prevent useless backtracking at some point. This can be done by | |
2326 | enclosing the inner quantifier as an independent subexpression: | |
2327 | .PP | |
2328 | .Vb 1 | |
2329 | \& $x =~ /\e( ( (?>[^()]+) | \e([^()]*\e) )+ \e)/x; | |
2330 | .Ve | |
2331 | .PP | |
2332 | Here, \f(CW\*(C`(?>[^()]+)\*(C'\fR breaks the degeneracy of string partitioning | |
2333 | by gobbling up as much of the string as possible and keeping it. Then | |
2334 | match failures fail much more quickly. | |
2335 | .Sh "Conditional expressions" | |
2336 | .IX Subsection "Conditional expressions" | |
2337 | A \fBconditional\ expression\fR\ is a form of if-then-else statement | |
2338 | that allows one to choose which patterns are to be matched, based on | |
2339 | some condition. There are two types of conditional expression: | |
2340 | \&\f(CW\*(C`(?(condition)yes\-regexp)\*(C'\fR and | |
2341 | \&\f(CW\*(C`(?(condition)yes\-regexp|no\-regexp)\*(C'\fR. \f(CW\*(C`(?(condition)yes\-regexp)\*(C'\fR is | |
2342 | like an \f(CW'if\ ()\ {}'\fR\ statement in Perl. If the \f(CW\*(C`condition\*(C'\fR is true, | |
2343 | the \f(CW\*(C`yes\-regexp\*(C'\fR will be matched. If the \f(CW\*(C`condition\*(C'\fR is false, the | |
2344 | \&\f(CW\*(C`yes\-regexp\*(C'\fR will be skipped and perl will move onto the next regexp | |
2345 | element. The second form is like an \f(CW'if\ ()\ {}\ else\ {}'\fR\ statement | |
2346 | in Perl. If the \f(CW\*(C`condition\*(C'\fR is true, the \f(CW\*(C`yes\-regexp\*(C'\fR will be | |
2347 | matched, otherwise the \f(CW\*(C`no\-regexp\*(C'\fR will be matched. | |
2348 | .PP | |
2349 | The \f(CW\*(C`condition\*(C'\fR can have two forms. The first form is simply an | |
2350 | integer in parentheses \f(CW\*(C`(integer)\*(C'\fR. It is true if the corresponding | |
2351 | backreference \f(CW\*(C`\einteger\*(C'\fR matched earlier in the regexp. The second | |
2352 | form is a bare zero width assertion \f(CW\*(C`(?...)\*(C'\fR, either a | |
2353 | lookahead, a lookbehind, or a code assertion (discussed in the next | |
2354 | section). | |
2355 | .PP | |
2356 | The integer form of the \f(CW\*(C`condition\*(C'\fR allows us to choose, with more | |
2357 | flexibility, what to match based on what matched earlier in the | |
2358 | regexp. This searches for words of the form \f(CW"$x$x"\fR or | |
2359 | \&\f(CW"$x$y$y$x"\fR: | |
2360 | .PP | |
2361 | .Vb 9 | |
2362 | \& % simple_grep '^(\ew+)(\ew+)?(?(2)\e2\e1|\e1)$' /usr/dict/words | |
2363 | \& beriberi | |
2364 | \& coco | |
2365 | \& couscous | |
2366 | \& deed | |
2367 | \& ... | |
2368 | \& toot | |
2369 | \& toto | |
2370 | \& tutu | |
2371 | .Ve | |
2372 | .PP | |
2373 | The lookbehind \f(CW\*(C`condition\*(C'\fR allows, along with backreferences, | |
2374 | an earlier part of the match to influence a later part of the | |
2375 | match. For instance, | |
2376 | .PP | |
2377 | .Vb 1 | |
2378 | \& /[ATGC]+(?(?<=AA)G|C)$/; | |
2379 | .Ve | |
2380 | .PP | |
2381 | matches a \s-1DNA\s0 sequence such that it either ends in \f(CW\*(C`AAG\*(C'\fR, or some | |
2382 | other base pair combination and \f(CW\*(C`C\*(C'\fR. Note that the form is | |
2383 | \&\f(CW\*(C`(?(?<=AA)G|C)\*(C'\fR and not \f(CW\*(C`(?((?<=AA))G|C)\*(C'\fR; for the | |
2384 | lookahead, lookbehind or code assertions, the parentheses around the | |
2385 | conditional are not needed. | |
2386 | .Sh "A bit of magic: executing Perl code in a regular expression" | |
2387 | .IX Subsection "A bit of magic: executing Perl code in a regular expression" | |
2388 | Normally, regexps are a part of Perl expressions. | |
2389 | \&\fBCode\ evaluation\fR\ expressions turn that around by allowing | |
2390 | arbitrary Perl code to be a part of a regexp. A code evaluation | |
2391 | expression is denoted \f(CW\*(C`(?{code})\*(C'\fR, with \f(CW\*(C`code\*(C'\fR a string of Perl | |
2392 | statements. | |
2393 | .PP | |
2394 | Code expressions are zero-width assertions, and the value they return | |
2395 | depends on their environment. There are two possibilities: either the | |
2396 | code expression is used as a conditional in a conditional expression | |
2397 | \&\f(CW\*(C`(?(condition)...)\*(C'\fR, or it is not. If the code expression is a | |
2398 | conditional, the code is evaluated and the result (i.e., the result of | |
2399 | the last statement) is used to determine truth or falsehood. If the | |
2400 | code expression is not used as a conditional, the assertion always | |
2401 | evaluates true and the result is put into the special variable | |
2402 | \&\f(CW$^R\fR. The variable \f(CW$^R\fR can then be used in code expressions later | |
2403 | in the regexp. Here are some silly examples: | |
2404 | .PP | |
2405 | .Vb 5 | |
2406 | \& $x = "abcdef"; | |
2407 | \& $x =~ /abc(?{print "Hi Mom!";})def/; # matches, | |
2408 | \& # prints 'Hi Mom!' | |
2409 | \& $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match, | |
2410 | \& # no 'Hi Mom!' | |
2411 | .Ve | |
2412 | .PP | |
2413 | Pay careful attention to the next example: | |
2414 | .PP | |
2415 | .Vb 3 | |
2416 | \& $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match, | |
2417 | \& # no 'Hi Mom!' | |
2418 | \& # but why not? | |
2419 | .Ve | |
2420 | .PP | |
2421 | At first glance, you'd think that it shouldn't print, because obviously | |
2422 | the \f(CW\*(C`ddd\*(C'\fR isn't going to match the target string. But look at this | |
2423 | example: | |
2424 | .PP | |
2425 | .Vb 2 | |
2426 | \& $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match, | |
2427 | \& # but _does_ print | |
2428 | .Ve | |
2429 | .PP | |
2430 | Hmm. What happened here? If you've been following along, you know that | |
2431 | the above pattern should be effectively the same as the last one \*(-- | |
2432 | enclosing the d in a character class isn't going to change what it | |
2433 | matches. So why does the first not print while the second one does? | |
2434 | .PP | |
2435 | The answer lies in the optimizations the REx engine makes. In the first | |
2436 | case, all the engine sees are plain old characters (aside from the | |
2437 | \&\f(CW\*(C`?{}\*(C'\fR construct). It's smart enough to realize that the string 'ddd' | |
2438 | doesn't occur in our target string before actually running the pattern | |
2439 | through. But in the second case, we've tricked it into thinking that our | |
2440 | pattern is more complicated than it is. It takes a look, sees our | |
2441 | character class, and decides that it will have to actually run the | |
2442 | pattern to determine whether or not it matches, and in the process of | |
2443 | running it hits the print statement before it discovers that we don't | |
2444 | have a match. | |
2445 | .PP | |
2446 | To take a closer look at how the engine does optimizations, see the | |
2447 | section \*(L"Pragmas and debugging\*(R" below. | |
2448 | .PP | |
2449 | More fun with \f(CW\*(C`?{}\*(C'\fR: | |
2450 | .PP | |
2451 | .Vb 6 | |
2452 | \& $x =~ /(?{print "Hi Mom!";})/; # matches, | |
2453 | \& # prints 'Hi Mom!' | |
2454 | \& $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches, | |
2455 | \& # prints '1' | |
2456 | \& $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches, | |
2457 | \& # prints '1' | |
2458 | .Ve | |
2459 | .PP | |
2460 | The bit of magic mentioned in the section title occurs when the regexp | |
2461 | backtracks in the process of searching for a match. If the regexp | |
2462 | backtracks over a code expression and if the variables used within are | |
2463 | localized using \f(CW\*(C`local\*(C'\fR, the changes in the variables produced by the | |
2464 | code expression are undone! Thus, if we wanted to count how many times | |
2465 | a character got matched inside a group, we could use, e.g., | |
2466 | .PP | |
2467 | .Vb 11 | |
2468 | \& $x = "aaaa"; | |
2469 | \& $count = 0; # initialize 'a' count | |
2470 | \& $c = "bob"; # test if $c gets clobbered | |
2471 | \& $x =~ /(?{local $c = 0;}) # initialize count | |
2472 | \& ( a # match 'a' | |
2473 | \& (?{local $c = $c + 1;}) # increment count | |
2474 | \& )* # do this any number of times, | |
2475 | \& aa # but match 'aa' at the end | |
2476 | \& (?{$count = $c;}) # copy local $c var into $count | |
2477 | \& /x; | |
2478 | \& print "'a' count is $count, \e$c variable is '$c'\en"; | |
2479 | .Ve | |
2480 | .PP | |
2481 | This prints | |
2482 | .PP | |
2483 | .Vb 1 | |
2484 | \& 'a' count is 2, $c variable is 'bob' | |
2485 | .Ve | |
2486 | .PP | |
2487 | If we replace the \f(CW\*(C`\ (?{local\ $c\ =\ $c\ +\ 1;})\*(C'\fR\ with | |
2488 | \&\f(CW\*(C`\ (?{$c\ =\ $c\ +\ 1;})\*(C'\fR\ , the variable changes are \fInot\fR undone | |
2489 | during backtracking, and we get | |
2490 | .PP | |
2491 | .Vb 1 | |
2492 | \& 'a' count is 4, $c variable is 'bob' | |
2493 | .Ve | |
2494 | .PP | |
2495 | Note that only localized variable changes are undone. Other side | |
2496 | effects of code expression execution are permanent. Thus | |
2497 | .PP | |
2498 | .Vb 2 | |
2499 | \& $x = "aaaa"; | |
2500 | \& $x =~ /(a(?{print "Yow\en";}))*aa/; | |
2501 | .Ve | |
2502 | .PP | |
2503 | produces | |
2504 | .PP | |
2505 | .Vb 4 | |
2506 | \& Yow | |
2507 | \& Yow | |
2508 | \& Yow | |
2509 | \& Yow | |
2510 | .Ve | |
2511 | .PP | |
2512 | The result \f(CW$^R\fR is automatically localized, so that it will behave | |
2513 | properly in the presence of backtracking. | |
2514 | .PP | |
2515 | This example uses a code expression in a conditional to match the | |
2516 | article 'the' in either English or German: | |
2517 | .PP | |
2518 | .Vb 11 | |
2519 | \& $lang = 'DE'; # use German | |
2520 | \& ... | |
2521 | \& $text = "das"; | |
2522 | \& print "matched\en" | |
2523 | \& if $text =~ /(?(?{ | |
2524 | \& $lang eq 'EN'; # is the language English? | |
2525 | \& }) | |
2526 | \& the | # if so, then match 'the' | |
2527 | \& (die|das|der) # else, match 'die|das|der' | |
2528 | \& ) | |
2529 | \& /xi; | |
2530 | .Ve | |
2531 | .PP | |
2532 | Note that the syntax here is \f(CW\*(C`(?(?{...})yes\-regexp|no\-regexp)\*(C'\fR, not | |
2533 | \&\f(CW\*(C`(?((?{...}))yes\-regexp|no\-regexp)\*(C'\fR. In other words, in the case of a | |
2534 | code expression, we don't need the extra parentheses around the | |
2535 | conditional. | |
2536 | .PP | |
2537 | If you try to use code expressions with interpolating variables, perl | |
2538 | may surprise you: | |
2539 | .PP | |
2540 | .Vb 5 | |
2541 | \& $bar = 5; | |
2542 | \& $pat = '(?{ 1 })'; | |
2543 | \& /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated | |
2544 | \& /foo(?{ 1 })$bar/; # compile error! | |
2545 | \& /foo${pat}bar/; # compile error! | |
2546 | .Ve | |
2547 | .PP | |
2548 | .Vb 2 | |
2549 | \& $pat = qr/(?{ $foo = 1 })/; # precompile code regexp | |
2550 | \& /foo${pat}bar/; # compiles ok | |
2551 | .Ve | |
2552 | .PP | |
2553 | If a regexp has (1) code expressions and interpolating variables,or | |
2554 | (2) a variable that interpolates a code expression, perl treats the | |
2555 | regexp as an error. If the code expression is precompiled into a | |
2556 | variable, however, interpolating is ok. The question is, why is this | |
2557 | an error? | |
2558 | .PP | |
2559 | The reason is that variable interpolation and code expressions | |
2560 | together pose a security risk. The combination is dangerous because | |
2561 | many programmers who write search engines often take user input and | |
2562 | plug it directly into a regexp: | |
2563 | .PP | |
2564 | .Vb 3 | |
2565 | \& $regexp = <>; # read user-supplied regexp | |
2566 | \& $chomp $regexp; # get rid of possible newline | |
2567 | \& $text =~ /$regexp/; # search $text for the $regexp | |
2568 | .Ve | |
2569 | .PP | |
2570 | If the \f(CW$regexp\fR variable contains a code expression, the user could | |
2571 | then execute arbitrary Perl code. For instance, some joker could | |
2572 | search for \f(CW\*(C`system('rm\ \-rf\ *');\*(C'\fR\ to erase your files. In this | |
2573 | sense, the combination of interpolation and code expressions \fBtaints\fR | |
2574 | your regexp. So by default, using both interpolation and code | |
2575 | expressions in the same regexp is not allowed. If you're not | |
2576 | concerned about malicious users, it is possible to bypass this | |
2577 | security check by invoking \f(CW\*(C`use\ re\ 'eval'\*(C'\fR\ : | |
2578 | .PP | |
2579 | .Vb 5 | |
2580 | \& use re 'eval'; # throw caution out the door | |
2581 | \& $bar = 5; | |
2582 | \& $pat = '(?{ 1 })'; | |
2583 | \& /foo(?{ 1 })$bar/; # compiles ok | |
2584 | \& /foo${pat}bar/; # compiles ok | |
2585 | .Ve | |
2586 | .PP | |
2587 | Another form of code expression is the \fBpattern\ code\ expression\fR\ . | |
2588 | The pattern code expression is like a regular code expression, except | |
2589 | that the result of the code evaluation is treated as a regular | |
2590 | expression and matched immediately. A simple example is | |
2591 | .PP | |
2592 | .Vb 4 | |
2593 | \& $length = 5; | |
2594 | \& $char = 'a'; | |
2595 | \& $x = 'aaaaabb'; | |
2596 | \& $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a' | |
2597 | .Ve | |
2598 | .PP | |
2599 | This final example contains both ordinary and pattern code | |
2600 | expressions. It detects if a binary string \f(CW1101010010001...\fR has a | |
2601 | Fibonacci spacing 0,1,1,2,3,5,... of the \f(CW1\fR's: | |
2602 | .PP | |
2603 | .Vb 17 | |
2604 | \& $s0 = 0; $s1 = 1; # initial conditions | |
2605 | \& $x = "1101010010001000001"; | |
2606 | \& print "It is a Fibonacci sequence\en" | |
2607 | \& if $x =~ /^1 # match an initial '1' | |
2608 | \& ( | |
2609 | \& (??{'0' x $s0}) # match $s0 of '0' | |
2610 | \& 1 # and then a '1' | |
2611 | \& (?{ | |
2612 | \& $largest = $s0; # largest seq so far | |
2613 | \& $s2 = $s1 + $s0; # compute next term | |
2614 | \& $s0 = $s1; # in Fibonacci sequence | |
2615 | \& $s1 = $s2; | |
2616 | \& }) | |
2617 | \& )+ # repeat as needed | |
2618 | \& $ # that is all there is | |
2619 | \& /x; | |
2620 | \& print "Largest sequence matched was $largest\en"; | |
2621 | .Ve | |
2622 | .PP | |
2623 | This prints | |
2624 | .PP | |
2625 | .Vb 2 | |
2626 | \& It is a Fibonacci sequence | |
2627 | \& Largest sequence matched was 5 | |
2628 | .Ve | |
2629 | .PP | |
2630 | Ha! Try that with your garden variety regexp package... | |
2631 | .PP | |
2632 | Note that the variables \f(CW$s0\fR and \f(CW$s1\fR are not substituted when the | |
2633 | regexp is compiled, as happens for ordinary variables outside a code | |
2634 | expression. Rather, the code expressions are evaluated when perl | |
2635 | encounters them during the search for a match. | |
2636 | .PP | |
2637 | The regexp without the \f(CW\*(C`//x\*(C'\fR modifier is | |
2638 | .PP | |
2639 | .Vb 1 | |
2640 | \& /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/; | |
2641 | .Ve | |
2642 | .PP | |
2643 | and is a great start on an Obfuscated Perl entry :\-) When working with | |
2644 | code and conditional expressions, the extended form of regexps is | |
2645 | almost necessary in creating and debugging regexps. | |
2646 | .Sh "Pragmas and debugging" | |
2647 | .IX Subsection "Pragmas and debugging" | |
2648 | Speaking of debugging, there are several pragmas available to control | |
2649 | and debug regexps in Perl. We have already encountered one pragma in | |
2650 | the previous section, \f(CW\*(C`use\ re\ 'eval';\*(C'\fR\ , that allows variable | |
2651 | interpolation and code expressions to coexist in a regexp. The other | |
2652 | pragmas are | |
2653 | .PP | |
2654 | .Vb 3 | |
2655 | \& use re 'taint'; | |
2656 | \& $tainted = <>; | |
2657 | \& @parts = ($tainted =~ /(\ew+)\es+(\ew+)/; # @parts is now tainted | |
2658 | .Ve | |
2659 | .PP | |
2660 | The \f(CW\*(C`taint\*(C'\fR pragma causes any substrings from a match with a tainted | |
2661 | variable to be tainted as well. This is not normally the case, as | |
2662 | regexps are often used to extract the safe bits from a tainted | |
2663 | variable. Use \f(CW\*(C`taint\*(C'\fR when you are not extracting safe bits, but are | |
2664 | performing some other processing. Both \f(CW\*(C`taint\*(C'\fR and \f(CW\*(C`eval\*(C'\fR pragmas | |
2665 | are lexically scoped, which means they are in effect only until | |
2666 | the end of the block enclosing the pragmas. | |
2667 | .PP | |
2668 | .Vb 2 | |
2669 | \& use re 'debug'; | |
2670 | \& /^(.*)$/s; # output debugging info | |
2671 | .Ve | |
2672 | .PP | |
2673 | .Vb 2 | |
2674 | \& use re 'debugcolor'; | |
2675 | \& /^(.*)$/s; # output debugging info in living color | |
2676 | .Ve | |
2677 | .PP | |
2678 | The global \f(CW\*(C`debug\*(C'\fR and \f(CW\*(C`debugcolor\*(C'\fR pragmas allow one to get | |
2679 | detailed debugging info about regexp compilation and | |
2680 | execution. \f(CW\*(C`debugcolor\*(C'\fR is the same as debug, except the debugging | |
2681 | information is displayed in color on terminals that can display | |
2682 | termcap color sequences. Here is example output: | |
2683 | .PP | |
2684 | .Vb 25 | |
2685 | \& % perl -e 'use re "debug"; "abc" =~ /a*b+c/;' | |
2686 | \& Compiling REx `a*b+c' | |
2687 | \& size 9 first at 1 | |
2688 | \& 1: STAR(4) | |
2689 | \& 2: EXACT <a>(0) | |
2690 | \& 4: PLUS(7) | |
2691 | \& 5: EXACT <b>(0) | |
2692 | \& 7: EXACT <c>(9) | |
2693 | \& 9: END(0) | |
2694 | \& floating `bc' at 0..2147483647 (checking floating) minlen 2 | |
2695 | \& Guessing start of match, REx `a*b+c' against `abc'... | |
2696 | \& Found floating substr `bc' at offset 1... | |
2697 | \& Guessed: match at offset 0 | |
2698 | \& Matching REx `a*b+c' against `abc' | |
2699 | \& Setting an EVAL scope, savestack=3 | |
2700 | \& 0 <> <abc> | 1: STAR | |
2701 | \& EXACT <a> can match 1 times out of 32767... | |
2702 | \& Setting an EVAL scope, savestack=3 | |
2703 | \& 1 <a> <bc> | 4: PLUS | |
2704 | \& EXACT <b> can match 1 times out of 32767... | |
2705 | \& Setting an EVAL scope, savestack=3 | |
2706 | \& 2 <ab> <c> | 7: EXACT <c> | |
2707 | \& 3 <abc> <> | 9: END | |
2708 | \& Match successful! | |
2709 | \& Freeing REx: `a*b+c' | |
2710 | .Ve | |
2711 | .PP | |
2712 | If you have gotten this far into the tutorial, you can probably guess | |
2713 | what the different parts of the debugging output tell you. The first | |
2714 | part | |
2715 | .PP | |
2716 | .Vb 8 | |
2717 | \& Compiling REx `a*b+c' | |
2718 | \& size 9 first at 1 | |
2719 | \& 1: STAR(4) | |
2720 | \& 2: EXACT <a>(0) | |
2721 | \& 4: PLUS(7) | |
2722 | \& 5: EXACT <b>(0) | |
2723 | \& 7: EXACT <c>(9) | |
2724 | \& 9: END(0) | |
2725 | .Ve | |
2726 | .PP | |
2727 | describes the compilation stage. \f(CWSTAR(4)\fR means that there is a | |
2728 | starred object, in this case \f(CW'a'\fR, and if it matches, goto line 4, | |
2729 | i.e., \f(CWPLUS(7)\fR. The middle lines describe some heuristics and | |
2730 | optimizations performed before a match: | |
2731 | .PP | |
2732 | .Vb 4 | |
2733 | \& floating `bc' at 0..2147483647 (checking floating) minlen 2 | |
2734 | \& Guessing start of match, REx `a*b+c' against `abc'... | |
2735 | \& Found floating substr `bc' at offset 1... | |
2736 | \& Guessed: match at offset 0 | |
2737 | .Ve | |
2738 | .PP | |
2739 | Then the match is executed and the remaining lines describe the | |
2740 | process: | |
2741 | .PP | |
2742 | .Vb 12 | |
2743 | \& Matching REx `a*b+c' against `abc' | |
2744 | \& Setting an EVAL scope, savestack=3 | |
2745 | \& 0 <> <abc> | 1: STAR | |
2746 | \& EXACT <a> can match 1 times out of 32767... | |
2747 | \& Setting an EVAL scope, savestack=3 | |
2748 | \& 1 <a> <bc> | 4: PLUS | |
2749 | \& EXACT <b> can match 1 times out of 32767... | |
2750 | \& Setting an EVAL scope, savestack=3 | |
2751 | \& 2 <ab> <c> | 7: EXACT <c> | |
2752 | \& 3 <abc> <> | 9: END | |
2753 | \& Match successful! | |
2754 | \& Freeing REx: `a*b+c' | |
2755 | .Ve | |
2756 | .PP | |
2757 | Each step is of the form \f(CW\*(C`n\ <x>\ <y>\*(C'\fR\ , with \f(CW\*(C`<x>\*(C'\fR the | |
2758 | part of the string matched and \f(CW\*(C`<y>\*(C'\fR the part not yet | |
2759 | matched. The \f(CW\*(C`|\ 1:\ STAR\*(C'\fR\ says that perl is at line number 1 | |
2760 | n the compilation list above. See | |
2761 | \&\*(L"Debugging regular expressions\*(R" in perldebguts for much more detail. | |
2762 | .PP | |
2763 | An alternative method of debugging regexps is to embed \f(CW\*(C`print\*(C'\fR | |
2764 | statements within the regexp. This provides a blow-by-blow account of | |
2765 | the backtracking in an alternation: | |
2766 | .PP | |
2767 | .Vb 12 | |
2768 | \& "that this" =~ m@(?{print "Start at position ", pos, "\en";}) | |
2769 | \& t(?{print "t1\en";}) | |
2770 | \& h(?{print "h1\en";}) | |
2771 | \& i(?{print "i1\en";}) | |
2772 | \& s(?{print "s1\en";}) | |
2773 | \& | | |
2774 | \& t(?{print "t2\en";}) | |
2775 | \& h(?{print "h2\en";}) | |
2776 | \& a(?{print "a2\en";}) | |
2777 | \& t(?{print "t2\en";}) | |
2778 | \& (?{print "Done at position ", pos, "\en";}) | |
2779 | \& @x; | |
2780 | .Ve | |
2781 | .PP | |
2782 | prints | |
2783 | .PP | |
2784 | .Vb 8 | |
2785 | \& Start at position 0 | |
2786 | \& t1 | |
2787 | \& h1 | |
2788 | \& t2 | |
2789 | \& h2 | |
2790 | \& a2 | |
2791 | \& t2 | |
2792 | \& Done at position 4 | |
2793 | .Ve | |
2794 | .SH "BUGS" | |
2795 | .IX Header "BUGS" | |
2796 | Code expressions, conditional expressions, and independent expressions | |
2797 | are \fBexperimental\fR. Don't use them in production code. Yet. | |
2798 | .SH "SEE ALSO" | |
2799 | .IX Header "SEE ALSO" | |
2800 | This is just a tutorial. For the full story on perl regular | |
2801 | expressions, see the perlre regular expressions reference page. | |
2802 | .PP | |
2803 | For more information on the matching \f(CW\*(C`m//\*(C'\fR and substitution \f(CW\*(C`s///\*(C'\fR | |
2804 | operators, see \*(L"Regexp Quote-Like Operators\*(R" in perlop. For | |
2805 | information on the \f(CW\*(C`split\*(C'\fR operation, see \*(L"split\*(R" in perlfunc. | |
2806 | .PP | |
2807 | For an excellent all-around resource on the care and feeding of | |
2808 | regular expressions, see the book \fIMastering Regular Expressions\fR by | |
2809 | Jeffrey Friedl (published by O'Reilly, \s-1ISBN\s0 1556592\-257\-3). | |
2810 | .SH "AUTHOR AND COPYRIGHT" | |
2811 | .IX Header "AUTHOR AND COPYRIGHT" | |
2812 | Copyright (c) 2000 Mark Kvale | |
2813 | All rights reserved. | |
2814 | .PP | |
2815 | This document may be distributed under the same terms as Perl itself. | |
2816 | .Sh "Acknowledgments" | |
2817 | .IX Subsection "Acknowledgments" | |
2818 | The inspiration for the stop codon \s-1DNA\s0 example came from the \s-1ZIP\s0 | |
2819 | code example in chapter 7 of \fIMastering Regular Expressions\fR. | |
2820 | .PP | |
2821 | The author would like to thank Jeff Pinyan, Andrew Johnson, Peter | |
2822 | Haworth, Ronald J Kimball, and Joe Smith for all their helpful | |
2823 | comments. |