| 1 | .\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32 |
| 2 | .\" |
| 3 | .\" Standard preamble: |
| 4 | .\" ======================================================================== |
| 5 | .de Sh \" Subsection heading |
| 6 | .br |
| 7 | .if t .Sp |
| 8 | .ne 5 |
| 9 | .PP |
| 10 | \fB\\$1\fR |
| 11 | .PP |
| 12 | .. |
| 13 | .de Sp \" Vertical space (when we can't use .PP) |
| 14 | .if t .sp .5v |
| 15 | .if n .sp |
| 16 | .. |
| 17 | .de Vb \" Begin verbatim text |
| 18 | .ft CW |
| 19 | .nf |
| 20 | .ne \\$1 |
| 21 | .. |
| 22 | .de Ve \" End verbatim text |
| 23 | .ft R |
| 24 | .fi |
| 25 | .. |
| 26 | .\" Set up some character translations and predefined strings. \*(-- will |
| 27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
| 28 | .\" double quote, and \*(R" will give a right double quote. | will give a |
| 29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to |
| 30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' |
| 31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. |
| 32 | .tr \(*W-|\(bv\*(Tr |
| 33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
| 34 | .ie n \{\ |
| 35 | . ds -- \(*W- |
| 36 | . ds PI pi |
| 37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
| 38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
| 39 | . ds L" "" |
| 40 | . ds R" "" |
| 41 | . ds C` "" |
| 42 | . ds C' "" |
| 43 | 'br\} |
| 44 | .el\{\ |
| 45 | . ds -- \|\(em\| |
| 46 | . ds PI \(*p |
| 47 | . ds L" `` |
| 48 | . ds R" '' |
| 49 | 'br\} |
| 50 | .\" |
| 51 | .\" If the F register is turned on, we'll generate index entries on stderr for |
| 52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index |
| 53 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
| 54 | .\" output yourself in some meaningful fashion. |
| 55 | .if \nF \{\ |
| 56 | . de IX |
| 57 | . tm Index:\\$1\t\\n%\t"\\$2" |
| 58 | .. |
| 59 | . nr % 0 |
| 60 | . rr F |
| 61 | .\} |
| 62 | .\" |
| 63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
| 64 | .\" way too many mistakes in technical documents. |
| 65 | .hy 0 |
| 66 | .if n .na |
| 67 | .\" |
| 68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
| 69 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
| 70 | . \" fudge factors for nroff and troff |
| 71 | .if n \{\ |
| 72 | . ds #H 0 |
| 73 | . ds #V .8m |
| 74 | . ds #F .3m |
| 75 | . ds #[ \f1 |
| 76 | . ds #] \fP |
| 77 | .\} |
| 78 | .if t \{\ |
| 79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
| 80 | . ds #V .6m |
| 81 | . ds #F 0 |
| 82 | . ds #[ \& |
| 83 | . ds #] \& |
| 84 | .\} |
| 85 | . \" simple accents for nroff and troff |
| 86 | .if n \{\ |
| 87 | . ds ' \& |
| 88 | . ds ` \& |
| 89 | . ds ^ \& |
| 90 | . ds , \& |
| 91 | . ds ~ ~ |
| 92 | . ds / |
| 93 | .\} |
| 94 | .if t \{\ |
| 95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
| 96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
| 97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
| 98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
| 99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
| 100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
| 101 | .\} |
| 102 | . \" troff and (daisy-wheel) nroff accents |
| 103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
| 104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
| 105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
| 106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
| 107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
| 108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
| 109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
| 110 | .ds ae a\h'-(\w'a'u*4/10)'e |
| 111 | .ds Ae A\h'-(\w'A'u*4/10)'E |
| 112 | . \" corrections for vroff |
| 113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
| 114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
| 115 | . \" for low resolution devices (crt and lpr) |
| 116 | .if \n(.H>23 .if \n(.V>19 \ |
| 117 | \{\ |
| 118 | . ds : e |
| 119 | . ds 8 ss |
| 120 | . ds o a |
| 121 | . ds d- d\h'-1'\(ga |
| 122 | . ds D- D\h'-1'\(hy |
| 123 | . ds th \o'bp' |
| 124 | . ds Th \o'LP' |
| 125 | . ds ae ae |
| 126 | . ds Ae AE |
| 127 | .\} |
| 128 | .rm #[ #] #H #V #F C |
| 129 | .\" ======================================================================== |
| 130 | .\" |
| 131 | .IX Title "PERLRETUT 1" |
| 132 | .TH PERLRETUT 1 "2006-01-07" "perl v5.8.8" "Perl Programmers Reference Guide" |
| 133 | .SH "NAME" |
| 134 | perlretut \- Perl regular expressions tutorial |
| 135 | .SH "DESCRIPTION" |
| 136 | .IX Header "DESCRIPTION" |
| 137 | This page provides a basic tutorial on understanding, creating and |
| 138 | using regular expressions in Perl. It serves as a complement to the |
| 139 | reference page on regular expressions perlre. Regular expressions |
| 140 | are an integral part of the \f(CW\*(C`m//\*(C'\fR, \f(CW\*(C`s///\*(C'\fR, \f(CW\*(C`qr//\*(C'\fR and \f(CW\*(C`split\*(C'\fR |
| 141 | operators and so this tutorial also overlaps with |
| 142 | \&\*(L"Regexp Quote-Like Operators\*(R" in perlop and \*(L"split\*(R" in perlfunc. |
| 143 | .PP |
| 144 | Perl is widely renowned for excellence in text processing, and regular |
| 145 | expressions are one of the big factors behind this fame. Perl regular |
| 146 | expressions display an efficiency and flexibility unknown in most |
| 147 | other computer languages. Mastering even the basics of regular |
| 148 | expressions will allow you to manipulate text with surprising ease. |
| 149 | .PP |
| 150 | What is a regular expression? A regular expression is simply a string |
| 151 | that describes a pattern. Patterns are in common use these days; |
| 152 | examples are the patterns typed into a search engine to find web pages |
| 153 | and the patterns used to list files in a directory, e.g., \f(CW\*(C`ls *.txt\*(C'\fR |
| 154 | or \f(CW\*(C`dir *.*\*(C'\fR. In Perl, the patterns described by regular expressions |
| 155 | are used to search strings, extract desired parts of strings, and to |
| 156 | do search and replace operations. |
| 157 | .PP |
| 158 | Regular expressions have the undeserved reputation of being abstract |
| 159 | and difficult to understand. Regular expressions are constructed using |
| 160 | simple concepts like conditionals and loops and are no more difficult |
| 161 | to understand than the corresponding \f(CW\*(C`if\*(C'\fR conditionals and \f(CW\*(C`while\*(C'\fR |
| 162 | loops in the Perl language itself. In fact, the main challenge in |
| 163 | learning regular expressions is just getting used to the terse |
| 164 | notation used to express these concepts. |
| 165 | .PP |
| 166 | This tutorial flattens the learning curve by discussing regular |
| 167 | expression concepts, along with their notation, one at a time and with |
| 168 | many examples. The first part of the tutorial will progress from the |
| 169 | simplest word searches to the basic regular expression concepts. If |
| 170 | you master the first part, you will have all the tools needed to solve |
| 171 | about 98% of your needs. The second part of the tutorial is for those |
| 172 | comfortable with the basics and hungry for more power tools. It |
| 173 | discusses the more advanced regular expression operators and |
| 174 | introduces the latest cutting edge innovations in 5.6.0. |
| 175 | .PP |
| 176 | A note: to save time, 'regular expression' is often abbreviated as |
| 177 | regexp or regex. Regexp is a more natural abbreviation than regex, but |
| 178 | is harder to pronounce. The Perl pod documentation is evenly split on |
| 179 | regexp vs regex; in Perl, there is more than one way to abbreviate it. |
| 180 | We'll use regexp in this tutorial. |
| 181 | .SH "Part 1: The basics" |
| 182 | .IX Header "Part 1: The basics" |
| 183 | .Sh "Simple word matching" |
| 184 | .IX Subsection "Simple word matching" |
| 185 | The simplest regexp is simply a word, or more generally, a string of |
| 186 | characters. A regexp consisting of a word matches any string that |
| 187 | contains that word: |
| 188 | .PP |
| 189 | .Vb 1 |
| 190 | \& "Hello World" =~ /World/; # matches |
| 191 | .Ve |
| 192 | .PP |
| 193 | What is this perl statement all about? \f(CW"Hello World"\fR is a simple |
| 194 | double quoted string. \f(CW\*(C`World\*(C'\fR is the regular expression and the |
| 195 | \&\f(CW\*(C`//\*(C'\fR enclosing \f(CW\*(C`/World/\*(C'\fR tells perl to search a string for a match. |
| 196 | The operator \f(CW\*(C`=~\*(C'\fR associates the string with the regexp match and |
| 197 | produces a true value if the regexp matched, or false if the regexp |
| 198 | did not match. In our case, \f(CW\*(C`World\*(C'\fR matches the second word in |
| 199 | \&\f(CW"Hello World"\fR, so the expression is true. Expressions like this |
| 200 | are useful in conditionals: |
| 201 | .PP |
| 202 | .Vb 6 |
| 203 | \& if ("Hello World" =~ /World/) { |
| 204 | \& print "It matches\en"; |
| 205 | \& } |
| 206 | \& else { |
| 207 | \& print "It doesn't match\en"; |
| 208 | \& } |
| 209 | .Ve |
| 210 | .PP |
| 211 | There are useful variations on this theme. The sense of the match can |
| 212 | be reversed by using \f(CW\*(C`!~\*(C'\fR operator: |
| 213 | .PP |
| 214 | .Vb 6 |
| 215 | \& if ("Hello World" !~ /World/) { |
| 216 | \& print "It doesn't match\en"; |
| 217 | \& } |
| 218 | \& else { |
| 219 | \& print "It matches\en"; |
| 220 | \& } |
| 221 | .Ve |
| 222 | .PP |
| 223 | The literal string in the regexp can be replaced by a variable: |
| 224 | .PP |
| 225 | .Vb 7 |
| 226 | \& $greeting = "World"; |
| 227 | \& if ("Hello World" =~ /$greeting/) { |
| 228 | \& print "It matches\en"; |
| 229 | \& } |
| 230 | \& else { |
| 231 | \& print "It doesn't match\en"; |
| 232 | \& } |
| 233 | .Ve |
| 234 | .PP |
| 235 | If you're matching against the special default variable \f(CW$_\fR, the |
| 236 | \&\f(CW\*(C`$_ =~\*(C'\fR part can be omitted: |
| 237 | .PP |
| 238 | .Vb 7 |
| 239 | \& $_ = "Hello World"; |
| 240 | \& if (/World/) { |
| 241 | \& print "It matches\en"; |
| 242 | \& } |
| 243 | \& else { |
| 244 | \& print "It doesn't match\en"; |
| 245 | \& } |
| 246 | .Ve |
| 247 | .PP |
| 248 | And finally, the \f(CW\*(C`//\*(C'\fR default delimiters for a match can be changed |
| 249 | to arbitrary delimiters by putting an \f(CW'm'\fR out front: |
| 250 | .PP |
| 251 | .Vb 4 |
| 252 | \& "Hello World" =~ m!World!; # matches, delimited by '!' |
| 253 | \& "Hello World" =~ m{World}; # matches, note the matching '{}' |
| 254 | \& "/usr/bin/perl" =~ m"/perl"; # matches after '/usr/bin', |
| 255 | \& # '/' becomes an ordinary char |
| 256 | .Ve |
| 257 | .PP |
| 258 | \&\f(CW\*(C`/World/\*(C'\fR, \f(CW\*(C`m!World!\*(C'\fR, and \f(CW\*(C`m{World}\*(C'\fR all represent the |
| 259 | same thing. When, e.g., \f(CW""\fR is used as a delimiter, the forward |
| 260 | slash \f(CW'/'\fR becomes an ordinary character and can be used in a regexp |
| 261 | without trouble. |
| 262 | .PP |
| 263 | Let's consider how different regexps would match \f(CW"Hello World"\fR: |
| 264 | .PP |
| 265 | .Vb 4 |
| 266 | \& "Hello World" =~ /world/; # doesn't match |
| 267 | \& "Hello World" =~ /o W/; # matches |
| 268 | \& "Hello World" =~ /oW/; # doesn't match |
| 269 | \& "Hello World" =~ /World /; # doesn't match |
| 270 | .Ve |
| 271 | .PP |
| 272 | The first regexp \f(CW\*(C`world\*(C'\fR doesn't match because regexps are |
| 273 | case\-sensitive. The second regexp matches because the substring |
| 274 | \&\f(CW'o\ W'\fR\ occurs in the string \f(CW"Hello\ World"\fR\ . The space |
| 275 | character ' ' is treated like any other character in a regexp and is |
| 276 | needed to match in this case. The lack of a space character is the |
| 277 | reason the third regexp \f(CW'oW'\fR doesn't match. The fourth regexp |
| 278 | \&\f(CW'World '\fR doesn't match because there is a space at the end of the |
| 279 | regexp, but not at the end of the string. The lesson here is that |
| 280 | regexps must match a part of the string \fIexactly\fR in order for the |
| 281 | statement to be true. |
| 282 | .PP |
| 283 | If a regexp matches in more than one place in the string, perl will |
| 284 | always match at the earliest possible point in the string: |
| 285 | .PP |
| 286 | .Vb 2 |
| 287 | \& "Hello World" =~ /o/; # matches 'o' in 'Hello' |
| 288 | \& "That hat is red" =~ /hat/; # matches 'hat' in 'That' |
| 289 | .Ve |
| 290 | .PP |
| 291 | With respect to character matching, there are a few more points you |
| 292 | need to know about. First of all, not all characters can be used 'as |
| 293 | is' in a match. Some characters, called \fBmetacharacters\fR, are reserved |
| 294 | for use in regexp notation. The metacharacters are |
| 295 | .PP |
| 296 | .Vb 1 |
| 297 | \& {}[]()^$.|*+?\e |
| 298 | .Ve |
| 299 | .PP |
| 300 | The significance of each of these will be explained |
| 301 | in the rest of the tutorial, but for now, it is important only to know |
| 302 | that a metacharacter can be matched by putting a backslash before it: |
| 303 | .PP |
| 304 | .Vb 5 |
| 305 | \& "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter |
| 306 | \& "2+2=4" =~ /2\e+2/; # matches, \e+ is treated like an ordinary + |
| 307 | \& "The interval is [0,1)." =~ /[0,1)./ # is a syntax error! |
| 308 | \& "The interval is [0,1)." =~ /\e[0,1\e)\e./ # matches |
| 309 | \& "/usr/bin/perl" =~ /\e/usr\e/bin\e/perl/; # matches |
| 310 | .Ve |
| 311 | .PP |
| 312 | In the last regexp, the forward slash \f(CW'/'\fR is also backslashed, |
| 313 | because it is used to delimit the regexp. This can lead to \s-1LTS\s0 |
| 314 | (leaning toothpick syndrome), however, and it is often more readable |
| 315 | to change delimiters. |
| 316 | .PP |
| 317 | .Vb 1 |
| 318 | \& "/usr/bin/perl" =~ m!/usr/bin/perl!; # easier to read |
| 319 | .Ve |
| 320 | .PP |
| 321 | The backslash character \f(CW'\e'\fR is a metacharacter itself and needs to |
| 322 | be backslashed: |
| 323 | .PP |
| 324 | .Vb 1 |
| 325 | \& 'C:\eWIN32' =~ /C:\e\eWIN/; # matches |
| 326 | .Ve |
| 327 | .PP |
| 328 | In addition to the metacharacters, there are some \s-1ASCII\s0 characters |
| 329 | which don't have printable character equivalents and are instead |
| 330 | represented by \fBescape sequences\fR. Common examples are \f(CW\*(C`\et\*(C'\fR for a |
| 331 | tab, \f(CW\*(C`\en\*(C'\fR for a newline, \f(CW\*(C`\er\*(C'\fR for a carriage return and \f(CW\*(C`\ea\*(C'\fR for a |
| 332 | bell. If your string is better thought of as a sequence of arbitrary |
| 333 | bytes, the octal escape sequence, e.g., \f(CW\*(C`\e033\*(C'\fR, or hexadecimal escape |
| 334 | sequence, e.g., \f(CW\*(C`\ex1B\*(C'\fR may be a more natural representation for your |
| 335 | bytes. Here are some examples of escapes: |
| 336 | .PP |
| 337 | .Vb 4 |
| 338 | \& "1000\et2000" =~ m(0\et2) # matches |
| 339 | \& "1000\en2000" =~ /0\en20/ # matches |
| 340 | \& "1000\et2000" =~ /\e000\et2/ # doesn't match, "0" ne "\e000" |
| 341 | \& "cat" =~ /\e143\ex61\ex74/ # matches, but a weird way to spell cat |
| 342 | .Ve |
| 343 | .PP |
| 344 | If you've been around Perl a while, all this talk of escape sequences |
| 345 | may seem familiar. Similar escape sequences are used in double-quoted |
| 346 | strings and in fact the regexps in Perl are mostly treated as |
| 347 | double-quoted strings. This means that variables can be used in |
| 348 | regexps as well. Just like double-quoted strings, the values of the |
| 349 | variables in the regexp will be substituted in before the regexp is |
| 350 | evaluated for matching purposes. So we have: |
| 351 | .PP |
| 352 | .Vb 4 |
| 353 | \& $foo = 'house'; |
| 354 | \& 'housecat' =~ /$foo/; # matches |
| 355 | \& 'cathouse' =~ /cat$foo/; # matches |
| 356 | \& 'housecat' =~ /${foo}cat/; # matches |
| 357 | .Ve |
| 358 | .PP |
| 359 | So far, so good. With the knowledge above you can already perform |
| 360 | searches with just about any literal string regexp you can dream up. |
| 361 | Here is a \fIvery simple\fR emulation of the Unix grep program: |
| 362 | .PP |
| 363 | .Vb 7 |
| 364 | \& % cat > simple_grep |
| 365 | \& #!/usr/bin/perl |
| 366 | \& $regexp = shift; |
| 367 | \& while (<>) { |
| 368 | \& print if /$regexp/; |
| 369 | \& } |
| 370 | \& ^D |
| 371 | .Ve |
| 372 | .PP |
| 373 | .Vb 1 |
| 374 | \& % chmod +x simple_grep |
| 375 | .Ve |
| 376 | .PP |
| 377 | .Vb 10 |
| 378 | \& % simple_grep abba /usr/dict/words |
| 379 | \& Babbage |
| 380 | \& cabbage |
| 381 | \& cabbages |
| 382 | \& sabbath |
| 383 | \& Sabbathize |
| 384 | \& Sabbathizes |
| 385 | \& sabbatical |
| 386 | \& scabbard |
| 387 | \& scabbards |
| 388 | .Ve |
| 389 | .PP |
| 390 | This program is easy to understand. \f(CW\*(C`#!/usr/bin/perl\*(C'\fR is the standard |
| 391 | way to invoke a perl program from the shell. |
| 392 | \&\f(CW\*(C`$regexp\ =\ shift;\*(C'\fR\ saves the first command line argument as the |
| 393 | regexp to be used, leaving the rest of the command line arguments to |
| 394 | be treated as files. \f(CW\*(C`while\ (<>)\*(C'\fR\ loops over all the lines in |
| 395 | all the files. For each line, \f(CW\*(C`print\ if\ /$regexp/;\*(C'\fR\ prints the |
| 396 | line if the regexp matches the line. In this line, both \f(CW\*(C`print\*(C'\fR and |
| 397 | \&\f(CW\*(C`/$regexp/\*(C'\fR use the default variable \f(CW$_\fR implicitly. |
| 398 | .PP |
| 399 | With all of the regexps above, if the regexp matched anywhere in the |
| 400 | string, it was considered a match. Sometimes, however, we'd like to |
| 401 | specify \fIwhere\fR in the string the regexp should try to match. To do |
| 402 | this, we would use the \fBanchor\fR metacharacters \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR. The |
| 403 | anchor \f(CW\*(C`^\*(C'\fR means match at the beginning of the string and the anchor |
| 404 | \&\f(CW\*(C`$\*(C'\fR means match at the end of the string, or before a newline at the |
| 405 | end of the string. Here is how they are used: |
| 406 | .PP |
| 407 | .Vb 4 |
| 408 | \& "housekeeper" =~ /keeper/; # matches |
| 409 | \& "housekeeper" =~ /^keeper/; # doesn't match |
| 410 | \& "housekeeper" =~ /keeper$/; # matches |
| 411 | \& "housekeeper\en" =~ /keeper$/; # matches |
| 412 | .Ve |
| 413 | .PP |
| 414 | The second regexp doesn't match because \f(CW\*(C`^\*(C'\fR constrains \f(CW\*(C`keeper\*(C'\fR to |
| 415 | match only at the beginning of the string, but \f(CW"housekeeper"\fR has |
| 416 | keeper starting in the middle. The third regexp does match, since the |
| 417 | \&\f(CW\*(C`$\*(C'\fR constrains \f(CW\*(C`keeper\*(C'\fR to match only at the end of the string. |
| 418 | .PP |
| 419 | When both \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR are used at the same time, the regexp has to |
| 420 | match both the beginning and the end of the string, i.e., the regexp |
| 421 | matches the whole string. Consider |
| 422 | .PP |
| 423 | .Vb 3 |
| 424 | \& "keeper" =~ /^keep$/; # doesn't match |
| 425 | \& "keeper" =~ /^keeper$/; # matches |
| 426 | \& "" =~ /^$/; # ^$ matches an empty string |
| 427 | .Ve |
| 428 | .PP |
| 429 | The first regexp doesn't match because the string has more to it than |
| 430 | \&\f(CW\*(C`keep\*(C'\fR. Since the second regexp is exactly the string, it |
| 431 | matches. Using both \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR in a regexp forces the complete |
| 432 | string to match, so it gives you complete control over which strings |
| 433 | match and which don't. Suppose you are looking for a fellow named |
| 434 | bert, off in a string by himself: |
| 435 | .PP |
| 436 | .Vb 1 |
| 437 | \& "dogbert" =~ /bert/; # matches, but not what you want |
| 438 | .Ve |
| 439 | .PP |
| 440 | .Vb 2 |
| 441 | \& "dilbert" =~ /^bert/; # doesn't match, but .. |
| 442 | \& "bertram" =~ /^bert/; # matches, so still not good enough |
| 443 | .Ve |
| 444 | .PP |
| 445 | .Vb 3 |
| 446 | \& "bertram" =~ /^bert$/; # doesn't match, good |
| 447 | \& "dilbert" =~ /^bert$/; # doesn't match, good |
| 448 | \& "bert" =~ /^bert$/; # matches, perfect |
| 449 | .Ve |
| 450 | .PP |
| 451 | Of course, in the case of a literal string, one could just as easily |
| 452 | use the string equivalence \f(CW\*(C`$string\ eq\ 'bert'\*(C'\fR\ and it would be |
| 453 | more efficient. The \f(CW\*(C`^...$\*(C'\fR regexp really becomes useful when we |
| 454 | add in the more powerful regexp tools below. |
| 455 | .Sh "Using character classes" |
| 456 | .IX Subsection "Using character classes" |
| 457 | Although one can already do quite a lot with the literal string |
| 458 | regexps above, we've only scratched the surface of regular expression |
| 459 | technology. In this and subsequent sections we will introduce regexp |
| 460 | concepts (and associated metacharacter notations) that will allow a |
| 461 | regexp to not just represent a single character sequence, but a \fIwhole |
| 462 | class\fR of them. |
| 463 | .PP |
| 464 | One such concept is that of a \fBcharacter class\fR. A character class |
| 465 | allows a set of possible characters, rather than just a single |
| 466 | character, to match at a particular point in a regexp. Character |
| 467 | classes are denoted by brackets \f(CW\*(C`[...]\*(C'\fR, with the set of characters |
| 468 | to be possibly matched inside. Here are some examples: |
| 469 | .PP |
| 470 | .Vb 4 |
| 471 | \& /cat/; # matches 'cat' |
| 472 | \& /[bcr]at/; # matches 'bat, 'cat', or 'rat' |
| 473 | \& /item[0123456789]/; # matches 'item0' or ... or 'item9' |
| 474 | \& "abc" =~ /[cab]/; # matches 'a' |
| 475 | .Ve |
| 476 | .PP |
| 477 | In the last statement, even though \f(CW'c'\fR is the first character in |
| 478 | the class, \f(CW'a'\fR matches because the first character position in the |
| 479 | string is the earliest point at which the regexp can match. |
| 480 | .PP |
| 481 | .Vb 2 |
| 482 | \& /[yY][eE][sS]/; # match 'yes' in a case-insensitive way |
| 483 | \& # 'yes', 'Yes', 'YES', etc. |
| 484 | .Ve |
| 485 | .PP |
| 486 | This regexp displays a common task: perform a case-insensitive |
| 487 | match. Perl provides away of avoiding all those brackets by simply |
| 488 | appending an \f(CW'i'\fR to the end of the match. Then \f(CW\*(C`/[yY][eE][sS]/;\*(C'\fR |
| 489 | can be rewritten as \f(CW\*(C`/yes/i;\*(C'\fR. The \f(CW'i'\fR stands for |
| 490 | case-insensitive and is an example of a \fBmodifier\fR of the matching |
| 491 | operation. We will meet other modifiers later in the tutorial. |
| 492 | .PP |
| 493 | We saw in the section above that there were ordinary characters, which |
| 494 | represented themselves, and special characters, which needed a |
| 495 | backslash \f(CW\*(C`\e\*(C'\fR to represent themselves. The same is true in a |
| 496 | character class, but the sets of ordinary and special characters |
| 497 | inside a character class are different than those outside a character |
| 498 | class. The special characters for a character class are \f(CW\*(C`\-]\e^$\*(C'\fR. \f(CW\*(C`]\*(C'\fR |
| 499 | is special because it denotes the end of a character class. \f(CW\*(C`$\*(C'\fR is |
| 500 | special because it denotes a scalar variable. \f(CW\*(C`\e\*(C'\fR is special because |
| 501 | it is used in escape sequences, just like above. Here is how the |
| 502 | special characters \f(CW\*(C`]$\e\*(C'\fR are handled: |
| 503 | .PP |
| 504 | .Vb 5 |
| 505 | \& /[\e]c]def/; # matches ']def' or 'cdef' |
| 506 | \& $x = 'bcr'; |
| 507 | \& /[$x]at/; # matches 'bat', 'cat', or 'rat' |
| 508 | \& /[\e$x]at/; # matches '$at' or 'xat' |
| 509 | \& /[\e\e$x]at/; # matches '\eat', 'bat, 'cat', or 'rat' |
| 510 | .Ve |
| 511 | .PP |
| 512 | The last two are a little tricky. in \f(CW\*(C`[\e$x]\*(C'\fR, the backslash protects |
| 513 | the dollar sign, so the character class has two members \f(CW\*(C`$\*(C'\fR and \f(CW\*(C`x\*(C'\fR. |
| 514 | In \f(CW\*(C`[\e\e$x]\*(C'\fR, the backslash is protected, so \f(CW$x\fR is treated as a |
| 515 | variable and substituted in double quote fashion. |
| 516 | .PP |
| 517 | The special character \f(CW'\-'\fR acts as a range operator within character |
| 518 | classes, so that a contiguous set of characters can be written as a |
| 519 | range. With ranges, the unwieldy \f(CW\*(C`[0123456789]\*(C'\fR and \f(CW\*(C`[abc...xyz]\*(C'\fR |
| 520 | become the svelte \f(CW\*(C`[0\-9]\*(C'\fR and \f(CW\*(C`[a\-z]\*(C'\fR. Some examples are |
| 521 | .PP |
| 522 | .Vb 6 |
| 523 | \& /item[0-9]/; # matches 'item0' or ... or 'item9' |
| 524 | \& /[0-9bx-z]aa/; # matches '0aa', ..., '9aa', |
| 525 | \& # 'baa', 'xaa', 'yaa', or 'zaa' |
| 526 | \& /[0-9a-fA-F]/; # matches a hexadecimal digit |
| 527 | \& /[0-9a-zA-Z_]/; # matches a "word" character, |
| 528 | \& # like those in a perl variable name |
| 529 | .Ve |
| 530 | .PP |
| 531 | If \f(CW'\-'\fR is the first or last character in a character class, it is |
| 532 | treated as an ordinary character; \f(CW\*(C`[\-ab]\*(C'\fR, \f(CW\*(C`[ab\-]\*(C'\fR and \f(CW\*(C`[a\e\-b]\*(C'\fR are |
| 533 | all equivalent. |
| 534 | .PP |
| 535 | The special character \f(CW\*(C`^\*(C'\fR in the first position of a character class |
| 536 | denotes a \fBnegated character class\fR, which matches any character but |
| 537 | those in the brackets. Both \f(CW\*(C`[...]\*(C'\fR and \f(CW\*(C`[^...]\*(C'\fR must match a |
| 538 | character, or the match fails. Then |
| 539 | .PP |
| 540 | .Vb 4 |
| 541 | \& /[^a]at/; # doesn't match 'aat' or 'at', but matches |
| 542 | \& # all other 'bat', 'cat, '0at', '%at', etc. |
| 543 | \& /[^0-9]/; # matches a non-numeric character |
| 544 | \& /[a^]at/; # matches 'aat' or '^at'; here '^' is ordinary |
| 545 | .Ve |
| 546 | .PP |
| 547 | Now, even \f(CW\*(C`[0\-9]\*(C'\fR can be a bother the write multiple times, so in the |
| 548 | interest of saving keystrokes and making regexps more readable, Perl |
| 549 | has several abbreviations for common character classes: |
| 550 | .IP "\(bu" 4 |
| 551 | \&\ed is a digit and represents [0\-9] |
| 552 | .IP "\(bu" 4 |
| 553 | \&\es is a whitespace character and represents [\e \et\er\en\ef] |
| 554 | .IP "\(bu" 4 |
| 555 | \&\ew is a word character (alphanumeric or _) and represents [0\-9a\-zA\-Z_] |
| 556 | .IP "\(bu" 4 |
| 557 | \&\eD is a negated \ed; it represents any character but a digit [^0\-9] |
| 558 | .IP "\(bu" 4 |
| 559 | \&\eS is a negated \es; it represents any non-whitespace character [^\es] |
| 560 | .IP "\(bu" 4 |
| 561 | \&\eW is a negated \ew; it represents any non-word character [^\ew] |
| 562 | .IP "\(bu" 4 |
| 563 | The period '.' matches any character but \*(L"\en\*(R" |
| 564 | .PP |
| 565 | The \f(CW\*(C`\ed\es\ew\eD\eS\eW\*(C'\fR abbreviations can be used both inside and outside |
| 566 | of character classes. Here are some in use: |
| 567 | .PP |
| 568 | .Vb 7 |
| 569 | \& /\ed\ed:\ed\ed:\ed\ed/; # matches a hh:mm:ss time format |
| 570 | \& /[\ed\es]/; # matches any digit or whitespace character |
| 571 | \& /\ew\eW\ew/; # matches a word char, followed by a |
| 572 | \& # non-word char, followed by a word char |
| 573 | \& /..rt/; # matches any two chars, followed by 'rt' |
| 574 | \& /end\e./; # matches 'end.' |
| 575 | \& /end[.]/; # same thing, matches 'end.' |
| 576 | .Ve |
| 577 | .PP |
| 578 | Because a period is a metacharacter, it needs to be escaped to match |
| 579 | as an ordinary period. Because, for example, \f(CW\*(C`\ed\*(C'\fR and \f(CW\*(C`\ew\*(C'\fR are sets |
| 580 | of characters, it is incorrect to think of \f(CW\*(C`[^\ed\ew]\*(C'\fR as \f(CW\*(C`[\eD\eW]\*(C'\fR; in |
| 581 | fact \f(CW\*(C`[^\ed\ew]\*(C'\fR is the same as \f(CW\*(C`[^\ew]\*(C'\fR, which is the same as |
| 582 | \&\f(CW\*(C`[\eW]\*(C'\fR. Think DeMorgan's laws. |
| 583 | .PP |
| 584 | An anchor useful in basic regexps is the \fBword\ anchor\fR\ |
| 585 | \&\f(CW\*(C`\eb\*(C'\fR. This matches a boundary between a word character and a non-word |
| 586 | character \f(CW\*(C`\ew\eW\*(C'\fR or \f(CW\*(C`\eW\ew\*(C'\fR: |
| 587 | .PP |
| 588 | .Vb 5 |
| 589 | \& $x = "Housecat catenates house and cat"; |
| 590 | \& $x =~ /cat/; # matches cat in 'housecat' |
| 591 | \& $x =~ /\ebcat/; # matches cat in 'catenates' |
| 592 | \& $x =~ /cat\eb/; # matches cat in 'housecat' |
| 593 | \& $x =~ /\ebcat\eb/; # matches 'cat' at end of string |
| 594 | .Ve |
| 595 | .PP |
| 596 | Note in the last example, the end of the string is considered a word |
| 597 | boundary. |
| 598 | .PP |
| 599 | You might wonder why \f(CW'.'\fR matches everything but \f(CW"\en"\fR \- why not |
| 600 | every character? The reason is that often one is matching against |
| 601 | lines and would like to ignore the newline characters. For instance, |
| 602 | while the string \f(CW"\en"\fR represents one line, we would like to think |
| 603 | of as empty. Then |
| 604 | .PP |
| 605 | .Vb 2 |
| 606 | \& "" =~ /^$/; # matches |
| 607 | \& "\en" =~ /^$/; # matches, "\en" is ignored |
| 608 | .Ve |
| 609 | .PP |
| 610 | .Vb 5 |
| 611 | \& "" =~ /./; # doesn't match; it needs a char |
| 612 | \& "" =~ /^.$/; # doesn't match; it needs a char |
| 613 | \& "\en" =~ /^.$/; # doesn't match; it needs a char other than "\en" |
| 614 | \& "a" =~ /^.$/; # matches |
| 615 | \& "a\en" =~ /^.$/; # matches, ignores the "\en" |
| 616 | .Ve |
| 617 | .PP |
| 618 | This behavior is convenient, because we usually want to ignore |
| 619 | newlines when we count and match characters in a line. Sometimes, |
| 620 | however, we want to keep track of newlines. We might even want \f(CW\*(C`^\*(C'\fR |
| 621 | and \f(CW\*(C`$\*(C'\fR to anchor at the beginning and end of lines within the |
| 622 | string, rather than just the beginning and end of the string. Perl |
| 623 | allows us to choose between ignoring and paying attention to newlines |
| 624 | by using the \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR modifiers. \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR stand for |
| 625 | single line and multi-line and they determine whether a string is to |
| 626 | be treated as one continuous string, or as a set of lines. The two |
| 627 | modifiers affect two aspects of how the regexp is interpreted: 1) how |
| 628 | the \f(CW'.'\fR character class is defined, and 2) where the anchors \f(CW\*(C`^\*(C'\fR |
| 629 | and \f(CW\*(C`$\*(C'\fR are able to match. Here are the four possible combinations: |
| 630 | .IP "\(bu" 4 |
| 631 | no modifiers (//): Default behavior. \f(CW'.'\fR matches any character |
| 632 | except \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR matches only at the beginning of the string and |
| 633 | \&\f(CW\*(C`$\*(C'\fR matches only at the end or before a newline at the end. |
| 634 | .IP "\(bu" 4 |
| 635 | s modifier (//s): Treat string as a single long line. \f(CW'.'\fR matches |
| 636 | any character, even \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR matches only at the beginning of |
| 637 | the string and \f(CW\*(C`$\*(C'\fR matches only at the end or before a newline at the |
| 638 | end. |
| 639 | .IP "\(bu" 4 |
| 640 | m modifier (//m): Treat string as a set of multiple lines. \f(CW'.'\fR |
| 641 | matches any character except \f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR are able to match |
| 642 | at the start or end of \fIany\fR line within the string. |
| 643 | .IP "\(bu" 4 |
| 644 | both s and m modifiers (//sm): Treat string as a single long line, but |
| 645 | detect multiple lines. \f(CW'.'\fR matches any character, even |
| 646 | \&\f(CW"\en"\fR. \f(CW\*(C`^\*(C'\fR and \f(CW\*(C`$\*(C'\fR, however, are able to match at the start or end |
| 647 | of \fIany\fR line within the string. |
| 648 | .PP |
| 649 | Here are examples of \f(CW\*(C`//s\*(C'\fR and \f(CW\*(C`//m\*(C'\fR in action: |
| 650 | .PP |
| 651 | .Vb 1 |
| 652 | \& $x = "There once was a girl\enWho programmed in Perl\en"; |
| 653 | .Ve |
| 654 | .PP |
| 655 | .Vb 4 |
| 656 | \& $x =~ /^Who/; # doesn't match, "Who" not at start of string |
| 657 | \& $x =~ /^Who/s; # doesn't match, "Who" not at start of string |
| 658 | \& $x =~ /^Who/m; # matches, "Who" at start of second line |
| 659 | \& $x =~ /^Who/sm; # matches, "Who" at start of second line |
| 660 | .Ve |
| 661 | .PP |
| 662 | .Vb 4 |
| 663 | \& $x =~ /girl.Who/; # doesn't match, "." doesn't match "\en" |
| 664 | \& $x =~ /girl.Who/s; # matches, "." matches "\en" |
| 665 | \& $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\en" |
| 666 | \& $x =~ /girl.Who/sm; # matches, "." matches "\en" |
| 667 | .Ve |
| 668 | .PP |
| 669 | Most of the time, the default behavior is what is want, but \f(CW\*(C`//s\*(C'\fR and |
| 670 | \&\f(CW\*(C`//m\*(C'\fR are occasionally very useful. If \f(CW\*(C`//m\*(C'\fR is being used, the start |
| 671 | of the string can still be matched with \f(CW\*(C`\eA\*(C'\fR and the end of string |
| 672 | can still be matched with the anchors \f(CW\*(C`\eZ\*(C'\fR (matches both the end and |
| 673 | the newline before, like \f(CW\*(C`$\*(C'\fR), and \f(CW\*(C`\ez\*(C'\fR (matches only the end): |
| 674 | .PP |
| 675 | .Vb 2 |
| 676 | \& $x =~ /^Who/m; # matches, "Who" at start of second line |
| 677 | \& $x =~ /\eAWho/m; # doesn't match, "Who" is not at start of string |
| 678 | .Ve |
| 679 | .PP |
| 680 | .Vb 2 |
| 681 | \& $x =~ /girl$/m; # matches, "girl" at end of first line |
| 682 | \& $x =~ /girl\eZ/m; # doesn't match, "girl" is not at end of string |
| 683 | .Ve |
| 684 | .PP |
| 685 | .Vb 2 |
| 686 | \& $x =~ /Perl\eZ/m; # matches, "Perl" is at newline before end |
| 687 | \& $x =~ /Perl\ez/m; # doesn't match, "Perl" is not at end of string |
| 688 | .Ve |
| 689 | .PP |
| 690 | We now know how to create choices among classes of characters in a |
| 691 | regexp. What about choices among words or character strings? Such |
| 692 | choices are described in the next section. |
| 693 | .Sh "Matching this or that" |
| 694 | .IX Subsection "Matching this or that" |
| 695 | Sometimes we would like to our regexp to be able to match different |
| 696 | possible words or character strings. This is accomplished by using |
| 697 | the \fBalternation\fR metacharacter \f(CW\*(C`|\*(C'\fR. To match \f(CW\*(C`dog\*(C'\fR or \f(CW\*(C`cat\*(C'\fR, we |
| 698 | form the regexp \f(CW\*(C`dog|cat\*(C'\fR. As before, perl will try to match the |
| 699 | regexp at the earliest possible point in the string. At each |
| 700 | character position, perl will first try to match the first |
| 701 | alternative, \f(CW\*(C`dog\*(C'\fR. If \f(CW\*(C`dog\*(C'\fR doesn't match, perl will then try the |
| 702 | next alternative, \f(CW\*(C`cat\*(C'\fR. If \f(CW\*(C`cat\*(C'\fR doesn't match either, then the |
| 703 | match fails and perl moves to the next position in the string. Some |
| 704 | examples: |
| 705 | .PP |
| 706 | .Vb 2 |
| 707 | \& "cats and dogs" =~ /cat|dog|bird/; # matches "cat" |
| 708 | \& "cats and dogs" =~ /dog|cat|bird/; # matches "cat" |
| 709 | .Ve |
| 710 | .PP |
| 711 | Even though \f(CW\*(C`dog\*(C'\fR is the first alternative in the second regexp, |
| 712 | \&\f(CW\*(C`cat\*(C'\fR is able to match earlier in the string. |
| 713 | .PP |
| 714 | .Vb 2 |
| 715 | \& "cats" =~ /c|ca|cat|cats/; # matches "c" |
| 716 | \& "cats" =~ /cats|cat|ca|c/; # matches "cats" |
| 717 | .Ve |
| 718 | .PP |
| 719 | Here, all the alternatives match at the first string position, so the |
| 720 | first alternative is the one that matches. If some of the |
| 721 | alternatives are truncations of the others, put the longest ones first |
| 722 | to give them a chance to match. |
| 723 | .PP |
| 724 | .Vb 2 |
| 725 | \& "cab" =~ /a|b|c/ # matches "c" |
| 726 | \& # /a|b|c/ == /[abc]/ |
| 727 | .Ve |
| 728 | .PP |
| 729 | The last example points out that character classes are like |
| 730 | alternations of characters. At a given character position, the first |
| 731 | alternative that allows the regexp match to succeed will be the one |
| 732 | that matches. |
| 733 | .Sh "Grouping things and hierarchical matching" |
| 734 | .IX Subsection "Grouping things and hierarchical matching" |
| 735 | Alternation allows a regexp to choose among alternatives, but by |
| 736 | itself it unsatisfying. The reason is that each alternative is a whole |
| 737 | regexp, but sometime we want alternatives for just part of a |
| 738 | regexp. For instance, suppose we want to search for housecats or |
| 739 | housekeepers. The regexp \f(CW\*(C`housecat|housekeeper\*(C'\fR fits the bill, but is |
| 740 | inefficient because we had to type \f(CW\*(C`house\*(C'\fR twice. It would be nice to |
| 741 | have parts of the regexp be constant, like \f(CW\*(C`house\*(C'\fR, and some |
| 742 | parts have alternatives, like \f(CW\*(C`cat|keeper\*(C'\fR. |
| 743 | .PP |
| 744 | The \fBgrouping\fR metacharacters \f(CW\*(C`()\*(C'\fR solve this problem. Grouping |
| 745 | allows parts of a regexp to be treated as a single unit. Parts of a |
| 746 | regexp are grouped by enclosing them in parentheses. Thus we could solve |
| 747 | the \f(CW\*(C`housecat|housekeeper\*(C'\fR by forming the regexp as |
| 748 | \&\f(CW\*(C`house(cat|keeper)\*(C'\fR. The regexp \f(CW\*(C`house(cat|keeper)\*(C'\fR means match |
| 749 | \&\f(CW\*(C`house\*(C'\fR followed by either \f(CW\*(C`cat\*(C'\fR or \f(CW\*(C`keeper\*(C'\fR. Some more examples |
| 750 | are |
| 751 | .PP |
| 752 | .Vb 4 |
| 753 | \& /(a|b)b/; # matches 'ab' or 'bb' |
| 754 | \& /(ac|b)b/; # matches 'acb' or 'bb' |
| 755 | \& /(^a|b)c/; # matches 'ac' at start of string or 'bc' anywhere |
| 756 | \& /(a|[bc])d/; # matches 'ad', 'bd', or 'cd' |
| 757 | .Ve |
| 758 | .PP |
| 759 | .Vb 3 |
| 760 | \& /house(cat|)/; # matches either 'housecat' or 'house' |
| 761 | \& /house(cat(s|)|)/; # matches either 'housecats' or 'housecat' or |
| 762 | \& # 'house'. Note groups can be nested. |
| 763 | .Ve |
| 764 | .PP |
| 765 | .Vb 3 |
| 766 | \& /(19|20|)\ed\ed/; # match years 19xx, 20xx, or the Y2K problem, xx |
| 767 | \& "20" =~ /(19|20|)\ed\ed/; # matches the null alternative '()\ed\ed', |
| 768 | \& # because '20\ed\ed' can't match |
| 769 | .Ve |
| 770 | .PP |
| 771 | Alternations behave the same way in groups as out of them: at a given |
| 772 | string position, the leftmost alternative that allows the regexp to |
| 773 | match is taken. So in the last example at the first string position, |
| 774 | \&\f(CW"20"\fR matches the second alternative, but there is nothing left over |
| 775 | to match the next two digits \f(CW\*(C`\ed\ed\*(C'\fR. So perl moves on to the next |
| 776 | alternative, which is the null alternative and that works, since |
| 777 | \&\f(CW"20"\fR is two digits. |
| 778 | .PP |
| 779 | The process of trying one alternative, seeing if it matches, and |
| 780 | moving on to the next alternative if it doesn't, is called |
| 781 | \&\fBbacktracking\fR. The term 'backtracking' comes from the idea that |
| 782 | matching a regexp is like a walk in the woods. Successfully matching |
| 783 | a regexp is like arriving at a destination. There are many possible |
| 784 | trailheads, one for each string position, and each one is tried in |
| 785 | order, left to right. From each trailhead there may be many paths, |
| 786 | some of which get you there, and some which are dead ends. When you |
| 787 | walk along a trail and hit a dead end, you have to backtrack along the |
| 788 | trail to an earlier point to try another trail. If you hit your |
| 789 | destination, you stop immediately and forget about trying all the |
| 790 | other trails. You are persistent, and only if you have tried all the |
| 791 | trails from all the trailheads and not arrived at your destination, do |
| 792 | you declare failure. To be concrete, here is a step-by-step analysis |
| 793 | of what perl does when it tries to match the regexp |
| 794 | .PP |
| 795 | .Vb 1 |
| 796 | \& "abcde" =~ /(abd|abc)(df|d|de)/; |
| 797 | .Ve |
| 798 | .IP "0" 4 |
| 799 | Start with the first letter in the string 'a'. |
| 800 | .IP "1" 4 |
| 801 | .IX Item "1" |
| 802 | Try the first alternative in the first group 'abd'. |
| 803 | .IP "2" 4 |
| 804 | .IX Item "2" |
| 805 | Match 'a' followed by 'b'. So far so good. |
| 806 | .IP "3" 4 |
| 807 | .IX Item "3" |
| 808 | \&'d' in the regexp doesn't match 'c' in the string \- a dead |
| 809 | end. So backtrack two characters and pick the second alternative in |
| 810 | the first group 'abc'. |
| 811 | .IP "4" 4 |
| 812 | .IX Item "4" |
| 813 | Match 'a' followed by 'b' followed by 'c'. We are on a roll |
| 814 | and have satisfied the first group. Set \f(CW$1\fR to 'abc'. |
| 815 | .IP "5" 4 |
| 816 | .IX Item "5" |
| 817 | Move on to the second group and pick the first alternative |
| 818 | \&'df'. |
| 819 | .IP "6" 4 |
| 820 | .IX Item "6" |
| 821 | Match the 'd'. |
| 822 | .IP "7" 4 |
| 823 | .IX Item "7" |
| 824 | \&'f' in the regexp doesn't match 'e' in the string, so a dead |
| 825 | end. Backtrack one character and pick the second alternative in the |
| 826 | second group 'd'. |
| 827 | .IP "8" 4 |
| 828 | .IX Item "8" |
| 829 | \&'d' matches. The second grouping is satisfied, so set \f(CW$2\fR to |
| 830 | \&'d'. |
| 831 | .IP "9" 4 |
| 832 | .IX Item "9" |
| 833 | We are at the end of the regexp, so we are done! We have |
| 834 | matched 'abcd' out of the string \*(L"abcde\*(R". |
| 835 | .PP |
| 836 | There are a couple of things to note about this analysis. First, the |
| 837 | third alternative in the second group 'de' also allows a match, but we |
| 838 | stopped before we got to it \- at a given character position, leftmost |
| 839 | wins. Second, we were able to get a match at the first character |
| 840 | position of the string 'a'. If there were no matches at the first |
| 841 | position, perl would move to the second character position 'b' and |
| 842 | attempt the match all over again. Only when all possible paths at all |
| 843 | possible character positions have been exhausted does perl give |
| 844 | up and declare \f(CW\*(C`$string\ =~\ /(abd|abc)(df|d|de)/;\*(C'\fR\ to be false. |
| 845 | .PP |
| 846 | Even with all this work, regexp matching happens remarkably fast. To |
| 847 | speed things up, during compilation stage, perl compiles the regexp |
| 848 | into a compact sequence of opcodes that can often fit inside a |
| 849 | processor cache. When the code is executed, these opcodes can then run |
| 850 | at full throttle and search very quickly. |
| 851 | .Sh "Extracting matches" |
| 852 | .IX Subsection "Extracting matches" |
| 853 | The grouping metacharacters \f(CW\*(C`()\*(C'\fR also serve another completely |
| 854 | different function: they allow the extraction of the parts of a string |
| 855 | that matched. This is very useful to find out what matched and for |
| 856 | text processing in general. For each grouping, the part that matched |
| 857 | inside goes into the special variables \f(CW$1\fR, \f(CW$2\fR, etc. They can be |
| 858 | used just as ordinary variables: |
| 859 | .PP |
| 860 | .Vb 6 |
| 861 | \& # extract hours, minutes, seconds |
| 862 | \& if ($time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/) { # match hh:mm:ss format |
| 863 | \& $hours = $1; |
| 864 | \& $minutes = $2; |
| 865 | \& $seconds = $3; |
| 866 | \& } |
| 867 | .Ve |
| 868 | .PP |
| 869 | Now, we know that in scalar context, |
| 870 | \&\f(CW\*(C`$time\ =~\ /(\ed\ed):(\ed\ed):(\ed\ed)/\*(C'\fR\ returns a true or false |
| 871 | value. In list context, however, it returns the list of matched values |
| 872 | \&\f(CW\*(C`($1,$2,$3)\*(C'\fR. So we could write the code more compactly as |
| 873 | .PP |
| 874 | .Vb 2 |
| 875 | \& # extract hours, minutes, seconds |
| 876 | \& ($hours, $minutes, $second) = ($time =~ /(\ed\ed):(\ed\ed):(\ed\ed)/); |
| 877 | .Ve |
| 878 | .PP |
| 879 | If the groupings in a regexp are nested, \f(CW$1\fR gets the group with the |
| 880 | leftmost opening parenthesis, \f(CW$2\fR the next opening parenthesis, |
| 881 | etc. For example, here is a complex regexp and the matching variables |
| 882 | indicated below it: |
| 883 | .PP |
| 884 | .Vb 2 |
| 885 | \& /(ab(cd|ef)((gi)|j))/; |
| 886 | \& 1 2 34 |
| 887 | .Ve |
| 888 | .PP |
| 889 | so that if the regexp matched, e.g., \f(CW$2\fR would contain 'cd' or 'ef'. For |
| 890 | convenience, perl sets \f(CW$+\fR to the string held by the highest numbered |
| 891 | \&\f(CW$1\fR, \f(CW$2\fR, ... that got assigned (and, somewhat related, \f(CW$^N\fR to the |
| 892 | value of the \f(CW$1\fR, \f(CW$2\fR, ... most-recently assigned; i.e. the \f(CW$1\fR, |
| 893 | \&\f(CW$2\fR, ... associated with the rightmost closing parenthesis used in the |
| 894 | match). |
| 895 | .PP |
| 896 | Closely associated with the matching variables \f(CW$1\fR, \f(CW$2\fR, ... are |
| 897 | the \fBbackreferences\fR \f(CW\*(C`\e1\*(C'\fR, \f(CW\*(C`\e2\*(C'\fR, ... . Backreferences are simply |
| 898 | matching variables that can be used \fIinside\fR a regexp. This is a |
| 899 | really nice feature \- what matches later in a regexp can depend on |
| 900 | what matched earlier in the regexp. Suppose we wanted to look |
| 901 | for doubled words in text, like 'the the'. The following regexp finds |
| 902 | all 3\-letter doubles with a space in between: |
| 903 | .PP |
| 904 | .Vb 1 |
| 905 | \& /(\ew\ew\ew)\es\e1/; |
| 906 | .Ve |
| 907 | .PP |
| 908 | The grouping assigns a value to \e1, so that the same 3 letter sequence |
| 909 | is used for both parts. Here are some words with repeated parts: |
| 910 | .PP |
| 911 | .Vb 7 |
| 912 | \& % simple_grep '^(\ew\ew\ew\ew|\ew\ew\ew|\ew\ew|\ew)\e1$' /usr/dict/words |
| 913 | \& beriberi |
| 914 | \& booboo |
| 915 | \& coco |
| 916 | \& mama |
| 917 | \& murmur |
| 918 | \& papa |
| 919 | .Ve |
| 920 | .PP |
| 921 | The regexp has a single grouping which considers 4\-letter |
| 922 | combinations, then 3\-letter combinations, etc. and uses \f(CW\*(C`\e1\*(C'\fR to look for |
| 923 | a repeat. Although \f(CW$1\fR and \f(CW\*(C`\e1\*(C'\fR represent the same thing, care should be |
| 924 | taken to use matched variables \f(CW$1\fR, \f(CW$2\fR, ... only outside a regexp |
| 925 | and backreferences \f(CW\*(C`\e1\*(C'\fR, \f(CW\*(C`\e2\*(C'\fR, ... only inside a regexp; not doing |
| 926 | so may lead to surprising and/or undefined results. |
| 927 | .PP |
| 928 | In addition to what was matched, Perl 5.6.0 also provides the |
| 929 | positions of what was matched with the \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR |
| 930 | arrays. \f(CW\*(C`$\-[0]\*(C'\fR is the position of the start of the entire match and |
| 931 | \&\f(CW$+[0]\fR is the position of the end. Similarly, \f(CW\*(C`$\-[n]\*(C'\fR is the |
| 932 | position of the start of the \f(CW$n\fR match and \f(CW$+[n]\fR is the position |
| 933 | of the end. If \f(CW$n\fR is undefined, so are \f(CW\*(C`$\-[n]\*(C'\fR and \f(CW$+[n]\fR. Then |
| 934 | this code |
| 935 | .PP |
| 936 | .Vb 5 |
| 937 | \& $x = "Mmm...donut, thought Homer"; |
| 938 | \& $x =~ /^(Mmm|Yech)\e.\e.\e.(donut|peas)/; # matches |
| 939 | \& foreach $expr (1..$#-) { |
| 940 | \& print "Match $expr: '${$expr}' at position ($-[$expr],$+[$expr])\en"; |
| 941 | \& } |
| 942 | .Ve |
| 943 | .PP |
| 944 | prints |
| 945 | .PP |
| 946 | .Vb 2 |
| 947 | \& Match 1: 'Mmm' at position (0,3) |
| 948 | \& Match 2: 'donut' at position (6,11) |
| 949 | .Ve |
| 950 | .PP |
| 951 | Even if there are no groupings in a regexp, it is still possible to |
| 952 | find out what exactly matched in a string. If you use them, perl |
| 953 | will set \f(CW$`\fR to the part of the string before the match, will set \f(CW$&\fR |
| 954 | to the part of the string that matched, and will set \f(CW$'\fR to the part |
| 955 | of the string after the match. An example: |
| 956 | .PP |
| 957 | .Vb 3 |
| 958 | \& $x = "the cat caught the mouse"; |
| 959 | \& $x =~ /cat/; # $` = 'the ', $& = 'cat', $' = ' caught the mouse' |
| 960 | \& $x =~ /the/; # $` = '', $& = 'the', $' = ' cat caught the mouse' |
| 961 | .Ve |
| 962 | .PP |
| 963 | In the second match, \f(CW\*(C`$`\ =\ ''\*(C'\fR\ because the regexp matched at the |
| 964 | first character position in the string and stopped, it never saw the |
| 965 | second 'the'. It is important to note that using \f(CW$`\fR and \f(CW$'\fR |
| 966 | slows down regexp matching quite a bit, and \f(CW $& \fR slows it down to a |
| 967 | lesser extent, because if they are used in one regexp in a program, |
| 968 | they are generated for <all> regexps in the program. So if raw |
| 969 | performance is a goal of your application, they should be avoided. |
| 970 | If you need them, use \f(CW\*(C`@\-\*(C'\fR and \f(CW\*(C`@+\*(C'\fR instead: |
| 971 | .PP |
| 972 | .Vb 3 |
| 973 | \& $` is the same as substr( $x, 0, $-[0] ) |
| 974 | \& $& is the same as substr( $x, $-[0], $+[0]-$-[0] ) |
| 975 | \& $' is the same as substr( $x, $+[0] ) |
| 976 | .Ve |
| 977 | .Sh "Matching repetitions" |
| 978 | .IX Subsection "Matching repetitions" |
| 979 | The examples in the previous section display an annoying weakness. We |
| 980 | were only matching 3\-letter words, or syllables of 4 letters or |
| 981 | less. We'd like to be able to match words or syllables of any length, |
| 982 | without writing out tedious alternatives like |
| 983 | \&\f(CW\*(C`\ew\ew\ew\ew|\ew\ew\ew|\ew\ew|\ew\*(C'\fR. |
| 984 | .PP |
| 985 | This is exactly the problem the \fBquantifier\fR metacharacters \f(CW\*(C`?\*(C'\fR, |
| 986 | \&\f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR, and \f(CW\*(C`{}\*(C'\fR were created for. They allow us to determine the |
| 987 | number of repeats of a portion of a regexp we consider to be a |
| 988 | match. Quantifiers are put immediately after the character, character |
| 989 | class, or grouping that we want to specify. They have the following |
| 990 | meanings: |
| 991 | .IP "\(bu" 4 |
| 992 | \&\f(CW\*(C`a?\*(C'\fR = match 'a' 1 or 0 times |
| 993 | .IP "\(bu" 4 |
| 994 | \&\f(CW\*(C`a*\*(C'\fR = match 'a' 0 or more times, i.e., any number of times |
| 995 | .IP "\(bu" 4 |
| 996 | \&\f(CW\*(C`a+\*(C'\fR = match 'a' 1 or more times, i.e., at least once |
| 997 | .IP "\(bu" 4 |
| 998 | \&\f(CW\*(C`a{n,m}\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, but not more than \f(CW\*(C`m\*(C'\fR |
| 999 | times. |
| 1000 | .IP "\(bu" 4 |
| 1001 | \&\f(CW\*(C`a{n,}\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR or more times |
| 1002 | .IP "\(bu" 4 |
| 1003 | \&\f(CW\*(C`a{n}\*(C'\fR = match exactly \f(CW\*(C`n\*(C'\fR times |
| 1004 | .PP |
| 1005 | Here are some examples: |
| 1006 | .PP |
| 1007 | .Vb 9 |
| 1008 | \& /[a-z]+\es+\ed*/; # match a lowercase word, at least some space, and |
| 1009 | \& # any number of digits |
| 1010 | \& /(\ew+)\es+\e1/; # match doubled words of arbitrary length |
| 1011 | \& /y(es)?/i; # matches 'y', 'Y', or a case-insensitive 'yes' |
| 1012 | \& $year =~ /\ed{2,4}/; # make sure year is at least 2 but not more |
| 1013 | \& # than 4 digits |
| 1014 | \& $year =~ /\ed{4}|\ed{2}/; # better match; throw out 3 digit dates |
| 1015 | \& $year =~ /\ed{2}(\ed{2})?/; # same thing written differently. However, |
| 1016 | \& # this produces $1 and the other does not. |
| 1017 | .Ve |
| 1018 | .PP |
| 1019 | .Vb 7 |
| 1020 | \& % simple_grep '^(\ew+)\e1$' /usr/dict/words # isn't this easier? |
| 1021 | \& beriberi |
| 1022 | \& booboo |
| 1023 | \& coco |
| 1024 | \& mama |
| 1025 | \& murmur |
| 1026 | \& papa |
| 1027 | .Ve |
| 1028 | .PP |
| 1029 | For all of these quantifiers, perl will try to match as much of the |
| 1030 | string as possible, while still allowing the regexp to succeed. Thus |
| 1031 | with \f(CW\*(C`/a?.../\*(C'\fR, perl will first try to match the regexp with the \f(CW\*(C`a\*(C'\fR |
| 1032 | present; if that fails, perl will try to match the regexp without the |
| 1033 | \&\f(CW\*(C`a\*(C'\fR present. For the quantifier \f(CW\*(C`*\*(C'\fR, we get the following: |
| 1034 | .PP |
| 1035 | .Vb 5 |
| 1036 | \& $x = "the cat in the hat"; |
| 1037 | \& $x =~ /^(.*)(cat)(.*)$/; # matches, |
| 1038 | \& # $1 = 'the ' |
| 1039 | \& # $2 = 'cat' |
| 1040 | \& # $3 = ' in the hat' |
| 1041 | .Ve |
| 1042 | .PP |
| 1043 | Which is what we might expect, the match finds the only \f(CW\*(C`cat\*(C'\fR in the |
| 1044 | string and locks onto it. Consider, however, this regexp: |
| 1045 | .PP |
| 1046 | .Vb 4 |
| 1047 | \& $x =~ /^(.*)(at)(.*)$/; # matches, |
| 1048 | \& # $1 = 'the cat in the h' |
| 1049 | \& # $2 = 'at' |
| 1050 | \& # $3 = '' (0 matches) |
| 1051 | .Ve |
| 1052 | .PP |
| 1053 | One might initially guess that perl would find the \f(CW\*(C`at\*(C'\fR in \f(CW\*(C`cat\*(C'\fR and |
| 1054 | stop there, but that wouldn't give the longest possible string to the |
| 1055 | first quantifier \f(CW\*(C`.*\*(C'\fR. Instead, the first quantifier \f(CW\*(C`.*\*(C'\fR grabs as |
| 1056 | much of the string as possible while still having the regexp match. In |
| 1057 | this example, that means having the \f(CW\*(C`at\*(C'\fR sequence with the final \f(CW\*(C`at\*(C'\fR |
| 1058 | in the string. The other important principle illustrated here is that |
| 1059 | when there are two or more elements in a regexp, the \fIleftmost\fR |
| 1060 | quantifier, if there is one, gets to grab as much the string as |
| 1061 | possible, leaving the rest of the regexp to fight over scraps. Thus in |
| 1062 | our example, the first quantifier \f(CW\*(C`.*\*(C'\fR grabs most of the string, while |
| 1063 | the second quantifier \f(CW\*(C`.*\*(C'\fR gets the empty string. Quantifiers that |
| 1064 | grab as much of the string as possible are called \fBmaximal match\fR or |
| 1065 | \&\fBgreedy\fR quantifiers. |
| 1066 | .PP |
| 1067 | When a regexp can match a string in several different ways, we can use |
| 1068 | the principles above to predict which way the regexp will match: |
| 1069 | .IP "\(bu" 4 |
| 1070 | Principle 0: Taken as a whole, any regexp will be matched at the |
| 1071 | earliest possible position in the string. |
| 1072 | .IP "\(bu" 4 |
| 1073 | Principle 1: In an alternation \f(CW\*(C`a|b|c...\*(C'\fR, the leftmost alternative |
| 1074 | that allows a match for the whole regexp will be the one used. |
| 1075 | .IP "\(bu" 4 |
| 1076 | Principle 2: The maximal matching quantifiers \f(CW\*(C`?\*(C'\fR, \f(CW\*(C`*\*(C'\fR, \f(CW\*(C`+\*(C'\fR and |
| 1077 | \&\f(CW\*(C`{n,m}\*(C'\fR will in general match as much of the string as possible while |
| 1078 | still allowing the whole regexp to match. |
| 1079 | .IP "\(bu" 4 |
| 1080 | Principle 3: If there are two or more elements in a regexp, the |
| 1081 | leftmost greedy quantifier, if any, will match as much of the string |
| 1082 | as possible while still allowing the whole regexp to match. The next |
| 1083 | leftmost greedy quantifier, if any, will try to match as much of the |
| 1084 | string remaining available to it as possible, while still allowing the |
| 1085 | whole regexp to match. And so on, until all the regexp elements are |
| 1086 | satisfied. |
| 1087 | .PP |
| 1088 | As we have seen above, Principle 0 overrides the others \- the regexp |
| 1089 | will be matched as early as possible, with the other principles |
| 1090 | determining how the regexp matches at that earliest character |
| 1091 | position. |
| 1092 | .PP |
| 1093 | Here is an example of these principles in action: |
| 1094 | .PP |
| 1095 | .Vb 5 |
| 1096 | \& $x = "The programming republic of Perl"; |
| 1097 | \& $x =~ /^(.+)(e|r)(.*)$/; # matches, |
| 1098 | \& # $1 = 'The programming republic of Pe' |
| 1099 | \& # $2 = 'r' |
| 1100 | \& # $3 = 'l' |
| 1101 | .Ve |
| 1102 | .PP |
| 1103 | This regexp matches at the earliest string position, \f(CW'T'\fR. One |
| 1104 | might think that \f(CW\*(C`e\*(C'\fR, being leftmost in the alternation, would be |
| 1105 | matched, but \f(CW\*(C`r\*(C'\fR produces the longest string in the first quantifier. |
| 1106 | .PP |
| 1107 | .Vb 3 |
| 1108 | \& $x =~ /(m{1,2})(.*)$/; # matches, |
| 1109 | \& # $1 = 'mm' |
| 1110 | \& # $2 = 'ing republic of Perl' |
| 1111 | .Ve |
| 1112 | .PP |
| 1113 | Here, The earliest possible match is at the first \f(CW'm'\fR in |
| 1114 | \&\f(CW\*(C`programming\*(C'\fR. \f(CW\*(C`m{1,2}\*(C'\fR is the first quantifier, so it gets to match |
| 1115 | a maximal \f(CW\*(C`mm\*(C'\fR. |
| 1116 | .PP |
| 1117 | .Vb 3 |
| 1118 | \& $x =~ /.*(m{1,2})(.*)$/; # matches, |
| 1119 | \& # $1 = 'm' |
| 1120 | \& # $2 = 'ing republic of Perl' |
| 1121 | .Ve |
| 1122 | .PP |
| 1123 | Here, the regexp matches at the start of the string. The first |
| 1124 | quantifier \f(CW\*(C`.*\*(C'\fR grabs as much as possible, leaving just a single |
| 1125 | \&\f(CW'm'\fR for the second quantifier \f(CW\*(C`m{1,2}\*(C'\fR. |
| 1126 | .PP |
| 1127 | .Vb 4 |
| 1128 | \& $x =~ /(.?)(m{1,2})(.*)$/; # matches, |
| 1129 | \& # $1 = 'a' |
| 1130 | \& # $2 = 'mm' |
| 1131 | \& # $3 = 'ing republic of Perl' |
| 1132 | .Ve |
| 1133 | .PP |
| 1134 | Here, \f(CW\*(C`.?\*(C'\fR eats its maximal one character at the earliest possible |
| 1135 | position in the string, \f(CW'a'\fR in \f(CW\*(C`programming\*(C'\fR, leaving \f(CW\*(C`m{1,2}\*(C'\fR |
| 1136 | the opportunity to match both \f(CW\*(C`m\*(C'\fR's. Finally, |
| 1137 | .PP |
| 1138 | .Vb 1 |
| 1139 | \& "aXXXb" =~ /(X*)/; # matches with $1 = '' |
| 1140 | .Ve |
| 1141 | .PP |
| 1142 | because it can match zero copies of \f(CW'X'\fR at the beginning of the |
| 1143 | string. If you definitely want to match at least one \f(CW'X'\fR, use |
| 1144 | \&\f(CW\*(C`X+\*(C'\fR, not \f(CW\*(C`X*\*(C'\fR. |
| 1145 | .PP |
| 1146 | Sometimes greed is not good. At times, we would like quantifiers to |
| 1147 | match a \fIminimal\fR piece of string, rather than a maximal piece. For |
| 1148 | this purpose, Larry Wall created the \fBminimal\ match\fR\ or |
| 1149 | \&\fBnon-greedy\fR quantifiers \f(CW\*(C`??\*(C'\fR,\f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, and \f(CW\*(C`{}?\*(C'\fR. These are |
| 1150 | the usual quantifiers with a \f(CW\*(C`?\*(C'\fR appended to them. They have the |
| 1151 | following meanings: |
| 1152 | .IP "\(bu" 4 |
| 1153 | \&\f(CW\*(C`a??\*(C'\fR = match 'a' 0 or 1 times. Try 0 first, then 1. |
| 1154 | .IP "\(bu" 4 |
| 1155 | \&\f(CW\*(C`a*?\*(C'\fR = match 'a' 0 or more times, i.e., any number of times, |
| 1156 | but as few times as possible |
| 1157 | .IP "\(bu" 4 |
| 1158 | \&\f(CW\*(C`a+?\*(C'\fR = match 'a' 1 or more times, i.e., at least once, but |
| 1159 | as few times as possible |
| 1160 | .IP "\(bu" 4 |
| 1161 | \&\f(CW\*(C`a{n,m}?\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, not more than \f(CW\*(C`m\*(C'\fR |
| 1162 | times, as few times as possible |
| 1163 | .IP "\(bu" 4 |
| 1164 | \&\f(CW\*(C`a{n,}?\*(C'\fR = match at least \f(CW\*(C`n\*(C'\fR times, but as few times as |
| 1165 | possible |
| 1166 | .IP "\(bu" 4 |
| 1167 | \&\f(CW\*(C`a{n}?\*(C'\fR = match exactly \f(CW\*(C`n\*(C'\fR times. Because we match exactly |
| 1168 | \&\f(CW\*(C`n\*(C'\fR times, \f(CW\*(C`a{n}?\*(C'\fR is equivalent to \f(CW\*(C`a{n}\*(C'\fR and is just there for |
| 1169 | notational consistency. |
| 1170 | .PP |
| 1171 | Let's look at the example above, but with minimal quantifiers: |
| 1172 | .PP |
| 1173 | .Vb 5 |
| 1174 | \& $x = "The programming republic of Perl"; |
| 1175 | \& $x =~ /^(.+?)(e|r)(.*)$/; # matches, |
| 1176 | \& # $1 = 'Th' |
| 1177 | \& # $2 = 'e' |
| 1178 | \& # $3 = ' programming republic of Perl' |
| 1179 | .Ve |
| 1180 | .PP |
| 1181 | The minimal string that will allow both the start of the string \f(CW\*(C`^\*(C'\fR |
| 1182 | and the alternation to match is \f(CW\*(C`Th\*(C'\fR, with the alternation \f(CW\*(C`e|r\*(C'\fR |
| 1183 | matching \f(CW\*(C`e\*(C'\fR. The second quantifier \f(CW\*(C`.*\*(C'\fR is free to gobble up the |
| 1184 | rest of the string. |
| 1185 | .PP |
| 1186 | .Vb 3 |
| 1187 | \& $x =~ /(m{1,2}?)(.*?)$/; # matches, |
| 1188 | \& # $1 = 'm' |
| 1189 | \& # $2 = 'ming republic of Perl' |
| 1190 | .Ve |
| 1191 | .PP |
| 1192 | The first string position that this regexp can match is at the first |
| 1193 | \&\f(CW'm'\fR in \f(CW\*(C`programming\*(C'\fR. At this position, the minimal \f(CW\*(C`m{1,2}?\*(C'\fR |
| 1194 | matches just one \f(CW'm'\fR. Although the second quantifier \f(CW\*(C`.*?\*(C'\fR would |
| 1195 | prefer to match no characters, it is constrained by the end-of-string |
| 1196 | anchor \f(CW\*(C`$\*(C'\fR to match the rest of the string. |
| 1197 | .PP |
| 1198 | .Vb 4 |
| 1199 | \& $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches, |
| 1200 | \& # $1 = 'The progra' |
| 1201 | \& # $2 = 'm' |
| 1202 | \& # $3 = 'ming republic of Perl' |
| 1203 | .Ve |
| 1204 | .PP |
| 1205 | In this regexp, you might expect the first minimal quantifier \f(CW\*(C`.*?\*(C'\fR |
| 1206 | to match the empty string, because it is not constrained by a \f(CW\*(C`^\*(C'\fR |
| 1207 | anchor to match the beginning of the word. Principle 0 applies here, |
| 1208 | however. Because it is possible for the whole regexp to match at the |
| 1209 | start of the string, it \fIwill\fR match at the start of the string. Thus |
| 1210 | the first quantifier has to match everything up to the first \f(CW\*(C`m\*(C'\fR. The |
| 1211 | second minimal quantifier matches just one \f(CW\*(C`m\*(C'\fR and the third |
| 1212 | quantifier matches the rest of the string. |
| 1213 | .PP |
| 1214 | .Vb 4 |
| 1215 | \& $x =~ /(.??)(m{1,2})(.*)$/; # matches, |
| 1216 | \& # $1 = 'a' |
| 1217 | \& # $2 = 'mm' |
| 1218 | \& # $3 = 'ing republic of Perl' |
| 1219 | .Ve |
| 1220 | .PP |
| 1221 | Just as in the previous regexp, the first quantifier \f(CW\*(C`.??\*(C'\fR can match |
| 1222 | earliest at position \f(CW'a'\fR, so it does. The second quantifier is |
| 1223 | greedy, so it matches \f(CW\*(C`mm\*(C'\fR, and the third matches the rest of the |
| 1224 | string. |
| 1225 | .PP |
| 1226 | We can modify principle 3 above to take into account non-greedy |
| 1227 | quantifiers: |
| 1228 | .IP "\(bu" 4 |
| 1229 | Principle 3: If there are two or more elements in a regexp, the |
| 1230 | leftmost greedy (non\-greedy) quantifier, if any, will match as much |
| 1231 | (little) of the string as possible while still allowing the whole |
| 1232 | regexp to match. The next leftmost greedy (non\-greedy) quantifier, if |
| 1233 | any, will try to match as much (little) of the string remaining |
| 1234 | available to it as possible, while still allowing the whole regexp to |
| 1235 | match. And so on, until all the regexp elements are satisfied. |
| 1236 | .PP |
| 1237 | Just like alternation, quantifiers are also susceptible to |
| 1238 | backtracking. Here is a step-by-step analysis of the example |
| 1239 | .PP |
| 1240 | .Vb 5 |
| 1241 | \& $x = "the cat in the hat"; |
| 1242 | \& $x =~ /^(.*)(at)(.*)$/; # matches, |
| 1243 | \& # $1 = 'the cat in the h' |
| 1244 | \& # $2 = 'at' |
| 1245 | \& # $3 = '' (0 matches) |
| 1246 | .Ve |
| 1247 | .IP "0" 4 |
| 1248 | Start with the first letter in the string 't'. |
| 1249 | .IP "1" 4 |
| 1250 | .IX Item "1" |
| 1251 | The first quantifier '.*' starts out by matching the whole |
| 1252 | string 'the cat in the hat'. |
| 1253 | .IP "2" 4 |
| 1254 | .IX Item "2" |
| 1255 | \&'a' in the regexp element 'at' doesn't match the end of the |
| 1256 | string. Backtrack one character. |
| 1257 | .IP "3" 4 |
| 1258 | .IX Item "3" |
| 1259 | \&'a' in the regexp element 'at' still doesn't match the last |
| 1260 | letter of the string 't', so backtrack one more character. |
| 1261 | .IP "4" 4 |
| 1262 | .IX Item "4" |
| 1263 | Now we can match the 'a' and the 't'. |
| 1264 | .IP "5" 4 |
| 1265 | .IX Item "5" |
| 1266 | Move on to the third element '.*'. Since we are at the end of |
| 1267 | the string and '.*' can match 0 times, assign it the empty string. |
| 1268 | .IP "6" 4 |
| 1269 | .IX Item "6" |
| 1270 | We are done! |
| 1271 | .PP |
| 1272 | Most of the time, all this moving forward and backtracking happens |
| 1273 | quickly and searching is fast. There are some pathological regexps, |
| 1274 | however, whose execution time exponentially grows with the size of the |
| 1275 | string. A typical structure that blows up in your face is of the form |
| 1276 | .PP |
| 1277 | .Vb 1 |
| 1278 | \& /(a|b+)*/; |
| 1279 | .Ve |
| 1280 | .PP |
| 1281 | The problem is the nested indeterminate quantifiers. There are many |
| 1282 | different ways of partitioning a string of length n between the \f(CW\*(C`+\*(C'\fR |
| 1283 | and \f(CW\*(C`*\*(C'\fR: one repetition with \f(CW\*(C`b+\*(C'\fR of length n, two repetitions with |
| 1284 | the first \f(CW\*(C`b+\*(C'\fR length k and the second with length n\-k, m repetitions |
| 1285 | whose bits add up to length n, etc. In fact there are an exponential |
| 1286 | number of ways to partition a string as a function of length. A |
| 1287 | regexp may get lucky and match early in the process, but if there is |
| 1288 | no match, perl will try \fIevery\fR possibility before giving up. So be |
| 1289 | careful with nested \f(CW\*(C`*\*(C'\fR's, \f(CW\*(C`{n,m}\*(C'\fR's, and \f(CW\*(C`+\*(C'\fR's. The book |
| 1290 | \&\fIMastering regular expressions\fR by Jeffrey Friedl gives a wonderful |
| 1291 | discussion of this and other efficiency issues. |
| 1292 | .Sh "Building a regexp" |
| 1293 | .IX Subsection "Building a regexp" |
| 1294 | At this point, we have all the basic regexp concepts covered, so let's |
| 1295 | give a more involved example of a regular expression. We will build a |
| 1296 | regexp that matches numbers. |
| 1297 | .PP |
| 1298 | The first task in building a regexp is to decide what we want to match |
| 1299 | and what we want to exclude. In our case, we want to match both |
| 1300 | integers and floating point numbers and we want to reject any string |
| 1301 | that isn't a number. |
| 1302 | .PP |
| 1303 | The next task is to break the problem down into smaller problems that |
| 1304 | are easily converted into a regexp. |
| 1305 | .PP |
| 1306 | The simplest case is integers. These consist of a sequence of digits, |
| 1307 | with an optional sign in front. The digits we can represent with |
| 1308 | \&\f(CW\*(C`\ed+\*(C'\fR and the sign can be matched with \f(CW\*(C`[+\-]\*(C'\fR. Thus the integer |
| 1309 | regexp is |
| 1310 | .PP |
| 1311 | .Vb 1 |
| 1312 | \& /[+-]?\ed+/; # matches integers |
| 1313 | .Ve |
| 1314 | .PP |
| 1315 | A floating point number potentially has a sign, an integral part, a |
| 1316 | decimal point, a fractional part, and an exponent. One or more of these |
| 1317 | parts is optional, so we need to check out the different |
| 1318 | possibilities. Floating point numbers which are in proper form include |
| 1319 | 123., 0.345, .34, \-1e6, and 25.4E\-72. As with integers, the sign out |
| 1320 | front is completely optional and can be matched by \f(CW\*(C`[+\-]?\*(C'\fR. We can |
| 1321 | see that if there is no exponent, floating point numbers must have a |
| 1322 | decimal point, otherwise they are integers. We might be tempted to |
| 1323 | model these with \f(CW\*(C`\ed*\e.\ed*\*(C'\fR, but this would also match just a single |
| 1324 | decimal point, which is not a number. So the three cases of floating |
| 1325 | point number sans exponent are |
| 1326 | .PP |
| 1327 | .Vb 3 |
| 1328 | \& /[+-]?\ed+\e./; # 1., 321., etc. |
| 1329 | \& /[+-]?\e.\ed+/; # .1, .234, etc. |
| 1330 | \& /[+-]?\ed+\e.\ed+/; # 1.0, 30.56, etc. |
| 1331 | .Ve |
| 1332 | .PP |
| 1333 | These can be combined into a single regexp with a three-way alternation: |
| 1334 | .PP |
| 1335 | .Vb 1 |
| 1336 | \& /[+-]?(\ed+\e.\ed+|\ed+\e.|\e.\ed+)/; # floating point, no exponent |
| 1337 | .Ve |
| 1338 | .PP |
| 1339 | In this alternation, it is important to put \f(CW'\ed+\e.\ed+'\fR before |
| 1340 | \&\f(CW'\ed+\e.'\fR. If \f(CW'\ed+\e.'\fR were first, the regexp would happily match that |
| 1341 | and ignore the fractional part of the number. |
| 1342 | .PP |
| 1343 | Now consider floating point numbers with exponents. The key |
| 1344 | observation here is that \fIboth\fR integers and numbers with decimal |
| 1345 | points are allowed in front of an exponent. Then exponents, like the |
| 1346 | overall sign, are independent of whether we are matching numbers with |
| 1347 | or without decimal points, and can be 'decoupled' from the |
| 1348 | mantissa. The overall form of the regexp now becomes clear: |
| 1349 | .PP |
| 1350 | .Vb 1 |
| 1351 | \& /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/; |
| 1352 | .Ve |
| 1353 | .PP |
| 1354 | The exponent is an \f(CW\*(C`e\*(C'\fR or \f(CW\*(C`E\*(C'\fR, followed by an integer. So the |
| 1355 | exponent regexp is |
| 1356 | .PP |
| 1357 | .Vb 1 |
| 1358 | \& /[eE][+-]?\ed+/; # exponent |
| 1359 | .Ve |
| 1360 | .PP |
| 1361 | Putting all the parts together, we get a regexp that matches numbers: |
| 1362 | .PP |
| 1363 | .Vb 1 |
| 1364 | \& /^[+-]?(\ed+\e.\ed+|\ed+\e.|\e.\ed+|\ed+)([eE][+-]?\ed+)?$/; # Ta da! |
| 1365 | .Ve |
| 1366 | .PP |
| 1367 | Long regexps like this may impress your friends, but can be hard to |
| 1368 | decipher. In complex situations like this, the \f(CW\*(C`//x\*(C'\fR modifier for a |
| 1369 | match is invaluable. It allows one to put nearly arbitrary whitespace |
| 1370 | and comments into a regexp without affecting their meaning. Using it, |
| 1371 | we can rewrite our 'extended' regexp in the more pleasing form |
| 1372 | .PP |
| 1373 | .Vb 10 |
| 1374 | \& /^ |
| 1375 | \& [+-]? # first, match an optional sign |
| 1376 | \& ( # then match integers or f.p. mantissas: |
| 1377 | \& \ed+\e.\ed+ # mantissa of the form a.b |
| 1378 | \& |\ed+\e. # mantissa of the form a. |
| 1379 | \& |\e.\ed+ # mantissa of the form .b |
| 1380 | \& |\ed+ # integer of the form a |
| 1381 | \& ) |
| 1382 | \& ([eE][+-]?\ed+)? # finally, optionally match an exponent |
| 1383 | \& $/x; |
| 1384 | .Ve |
| 1385 | .PP |
| 1386 | If whitespace is mostly irrelevant, how does one include space |
| 1387 | characters in an extended regexp? The answer is to backslash it |
| 1388 | \&\f(CW'\e\ '\fR\ or put it in a character class \f(CW\*(C`[\ ]\*(C'\fR\ . The same thing |
| 1389 | goes for pound signs, use \f(CW\*(C`\e#\*(C'\fR or \f(CW\*(C`[#]\*(C'\fR. For instance, Perl allows |
| 1390 | a space between the sign and the mantissa/integer, and we could add |
| 1391 | this to our regexp as follows: |
| 1392 | .PP |
| 1393 | .Vb 10 |
| 1394 | \& /^ |
| 1395 | \& [+-]?\e * # first, match an optional sign *and space* |
| 1396 | \& ( # then match integers or f.p. mantissas: |
| 1397 | \& \ed+\e.\ed+ # mantissa of the form a.b |
| 1398 | \& |\ed+\e. # mantissa of the form a. |
| 1399 | \& |\e.\ed+ # mantissa of the form .b |
| 1400 | \& |\ed+ # integer of the form a |
| 1401 | \& ) |
| 1402 | \& ([eE][+-]?\ed+)? # finally, optionally match an exponent |
| 1403 | \& $/x; |
| 1404 | .Ve |
| 1405 | .PP |
| 1406 | In this form, it is easier to see a way to simplify the |
| 1407 | alternation. Alternatives 1, 2, and 4 all start with \f(CW\*(C`\ed+\*(C'\fR, so it |
| 1408 | could be factored out: |
| 1409 | .PP |
| 1410 | .Vb 11 |
| 1411 | \& /^ |
| 1412 | \& [+-]?\e * # first, match an optional sign |
| 1413 | \& ( # then match integers or f.p. mantissas: |
| 1414 | \& \ed+ # start out with a ... |
| 1415 | \& ( |
| 1416 | \& \e.\ed* # mantissa of the form a.b or a. |
| 1417 | \& )? # ? takes care of integers of the form a |
| 1418 | \& |\e.\ed+ # mantissa of the form .b |
| 1419 | \& ) |
| 1420 | \& ([eE][+-]?\ed+)? # finally, optionally match an exponent |
| 1421 | \& $/x; |
| 1422 | .Ve |
| 1423 | .PP |
| 1424 | or written in the compact form, |
| 1425 | .PP |
| 1426 | .Vb 1 |
| 1427 | \& /^[+-]?\e *(\ed+(\e.\ed*)?|\e.\ed+)([eE][+-]?\ed+)?$/; |
| 1428 | .Ve |
| 1429 | .PP |
| 1430 | This is our final regexp. To recap, we built a regexp by |
| 1431 | .IP "\(bu" 4 |
| 1432 | specifying the task in detail, |
| 1433 | .IP "\(bu" 4 |
| 1434 | breaking down the problem into smaller parts, |
| 1435 | .IP "\(bu" 4 |
| 1436 | translating the small parts into regexps, |
| 1437 | .IP "\(bu" 4 |
| 1438 | combining the regexps, |
| 1439 | .IP "\(bu" 4 |
| 1440 | and optimizing the final combined regexp. |
| 1441 | .PP |
| 1442 | These are also the typical steps involved in writing a computer |
| 1443 | program. This makes perfect sense, because regular expressions are |
| 1444 | essentially programs written a little computer language that specifies |
| 1445 | patterns. |
| 1446 | .Sh "Using regular expressions in Perl" |
| 1447 | .IX Subsection "Using regular expressions in Perl" |
| 1448 | The last topic of Part 1 briefly covers how regexps are used in Perl |
| 1449 | programs. Where do they fit into Perl syntax? |
| 1450 | .PP |
| 1451 | We have already introduced the matching operator in its default |
| 1452 | \&\f(CW\*(C`/regexp/\*(C'\fR and arbitrary delimiter \f(CW\*(C`m!regexp!\*(C'\fR forms. We have used |
| 1453 | the binding operator \f(CW\*(C`=~\*(C'\fR and its negation \f(CW\*(C`!~\*(C'\fR to test for string |
| 1454 | matches. Associated with the matching operator, we have discussed the |
| 1455 | single line \f(CW\*(C`//s\*(C'\fR, multi-line \f(CW\*(C`//m\*(C'\fR, case-insensitive \f(CW\*(C`//i\*(C'\fR and |
| 1456 | extended \f(CW\*(C`//x\*(C'\fR modifiers. |
| 1457 | .PP |
| 1458 | There are a few more things you might want to know about matching |
| 1459 | operators. First, we pointed out earlier that variables in regexps are |
| 1460 | substituted before the regexp is evaluated: |
| 1461 | .PP |
| 1462 | .Vb 4 |
| 1463 | \& $pattern = 'Seuss'; |
| 1464 | \& while (<>) { |
| 1465 | \& print if /$pattern/; |
| 1466 | \& } |
| 1467 | .Ve |
| 1468 | .PP |
| 1469 | This will print any lines containing the word \f(CW\*(C`Seuss\*(C'\fR. It is not as |
| 1470 | efficient as it could be, however, because perl has to re-evaluate |
| 1471 | \&\f(CW$pattern\fR each time through the loop. If \f(CW$pattern\fR won't be |
| 1472 | changing over the lifetime of the script, we can add the \f(CW\*(C`//o\*(C'\fR |
| 1473 | modifier, which directs perl to only perform variable substitutions |
| 1474 | once: |
| 1475 | .PP |
| 1476 | .Vb 6 |
| 1477 | \& #!/usr/bin/perl |
| 1478 | \& # Improved simple_grep |
| 1479 | \& $regexp = shift; |
| 1480 | \& while (<>) { |
| 1481 | \& print if /$regexp/o; # a good deal faster |
| 1482 | \& } |
| 1483 | .Ve |
| 1484 | .PP |
| 1485 | If you change \f(CW$pattern\fR after the first substitution happens, perl |
| 1486 | will ignore it. If you don't want any substitutions at all, use the |
| 1487 | special delimiter \f(CW\*(C`m''\*(C'\fR: |
| 1488 | .PP |
| 1489 | .Vb 4 |
| 1490 | \& @pattern = ('Seuss'); |
| 1491 | \& while (<>) { |
| 1492 | \& print if m'@pattern'; # matches literal '@pattern', not 'Seuss' |
| 1493 | \& } |
| 1494 | .Ve |
| 1495 | .PP |
| 1496 | \&\f(CW\*(C`m''\*(C'\fR acts like single quotes on a regexp; all other \f(CW\*(C`m\*(C'\fR delimiters |
| 1497 | act like double quotes. If the regexp evaluates to the empty string, |
| 1498 | the regexp in the \fIlast successful match\fR is used instead. So we have |
| 1499 | .PP |
| 1500 | .Vb 2 |
| 1501 | \& "dog" =~ /d/; # 'd' matches |
| 1502 | \& "dogbert =~ //; # this matches the 'd' regexp used before |
| 1503 | .Ve |
| 1504 | .PP |
| 1505 | The final two modifiers \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`//c\*(C'\fR concern multiple matches. |
| 1506 | The modifier \f(CW\*(C`//g\*(C'\fR stands for global matching and allows the |
| 1507 | matching operator to match within a string as many times as possible. |
| 1508 | In scalar context, successive invocations against a string will have |
| 1509 | `\f(CW\*(C`//g\*(C'\fR jump from match to match, keeping track of position in the |
| 1510 | string as it goes along. You can get or set the position with the |
| 1511 | \&\f(CW\*(C`pos()\*(C'\fR function. |
| 1512 | .PP |
| 1513 | The use of \f(CW\*(C`//g\*(C'\fR is shown in the following example. Suppose we have |
| 1514 | a string that consists of words separated by spaces. If we know how |
| 1515 | many words there are in advance, we could extract the words using |
| 1516 | groupings: |
| 1517 | .PP |
| 1518 | .Vb 5 |
| 1519 | \& $x = "cat dog house"; # 3 words |
| 1520 | \& $x =~ /^\es*(\ew+)\es+(\ew+)\es+(\ew+)\es*$/; # matches, |
| 1521 | \& # $1 = 'cat' |
| 1522 | \& # $2 = 'dog' |
| 1523 | \& # $3 = 'house' |
| 1524 | .Ve |
| 1525 | .PP |
| 1526 | But what if we had an indeterminate number of words? This is the sort |
| 1527 | of task \f(CW\*(C`//g\*(C'\fR was made for. To extract all words, form the simple |
| 1528 | regexp \f(CW\*(C`(\ew+)\*(C'\fR and loop over all matches with \f(CW\*(C`/(\ew+)/g\*(C'\fR: |
| 1529 | .PP |
| 1530 | .Vb 3 |
| 1531 | \& while ($x =~ /(\ew+)/g) { |
| 1532 | \& print "Word is $1, ends at position ", pos $x, "\en"; |
| 1533 | \& } |
| 1534 | .Ve |
| 1535 | .PP |
| 1536 | prints |
| 1537 | .PP |
| 1538 | .Vb 3 |
| 1539 | \& Word is cat, ends at position 3 |
| 1540 | \& Word is dog, ends at position 7 |
| 1541 | \& Word is house, ends at position 13 |
| 1542 | .Ve |
| 1543 | .PP |
| 1544 | A failed match or changing the target string resets the position. If |
| 1545 | you don't want the position reset after failure to match, add the |
| 1546 | \&\f(CW\*(C`//c\*(C'\fR, as in \f(CW\*(C`/regexp/gc\*(C'\fR. The current position in the string is |
| 1547 | associated with the string, not the regexp. This means that different |
| 1548 | strings have different positions and their respective positions can be |
| 1549 | set or read independently. |
| 1550 | .PP |
| 1551 | In list context, \f(CW\*(C`//g\*(C'\fR returns a list of matched groupings, or if |
| 1552 | there are no groupings, a list of matches to the whole regexp. So if |
| 1553 | we wanted just the words, we could use |
| 1554 | .PP |
| 1555 | .Vb 4 |
| 1556 | \& @words = ($x =~ /(\ew+)/g); # matches, |
| 1557 | \& # $word[0] = 'cat' |
| 1558 | \& # $word[1] = 'dog' |
| 1559 | \& # $word[2] = 'house' |
| 1560 | .Ve |
| 1561 | .PP |
| 1562 | Closely associated with the \f(CW\*(C`//g\*(C'\fR modifier is the \f(CW\*(C`\eG\*(C'\fR anchor. The |
| 1563 | \&\f(CW\*(C`\eG\*(C'\fR anchor matches at the point where the previous \f(CW\*(C`//g\*(C'\fR match left |
| 1564 | off. \f(CW\*(C`\eG\*(C'\fR allows us to easily do context-sensitive matching: |
| 1565 | .PP |
| 1566 | .Vb 12 |
| 1567 | \& $metric = 1; # use metric units |
| 1568 | \& ... |
| 1569 | \& $x = <FILE>; # read in measurement |
| 1570 | \& $x =~ /^([+-]?\ed+)\es*/g; # get magnitude |
| 1571 | \& $weight = $1; |
| 1572 | \& if ($metric) { # error checking |
| 1573 | \& print "Units error!" unless $x =~ /\eGkg\e./g; |
| 1574 | \& } |
| 1575 | \& else { |
| 1576 | \& print "Units error!" unless $x =~ /\eGlbs\e./g; |
| 1577 | \& } |
| 1578 | \& $x =~ /\eG\es+(widget|sprocket)/g; # continue processing |
| 1579 | .Ve |
| 1580 | .PP |
| 1581 | The combination of \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`\eG\*(C'\fR allows us to process the string a |
| 1582 | bit at a time and use arbitrary Perl logic to decide what to do next. |
| 1583 | Currently, the \f(CW\*(C`\eG\*(C'\fR anchor is only fully supported when used to anchor |
| 1584 | to the start of the pattern. |
| 1585 | .PP |
| 1586 | \&\f(CW\*(C`\eG\*(C'\fR is also invaluable in processing fixed length records with |
| 1587 | regexps. Suppose we have a snippet of coding region \s-1DNA\s0, encoded as |
| 1588 | base pair letters \f(CW\*(C`ATCGTTGAAT...\*(C'\fR and we want to find all the stop |
| 1589 | codons \f(CW\*(C`TGA\*(C'\fR. In a coding region, codons are 3\-letter sequences, so |
| 1590 | we can think of the \s-1DNA\s0 snippet as a sequence of 3\-letter records. The |
| 1591 | naive regexp |
| 1592 | .PP |
| 1593 | .Vb 3 |
| 1594 | \& # expanded, this is "ATC GTT GAA TGC AAA TGA CAT GAC" |
| 1595 | \& $dna = "ATCGTTGAATGCAAATGACATGAC"; |
| 1596 | \& $dna =~ /TGA/; |
| 1597 | .Ve |
| 1598 | .PP |
| 1599 | doesn't work; it may match a \f(CW\*(C`TGA\*(C'\fR, but there is no guarantee that |
| 1600 | the match is aligned with codon boundaries, e.g., the substring |
| 1601 | \&\f(CW\*(C`GTT\ GAA\*(C'\fR\ gives a match. A better solution is |
| 1602 | .PP |
| 1603 | .Vb 3 |
| 1604 | \& while ($dna =~ /(\ew\ew\ew)*?TGA/g) { # note the minimal *? |
| 1605 | \& print "Got a TGA stop codon at position ", pos $dna, "\en"; |
| 1606 | \& } |
| 1607 | .Ve |
| 1608 | .PP |
| 1609 | which prints |
| 1610 | .PP |
| 1611 | .Vb 2 |
| 1612 | \& Got a TGA stop codon at position 18 |
| 1613 | \& Got a TGA stop codon at position 23 |
| 1614 | .Ve |
| 1615 | .PP |
| 1616 | Position 18 is good, but position 23 is bogus. What happened? |
| 1617 | .PP |
| 1618 | The answer is that our regexp works well until we get past the last |
| 1619 | real match. Then the regexp will fail to match a synchronized \f(CW\*(C`TGA\*(C'\fR |
| 1620 | and start stepping ahead one character position at a time, not what we |
| 1621 | want. The solution is to use \f(CW\*(C`\eG\*(C'\fR to anchor the match to the codon |
| 1622 | alignment: |
| 1623 | .PP |
| 1624 | .Vb 3 |
| 1625 | \& while ($dna =~ /\eG(\ew\ew\ew)*?TGA/g) { |
| 1626 | \& print "Got a TGA stop codon at position ", pos $dna, "\en"; |
| 1627 | \& } |
| 1628 | .Ve |
| 1629 | .PP |
| 1630 | This prints |
| 1631 | .PP |
| 1632 | .Vb 1 |
| 1633 | \& Got a TGA stop codon at position 18 |
| 1634 | .Ve |
| 1635 | .PP |
| 1636 | which is the correct answer. This example illustrates that it is |
| 1637 | important not only to match what is desired, but to reject what is not |
| 1638 | desired. |
| 1639 | .PP |
| 1640 | \&\fBsearch and replace\fR |
| 1641 | .PP |
| 1642 | Regular expressions also play a big role in \fBsearch and replace\fR |
| 1643 | operations in Perl. Search and replace is accomplished with the |
| 1644 | \&\f(CW\*(C`s///\*(C'\fR operator. The general form is |
| 1645 | \&\f(CW\*(C`s/regexp/replacement/modifiers\*(C'\fR, with everything we know about |
| 1646 | regexps and modifiers applying in this case as well. The |
| 1647 | \&\f(CW\*(C`replacement\*(C'\fR is a Perl double quoted string that replaces in the |
| 1648 | string whatever is matched with the \f(CW\*(C`regexp\*(C'\fR. The operator \f(CW\*(C`=~\*(C'\fR is |
| 1649 | also used here to associate a string with \f(CW\*(C`s///\*(C'\fR. If matching |
| 1650 | against \f(CW$_\fR, the \f(CW\*(C`$_\ =~\*(C'\fR\ can be dropped. If there is a match, |
| 1651 | \&\f(CW\*(C`s///\*(C'\fR returns the number of substitutions made, otherwise it returns |
| 1652 | false. Here are a few examples: |
| 1653 | .PP |
| 1654 | .Vb 8 |
| 1655 | \& $x = "Time to feed the cat!"; |
| 1656 | \& $x =~ s/cat/hacker/; # $x contains "Time to feed the hacker!" |
| 1657 | \& if ($x =~ s/^(Time.*hacker)!$/$1 now!/) { |
| 1658 | \& $more_insistent = 1; |
| 1659 | \& } |
| 1660 | \& $y = "'quoted words'"; |
| 1661 | \& $y =~ s/^'(.*)'$/$1/; # strip single quotes, |
| 1662 | \& # $y contains "quoted words" |
| 1663 | .Ve |
| 1664 | .PP |
| 1665 | In the last example, the whole string was matched, but only the part |
| 1666 | inside the single quotes was grouped. With the \f(CW\*(C`s///\*(C'\fR operator, the |
| 1667 | matched variables \f(CW$1\fR, \f(CW$2\fR, etc. are immediately available for use |
| 1668 | in the replacement expression, so we use \f(CW$1\fR to replace the quoted |
| 1669 | string with just what was quoted. With the global modifier, \f(CW\*(C`s///g\*(C'\fR |
| 1670 | will search and replace all occurrences of the regexp in the string: |
| 1671 | .PP |
| 1672 | .Vb 6 |
| 1673 | \& $x = "I batted 4 for 4"; |
| 1674 | \& $x =~ s/4/four/; # doesn't do it all: |
| 1675 | \& # $x contains "I batted four for 4" |
| 1676 | \& $x = "I batted 4 for 4"; |
| 1677 | \& $x =~ s/4/four/g; # does it all: |
| 1678 | \& # $x contains "I batted four for four" |
| 1679 | .Ve |
| 1680 | .PP |
| 1681 | If you prefer 'regex' over 'regexp' in this tutorial, you could use |
| 1682 | the following program to replace it: |
| 1683 | .PP |
| 1684 | .Vb 9 |
| 1685 | \& % cat > simple_replace |
| 1686 | \& #!/usr/bin/perl |
| 1687 | \& $regexp = shift; |
| 1688 | \& $replacement = shift; |
| 1689 | \& while (<>) { |
| 1690 | \& s/$regexp/$replacement/go; |
| 1691 | \& print; |
| 1692 | \& } |
| 1693 | \& ^D |
| 1694 | .Ve |
| 1695 | .PP |
| 1696 | .Vb 1 |
| 1697 | \& % simple_replace regexp regex perlretut.pod |
| 1698 | .Ve |
| 1699 | .PP |
| 1700 | In \f(CW\*(C`simple_replace\*(C'\fR we used the \f(CW\*(C`s///g\*(C'\fR modifier to replace all |
| 1701 | occurrences of the regexp on each line and the \f(CW\*(C`s///o\*(C'\fR modifier to |
| 1702 | compile the regexp only once. As with \f(CW\*(C`simple_grep\*(C'\fR, both the |
| 1703 | \&\f(CW\*(C`print\*(C'\fR and the \f(CW\*(C`s/$regexp/$replacement/go\*(C'\fR use \f(CW$_\fR implicitly. |
| 1704 | .PP |
| 1705 | A modifier available specifically to search and replace is the |
| 1706 | \&\f(CW\*(C`s///e\*(C'\fR evaluation modifier. \f(CW\*(C`s///e\*(C'\fR wraps an \f(CW\*(C`eval{...}\*(C'\fR around |
| 1707 | the replacement string and the evaluated result is substituted for the |
| 1708 | matched substring. \f(CW\*(C`s///e\*(C'\fR is useful if you need to do a bit of |
| 1709 | computation in the process of replacing text. This example counts |
| 1710 | character frequencies in a line: |
| 1711 | .PP |
| 1712 | .Vb 4 |
| 1713 | \& $x = "Bill the cat"; |
| 1714 | \& $x =~ s/(.)/$chars{$1}++;$1/eg; # final $1 replaces char with itself |
| 1715 | \& print "frequency of '$_' is $chars{$_}\en" |
| 1716 | \& foreach (sort {$chars{$b} <=> $chars{$a}} keys %chars); |
| 1717 | .Ve |
| 1718 | .PP |
| 1719 | This prints |
| 1720 | .PP |
| 1721 | .Vb 9 |
| 1722 | \& frequency of ' ' is 2 |
| 1723 | \& frequency of 't' is 2 |
| 1724 | \& frequency of 'l' is 2 |
| 1725 | \& frequency of 'B' is 1 |
| 1726 | \& frequency of 'c' is 1 |
| 1727 | \& frequency of 'e' is 1 |
| 1728 | \& frequency of 'h' is 1 |
| 1729 | \& frequency of 'i' is 1 |
| 1730 | \& frequency of 'a' is 1 |
| 1731 | .Ve |
| 1732 | .PP |
| 1733 | As with the match \f(CW\*(C`m//\*(C'\fR operator, \f(CW\*(C`s///\*(C'\fR can use other delimiters, |
| 1734 | such as \f(CW\*(C`s!!!\*(C'\fR and \f(CW\*(C`s{}{}\*(C'\fR, and even \f(CW\*(C`s{}//\*(C'\fR. If single quotes are |
| 1735 | used \f(CW\*(C`s'''\*(C'\fR, then the regexp and replacement are treated as single |
| 1736 | quoted strings and there are no substitutions. \f(CW\*(C`s///\*(C'\fR in list context |
| 1737 | returns the same thing as in scalar context, i.e., the number of |
| 1738 | matches. |
| 1739 | .PP |
| 1740 | \&\fBThe split operator\fR |
| 1741 | .PP |
| 1742 | The \fB\f(CB\*(C`split\*(C'\fB \fR function can also optionally use a matching operator |
| 1743 | \&\f(CW\*(C`m//\*(C'\fR to split a string. \f(CW\*(C`split /regexp/, string, limit\*(C'\fR splits |
| 1744 | \&\f(CW\*(C`string\*(C'\fR into a list of substrings and returns that list. The regexp |
| 1745 | is used to match the character sequence that the \f(CW\*(C`string\*(C'\fR is split |
| 1746 | with respect to. The \f(CW\*(C`limit\*(C'\fR, if present, constrains splitting into |
| 1747 | no more than \f(CW\*(C`limit\*(C'\fR number of strings. For example, to split a |
| 1748 | string into words, use |
| 1749 | .PP |
| 1750 | .Vb 4 |
| 1751 | \& $x = "Calvin and Hobbes"; |
| 1752 | \& @words = split /\es+/, $x; # $word[0] = 'Calvin' |
| 1753 | \& # $word[1] = 'and' |
| 1754 | \& # $word[2] = 'Hobbes' |
| 1755 | .Ve |
| 1756 | .PP |
| 1757 | If the empty regexp \f(CW\*(C`//\*(C'\fR is used, the regexp always matches and |
| 1758 | the string is split into individual characters. If the regexp has |
| 1759 | groupings, then list produced contains the matched substrings from the |
| 1760 | groupings as well. For instance, |
| 1761 | .PP |
| 1762 | .Vb 12 |
| 1763 | \& $x = "/usr/bin/perl"; |
| 1764 | \& @dirs = split m!/!, $x; # $dirs[0] = '' |
| 1765 | \& # $dirs[1] = 'usr' |
| 1766 | \& # $dirs[2] = 'bin' |
| 1767 | \& # $dirs[3] = 'perl' |
| 1768 | \& @parts = split m!(/)!, $x; # $parts[0] = '' |
| 1769 | \& # $parts[1] = '/' |
| 1770 | \& # $parts[2] = 'usr' |
| 1771 | \& # $parts[3] = '/' |
| 1772 | \& # $parts[4] = 'bin' |
| 1773 | \& # $parts[5] = '/' |
| 1774 | \& # $parts[6] = 'perl' |
| 1775 | .Ve |
| 1776 | .PP |
| 1777 | Since the first character of \f(CW$x\fR matched the regexp, \f(CW\*(C`split\*(C'\fR prepended |
| 1778 | an empty initial element to the list. |
| 1779 | .PP |
| 1780 | If you have read this far, congratulations! You now have all the basic |
| 1781 | tools needed to use regular expressions to solve a wide range of text |
| 1782 | processing problems. If this is your first time through the tutorial, |
| 1783 | why not stop here and play around with regexps a while... Part\ 2 |
| 1784 | concerns the more esoteric aspects of regular expressions and those |
| 1785 | concepts certainly aren't needed right at the start. |
| 1786 | .SH "Part 2: Power tools" |
| 1787 | .IX Header "Part 2: Power tools" |
| 1788 | \&\s-1OK\s0, you know the basics of regexps and you want to know more. If |
| 1789 | matching regular expressions is analogous to a walk in the woods, then |
| 1790 | the tools discussed in Part 1 are analogous to topo maps and a |
| 1791 | compass, basic tools we use all the time. Most of the tools in part 2 |
| 1792 | are analogous to flare guns and satellite phones. They aren't used |
| 1793 | too often on a hike, but when we are stuck, they can be invaluable. |
| 1794 | .PP |
| 1795 | What follows are the more advanced, less used, or sometimes esoteric |
| 1796 | capabilities of perl regexps. In Part 2, we will assume you are |
| 1797 | comfortable with the basics and concentrate on the new features. |
| 1798 | .Sh "More on characters, strings, and character classes" |
| 1799 | .IX Subsection "More on characters, strings, and character classes" |
| 1800 | There are a number of escape sequences and character classes that we |
| 1801 | haven't covered yet. |
| 1802 | .PP |
| 1803 | There are several escape sequences that convert characters or strings |
| 1804 | between upper and lower case. \f(CW\*(C`\el\*(C'\fR and \f(CW\*(C`\eu\*(C'\fR convert the next |
| 1805 | character to lower or upper case, respectively: |
| 1806 | .PP |
| 1807 | .Vb 4 |
| 1808 | \& $x = "perl"; |
| 1809 | \& $string =~ /\eu$x/; # matches 'Perl' in $string |
| 1810 | \& $x = "M(rs?|s)\e\e."; # note the double backslash |
| 1811 | \& $string =~ /\el$x/; # matches 'mr.', 'mrs.', and 'ms.', |
| 1812 | .Ve |
| 1813 | .PP |
| 1814 | \&\f(CW\*(C`\eL\*(C'\fR and \f(CW\*(C`\eU\*(C'\fR converts a whole substring, delimited by \f(CW\*(C`\eL\*(C'\fR or |
| 1815 | \&\f(CW\*(C`\eU\*(C'\fR and \f(CW\*(C`\eE\*(C'\fR, to lower or upper case: |
| 1816 | .PP |
| 1817 | .Vb 4 |
| 1818 | \& $x = "This word is in lower case:\eL SHOUT\eE"; |
| 1819 | \& $x =~ /shout/; # matches |
| 1820 | \& $x = "I STILL KEYPUNCH CARDS FOR MY 360" |
| 1821 | \& $x =~ /\eUkeypunch/; # matches punch card string |
| 1822 | .Ve |
| 1823 | .PP |
| 1824 | If there is no \f(CW\*(C`\eE\*(C'\fR, case is converted until the end of the |
| 1825 | string. The regexps \f(CW\*(C`\eL\eu$word\*(C'\fR or \f(CW\*(C`\eu\eL$word\*(C'\fR convert the first |
| 1826 | character of \f(CW$word\fR to uppercase and the rest of the characters to |
| 1827 | lowercase. |
| 1828 | .PP |
| 1829 | Control characters can be escaped with \f(CW\*(C`\ec\*(C'\fR, so that a control-Z |
| 1830 | character would be matched with \f(CW\*(C`\ecZ\*(C'\fR. The escape sequence |
| 1831 | \&\f(CW\*(C`\eQ\*(C'\fR...\f(CW\*(C`\eE\*(C'\fR quotes, or protects most non-alphabetic characters. For |
| 1832 | instance, |
| 1833 | .PP |
| 1834 | .Vb 2 |
| 1835 | \& $x = "\eQThat !^*&%~& cat!"; |
| 1836 | \& $x =~ /\eQ!^*&%~&\eE/; # check for rough language |
| 1837 | .Ve |
| 1838 | .PP |
| 1839 | It does not protect \f(CW\*(C`$\*(C'\fR or \f(CW\*(C`@\*(C'\fR, so that variables can still be |
| 1840 | substituted. |
| 1841 | .PP |
| 1842 | With the advent of 5.6.0, perl regexps can handle more than just the |
| 1843 | standard \s-1ASCII\s0 character set. Perl now supports \fBUnicode\fR, a standard |
| 1844 | for encoding the character sets from many of the world's written |
| 1845 | languages. Unicode does this by allowing characters to be more than |
| 1846 | one byte wide. Perl uses the \s-1UTF\-8\s0 encoding, in which \s-1ASCII\s0 characters |
| 1847 | are still encoded as one byte, but characters greater than \f(CW\*(C`chr(127)\*(C'\fR |
| 1848 | may be stored as two or more bytes. |
| 1849 | .PP |
| 1850 | What does this mean for regexps? Well, regexp users don't need to know |
| 1851 | much about perl's internal representation of strings. But they do need |
| 1852 | to know 1) how to represent Unicode characters in a regexp and 2) when |
| 1853 | a matching operation will treat the string to be searched as a |
| 1854 | sequence of bytes (the old way) or as a sequence of Unicode characters |
| 1855 | (the new way). The answer to 1) is that Unicode characters greater |
| 1856 | than \f(CW\*(C`chr(127)\*(C'\fR may be represented using the \f(CW\*(C`\ex{hex}\*(C'\fR notation, |
| 1857 | with \f(CW\*(C`hex\*(C'\fR a hexadecimal integer: |
| 1858 | .PP |
| 1859 | .Vb 1 |
| 1860 | \& /\ex{263a}/; # match a Unicode smiley face :) |
| 1861 | .Ve |
| 1862 | .PP |
| 1863 | Unicode characters in the range of 128\-255 use two hexadecimal digits |
| 1864 | with braces: \f(CW\*(C`\ex{ab}\*(C'\fR. Note that this is different than \f(CW\*(C`\exab\*(C'\fR, |
| 1865 | which is just a hexadecimal byte with no Unicode significance. |
| 1866 | .PP |
| 1867 | \&\fB\s-1NOTE\s0\fR: in Perl 5.6.0 it used to be that one needed to say \f(CW\*(C`use |
| 1868 | utf8\*(C'\fR to use any Unicode features. This is no more the case: for |
| 1869 | almost all Unicode processing, the explicit \f(CW\*(C`utf8\*(C'\fR pragma is not |
| 1870 | needed. (The only case where it matters is if your Perl script is in |
| 1871 | Unicode and encoded in \s-1UTF\-8\s0, then an explicit \f(CW\*(C`use utf8\*(C'\fR is needed.) |
| 1872 | .PP |
| 1873 | Figuring out the hexadecimal sequence of a Unicode character you want |
| 1874 | or deciphering someone else's hexadecimal Unicode regexp is about as |
| 1875 | much fun as programming in machine code. So another way to specify |
| 1876 | Unicode characters is to use the \fBnamed\ character\fR\ escape |
| 1877 | sequence \f(CW\*(C`\eN{name}\*(C'\fR. \f(CW\*(C`name\*(C'\fR is a name for the Unicode character, as |
| 1878 | specified in the Unicode standard. For instance, if we wanted to |
| 1879 | represent or match the astrological sign for the planet Mercury, we |
| 1880 | could use |
| 1881 | .PP |
| 1882 | .Vb 3 |
| 1883 | \& use charnames ":full"; # use named chars with Unicode full names |
| 1884 | \& $x = "abc\eN{MERCURY}def"; |
| 1885 | \& $x =~ /\eN{MERCURY}/; # matches |
| 1886 | .Ve |
| 1887 | .PP |
| 1888 | One can also use short names or restrict names to a certain alphabet: |
| 1889 | .PP |
| 1890 | .Vb 2 |
| 1891 | \& use charnames ':full'; |
| 1892 | \& print "\eN{GREEK SMALL LETTER SIGMA} is called sigma.\en"; |
| 1893 | .Ve |
| 1894 | .PP |
| 1895 | .Vb 2 |
| 1896 | \& use charnames ":short"; |
| 1897 | \& print "\eN{greek:Sigma} is an upper-case sigma.\en"; |
| 1898 | .Ve |
| 1899 | .PP |
| 1900 | .Vb 2 |
| 1901 | \& use charnames qw(greek); |
| 1902 | \& print "\eN{sigma} is Greek sigma\en"; |
| 1903 | .Ve |
| 1904 | .PP |
| 1905 | A list of full names is found in the file Names.txt in the |
| 1906 | lib/perl5/5.X.X/unicore directory. |
| 1907 | .PP |
| 1908 | The answer to requirement 2), as of 5.6.0, is that if a regexp |
| 1909 | contains Unicode characters, the string is searched as a sequence of |
| 1910 | Unicode characters. Otherwise, the string is searched as a sequence of |
| 1911 | bytes. If the string is being searched as a sequence of Unicode |
| 1912 | characters, but matching a single byte is required, we can use the \f(CW\*(C`\eC\*(C'\fR |
| 1913 | escape sequence. \f(CW\*(C`\eC\*(C'\fR is a character class akin to \f(CW\*(C`.\*(C'\fR except that |
| 1914 | it matches \fIany\fR byte 0\-255. So |
| 1915 | .PP |
| 1916 | .Vb 7 |
| 1917 | \& use charnames ":full"; # use named chars with Unicode full names |
| 1918 | \& $x = "a"; |
| 1919 | \& $x =~ /\eC/; # matches 'a', eats one byte |
| 1920 | \& $x = ""; |
| 1921 | \& $x =~ /\eC/; # doesn't match, no bytes to match |
| 1922 | \& $x = "\eN{MERCURY}"; # two-byte Unicode character |
| 1923 | \& $x =~ /\eC/; # matches, but dangerous! |
| 1924 | .Ve |
| 1925 | .PP |
| 1926 | The last regexp matches, but is dangerous because the string |
| 1927 | \&\fIcharacter\fR position is no longer synchronized to the string \fIbyte\fR |
| 1928 | position. This generates the warning 'Malformed \s-1UTF\-8\s0 |
| 1929 | character'. The \f(CW\*(C`\eC\*(C'\fR is best used for matching the binary data in strings |
| 1930 | with binary data intermixed with Unicode characters. |
| 1931 | .PP |
| 1932 | Let us now discuss the rest of the character classes. Just as with |
| 1933 | Unicode characters, there are named Unicode character classes |
| 1934 | represented by the \f(CW\*(C`\ep{name}\*(C'\fR escape sequence. Closely associated is |
| 1935 | the \f(CW\*(C`\eP{name}\*(C'\fR character class, which is the negation of the |
| 1936 | \&\f(CW\*(C`\ep{name}\*(C'\fR class. For example, to match lower and uppercase |
| 1937 | characters, |
| 1938 | .PP |
| 1939 | .Vb 6 |
| 1940 | \& use charnames ":full"; # use named chars with Unicode full names |
| 1941 | \& $x = "BOB"; |
| 1942 | \& $x =~ /^\ep{IsUpper}/; # matches, uppercase char class |
| 1943 | \& $x =~ /^\eP{IsUpper}/; # doesn't match, char class sans uppercase |
| 1944 | \& $x =~ /^\ep{IsLower}/; # doesn't match, lowercase char class |
| 1945 | \& $x =~ /^\eP{IsLower}/; # matches, char class sans lowercase |
| 1946 | .Ve |
| 1947 | .PP |
| 1948 | Here is the association between some Perl named classes and the |
| 1949 | traditional Unicode classes: |
| 1950 | .PP |
| 1951 | .Vb 1 |
| 1952 | \& Perl class name Unicode class name or regular expression |
| 1953 | .Ve |
| 1954 | .PP |
| 1955 | .Vb 15 |
| 1956 | \& IsAlpha /^[LM]/ |
| 1957 | \& IsAlnum /^[LMN]/ |
| 1958 | \& IsASCII $code <= 127 |
| 1959 | \& IsCntrl /^C/ |
| 1960 | \& IsBlank $code =~ /^(0020|0009)$/ || /^Z[^lp]/ |
| 1961 | \& IsDigit Nd |
| 1962 | \& IsGraph /^([LMNPS]|Co)/ |
| 1963 | \& IsLower Ll |
| 1964 | \& IsPrint /^([LMNPS]|Co|Zs)/ |
| 1965 | \& IsPunct /^P/ |
| 1966 | \& IsSpace /^Z/ || ($code =~ /^(0009|000A|000B|000C|000D)$/ |
| 1967 | \& IsSpacePerl /^Z/ || ($code =~ /^(0009|000A|000C|000D|0085|2028|2029)$/ |
| 1968 | \& IsUpper /^L[ut]/ |
| 1969 | \& IsWord /^[LMN]/ || $code eq "005F" |
| 1970 | \& IsXDigit $code =~ /^00(3[0-9]|[46][1-6])$/ |
| 1971 | .Ve |
| 1972 | .PP |
| 1973 | You can also use the official Unicode class names with the \f(CW\*(C`\ep\*(C'\fR and |
| 1974 | \&\f(CW\*(C`\eP\*(C'\fR, like \f(CW\*(C`\ep{L}\*(C'\fR for Unicode 'letters', or \f(CW\*(C`\ep{Lu}\*(C'\fR for uppercase |
| 1975 | letters, or \f(CW\*(C`\eP{Nd}\*(C'\fR for non\-digits. If a \f(CW\*(C`name\*(C'\fR is just one |
| 1976 | letter, the braces can be dropped. For instance, \f(CW\*(C`\epM\*(C'\fR is the |
| 1977 | character class of Unicode 'marks', for example accent marks. |
| 1978 | For the full list see perlunicode. |
| 1979 | .PP |
| 1980 | The Unicode has also been separated into various sets of characters |
| 1981 | which you can test with \f(CW\*(C`\ep{In...}\*(C'\fR (in) and \f(CW\*(C`\eP{In...}\*(C'\fR (not in), |
| 1982 | for example \f(CW\*(C`\ep{Latin}\*(C'\fR, \f(CW\*(C`\ep{Greek}\*(C'\fR, or \f(CW\*(C`\eP{Katakana}\*(C'\fR. |
| 1983 | For the full list see perlunicode. |
| 1984 | .PP |
| 1985 | \&\f(CW\*(C`\eX\*(C'\fR is an abbreviation for a character class sequence that includes |
| 1986 | the Unicode 'combining character sequences'. A 'combining character |
| 1987 | sequence' is a base character followed by any number of combining |
| 1988 | characters. An example of a combining character is an accent. Using |
| 1989 | the Unicode full names, e.g., \f(CW\*(C`A\ +\ COMBINING\ RING\*(C'\fR\ is a combining |
| 1990 | character sequence with base character \f(CW\*(C`A\*(C'\fR and combining character |
| 1991 | \&\f(CW\*(C`COMBINING\ RING\*(C'\fR\ , which translates in Danish to A with the circle |
| 1992 | atop it, as in the word Angstrom. \f(CW\*(C`\eX\*(C'\fR is equivalent to \f(CW\*(C`\ePM\epM*}\*(C'\fR, |
| 1993 | i.e., a non-mark followed by one or more marks. |
| 1994 | .PP |
| 1995 | For the full and latest information about Unicode see the latest |
| 1996 | Unicode standard, or the Unicode Consortium's website http://www.unicode.org/ |
| 1997 | .PP |
| 1998 | As if all those classes weren't enough, Perl also defines \s-1POSIX\s0 style |
| 1999 | character classes. These have the form \f(CW\*(C`[:name:]\*(C'\fR, with \f(CW\*(C`name\*(C'\fR the |
| 2000 | name of the \s-1POSIX\s0 class. The \s-1POSIX\s0 classes are \f(CW\*(C`alpha\*(C'\fR, \f(CW\*(C`alnum\*(C'\fR, |
| 2001 | \&\f(CW\*(C`ascii\*(C'\fR, \f(CW\*(C`cntrl\*(C'\fR, \f(CW\*(C`digit\*(C'\fR, \f(CW\*(C`graph\*(C'\fR, \f(CW\*(C`lower\*(C'\fR, \f(CW\*(C`print\*(C'\fR, \f(CW\*(C`punct\*(C'\fR, |
| 2002 | \&\f(CW\*(C`space\*(C'\fR, \f(CW\*(C`upper\*(C'\fR, and \f(CW\*(C`xdigit\*(C'\fR, and two extensions, \f(CW\*(C`word\*(C'\fR (a Perl |
| 2003 | extension to match \f(CW\*(C`\ew\*(C'\fR), and \f(CW\*(C`blank\*(C'\fR (a \s-1GNU\s0 extension). If \f(CW\*(C`utf8\*(C'\fR |
| 2004 | is being used, then these classes are defined the same as their |
| 2005 | corresponding perl Unicode classes: \f(CW\*(C`[:upper:]\*(C'\fR is the same as |
| 2006 | \&\f(CW\*(C`\ep{IsUpper}\*(C'\fR, etc. The \s-1POSIX\s0 character classes, however, don't |
| 2007 | require using \f(CW\*(C`utf8\*(C'\fR. The \f(CW\*(C`[:digit:]\*(C'\fR, \f(CW\*(C`[:word:]\*(C'\fR, and |
| 2008 | \&\f(CW\*(C`[:space:]\*(C'\fR correspond to the familiar \f(CW\*(C`\ed\*(C'\fR, \f(CW\*(C`\ew\*(C'\fR, and \f(CW\*(C`\es\*(C'\fR |
| 2009 | character classes. To negate a \s-1POSIX\s0 class, put a \f(CW\*(C`^\*(C'\fR in front of |
| 2010 | the name, so that, e.g., \f(CW\*(C`[:^digit:]\*(C'\fR corresponds to \f(CW\*(C`\eD\*(C'\fR and under |
| 2011 | \&\f(CW\*(C`utf8\*(C'\fR, \f(CW\*(C`\eP{IsDigit}\*(C'\fR. The Unicode and \s-1POSIX\s0 character classes can |
| 2012 | be used just like \f(CW\*(C`\ed\*(C'\fR, with the exception that \s-1POSIX\s0 character |
| 2013 | classes can only be used inside of a character class: |
| 2014 | .PP |
| 2015 | .Vb 7 |
| 2016 | \& /\es+[abc[:digit:]xyz]\es*/; # match a,b,c,x,y,z, or a digit |
| 2017 | \& /^=item\es[[:digit:]]/; # match '=item', |
| 2018 | \& # followed by a space and a digit |
| 2019 | \& use charnames ":full"; |
| 2020 | \& /\es+[abc\ep{IsDigit}xyz]\es+/; # match a,b,c,x,y,z, or a digit |
| 2021 | \& /^=item\es\ep{IsDigit}/; # match '=item', |
| 2022 | \& # followed by a space and a digit |
| 2023 | .Ve |
| 2024 | .PP |
| 2025 | Whew! That is all the rest of the characters and character classes. |
| 2026 | .Sh "Compiling and saving regular expressions" |
| 2027 | .IX Subsection "Compiling and saving regular expressions" |
| 2028 | In Part 1 we discussed the \f(CW\*(C`//o\*(C'\fR modifier, which compiles a regexp |
| 2029 | just once. This suggests that a compiled regexp is some data structure |
| 2030 | that can be stored once and used again and again. The regexp quote |
| 2031 | \&\f(CW\*(C`qr//\*(C'\fR does exactly that: \f(CW\*(C`qr/string/\*(C'\fR compiles the \f(CW\*(C`string\*(C'\fR as a |
| 2032 | regexp and transforms the result into a form that can be assigned to a |
| 2033 | variable: |
| 2034 | .PP |
| 2035 | .Vb 1 |
| 2036 | \& $reg = qr/foo+bar?/; # reg contains a compiled regexp |
| 2037 | .Ve |
| 2038 | .PP |
| 2039 | Then \f(CW$reg\fR can be used as a regexp: |
| 2040 | .PP |
| 2041 | .Vb 3 |
| 2042 | \& $x = "fooooba"; |
| 2043 | \& $x =~ $reg; # matches, just like /foo+bar?/ |
| 2044 | \& $x =~ /$reg/; # same thing, alternate form |
| 2045 | .Ve |
| 2046 | .PP |
| 2047 | \&\f(CW$reg\fR can also be interpolated into a larger regexp: |
| 2048 | .PP |
| 2049 | .Vb 1 |
| 2050 | \& $x =~ /(abc)?$reg/; # still matches |
| 2051 | .Ve |
| 2052 | .PP |
| 2053 | As with the matching operator, the regexp quote can use different |
| 2054 | delimiters, e.g., \f(CW\*(C`qr!!\*(C'\fR, \f(CW\*(C`qr{}\*(C'\fR and \f(CW\*(C`qr~~\*(C'\fR. The single quote |
| 2055 | delimiters \f(CW\*(C`qr''\*(C'\fR prevent any interpolation from taking place. |
| 2056 | .PP |
| 2057 | Pre-compiled regexps are useful for creating dynamic matches that |
| 2058 | don't need to be recompiled each time they are encountered. Using |
| 2059 | pre-compiled regexps, \f(CW\*(C`simple_grep\*(C'\fR program can be expanded into a |
| 2060 | program that matches multiple patterns: |
| 2061 | .PP |
| 2062 | .Vb 4 |
| 2063 | \& % cat > multi_grep |
| 2064 | \& #!/usr/bin/perl |
| 2065 | \& # multi_grep - match any of <number> regexps |
| 2066 | \& # usage: multi_grep <number> regexp1 regexp2 ... file1 file2 ... |
| 2067 | .Ve |
| 2068 | .PP |
| 2069 | .Vb 12 |
| 2070 | \& $number = shift; |
| 2071 | \& $regexp[$_] = shift foreach (0..$number-1); |
| 2072 | \& @compiled = map qr/$_/, @regexp; |
| 2073 | \& while ($line = <>) { |
| 2074 | \& foreach $pattern (@compiled) { |
| 2075 | \& if ($line =~ /$pattern/) { |
| 2076 | \& print $line; |
| 2077 | \& last; # we matched, so move onto the next line |
| 2078 | \& } |
| 2079 | \& } |
| 2080 | \& } |
| 2081 | \& ^D |
| 2082 | .Ve |
| 2083 | .PP |
| 2084 | .Vb 4 |
| 2085 | \& % multi_grep 2 last for multi_grep |
| 2086 | \& $regexp[$_] = shift foreach (0..$number-1); |
| 2087 | \& foreach $pattern (@compiled) { |
| 2088 | \& last; |
| 2089 | .Ve |
| 2090 | .PP |
| 2091 | Storing pre-compiled regexps in an array \f(CW@compiled\fR allows us to |
| 2092 | simply loop through the regexps without any recompilation, thus gaining |
| 2093 | flexibility without sacrificing speed. |
| 2094 | .Sh "Embedding comments and modifiers in a regular expression" |
| 2095 | .IX Subsection "Embedding comments and modifiers in a regular expression" |
| 2096 | Starting with this section, we will be discussing Perl's set of |
| 2097 | \&\fBextended patterns\fR. These are extensions to the traditional regular |
| 2098 | expression syntax that provide powerful new tools for pattern |
| 2099 | matching. We have already seen extensions in the form of the minimal |
| 2100 | matching constructs \f(CW\*(C`??\*(C'\fR, \f(CW\*(C`*?\*(C'\fR, \f(CW\*(C`+?\*(C'\fR, \f(CW\*(C`{n,m}?\*(C'\fR, and \f(CW\*(C`{n,}?\*(C'\fR. The |
| 2101 | rest of the extensions below have the form \f(CW\*(C`(?char...)\*(C'\fR, where the |
| 2102 | \&\f(CW\*(C`char\*(C'\fR is a character that determines the type of extension. |
| 2103 | .PP |
| 2104 | The first extension is an embedded comment \f(CW\*(C`(?#text)\*(C'\fR. This embeds a |
| 2105 | comment into the regular expression without affecting its meaning. The |
| 2106 | comment should not have any closing parentheses in the text. An |
| 2107 | example is |
| 2108 | .PP |
| 2109 | .Vb 1 |
| 2110 | \& /(?# Match an integer:)[+-]?\ed+/; |
| 2111 | .Ve |
| 2112 | .PP |
| 2113 | This style of commenting has been largely superseded by the raw, |
| 2114 | freeform commenting that is allowed with the \f(CW\*(C`//x\*(C'\fR modifier. |
| 2115 | .PP |
| 2116 | The modifiers \f(CW\*(C`//i\*(C'\fR, \f(CW\*(C`//m\*(C'\fR, \f(CW\*(C`//s\*(C'\fR, and \f(CW\*(C`//x\*(C'\fR can also embedded in |
| 2117 | a regexp using \f(CW\*(C`(?i)\*(C'\fR, \f(CW\*(C`(?m)\*(C'\fR, \f(CW\*(C`(?s)\*(C'\fR, and \f(CW\*(C`(?x)\*(C'\fR. For instance, |
| 2118 | .PP |
| 2119 | .Vb 7 |
| 2120 | \& /(?i)yes/; # match 'yes' case insensitively |
| 2121 | \& /yes/i; # same thing |
| 2122 | \& /(?x)( # freeform version of an integer regexp |
| 2123 | \& [+-]? # match an optional sign |
| 2124 | \& \ed+ # match a sequence of digits |
| 2125 | \& ) |
| 2126 | \& /x; |
| 2127 | .Ve |
| 2128 | .PP |
| 2129 | Embedded modifiers can have two important advantages over the usual |
| 2130 | modifiers. Embedded modifiers allow a custom set of modifiers to |
| 2131 | \&\fIeach\fR regexp pattern. This is great for matching an array of regexps |
| 2132 | that must have different modifiers: |
| 2133 | .PP |
| 2134 | .Vb 8 |
| 2135 | \& $pattern[0] = '(?i)doctor'; |
| 2136 | \& $pattern[1] = 'Johnson'; |
| 2137 | \& ... |
| 2138 | \& while (<>) { |
| 2139 | \& foreach $patt (@pattern) { |
| 2140 | \& print if /$patt/; |
| 2141 | \& } |
| 2142 | \& } |
| 2143 | .Ve |
| 2144 | .PP |
| 2145 | The second advantage is that embedded modifiers only affect the regexp |
| 2146 | inside the group the embedded modifier is contained in. So grouping |
| 2147 | can be used to localize the modifier's effects: |
| 2148 | .PP |
| 2149 | .Vb 1 |
| 2150 | \& /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc. |
| 2151 | .Ve |
| 2152 | .PP |
| 2153 | Embedded modifiers can also turn off any modifiers already present |
| 2154 | by using, e.g., \f(CW\*(C`(?\-i)\*(C'\fR. Modifiers can also be combined into |
| 2155 | a single expression, e.g., \f(CW\*(C`(?s\-i)\*(C'\fR turns on single line mode and |
| 2156 | turns off case insensitivity. |
| 2157 | .Sh "Non-capturing groupings" |
| 2158 | .IX Subsection "Non-capturing groupings" |
| 2159 | We noted in Part 1 that groupings \f(CW\*(C`()\*(C'\fR had two distinct functions: 1) |
| 2160 | group regexp elements together as a single unit, and 2) extract, or |
| 2161 | capture, substrings that matched the regexp in the |
| 2162 | grouping. Non-capturing groupings, denoted by \f(CW\*(C`(?:regexp)\*(C'\fR, allow the |
| 2163 | regexp to be treated as a single unit, but don't extract substrings or |
| 2164 | set matching variables \f(CW$1\fR, etc. Both capturing and non-capturing |
| 2165 | groupings are allowed to co-exist in the same regexp. Because there is |
| 2166 | no extraction, non-capturing groupings are faster than capturing |
| 2167 | groupings. Non-capturing groupings are also handy for choosing exactly |
| 2168 | which parts of a regexp are to be extracted to matching variables: |
| 2169 | .PP |
| 2170 | .Vb 2 |
| 2171 | \& # match a number, $1-$4 are set, but we only want $1 |
| 2172 | \& /([+-]?\e *(\ed+(\e.\ed*)?|\e.\ed+)([eE][+-]?\ed+)?)/; |
| 2173 | .Ve |
| 2174 | .PP |
| 2175 | .Vb 2 |
| 2176 | \& # match a number faster , only $1 is set |
| 2177 | \& /([+-]?\e *(?:\ed+(?:\e.\ed*)?|\e.\ed+)(?:[eE][+-]?\ed+)?)/; |
| 2178 | .Ve |
| 2179 | .PP |
| 2180 | .Vb 2 |
| 2181 | \& # match a number, get $1 = whole number, $2 = exponent |
| 2182 | \& /([+-]?\e *(?:\ed+(?:\e.\ed*)?|\e.\ed+)(?:[eE]([+-]?\ed+))?)/; |
| 2183 | .Ve |
| 2184 | .PP |
| 2185 | Non-capturing groupings are also useful for removing nuisance |
| 2186 | elements gathered from a split operation: |
| 2187 | .PP |
| 2188 | .Vb 3 |
| 2189 | \& $x = '12a34b5'; |
| 2190 | \& @num = split /(a|b)/, $x; # @num = ('12','a','34','b','5') |
| 2191 | \& @num = split /(?:a|b)/, $x; # @num = ('12','34','5') |
| 2192 | .Ve |
| 2193 | .PP |
| 2194 | Non-capturing groupings may also have embedded modifiers: |
| 2195 | \&\f(CW\*(C`(?i\-m:regexp)\*(C'\fR is a non-capturing grouping that matches \f(CW\*(C`regexp\*(C'\fR |
| 2196 | case insensitively and turns off multi-line mode. |
| 2197 | .Sh "Looking ahead and looking behind" |
| 2198 | .IX Subsection "Looking ahead and looking behind" |
| 2199 | This section concerns the lookahead and lookbehind assertions. First, |
| 2200 | a little background. |
| 2201 | .PP |
| 2202 | In Perl regular expressions, most regexp elements 'eat up' a certain |
| 2203 | amount of string when they match. For instance, the regexp element |
| 2204 | \&\f(CW\*(C`[abc}]\*(C'\fR eats up one character of the string when it matches, in the |
| 2205 | sense that perl moves to the next character position in the string |
| 2206 | after the match. There are some elements, however, that don't eat up |
| 2207 | characters (advance the character position) if they match. The examples |
| 2208 | we have seen so far are the anchors. The anchor \f(CW\*(C`^\*(C'\fR matches the |
| 2209 | beginning of the line, but doesn't eat any characters. Similarly, the |
| 2210 | word boundary anchor \f(CW\*(C`\eb\*(C'\fR matches, e.g., if the character to the left |
| 2211 | is a word character and the character to the right is a non-word |
| 2212 | character, but it doesn't eat up any characters itself. Anchors are |
| 2213 | examples of 'zero\-width assertions'. Zero\-width, because they consume |
| 2214 | no characters, and assertions, because they test some property of the |
| 2215 | string. In the context of our walk in the woods analogy to regexp |
| 2216 | matching, most regexp elements move us along a trail, but anchors have |
| 2217 | us stop a moment and check our surroundings. If the local environment |
| 2218 | checks out, we can proceed forward. But if the local environment |
| 2219 | doesn't satisfy us, we must backtrack. |
| 2220 | .PP |
| 2221 | Checking the environment entails either looking ahead on the trail, |
| 2222 | looking behind, or both. \f(CW\*(C`^\*(C'\fR looks behind, to see that there are no |
| 2223 | characters before. \f(CW\*(C`$\*(C'\fR looks ahead, to see that there are no |
| 2224 | characters after. \f(CW\*(C`\eb\*(C'\fR looks both ahead and behind, to see if the |
| 2225 | characters on either side differ in their 'word'\-ness. |
| 2226 | .PP |
| 2227 | The lookahead and lookbehind assertions are generalizations of the |
| 2228 | anchor concept. Lookahead and lookbehind are zero-width assertions |
| 2229 | that let us specify which characters we want to test for. The |
| 2230 | lookahead assertion is denoted by \f(CW\*(C`(?=regexp)\*(C'\fR and the lookbehind |
| 2231 | assertion is denoted by \f(CW\*(C`(?<=fixed\-regexp)\*(C'\fR. Some examples are |
| 2232 | .PP |
| 2233 | .Vb 8 |
| 2234 | \& $x = "I catch the housecat 'Tom-cat' with catnip"; |
| 2235 | \& $x =~ /cat(?=\es+)/; # matches 'cat' in 'housecat' |
| 2236 | \& @catwords = ($x =~ /(?<=\es)cat\ew+/g); # matches, |
| 2237 | \& # $catwords[0] = 'catch' |
| 2238 | \& # $catwords[1] = 'catnip' |
| 2239 | \& $x =~ /\ebcat\eb/; # matches 'cat' in 'Tom-cat' |
| 2240 | \& $x =~ /(?<=\es)cat(?=\es)/; # doesn't match; no isolated 'cat' in |
| 2241 | \& # middle of $x |
| 2242 | .Ve |
| 2243 | .PP |
| 2244 | Note that the parentheses in \f(CW\*(C`(?=regexp)\*(C'\fR and \f(CW\*(C`(?<=regexp)\*(C'\fR are |
| 2245 | non\-capturing, since these are zero-width assertions. Thus in the |
| 2246 | second regexp, the substrings captured are those of the whole regexp |
| 2247 | itself. Lookahead \f(CW\*(C`(?=regexp)\*(C'\fR can match arbitrary regexps, but |
| 2248 | lookbehind \f(CW\*(C`(?<=fixed\-regexp)\*(C'\fR only works for regexps of fixed |
| 2249 | width, i.e., a fixed number of characters long. Thus |
| 2250 | \&\f(CW\*(C`(?<=(ab|bc))\*(C'\fR is fine, but \f(CW\*(C`(?<=(ab)*)\*(C'\fR is not. The |
| 2251 | negated versions of the lookahead and lookbehind assertions are |
| 2252 | denoted by \f(CW\*(C`(?!regexp)\*(C'\fR and \f(CW\*(C`(?<!fixed\-regexp)\*(C'\fR respectively. |
| 2253 | They evaluate true if the regexps do \fInot\fR match: |
| 2254 | .PP |
| 2255 | .Vb 4 |
| 2256 | \& $x = "foobar"; |
| 2257 | \& $x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo' |
| 2258 | \& $x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo' |
| 2259 | \& $x =~ /(?<!\es)foo/; # matches, there is no \es before 'foo' |
| 2260 | .Ve |
| 2261 | .PP |
| 2262 | The \f(CW\*(C`\eC\*(C'\fR is unsupported in lookbehind, because the already |
| 2263 | treacherous definition of \f(CW\*(C`\eC\*(C'\fR would become even more so |
| 2264 | when going backwards. |
| 2265 | .Sh "Using independent subexpressions to prevent backtracking" |
| 2266 | .IX Subsection "Using independent subexpressions to prevent backtracking" |
| 2267 | The last few extended patterns in this tutorial are experimental as of |
| 2268 | 5.6.0. Play with them, use them in some code, but don't rely on them |
| 2269 | just yet for production code. |
| 2270 | .PP |
| 2271 | \&\fBIndependent\ subexpressions\fR\ are regular expressions, in the |
| 2272 | context of a larger regular expression, that function independently of |
| 2273 | the larger regular expression. That is, they consume as much or as |
| 2274 | little of the string as they wish without regard for the ability of |
| 2275 | the larger regexp to match. Independent subexpressions are represented |
| 2276 | by \f(CW\*(C`(?>regexp)\*(C'\fR. We can illustrate their behavior by first |
| 2277 | considering an ordinary regexp: |
| 2278 | .PP |
| 2279 | .Vb 2 |
| 2280 | \& $x = "ab"; |
| 2281 | \& $x =~ /a*ab/; # matches |
| 2282 | .Ve |
| 2283 | .PP |
| 2284 | This obviously matches, but in the process of matching, the |
| 2285 | subexpression \f(CW\*(C`a*\*(C'\fR first grabbed the \f(CW\*(C`a\*(C'\fR. Doing so, however, |
| 2286 | wouldn't allow the whole regexp to match, so after backtracking, \f(CW\*(C`a*\*(C'\fR |
| 2287 | eventually gave back the \f(CW\*(C`a\*(C'\fR and matched the empty string. Here, what |
| 2288 | \&\f(CW\*(C`a*\*(C'\fR matched was \fIdependent\fR on what the rest of the regexp matched. |
| 2289 | .PP |
| 2290 | Contrast that with an independent subexpression: |
| 2291 | .PP |
| 2292 | .Vb 1 |
| 2293 | \& $x =~ /(?>a*)ab/; # doesn't match! |
| 2294 | .Ve |
| 2295 | .PP |
| 2296 | The independent subexpression \f(CW\*(C`(?>a*)\*(C'\fR doesn't care about the rest |
| 2297 | of the regexp, so it sees an \f(CW\*(C`a\*(C'\fR and grabs it. Then the rest of the |
| 2298 | regexp \f(CW\*(C`ab\*(C'\fR cannot match. Because \f(CW\*(C`(?>a*)\*(C'\fR is independent, there |
| 2299 | is no backtracking and the independent subexpression does not give |
| 2300 | up its \f(CW\*(C`a\*(C'\fR. Thus the match of the regexp as a whole fails. A similar |
| 2301 | behavior occurs with completely independent regexps: |
| 2302 | .PP |
| 2303 | .Vb 3 |
| 2304 | \& $x = "ab"; |
| 2305 | \& $x =~ /a*/g; # matches, eats an 'a' |
| 2306 | \& $x =~ /\eGab/g; # doesn't match, no 'a' available |
| 2307 | .Ve |
| 2308 | .PP |
| 2309 | Here \f(CW\*(C`//g\*(C'\fR and \f(CW\*(C`\eG\*(C'\fR create a 'tag team' handoff of the string from |
| 2310 | one regexp to the other. Regexps with an independent subexpression are |
| 2311 | much like this, with a handoff of the string to the independent |
| 2312 | subexpression, and a handoff of the string back to the enclosing |
| 2313 | regexp. |
| 2314 | .PP |
| 2315 | The ability of an independent subexpression to prevent backtracking |
| 2316 | can be quite useful. Suppose we want to match a non-empty string |
| 2317 | enclosed in parentheses up to two levels deep. Then the following |
| 2318 | regexp matches: |
| 2319 | .PP |
| 2320 | .Vb 2 |
| 2321 | \& $x = "abc(de(fg)h"; # unbalanced parentheses |
| 2322 | \& $x =~ /\e( ( [^()]+ | \e([^()]*\e) )+ \e)/x; |
| 2323 | .Ve |
| 2324 | .PP |
| 2325 | The regexp matches an open parenthesis, one or more copies of an |
| 2326 | alternation, and a close parenthesis. The alternation is two\-way, with |
| 2327 | the first alternative \f(CW\*(C`[^()]+\*(C'\fR matching a substring with no |
| 2328 | parentheses and the second alternative \f(CW\*(C`\e([^()]*\e)\*(C'\fR matching a |
| 2329 | substring delimited by parentheses. The problem with this regexp is |
| 2330 | that it is pathological: it has nested indeterminate quantifiers |
| 2331 | of the form \f(CW\*(C`(a+|b)+\*(C'\fR. We discussed in Part 1 how nested quantifiers |
| 2332 | like this could take an exponentially long time to execute if there |
| 2333 | was no match possible. To prevent the exponential blowup, we need to |
| 2334 | prevent useless backtracking at some point. This can be done by |
| 2335 | enclosing the inner quantifier as an independent subexpression: |
| 2336 | .PP |
| 2337 | .Vb 1 |
| 2338 | \& $x =~ /\e( ( (?>[^()]+) | \e([^()]*\e) )+ \e)/x; |
| 2339 | .Ve |
| 2340 | .PP |
| 2341 | Here, \f(CW\*(C`(?>[^()]+)\*(C'\fR breaks the degeneracy of string partitioning |
| 2342 | by gobbling up as much of the string as possible and keeping it. Then |
| 2343 | match failures fail much more quickly. |
| 2344 | .Sh "Conditional expressions" |
| 2345 | .IX Subsection "Conditional expressions" |
| 2346 | A \fBconditional\ expression\fR\ is a form of if-then-else statement |
| 2347 | that allows one to choose which patterns are to be matched, based on |
| 2348 | some condition. There are two types of conditional expression: |
| 2349 | \&\f(CW\*(C`(?(condition)yes\-regexp)\*(C'\fR and |
| 2350 | \&\f(CW\*(C`(?(condition)yes\-regexp|no\-regexp)\*(C'\fR. \f(CW\*(C`(?(condition)yes\-regexp)\*(C'\fR is |
| 2351 | like an \f(CW'if\ ()\ {}'\fR\ statement in Perl. If the \f(CW\*(C`condition\*(C'\fR is true, |
| 2352 | the \f(CW\*(C`yes\-regexp\*(C'\fR will be matched. If the \f(CW\*(C`condition\*(C'\fR is false, the |
| 2353 | \&\f(CW\*(C`yes\-regexp\*(C'\fR will be skipped and perl will move onto the next regexp |
| 2354 | element. The second form is like an \f(CW'if\ ()\ {}\ else\ {}'\fR\ statement |
| 2355 | in Perl. If the \f(CW\*(C`condition\*(C'\fR is true, the \f(CW\*(C`yes\-regexp\*(C'\fR will be |
| 2356 | matched, otherwise the \f(CW\*(C`no\-regexp\*(C'\fR will be matched. |
| 2357 | .PP |
| 2358 | The \f(CW\*(C`condition\*(C'\fR can have two forms. The first form is simply an |
| 2359 | integer in parentheses \f(CW\*(C`(integer)\*(C'\fR. It is true if the corresponding |
| 2360 | backreference \f(CW\*(C`\einteger\*(C'\fR matched earlier in the regexp. The second |
| 2361 | form is a bare zero width assertion \f(CW\*(C`(?...)\*(C'\fR, either a |
| 2362 | lookahead, a lookbehind, or a code assertion (discussed in the next |
| 2363 | section). |
| 2364 | .PP |
| 2365 | The integer form of the \f(CW\*(C`condition\*(C'\fR allows us to choose, with more |
| 2366 | flexibility, what to match based on what matched earlier in the |
| 2367 | regexp. This searches for words of the form \f(CW"$x$x"\fR or |
| 2368 | \&\f(CW"$x$y$y$x"\fR: |
| 2369 | .PP |
| 2370 | .Vb 9 |
| 2371 | \& % simple_grep '^(\ew+)(\ew+)?(?(2)\e2\e1|\e1)$' /usr/dict/words |
| 2372 | \& beriberi |
| 2373 | \& coco |
| 2374 | \& couscous |
| 2375 | \& deed |
| 2376 | \& ... |
| 2377 | \& toot |
| 2378 | \& toto |
| 2379 | \& tutu |
| 2380 | .Ve |
| 2381 | .PP |
| 2382 | The lookbehind \f(CW\*(C`condition\*(C'\fR allows, along with backreferences, |
| 2383 | an earlier part of the match to influence a later part of the |
| 2384 | match. For instance, |
| 2385 | .PP |
| 2386 | .Vb 1 |
| 2387 | \& /[ATGC]+(?(?<=AA)G|C)$/; |
| 2388 | .Ve |
| 2389 | .PP |
| 2390 | matches a \s-1DNA\s0 sequence such that it either ends in \f(CW\*(C`AAG\*(C'\fR, or some |
| 2391 | other base pair combination and \f(CW\*(C`C\*(C'\fR. Note that the form is |
| 2392 | \&\f(CW\*(C`(?(?<=AA)G|C)\*(C'\fR and not \f(CW\*(C`(?((?<=AA))G|C)\*(C'\fR; for the |
| 2393 | lookahead, lookbehind or code assertions, the parentheses around the |
| 2394 | conditional are not needed. |
| 2395 | .Sh "A bit of magic: executing Perl code in a regular expression" |
| 2396 | .IX Subsection "A bit of magic: executing Perl code in a regular expression" |
| 2397 | Normally, regexps are a part of Perl expressions. |
| 2398 | \&\fBCode\ evaluation\fR\ expressions turn that around by allowing |
| 2399 | arbitrary Perl code to be a part of a regexp. A code evaluation |
| 2400 | expression is denoted \f(CW\*(C`(?{code})\*(C'\fR, with \f(CW\*(C`code\*(C'\fR a string of Perl |
| 2401 | statements. |
| 2402 | .PP |
| 2403 | Code expressions are zero-width assertions, and the value they return |
| 2404 | depends on their environment. There are two possibilities: either the |
| 2405 | code expression is used as a conditional in a conditional expression |
| 2406 | \&\f(CW\*(C`(?(condition)...)\*(C'\fR, or it is not. If the code expression is a |
| 2407 | conditional, the code is evaluated and the result (i.e., the result of |
| 2408 | the last statement) is used to determine truth or falsehood. If the |
| 2409 | code expression is not used as a conditional, the assertion always |
| 2410 | evaluates true and the result is put into the special variable |
| 2411 | \&\f(CW$^R\fR. The variable \f(CW$^R\fR can then be used in code expressions later |
| 2412 | in the regexp. Here are some silly examples: |
| 2413 | .PP |
| 2414 | .Vb 5 |
| 2415 | \& $x = "abcdef"; |
| 2416 | \& $x =~ /abc(?{print "Hi Mom!";})def/; # matches, |
| 2417 | \& # prints 'Hi Mom!' |
| 2418 | \& $x =~ /aaa(?{print "Hi Mom!";})def/; # doesn't match, |
| 2419 | \& # no 'Hi Mom!' |
| 2420 | .Ve |
| 2421 | .PP |
| 2422 | Pay careful attention to the next example: |
| 2423 | .PP |
| 2424 | .Vb 3 |
| 2425 | \& $x =~ /abc(?{print "Hi Mom!";})ddd/; # doesn't match, |
| 2426 | \& # no 'Hi Mom!' |
| 2427 | \& # but why not? |
| 2428 | .Ve |
| 2429 | .PP |
| 2430 | At first glance, you'd think that it shouldn't print, because obviously |
| 2431 | the \f(CW\*(C`ddd\*(C'\fR isn't going to match the target string. But look at this |
| 2432 | example: |
| 2433 | .PP |
| 2434 | .Vb 2 |
| 2435 | \& $x =~ /abc(?{print "Hi Mom!";})[d]dd/; # doesn't match, |
| 2436 | \& # but _does_ print |
| 2437 | .Ve |
| 2438 | .PP |
| 2439 | Hmm. What happened here? If you've been following along, you know that |
| 2440 | the above pattern should be effectively the same as the last one \*(-- |
| 2441 | enclosing the d in a character class isn't going to change what it |
| 2442 | matches. So why does the first not print while the second one does? |
| 2443 | .PP |
| 2444 | The answer lies in the optimizations the REx engine makes. In the first |
| 2445 | case, all the engine sees are plain old characters (aside from the |
| 2446 | \&\f(CW\*(C`?{}\*(C'\fR construct). It's smart enough to realize that the string 'ddd' |
| 2447 | doesn't occur in our target string before actually running the pattern |
| 2448 | through. But in the second case, we've tricked it into thinking that our |
| 2449 | pattern is more complicated than it is. It takes a look, sees our |
| 2450 | character class, and decides that it will have to actually run the |
| 2451 | pattern to determine whether or not it matches, and in the process of |
| 2452 | running it hits the print statement before it discovers that we don't |
| 2453 | have a match. |
| 2454 | .PP |
| 2455 | To take a closer look at how the engine does optimizations, see the |
| 2456 | section \*(L"Pragmas and debugging\*(R" below. |
| 2457 | .PP |
| 2458 | More fun with \f(CW\*(C`?{}\*(C'\fR: |
| 2459 | .PP |
| 2460 | .Vb 6 |
| 2461 | \& $x =~ /(?{print "Hi Mom!";})/; # matches, |
| 2462 | \& # prints 'Hi Mom!' |
| 2463 | \& $x =~ /(?{$c = 1;})(?{print "$c";})/; # matches, |
| 2464 | \& # prints '1' |
| 2465 | \& $x =~ /(?{$c = 1;})(?{print "$^R";})/; # matches, |
| 2466 | \& # prints '1' |
| 2467 | .Ve |
| 2468 | .PP |
| 2469 | The bit of magic mentioned in the section title occurs when the regexp |
| 2470 | backtracks in the process of searching for a match. If the regexp |
| 2471 | backtracks over a code expression and if the variables used within are |
| 2472 | localized using \f(CW\*(C`local\*(C'\fR, the changes in the variables produced by the |
| 2473 | code expression are undone! Thus, if we wanted to count how many times |
| 2474 | a character got matched inside a group, we could use, e.g., |
| 2475 | .PP |
| 2476 | .Vb 11 |
| 2477 | \& $x = "aaaa"; |
| 2478 | \& $count = 0; # initialize 'a' count |
| 2479 | \& $c = "bob"; # test if $c gets clobbered |
| 2480 | \& $x =~ /(?{local $c = 0;}) # initialize count |
| 2481 | \& ( a # match 'a' |
| 2482 | \& (?{local $c = $c + 1;}) # increment count |
| 2483 | \& )* # do this any number of times, |
| 2484 | \& aa # but match 'aa' at the end |
| 2485 | \& (?{$count = $c;}) # copy local $c var into $count |
| 2486 | \& /x; |
| 2487 | \& print "'a' count is $count, \e$c variable is '$c'\en"; |
| 2488 | .Ve |
| 2489 | .PP |
| 2490 | This prints |
| 2491 | .PP |
| 2492 | .Vb 1 |
| 2493 | \& 'a' count is 2, $c variable is 'bob' |
| 2494 | .Ve |
| 2495 | .PP |
| 2496 | If we replace the \f(CW\*(C`\ (?{local\ $c\ =\ $c\ +\ 1;})\*(C'\fR\ with |
| 2497 | \&\f(CW\*(C`\ (?{$c\ =\ $c\ +\ 1;})\*(C'\fR\ , the variable changes are \fInot\fR undone |
| 2498 | during backtracking, and we get |
| 2499 | .PP |
| 2500 | .Vb 1 |
| 2501 | \& 'a' count is 4, $c variable is 'bob' |
| 2502 | .Ve |
| 2503 | .PP |
| 2504 | Note that only localized variable changes are undone. Other side |
| 2505 | effects of code expression execution are permanent. Thus |
| 2506 | .PP |
| 2507 | .Vb 2 |
| 2508 | \& $x = "aaaa"; |
| 2509 | \& $x =~ /(a(?{print "Yow\en";}))*aa/; |
| 2510 | .Ve |
| 2511 | .PP |
| 2512 | produces |
| 2513 | .PP |
| 2514 | .Vb 4 |
| 2515 | \& Yow |
| 2516 | \& Yow |
| 2517 | \& Yow |
| 2518 | \& Yow |
| 2519 | .Ve |
| 2520 | .PP |
| 2521 | The result \f(CW$^R\fR is automatically localized, so that it will behave |
| 2522 | properly in the presence of backtracking. |
| 2523 | .PP |
| 2524 | This example uses a code expression in a conditional to match the |
| 2525 | article 'the' in either English or German: |
| 2526 | .PP |
| 2527 | .Vb 11 |
| 2528 | \& $lang = 'DE'; # use German |
| 2529 | \& ... |
| 2530 | \& $text = "das"; |
| 2531 | \& print "matched\en" |
| 2532 | \& if $text =~ /(?(?{ |
| 2533 | \& $lang eq 'EN'; # is the language English? |
| 2534 | \& }) |
| 2535 | \& the | # if so, then match 'the' |
| 2536 | \& (die|das|der) # else, match 'die|das|der' |
| 2537 | \& ) |
| 2538 | \& /xi; |
| 2539 | .Ve |
| 2540 | .PP |
| 2541 | Note that the syntax here is \f(CW\*(C`(?(?{...})yes\-regexp|no\-regexp)\*(C'\fR, not |
| 2542 | \&\f(CW\*(C`(?((?{...}))yes\-regexp|no\-regexp)\*(C'\fR. In other words, in the case of a |
| 2543 | code expression, we don't need the extra parentheses around the |
| 2544 | conditional. |
| 2545 | .PP |
| 2546 | If you try to use code expressions with interpolating variables, perl |
| 2547 | may surprise you: |
| 2548 | .PP |
| 2549 | .Vb 5 |
| 2550 | \& $bar = 5; |
| 2551 | \& $pat = '(?{ 1 })'; |
| 2552 | \& /foo(?{ $bar })bar/; # compiles ok, $bar not interpolated |
| 2553 | \& /foo(?{ 1 })$bar/; # compile error! |
| 2554 | \& /foo${pat}bar/; # compile error! |
| 2555 | .Ve |
| 2556 | .PP |
| 2557 | .Vb 2 |
| 2558 | \& $pat = qr/(?{ $foo = 1 })/; # precompile code regexp |
| 2559 | \& /foo${pat}bar/; # compiles ok |
| 2560 | .Ve |
| 2561 | .PP |
| 2562 | If a regexp has (1) code expressions and interpolating variables, or |
| 2563 | (2) a variable that interpolates a code expression, perl treats the |
| 2564 | regexp as an error. If the code expression is precompiled into a |
| 2565 | variable, however, interpolating is ok. The question is, why is this |
| 2566 | an error? |
| 2567 | .PP |
| 2568 | The reason is that variable interpolation and code expressions |
| 2569 | together pose a security risk. The combination is dangerous because |
| 2570 | many programmers who write search engines often take user input and |
| 2571 | plug it directly into a regexp: |
| 2572 | .PP |
| 2573 | .Vb 3 |
| 2574 | \& $regexp = <>; # read user-supplied regexp |
| 2575 | \& $chomp $regexp; # get rid of possible newline |
| 2576 | \& $text =~ /$regexp/; # search $text for the $regexp |
| 2577 | .Ve |
| 2578 | .PP |
| 2579 | If the \f(CW$regexp\fR variable contains a code expression, the user could |
| 2580 | then execute arbitrary Perl code. For instance, some joker could |
| 2581 | search for \f(CW\*(C`system('rm\ \-rf\ *');\*(C'\fR\ to erase your files. In this |
| 2582 | sense, the combination of interpolation and code expressions \fBtaints\fR |
| 2583 | your regexp. So by default, using both interpolation and code |
| 2584 | expressions in the same regexp is not allowed. If you're not |
| 2585 | concerned about malicious users, it is possible to bypass this |
| 2586 | security check by invoking \f(CW\*(C`use\ re\ 'eval'\*(C'\fR\ : |
| 2587 | .PP |
| 2588 | .Vb 5 |
| 2589 | \& use re 'eval'; # throw caution out the door |
| 2590 | \& $bar = 5; |
| 2591 | \& $pat = '(?{ 1 })'; |
| 2592 | \& /foo(?{ 1 })$bar/; # compiles ok |
| 2593 | \& /foo${pat}bar/; # compiles ok |
| 2594 | .Ve |
| 2595 | .PP |
| 2596 | Another form of code expression is the \fBpattern\ code\ expression\fR\ . |
| 2597 | The pattern code expression is like a regular code expression, except |
| 2598 | that the result of the code evaluation is treated as a regular |
| 2599 | expression and matched immediately. A simple example is |
| 2600 | .PP |
| 2601 | .Vb 4 |
| 2602 | \& $length = 5; |
| 2603 | \& $char = 'a'; |
| 2604 | \& $x = 'aaaaabb'; |
| 2605 | \& $x =~ /(??{$char x $length})/x; # matches, there are 5 of 'a' |
| 2606 | .Ve |
| 2607 | .PP |
| 2608 | This final example contains both ordinary and pattern code |
| 2609 | expressions. It detects if a binary string \f(CW1101010010001...\fR has a |
| 2610 | Fibonacci spacing 0,1,1,2,3,5,... of the \f(CW1\fR's: |
| 2611 | .PP |
| 2612 | .Vb 17 |
| 2613 | \& $s0 = 0; $s1 = 1; # initial conditions |
| 2614 | \& $x = "1101010010001000001"; |
| 2615 | \& print "It is a Fibonacci sequence\en" |
| 2616 | \& if $x =~ /^1 # match an initial '1' |
| 2617 | \& ( |
| 2618 | \& (??{'0' x $s0}) # match $s0 of '0' |
| 2619 | \& 1 # and then a '1' |
| 2620 | \& (?{ |
| 2621 | \& $largest = $s0; # largest seq so far |
| 2622 | \& $s2 = $s1 + $s0; # compute next term |
| 2623 | \& $s0 = $s1; # in Fibonacci sequence |
| 2624 | \& $s1 = $s2; |
| 2625 | \& }) |
| 2626 | \& )+ # repeat as needed |
| 2627 | \& $ # that is all there is |
| 2628 | \& /x; |
| 2629 | \& print "Largest sequence matched was $largest\en"; |
| 2630 | .Ve |
| 2631 | .PP |
| 2632 | This prints |
| 2633 | .PP |
| 2634 | .Vb 2 |
| 2635 | \& It is a Fibonacci sequence |
| 2636 | \& Largest sequence matched was 5 |
| 2637 | .Ve |
| 2638 | .PP |
| 2639 | Ha! Try that with your garden variety regexp package... |
| 2640 | .PP |
| 2641 | Note that the variables \f(CW$s0\fR and \f(CW$s1\fR are not substituted when the |
| 2642 | regexp is compiled, as happens for ordinary variables outside a code |
| 2643 | expression. Rather, the code expressions are evaluated when perl |
| 2644 | encounters them during the search for a match. |
| 2645 | .PP |
| 2646 | The regexp without the \f(CW\*(C`//x\*(C'\fR modifier is |
| 2647 | .PP |
| 2648 | .Vb 1 |
| 2649 | \& /^1((??{'0'x$s0})1(?{$largest=$s0;$s2=$s1+$s0$s0=$s1;$s1=$s2;}))+$/; |
| 2650 | .Ve |
| 2651 | .PP |
| 2652 | and is a great start on an Obfuscated Perl entry :\-) When working with |
| 2653 | code and conditional expressions, the extended form of regexps is |
| 2654 | almost necessary in creating and debugging regexps. |
| 2655 | .Sh "Pragmas and debugging" |
| 2656 | .IX Subsection "Pragmas and debugging" |
| 2657 | Speaking of debugging, there are several pragmas available to control |
| 2658 | and debug regexps in Perl. We have already encountered one pragma in |
| 2659 | the previous section, \f(CW\*(C`use\ re\ 'eval';\*(C'\fR\ , that allows variable |
| 2660 | interpolation and code expressions to coexist in a regexp. The other |
| 2661 | pragmas are |
| 2662 | .PP |
| 2663 | .Vb 3 |
| 2664 | \& use re 'taint'; |
| 2665 | \& $tainted = <>; |
| 2666 | \& @parts = ($tainted =~ /(\ew+)\es+(\ew+)/; # @parts is now tainted |
| 2667 | .Ve |
| 2668 | .PP |
| 2669 | The \f(CW\*(C`taint\*(C'\fR pragma causes any substrings from a match with a tainted |
| 2670 | variable to be tainted as well. This is not normally the case, as |
| 2671 | regexps are often used to extract the safe bits from a tainted |
| 2672 | variable. Use \f(CW\*(C`taint\*(C'\fR when you are not extracting safe bits, but are |
| 2673 | performing some other processing. Both \f(CW\*(C`taint\*(C'\fR and \f(CW\*(C`eval\*(C'\fR pragmas |
| 2674 | are lexically scoped, which means they are in effect only until |
| 2675 | the end of the block enclosing the pragmas. |
| 2676 | .PP |
| 2677 | .Vb 2 |
| 2678 | \& use re 'debug'; |
| 2679 | \& /^(.*)$/s; # output debugging info |
| 2680 | .Ve |
| 2681 | .PP |
| 2682 | .Vb 2 |
| 2683 | \& use re 'debugcolor'; |
| 2684 | \& /^(.*)$/s; # output debugging info in living color |
| 2685 | .Ve |
| 2686 | .PP |
| 2687 | The global \f(CW\*(C`debug\*(C'\fR and \f(CW\*(C`debugcolor\*(C'\fR pragmas allow one to get |
| 2688 | detailed debugging info about regexp compilation and |
| 2689 | execution. \f(CW\*(C`debugcolor\*(C'\fR is the same as debug, except the debugging |
| 2690 | information is displayed in color on terminals that can display |
| 2691 | termcap color sequences. Here is example output: |
| 2692 | .PP |
| 2693 | .Vb 25 |
| 2694 | \& % perl -e 'use re "debug"; "abc" =~ /a*b+c/;' |
| 2695 | \& Compiling REx `a*b+c' |
| 2696 | \& size 9 first at 1 |
| 2697 | \& 1: STAR(4) |
| 2698 | \& 2: EXACT <a>(0) |
| 2699 | \& 4: PLUS(7) |
| 2700 | \& 5: EXACT <b>(0) |
| 2701 | \& 7: EXACT <c>(9) |
| 2702 | \& 9: END(0) |
| 2703 | \& floating `bc' at 0..2147483647 (checking floating) minlen 2 |
| 2704 | \& Guessing start of match, REx `a*b+c' against `abc'... |
| 2705 | \& Found floating substr `bc' at offset 1... |
| 2706 | \& Guessed: match at offset 0 |
| 2707 | \& Matching REx `a*b+c' against `abc' |
| 2708 | \& Setting an EVAL scope, savestack=3 |
| 2709 | \& 0 <> <abc> | 1: STAR |
| 2710 | \& EXACT <a> can match 1 times out of 32767... |
| 2711 | \& Setting an EVAL scope, savestack=3 |
| 2712 | \& 1 <a> <bc> | 4: PLUS |
| 2713 | \& EXACT <b> can match 1 times out of 32767... |
| 2714 | \& Setting an EVAL scope, savestack=3 |
| 2715 | \& 2 <ab> <c> | 7: EXACT <c> |
| 2716 | \& 3 <abc> <> | 9: END |
| 2717 | \& Match successful! |
| 2718 | \& Freeing REx: `a*b+c' |
| 2719 | .Ve |
| 2720 | .PP |
| 2721 | If you have gotten this far into the tutorial, you can probably guess |
| 2722 | what the different parts of the debugging output tell you. The first |
| 2723 | part |
| 2724 | .PP |
| 2725 | .Vb 8 |
| 2726 | \& Compiling REx `a*b+c' |
| 2727 | \& size 9 first at 1 |
| 2728 | \& 1: STAR(4) |
| 2729 | \& 2: EXACT <a>(0) |
| 2730 | \& 4: PLUS(7) |
| 2731 | \& 5: EXACT <b>(0) |
| 2732 | \& 7: EXACT <c>(9) |
| 2733 | \& 9: END(0) |
| 2734 | .Ve |
| 2735 | .PP |
| 2736 | describes the compilation stage. \f(CWSTAR(4)\fR means that there is a |
| 2737 | starred object, in this case \f(CW'a'\fR, and if it matches, goto line 4, |
| 2738 | i.e., \f(CWPLUS(7)\fR. The middle lines describe some heuristics and |
| 2739 | optimizations performed before a match: |
| 2740 | .PP |
| 2741 | .Vb 4 |
| 2742 | \& floating `bc' at 0..2147483647 (checking floating) minlen 2 |
| 2743 | \& Guessing start of match, REx `a*b+c' against `abc'... |
| 2744 | \& Found floating substr `bc' at offset 1... |
| 2745 | \& Guessed: match at offset 0 |
| 2746 | .Ve |
| 2747 | .PP |
| 2748 | Then the match is executed and the remaining lines describe the |
| 2749 | process: |
| 2750 | .PP |
| 2751 | .Vb 12 |
| 2752 | \& Matching REx `a*b+c' against `abc' |
| 2753 | \& Setting an EVAL scope, savestack=3 |
| 2754 | \& 0 <> <abc> | 1: STAR |
| 2755 | \& EXACT <a> can match 1 times out of 32767... |
| 2756 | \& Setting an EVAL scope, savestack=3 |
| 2757 | \& 1 <a> <bc> | 4: PLUS |
| 2758 | \& EXACT <b> can match 1 times out of 32767... |
| 2759 | \& Setting an EVAL scope, savestack=3 |
| 2760 | \& 2 <ab> <c> | 7: EXACT <c> |
| 2761 | \& 3 <abc> <> | 9: END |
| 2762 | \& Match successful! |
| 2763 | \& Freeing REx: `a*b+c' |
| 2764 | .Ve |
| 2765 | .PP |
| 2766 | Each step is of the form \f(CW\*(C`n\ <x>\ <y>\*(C'\fR\ , with \f(CW\*(C`<x>\*(C'\fR the |
| 2767 | part of the string matched and \f(CW\*(C`<y>\*(C'\fR the part not yet |
| 2768 | matched. The \f(CW\*(C`|\ 1:\ STAR\*(C'\fR\ says that perl is at line number 1 |
| 2769 | n the compilation list above. See |
| 2770 | \&\*(L"Debugging regular expressions\*(R" in perldebguts for much more detail. |
| 2771 | .PP |
| 2772 | An alternative method of debugging regexps is to embed \f(CW\*(C`print\*(C'\fR |
| 2773 | statements within the regexp. This provides a blow-by-blow account of |
| 2774 | the backtracking in an alternation: |
| 2775 | .PP |
| 2776 | .Vb 12 |
| 2777 | \& "that this" =~ m@(?{print "Start at position ", pos, "\en";}) |
| 2778 | \& t(?{print "t1\en";}) |
| 2779 | \& h(?{print "h1\en";}) |
| 2780 | \& i(?{print "i1\en";}) |
| 2781 | \& s(?{print "s1\en";}) |
| 2782 | \& | |
| 2783 | \& t(?{print "t2\en";}) |
| 2784 | \& h(?{print "h2\en";}) |
| 2785 | \& a(?{print "a2\en";}) |
| 2786 | \& t(?{print "t2\en";}) |
| 2787 | \& (?{print "Done at position ", pos, "\en";}) |
| 2788 | \& @x; |
| 2789 | .Ve |
| 2790 | .PP |
| 2791 | prints |
| 2792 | .PP |
| 2793 | .Vb 8 |
| 2794 | \& Start at position 0 |
| 2795 | \& t1 |
| 2796 | \& h1 |
| 2797 | \& t2 |
| 2798 | \& h2 |
| 2799 | \& a2 |
| 2800 | \& t2 |
| 2801 | \& Done at position 4 |
| 2802 | .Ve |
| 2803 | .SH "BUGS" |
| 2804 | .IX Header "BUGS" |
| 2805 | Code expressions, conditional expressions, and independent expressions |
| 2806 | are \fBexperimental\fR. Don't use them in production code. Yet. |
| 2807 | .SH "SEE ALSO" |
| 2808 | .IX Header "SEE ALSO" |
| 2809 | This is just a tutorial. For the full story on perl regular |
| 2810 | expressions, see the perlre regular expressions reference page. |
| 2811 | .PP |
| 2812 | For more information on the matching \f(CW\*(C`m//\*(C'\fR and substitution \f(CW\*(C`s///\*(C'\fR |
| 2813 | operators, see \*(L"Regexp Quote-Like Operators\*(R" in perlop. For |
| 2814 | information on the \f(CW\*(C`split\*(C'\fR operation, see \*(L"split\*(R" in perlfunc. |
| 2815 | .PP |
| 2816 | For an excellent all-around resource on the care and feeding of |
| 2817 | regular expressions, see the book \fIMastering Regular Expressions\fR by |
| 2818 | Jeffrey Friedl (published by O'Reilly, \s-1ISBN\s0 1556592\-257\-3). |
| 2819 | .SH "AUTHOR AND COPYRIGHT" |
| 2820 | .IX Header "AUTHOR AND COPYRIGHT" |
| 2821 | Copyright (c) 2000 Mark Kvale |
| 2822 | All rights reserved. |
| 2823 | .PP |
| 2824 | This document may be distributed under the same terms as Perl itself. |
| 2825 | .Sh "Acknowledgments" |
| 2826 | .IX Subsection "Acknowledgments" |
| 2827 | The inspiration for the stop codon \s-1DNA\s0 example came from the \s-1ZIP\s0 |
| 2828 | code example in chapter 7 of \fIMastering Regular Expressions\fR. |
| 2829 | .PP |
| 2830 | The author would like to thank Jeff Pinyan, Andrew Johnson, Peter |
| 2831 | Haworth, Ronald J Kimball, and Joe Smith for all their helpful |
| 2832 | comments. |