| 1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
| 2 | .\" |
| 3 | .\" Standard preamble: |
| 4 | .\" ======================================================================== |
| 5 | .de Sh \" Subsection heading |
| 6 | .br |
| 7 | .if t .Sp |
| 8 | .ne 5 |
| 9 | .PP |
| 10 | \fB\\$1\fR |
| 11 | .PP |
| 12 | .. |
| 13 | .de Sp \" Vertical space (when we can't use .PP) |
| 14 | .if t .sp .5v |
| 15 | .if n .sp |
| 16 | .. |
| 17 | .de Vb \" Begin verbatim text |
| 18 | .ft CW |
| 19 | .nf |
| 20 | .ne \\$1 |
| 21 | .. |
| 22 | .de Ve \" End verbatim text |
| 23 | .ft R |
| 24 | .fi |
| 25 | .. |
| 26 | .\" Set up some character translations and predefined strings. \*(-- will |
| 27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left |
| 28 | .\" double quote, and \*(R" will give a right double quote. | will give a |
| 29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to |
| 30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' |
| 31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. |
| 32 | .tr \(*W-|\(bv\*(Tr |
| 33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' |
| 34 | .ie n \{\ |
| 35 | . ds -- \(*W- |
| 36 | . ds PI pi |
| 37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch |
| 38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch |
| 39 | . ds L" "" |
| 40 | . ds R" "" |
| 41 | . ds C` "" |
| 42 | . ds C' "" |
| 43 | 'br\} |
| 44 | .el\{\ |
| 45 | . ds -- \|\(em\| |
| 46 | . ds PI \(*p |
| 47 | . ds L" `` |
| 48 | . ds R" '' |
| 49 | 'br\} |
| 50 | .\" |
| 51 | .\" If the F register is turned on, we'll generate index entries on stderr for |
| 52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index |
| 53 | .\" entries marked with X<> in POD. Of course, you'll have to process the |
| 54 | .\" output yourself in some meaningful fashion. |
| 55 | .if \nF \{\ |
| 56 | . de IX |
| 57 | . tm Index:\\$1\t\\n%\t"\\$2" |
| 58 | .. |
| 59 | . nr % 0 |
| 60 | . rr F |
| 61 | .\} |
| 62 | .\" |
| 63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes |
| 64 | .\" way too many mistakes in technical documents. |
| 65 | .hy 0 |
| 66 | .if n .na |
| 67 | .\" |
| 68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). |
| 69 | .\" Fear. Run. Save yourself. No user-serviceable parts. |
| 70 | . \" fudge factors for nroff and troff |
| 71 | .if n \{\ |
| 72 | . ds #H 0 |
| 73 | . ds #V .8m |
| 74 | . ds #F .3m |
| 75 | . ds #[ \f1 |
| 76 | . ds #] \fP |
| 77 | .\} |
| 78 | .if t \{\ |
| 79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) |
| 80 | . ds #V .6m |
| 81 | . ds #F 0 |
| 82 | . ds #[ \& |
| 83 | . ds #] \& |
| 84 | .\} |
| 85 | . \" simple accents for nroff and troff |
| 86 | .if n \{\ |
| 87 | . ds ' \& |
| 88 | . ds ` \& |
| 89 | . ds ^ \& |
| 90 | . ds , \& |
| 91 | . ds ~ ~ |
| 92 | . ds / |
| 93 | .\} |
| 94 | .if t \{\ |
| 95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" |
| 96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' |
| 97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' |
| 98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' |
| 99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' |
| 100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' |
| 101 | .\} |
| 102 | . \" troff and (daisy-wheel) nroff accents |
| 103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' |
| 104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' |
| 105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] |
| 106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' |
| 107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' |
| 108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] |
| 109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] |
| 110 | .ds ae a\h'-(\w'a'u*4/10)'e |
| 111 | .ds Ae A\h'-(\w'A'u*4/10)'E |
| 112 | . \" corrections for vroff |
| 113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' |
| 114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' |
| 115 | . \" for low resolution devices (crt and lpr) |
| 116 | .if \n(.H>23 .if \n(.V>19 \ |
| 117 | \{\ |
| 118 | . ds : e |
| 119 | . ds 8 ss |
| 120 | . ds o a |
| 121 | . ds d- d\h'-1'\(ga |
| 122 | . ds D- D\h'-1'\(hy |
| 123 | . ds th \o'bp' |
| 124 | . ds Th \o'LP' |
| 125 | . ds ae ae |
| 126 | . ds Ae AE |
| 127 | .\} |
| 128 | .rm #[ #] #H #V #F C |
| 129 | .\" ======================================================================== |
| 130 | .\" |
| 131 | .IX Title "RECDESCENT 1" |
| 132 | .TH RECDESCENT 1 "2000-08-20" "perl v5.8.0" "User Contributed Perl Documentation" |
| 133 | .SH "NAME" |
| 134 | Parse::RecDescent \- Generate Recursive\-Descent Parsers |
| 135 | .SH "VERSION" |
| 136 | .IX Header "VERSION" |
| 137 | This document describes version 1.79 of Parse::RecDescent, |
| 138 | released August 21, 2000. |
| 139 | .SH "SYNOPSIS" |
| 140 | .IX Header "SYNOPSIS" |
| 141 | .Vb 1 |
| 142 | \& use Parse::RecDescent; |
| 143 | .Ve |
| 144 | .PP |
| 145 | .Vb 1 |
| 146 | \& # Generate a parser from the specification in $grammar: |
| 147 | .Ve |
| 148 | .PP |
| 149 | .Vb 1 |
| 150 | \& $parser = new Parse::RecDescent ($grammar); |
| 151 | .Ve |
| 152 | .PP |
| 153 | .Vb 1 |
| 154 | \& # Generate a parser from the specification in $othergrammar |
| 155 | .Ve |
| 156 | .PP |
| 157 | .Vb 1 |
| 158 | \& $anotherparser = new Parse::RecDescent ($othergrammar); |
| 159 | .Ve |
| 160 | .PP |
| 161 | .Vb 2 |
| 162 | \& # Parse $text using rule 'startrule' (which must be |
| 163 | \& # defined in $grammar): |
| 164 | .Ve |
| 165 | .PP |
| 166 | .Vb 1 |
| 167 | \& $parser->startrule($text); |
| 168 | .Ve |
| 169 | .PP |
| 170 | .Vb 2 |
| 171 | \& # Parse $text using rule 'otherrule' (which must also |
| 172 | \& # be defined in $grammar): |
| 173 | .Ve |
| 174 | .PP |
| 175 | .Vb 1 |
| 176 | \& $parser->otherrule($text); |
| 177 | .Ve |
| 178 | .PP |
| 179 | .Vb 2 |
| 180 | \& # Change the universal token prefix pattern |
| 181 | \& # (the default is: '\es*'): |
| 182 | .Ve |
| 183 | .PP |
| 184 | .Vb 1 |
| 185 | \& $Parse::RecDescent::skip = '[ \et]+'; |
| 186 | .Ve |
| 187 | .PP |
| 188 | .Vb 2 |
| 189 | \& # Replace productions of existing rules (or create new ones) |
| 190 | \& # with the productions defined in $newgrammar: |
| 191 | .Ve |
| 192 | .PP |
| 193 | .Vb 1 |
| 194 | \& $parser->Replace($newgrammar); |
| 195 | .Ve |
| 196 | .PP |
| 197 | .Vb 2 |
| 198 | \& # Extend existing rules (or create new ones) |
| 199 | \& # by adding extra productions defined in $moregrammar: |
| 200 | .Ve |
| 201 | .PP |
| 202 | .Vb 1 |
| 203 | \& $parser->Extend($moregrammar); |
| 204 | .Ve |
| 205 | .PP |
| 206 | .Vb 1 |
| 207 | \& # Global flags (useful as command line arguments under -s): |
| 208 | .Ve |
| 209 | .PP |
| 210 | .Vb 6 |
| 211 | \& $::RD_ERRORS # unless undefined, report fatal errors |
| 212 | \& $::RD_WARN # unless undefined, also report non-fatal problems |
| 213 | \& $::RD_HINT # if defined, also suggestion remedies |
| 214 | \& $::RD_TRACE # if defined, also trace parsers' behaviour |
| 215 | \& $::RD_AUTOSTUB # if defined, generates "stubs" for undefined rules |
| 216 | \& $::RD_AUTOACTION # if defined, appends specified action to productions |
| 217 | .Ve |
| 218 | .SH "DESCRIPTION" |
| 219 | .IX Header "DESCRIPTION" |
| 220 | .Sh "Overview" |
| 221 | .IX Subsection "Overview" |
| 222 | Parse::RecDescent incrementally generates top-down recursive-descent text |
| 223 | parsers from simple \fIyacc\fR\-like grammar specifications. It provides: |
| 224 | .IP "\(bu" 4 |
| 225 | Regular expressions or literal strings as terminals (tokens), |
| 226 | .IP "\(bu" 4 |
| 227 | Multiple (non\-contiguous) productions for any rule, |
| 228 | .IP "\(bu" 4 |
| 229 | Repeated and optional subrules within productions, |
| 230 | .IP "\(bu" 4 |
| 231 | Full access to Perl within actions specified as part of the grammar, |
| 232 | .IP "\(bu" 4 |
| 233 | Simple automated error reporting during parser generation and parsing, |
| 234 | .IP "\(bu" 4 |
| 235 | The ability to commit to, uncommit to, or reject particular |
| 236 | productions during a parse, |
| 237 | .IP "\(bu" 4 |
| 238 | The ability to pass data up and down the parse tree (\*(L"down\*(R" via subrule |
| 239 | argument lists, \*(L"up\*(R" via subrule return values) |
| 240 | .IP "\(bu" 4 |
| 241 | Incremental extension of the parsing grammar (even during a parse), |
| 242 | .IP "\(bu" 4 |
| 243 | Precompilation of parser objects, |
| 244 | .IP "\(bu" 4 |
| 245 | User-definable reduce-reduce conflict resolution via |
| 246 | \&\*(L"scoring\*(R" of matching productions. |
| 247 | .ie n .Sh "Using ""Parse::RecDescent""" |
| 248 | .el .Sh "Using \f(CWParse::RecDescent\fP" |
| 249 | .IX Subsection "Using Parse::RecDescent" |
| 250 | Parser objects are created by calling \f(CW\*(C`Parse::RecDescent::new\*(C'\fR, passing in a |
| 251 | grammar specification (see the following subsections). If the grammar is |
| 252 | correct, \f(CW\*(C`new\*(C'\fR returns a blessed reference which can then be used to initiate |
| 253 | parsing through any rule specified in the original grammar. A typical sequence |
| 254 | looks like this: |
| 255 | .PP |
| 256 | .Vb 3 |
| 257 | \& $grammar = q { |
| 258 | \& # GRAMMAR SPECIFICATION HERE |
| 259 | \& }; |
| 260 | .Ve |
| 261 | .PP |
| 262 | .Vb 1 |
| 263 | \& $parser = new Parse::RecDescent ($grammar) or die "Bad grammar!\en"; |
| 264 | .Ve |
| 265 | .PP |
| 266 | .Vb 1 |
| 267 | \& # acquire $text |
| 268 | .Ve |
| 269 | .PP |
| 270 | .Vb 1 |
| 271 | \& defined $parser->startrule($text) or print "Bad text!\en"; |
| 272 | .Ve |
| 273 | .PP |
| 274 | The rule through which parsing is initiated must be explicitly defined |
| 275 | in the grammar (i.e. for the above example, the grammar must include a |
| 276 | rule of the form: \*(L"startrule: <subrules>\*(R". |
| 277 | .PP |
| 278 | If the starting rule succeeds, its value (see below) |
| 279 | is returned. Failure to generate the original parser or failure to match a text |
| 280 | is indicated by returning \f(CW\*(C`undef\*(C'\fR. Note that it's easy to set up grammars |
| 281 | that can succeed, but which return a value of 0, \*(L"0\*(R", or "". So don't be |
| 282 | tempted to write: |
| 283 | .PP |
| 284 | .Vb 1 |
| 285 | \& $parser->startrule($text) or print "Bad text!\en"; |
| 286 | .Ve |
| 287 | .PP |
| 288 | Normally, the parser has no effect on the original text. So in the |
| 289 | previous example the value of \f(CW$text\fR would be unchanged after having |
| 290 | been parsed. |
| 291 | .PP |
| 292 | If, however, the text to be matched is passed by reference: |
| 293 | .PP |
| 294 | .Vb 1 |
| 295 | \& $parser->startrule(\e$text) |
| 296 | .Ve |
| 297 | .PP |
| 298 | then any text which was consumed during the match will be removed from the |
| 299 | start of \f(CW$text\fR. |
| 300 | .Sh "Rules" |
| 301 | .IX Subsection "Rules" |
| 302 | In the grammar from which the parser is built, rules are specified by |
| 303 | giving an identifier (which must satisfy /[A\-Za\-z]\ew*/), followed by a |
| 304 | colon \fIon the same line\fR, followed by one or more productions, |
| 305 | separated by single vertical bars. The layout of the productions |
| 306 | is entirely free\-format: |
| 307 | .PP |
| 308 | .Vb 3 |
| 309 | \& rule1: production1 |
| 310 | \& | production2 | |
| 311 | \& production3 | production4 |
| 312 | .Ve |
| 313 | .PP |
| 314 | At any point in the grammar previously defined rules may be extended with |
| 315 | additional productions. This is achieved by redeclaring the rule with the new |
| 316 | productions. Thus: |
| 317 | .PP |
| 318 | .Vb 3 |
| 319 | \& rule1: a | b | c |
| 320 | \& rule2: d | e | f |
| 321 | \& rule1: g | h |
| 322 | .Ve |
| 323 | .PP |
| 324 | is exactly equivalent to: |
| 325 | .PP |
| 326 | .Vb 2 |
| 327 | \& rule1: a | b | c | g | h |
| 328 | \& rule2: d | e | f |
| 329 | .Ve |
| 330 | .PP |
| 331 | Each production in a rule consists of zero or more items, each of which |
| 332 | may be either: the name of another rule to be matched (a \*(L"subrule\*(R"), |
| 333 | a pattern or string literal to be matched directly (a \*(L"token\*(R"), a |
| 334 | block of Perl code to be executed (an \*(L"action\*(R"), a special instruction |
| 335 | to the parser (a \*(L"directive\*(R"), or a standard Perl comment (which is |
| 336 | ignored). |
| 337 | .PP |
| 338 | A rule matches a text if one of its productions matches. A production |
| 339 | matches if each of its items match consecutive substrings of the |
| 340 | text. The productions of a rule being matched are tried in the same |
| 341 | order that they appear in the original grammar, and the first matching |
| 342 | production terminates the match attempt (successfully). If all |
| 343 | productions are tried and none matches, the match attempt fails. |
| 344 | .PP |
| 345 | Note that this behaviour is quite different from the \*(L"prefer the longer match\*(R" |
| 346 | behaviour of \fIyacc\fR. For example, if \fIyacc\fR were parsing the rule: |
| 347 | .PP |
| 348 | .Vb 2 |
| 349 | \& seq : 'A' 'B' |
| 350 | \& | 'A' 'B' 'C' |
| 351 | .Ve |
| 352 | .PP |
| 353 | upon matching \*(L"\s-1AB\s0\*(R" it would look ahead to see if a 'C' is next and, if |
| 354 | so, will match the second production in preference to the first. In |
| 355 | other words, \fIyacc\fR effectively tries all the productions of a rule |
| 356 | breadth-first in parallel, and selects the \*(L"best\*(R" match, where \*(L"best\*(R" |
| 357 | means longest (note that this is a gross simplification of the true |
| 358 | behaviour of \fIyacc\fR but it will do for our purposes). |
| 359 | .PP |
| 360 | In contrast, \f(CW\*(C`Parse::RecDescent\*(C'\fR tries each production depth-first in |
| 361 | sequence, and selects the \*(L"best\*(R" match, where \*(L"best\*(R" means first. This is |
| 362 | the fundamental difference between \*(L"bottom\-up\*(R" and \*(L"recursive descent\*(R" |
| 363 | parsing. |
| 364 | .PP |
| 365 | Each successfully matched item in a production is assigned a value, |
| 366 | which can be accessed in subsequent actions within the same |
| 367 | production (or, in some cases, as the return value of a successful |
| 368 | subrule call). Unsuccessful items don't have an associated value, |
| 369 | since the failure of an item causes the entire surrounding production |
| 370 | to immediately fail. The following sections describe the various types |
| 371 | of items and their success values. |
| 372 | .Sh "Subrules" |
| 373 | .IX Subsection "Subrules" |
| 374 | A subrule which appears in a production is an instruction to the parser to |
| 375 | attempt to match the named rule at that point in the text being |
| 376 | parsed. If the named subrule is not defined when requested the |
| 377 | production containing it immediately fails (unless it was \*(L"autostubbed\*(R" \- see |
| 378 | Autostubbing). |
| 379 | .PP |
| 380 | A rule may (recursively) call itself as a subrule, but \fInot\fR as the |
| 381 | left-most item in any of its productions (since such recursions are usually |
| 382 | non\-terminating). |
| 383 | .PP |
| 384 | The value associated with a subrule is the value associated with its |
| 385 | \&\f(CW$return\fR variable (see \*(L"Actions\*(R" below), or with the last successfully |
| 386 | matched item in the subrule match. |
| 387 | .PP |
| 388 | Subrules may also be specified with a trailing repetition specifier, |
| 389 | indicating that they are to be (greedily) matched the specified number |
| 390 | of times. The available specifiers are: |
| 391 | .PP |
| 392 | .Vb 7 |
| 393 | \& subrule(?) # Match one-or-zero times |
| 394 | \& subrule(s) # Match one-or-more times |
| 395 | \& subrule(s?) # Match zero-or-more times |
| 396 | \& subrule(N) # Match exactly N times for integer N > 0 |
| 397 | \& subrule(N..M) # Match between N and M times |
| 398 | \& subrule(..M) # Match between 1 and M times |
| 399 | \& subrule(N..) # Match at least N times |
| 400 | .Ve |
| 401 | .PP |
| 402 | Repeated subrules keep matching until either the subrule fails to |
| 403 | match, or it has matched the minimal number of times but fails to |
| 404 | consume any of the parsed text (this second condition prevents the |
| 405 | subrule matching forever in some cases). |
| 406 | .PP |
| 407 | Since a repeated subrule may match many instances of the subrule itself, the |
| 408 | value associated with it is not a simple scalar, but rather a reference to a |
| 409 | list of scalars, each of which is the value associated with one of the |
| 410 | individual subrule matches. In other words in the rule: |
| 411 | .PP |
| 412 | .Vb 1 |
| 413 | \& program: statement(s) |
| 414 | .Ve |
| 415 | .PP |
| 416 | the value associated with the repeated subrule \*(L"statement(s)\*(R" is a reference |
| 417 | to an array containing the values matched by each call to the individual |
| 418 | subrule \*(L"statement\*(R". |
| 419 | .PP |
| 420 | Repetition modifieres may include a separator pattern: |
| 421 | .PP |
| 422 | .Vb 1 |
| 423 | \& program: statement(s /;/) |
| 424 | .Ve |
| 425 | .PP |
| 426 | specifying some sequence of characters to be skipped between each repetition. |
| 427 | This is really just a shorthand for the <leftop:...> directive |
| 428 | (see below). |
| 429 | .Sh "Tokens" |
| 430 | .IX Subsection "Tokens" |
| 431 | If a quote-delimited string or a Perl regex appears in a production, |
| 432 | the parser attempts to match that string or pattern at that point in |
| 433 | the text. For example: |
| 434 | .PP |
| 435 | .Vb 1 |
| 436 | \& typedef: "typedef" typename identifier ';' |
| 437 | .Ve |
| 438 | .PP |
| 439 | .Vb 1 |
| 440 | \& identifier: /[A-Za-z_][A-Za-z0-9_]*/ |
| 441 | .Ve |
| 442 | .PP |
| 443 | As in regular Perl, a single quoted string is uninterpolated, whilst |
| 444 | a double-quoted string or a pattern is interpolated (at the time |
| 445 | of matching, \fInot\fR when the parser is constructed). Hence, it is |
| 446 | possible to define rules in which tokens can be set at run\-time: |
| 447 | .PP |
| 448 | .Vb 1 |
| 449 | \& typedef: "$::typedefkeyword" typename identifier ';' |
| 450 | .Ve |
| 451 | .PP |
| 452 | .Vb 1 |
| 453 | \& identifier: /$::identpat/ |
| 454 | .Ve |
| 455 | .PP |
| 456 | Note that, since each rule is implemented inside a special namespace |
| 457 | belonging to its parser, it is necessary to explicitly quantify |
| 458 | variables from the main package. |
| 459 | .PP |
| 460 | Regex tokens can be specified using just slashes as delimiters |
| 461 | or with the explicit \f(CW\*(C`m<delimiter>......<delimiter>\*(C'\fR syntax: |
| 462 | .PP |
| 463 | .Vb 1 |
| 464 | \& typedef: "typedef" typename identifier ';' |
| 465 | .Ve |
| 466 | .PP |
| 467 | .Vb 1 |
| 468 | \& typename: /[A-Za-z_][A-Za-z0-9_]*/ |
| 469 | .Ve |
| 470 | .PP |
| 471 | .Vb 1 |
| 472 | \& identifier: m{[A-Za-z_][A-Za-z0-9_]*} |
| 473 | .Ve |
| 474 | .PP |
| 475 | A regex of either type can also have any valid trailing parameter(s) |
| 476 | (that is, any of [cgimsox]): |
| 477 | .PP |
| 478 | .Vb 1 |
| 479 | \& typedef: "typedef" typename identifier ';' |
| 480 | .Ve |
| 481 | .PP |
| 482 | .Vb 3 |
| 483 | \& identifier: / [a-z_] # LEADING ALPHA OR UNDERSCORE |
| 484 | \& [a-z0-9_]* # THEN DIGITS ALSO ALLOWED |
| 485 | \& /ix # CASE/SPACE/COMMENT INSENSITIVE |
| 486 | .Ve |
| 487 | .PP |
| 488 | The value associated with any successfully matched token is a string |
| 489 | containing the actual text which was matched by the token. |
| 490 | .PP |
| 491 | It is important to remember that, since each grammar is specified in a |
| 492 | Perl string, all instances of the universal escape character '\e' within |
| 493 | a grammar must be \*(L"doubled\*(R", so that they interpolate to single '\e's when |
| 494 | the string is compiled. For example, to use the grammar: |
| 495 | .PP |
| 496 | .Vb 3 |
| 497 | \& word: /\eS+/ | backslash |
| 498 | \& line: prefix word(s) "\en" |
| 499 | \& backslash: '\e\e' |
| 500 | .Ve |
| 501 | .PP |
| 502 | the following code is required: |
| 503 | .PP |
| 504 | .Vb 1 |
| 505 | \& $parser = new Parse::RecDescent (q{ |
| 506 | .Ve |
| 507 | .PP |
| 508 | .Vb 3 |
| 509 | \& word: /\e\eS+/ | backslash |
| 510 | \& line: prefix word(s) "\e\en" |
| 511 | \& backslash: '\e\e\e\e' |
| 512 | .Ve |
| 513 | .PP |
| 514 | .Vb 1 |
| 515 | \& }); |
| 516 | .Ve |
| 517 | .Sh "Terminal Separators" |
| 518 | .IX Subsection "Terminal Separators" |
| 519 | For the purpose of matching, each terminal in a production is considered |
| 520 | to be preceded by a \*(L"prefix\*(R" \- a pattern which must be |
| 521 | matched before a token match is attempted. By default, the |
| 522 | prefix is optional whitespace (which always matches, at |
| 523 | least trivially), but this default may be reset in any production. |
| 524 | .PP |
| 525 | The variable \f(CW$Parse::RecDescent::skip\fR stores the universal |
| 526 | prefix, which is the default for all terminal matches in all parsers |
| 527 | built with \f(CW\*(C`Parse::RecDescent\*(C'\fR. |
| 528 | .PP |
| 529 | The prefix for an individual production can be altered |
| 530 | by using the \f(CW\*(C`<skip:...>\*(C'\fR directive (see below). |
| 531 | .Sh "Actions" |
| 532 | .IX Subsection "Actions" |
| 533 | An action is a block of Perl code which is to be executed (as the |
| 534 | block of a \f(CW\*(C`do\*(C'\fR statement) when the parser reaches that point in a |
| 535 | production. The action executes within a special namespace belonging to |
| 536 | the active parser, so care must be taken in correctly qualifying variable |
| 537 | names (see also \*(L"Start\-up Actions\*(R" below). |
| 538 | .PP |
| 539 | The action is considered to succeed if the final value of the block |
| 540 | is defined (that is, if the implied \f(CW\*(C`do\*(C'\fR statement evaluates to a |
| 541 | defined value \- \fIeven one which would be treated as \*(L"false\*(R"\fR). Note |
| 542 | that the value associated with a successful action is also the final |
| 543 | value in the block. |
| 544 | .PP |
| 545 | An action will \fIfail\fR if its last evaluated value is \f(CW\*(C`undef\*(C'\fR. This is |
| 546 | surprisingly easy to accomplish by accident. For instance, here's an |
| 547 | infuriating case of an action that makes its production fail, but only |
| 548 | when debugging \fIisn't\fR activated: |
| 549 | .PP |
| 550 | .Vb 4 |
| 551 | \& description: name rank serial_number |
| 552 | \& { print "Got $item[2] $item[1] ($item[3])\en" |
| 553 | \& if $::debugging |
| 554 | \& } |
| 555 | .Ve |
| 556 | .PP |
| 557 | If \f(CW$debugging\fR is false, no statement in the block is executed, so |
| 558 | the final value is \f(CW\*(C`undef\*(C'\fR, and the entire production fails. The solution is: |
| 559 | .PP |
| 560 | .Vb 5 |
| 561 | \& description: name rank serial_number |
| 562 | \& { print "Got $item[2] $item[1] ($item[3])\en" |
| 563 | \& if $::debugging; |
| 564 | \& 1; |
| 565 | \& } |
| 566 | .Ve |
| 567 | .PP |
| 568 | Within an action, a number of useful parse-time variables are |
| 569 | available in the special parser namespace (there are other variables |
| 570 | also accessible, but meddling with them will probably just break your |
| 571 | parser. As a general rule, if you avoid referring to unqualified |
| 572 | variables \- especially those starting with an underscore \- inside an action, |
| 573 | things should be okay): |
| 574 | .ie n .IP "@item\fR and \f(CW%item" 4 |
| 575 | .el .IP "\f(CW@item\fR and \f(CW%item\fR" 4 |
| 576 | .IX Item "@item and %item" |
| 577 | The array slice \f(CW@item[1..$#item]\fR stores the value associated with each item |
| 578 | (that is, each subrule, token, or action) in the current production. The |
| 579 | analogy is to \f(CW$1\fR, \f(CW$2\fR, etc. in a \fIyacc\fR grammar. |
| 580 | Note that, for obvious reasons, \f(CW@item\fR only contains the |
| 581 | values of items \fIbefore\fR the current point in the production. |
| 582 | .Sp |
| 583 | The first element (\f(CW$item[0]\fR) stores the name of the current rule |
| 584 | being matched. |
| 585 | .Sp |
| 586 | \&\f(CW@item\fR is a standard Perl array, so it can also be indexed with negative |
| 587 | numbers, representing the number of items \fIback\fR from the current position in |
| 588 | the parse: |
| 589 | .Sp |
| 590 | .Vb 3 |
| 591 | \& stuff: /various/ bits 'and' pieces "then" data 'end' |
| 592 | \& { print $item[-2] } # PRINTS data |
| 593 | \& # (EASIER THAN: $item[6]) |
| 594 | .Ve |
| 595 | .Sp |
| 596 | The \f(CW%item\fR hash complements the <@item> array, providing named |
| 597 | access to the same item values: |
| 598 | .Sp |
| 599 | .Vb 3 |
| 600 | \& stuff: /various/ bits 'and' pieces "then" data 'end' |
| 601 | \& { print $item{data} # PRINTS data |
| 602 | \& # (EVEN EASIER THAN USING @item) |
| 603 | .Ve |
| 604 | .Sp |
| 605 | The results of named subrules are stored in the hash under each |
| 606 | subrule's name, whilst all other items are stored under a \*(L"named |
| 607 | positional\*(R" key that indictates their ordinal position within their item |
| 608 | type: _\|_STRING\fIn\fR_\|_, _\|_PATTERN\fIn\fR_\|_, _\|_DIRECTIVE\fIn\fR_\|_, _\|_ACTION\fIn\fR_\|_: |
| 609 | .Sp |
| 610 | .Vb 6 |
| 611 | \& stuff: /various/ bits 'and' pieces "then" data 'end' { save } |
| 612 | \& { print $item{__PATTERN1__}, # PRINTS 'various' |
| 613 | \& $item{__STRING2__}, # PRINTS 'then' |
| 614 | \& $item{__ACTION1__}, # PRINTS RETURN |
| 615 | \& # VALUE OF save |
| 616 | \& } |
| 617 | .Ve |
| 618 | .Sp |
| 619 | If you want proper \fInamed\fR access to patterns or literals, you need to turn |
| 620 | them into separate rules: |
| 621 | .Sp |
| 622 | .Vb 3 |
| 623 | \& stuff: various bits 'and' pieces "then" data 'end' |
| 624 | \& { print $item{various} # PRINTS various |
| 625 | \& } |
| 626 | .Ve |
| 627 | .Sp |
| 628 | .Vb 1 |
| 629 | \& various: /various/ |
| 630 | .Ve |
| 631 | .Sp |
| 632 | The special entry \f(CW$item{_\|_RULE_\|_}\fR stores the name of the current |
| 633 | rule (i.e. the same value as \f(CW$item[0]\fR. |
| 634 | .Sp |
| 635 | The advantage of using \f(CW%item\fR, instead of \f(CW@items\fR is that it |
| 636 | removes the need to track items positions that may change as a grammar |
| 637 | evolves. For example, adding an interim \f(CW\*(C`<skip>\*(C'\fR directive |
| 638 | of action can silently ruin a trailing action, by moving an \f(CW@item\fR |
| 639 | element \*(L"down\*(R" the array one place. In contrast, the named entry |
| 640 | of \f(CW%item\fR is unaffected by such an insertion. |
| 641 | .Sp |
| 642 | A limitation of the \f(CW%item\fR hash is that it only records the \fIlast\fR |
| 643 | value of a particular subrule. For example: |
| 644 | .Sp |
| 645 | .Vb 2 |
| 646 | \& range: '(' number '..' number )' |
| 647 | \& { $return = $item{number} } |
| 648 | .Ve |
| 649 | .Sp |
| 650 | will return only the value corresponding to the \fIsecond\fR match of the |
| 651 | \&\f(CW\*(C`number\*(C'\fR subrule. In other words, successive calls to a subrule |
| 652 | overwrite the corresponding entry in \f(CW%item\fR. Once again, the |
| 653 | solution is to rename each subrule in its own rule: |
| 654 | .Sp |
| 655 | .Vb 2 |
| 656 | \& range: '(' from_num '..' to_num )' |
| 657 | \& { $return = $item{from_num} } |
| 658 | .Ve |
| 659 | .Sp |
| 660 | .Vb 2 |
| 661 | \& from_num: number |
| 662 | \& to_num: number |
| 663 | .Ve |
| 664 | .ie n .IP "@arg\fR and \f(CW%arg" 4 |
| 665 | .el .IP "\f(CW@arg\fR and \f(CW%arg\fR" 4 |
| 666 | .IX Item "@arg and %arg" |
| 667 | The array \f(CW@arg\fR and the hash \f(CW%arg\fR store any arguments passed to |
| 668 | the rule from some other rule (see "\*(L"Subrule argument lists\*(R"). Changes |
| 669 | to the elements of either variable do not propagate back to the calling |
| 670 | rule (data can be passed back from a subrule via the \f(CW$return\fR |
| 671 | variable \- see next item). |
| 672 | .ie n .IP "$return" 4 |
| 673 | .el .IP "\f(CW$return\fR" 4 |
| 674 | .IX Item "$return" |
| 675 | If a value is assigned to \f(CW$return\fR within an action, that value is |
| 676 | returned if the production containing the action eventually matches |
| 677 | successfully. Note that setting \f(CW$return\fR \fIdoesn't\fR cause the current |
| 678 | production to succeed. It merely tells it what to return if it \fIdoes\fR succeed. |
| 679 | Hence \f(CW$return\fR is analogous to \f(CW$$\fR in a \fIyacc\fR grammar. |
| 680 | .Sp |
| 681 | If \f(CW$return\fR is not assigned within a production, the value of the |
| 682 | last component of the production (namely: \f(CW$item[$#item]\fR) is |
| 683 | returned if the production succeeds. |
| 684 | .ie n .IP "$commit" 4 |
| 685 | .el .IP "\f(CW$commit\fR" 4 |
| 686 | .IX Item "$commit" |
| 687 | The current state of commitment to the current production (see \*(L"Directives\*(R" |
| 688 | below). |
| 689 | .ie n .IP "$skip" 4 |
| 690 | .el .IP "\f(CW$skip\fR" 4 |
| 691 | .IX Item "$skip" |
| 692 | The current terminal prefix (see \*(L"Directives\*(R" below). |
| 693 | .ie n .IP "$text" 4 |
| 694 | .el .IP "\f(CW$text\fR" 4 |
| 695 | .IX Item "$text" |
| 696 | The remaining (unparsed) text. Changes to \f(CW$text\fR \fIdo not |
| 697 | propagate\fR out of unsuccessful productions, but \fIdo\fR survive |
| 698 | successful productions. Hence it is possible to dynamically alter the |
| 699 | text being parsed \- for example, to provide a \f(CW\*(C`#include\*(C'\fR\-like facility: |
| 700 | .Sp |
| 701 | .Vb 2 |
| 702 | \& hash_include: '#include' filename |
| 703 | \& { $text = ::loadfile($item[2]) . $text } |
| 704 | .Ve |
| 705 | .Sp |
| 706 | .Vb 2 |
| 707 | \& filename: '<' /[a-z0-9._-]+/i '>' { $return = $item[2] } |
| 708 | \& | '"' /[a-z0-9._-]+/i '"' { $return = $item[2] } |
| 709 | .Ve |
| 710 | .ie n .IP "$thisline\fR and \f(CW$prevline" 4 |
| 711 | .el .IP "\f(CW$thisline\fR and \f(CW$prevline\fR" 4 |
| 712 | .IX Item "$thisline and $prevline" |
| 713 | \&\f(CW$thisline\fR stores the current line number within the current parse |
| 714 | (starting from 1). \f(CW$prevline\fR stores the line number for the last |
| 715 | character which was already successfully parsed (this will be different from |
| 716 | \&\f(CW$thisline\fR at the end of each line). |
| 717 | .Sp |
| 718 | For efficiency, \f(CW$thisline\fR and \f(CW$prevline\fR are actually tied |
| 719 | hashes, and only recompute the required line number when the variable's |
| 720 | value is used. |
| 721 | .Sp |
| 722 | Assignment to \f(CW$thisline\fR adjusts the line number calculator, so that |
| 723 | it believes that the current line number is the value being assigned. Note |
| 724 | that this adjustment will be reflected in all subsequent line numbers |
| 725 | calculations. |
| 726 | .Sp |
| 727 | Modifying the value of the variable \f(CW$text\fR (as in the previous |
| 728 | \&\f(CW\*(C`hash_include\*(C'\fR example, for instance) will confuse the line |
| 729 | counting mechanism. To prevent this, you should call |
| 730 | \&\f(CW\*(C`Parse::RecDescent::LineCounter::resync($thisline)\*(C'\fR \fIimmediately\fR |
| 731 | after any assignment to the variable \f(CW$text\fR (or, at least, before the |
| 732 | next attempt to use \f(CW$thisline\fR). |
| 733 | .Sp |
| 734 | Note that if a production fails after assigning to or |
| 735 | resync'ing \f(CW$thisline\fR, the parser's line counter mechanism will |
| 736 | usually be corrupted. |
| 737 | .Sp |
| 738 | Also see the entry for \f(CW@itempos\fR. |
| 739 | .Sp |
| 740 | The line number can be set to values other than 1, by calling the start |
| 741 | rule with a second argument. For example: |
| 742 | .Sp |
| 743 | .Vb 1 |
| 744 | \& $parser = new Parse::RecDescent ($grammar); |
| 745 | .Ve |
| 746 | .Sp |
| 747 | .Vb 1 |
| 748 | \& $parser->input($text, 10); # START LINE NUMBERS AT 10 |
| 749 | .Ve |
| 750 | .ie n .IP "$thiscolumn\fR and \f(CW$prevcolumn" 4 |
| 751 | .el .IP "\f(CW$thiscolumn\fR and \f(CW$prevcolumn\fR" 4 |
| 752 | .IX Item "$thiscolumn and $prevcolumn" |
| 753 | \&\f(CW$thiscolumn\fR stores the current column number within the current line |
| 754 | being parsed (starting from 1). \f(CW$prevcolumn\fR stores the column number |
| 755 | of the last character which was actually successfully parsed. Usually |
| 756 | \&\f(CW\*(C`$prevcolumn == $thiscolumn\-1\*(C'\fR, but not at the end of lines. |
| 757 | .Sp |
| 758 | For efficiency, \f(CW$thiscolumn\fR and \f(CW$prevcolumn\fR are |
| 759 | actually tied hashes, and only recompute the required column number |
| 760 | when the variable's value is used. |
| 761 | .Sp |
| 762 | Assignment to \f(CW$thiscolumn\fR or \f(CW$prevcolumn\fR is a fatal error. |
| 763 | .Sp |
| 764 | Modifying the value of the variable \f(CW$text\fR (as in the previous |
| 765 | \&\f(CW\*(C`hash_include\*(C'\fR example, for instance) may confuse the column |
| 766 | counting mechanism. |
| 767 | .Sp |
| 768 | Note that \f(CW$thiscolumn\fR reports the column number \fIbefore\fR any |
| 769 | whitespace that might be skipped before reading a token. Hence |
| 770 | if you wish to know where a token started (and ended) use something like this: |
| 771 | .Sp |
| 772 | .Vb 2 |
| 773 | \& rule: token1 token2 startcol token3 endcol token4 |
| 774 | \& { print "token3: columns $item[3] to $item[5]"; } |
| 775 | .Ve |
| 776 | .Sp |
| 777 | .Vb 2 |
| 778 | \& startcol: // { $thiscolumn } # NEED THE // TO STEP PAST TOKEN SEP |
| 779 | \& endcol: { $prevcolumn } |
| 780 | .Ve |
| 781 | .Sp |
| 782 | Also see the entry for \f(CW@itempos\fR. |
| 783 | .ie n .IP "$thisoffset\fR and \f(CW$prevoffset" 4 |
| 784 | .el .IP "\f(CW$thisoffset\fR and \f(CW$prevoffset\fR" 4 |
| 785 | .IX Item "$thisoffset and $prevoffset" |
| 786 | \&\f(CW$thisoffset\fR stores the offset of the current parsing position |
| 787 | within the complete text |
| 788 | being parsed (starting from 0). \f(CW$prevoffset\fR stores the offset |
| 789 | of the last character which was actually successfully parsed. In all |
| 790 | cases \f(CW\*(C`$prevoffset == $thisoffset\-1\*(C'\fR. |
| 791 | .Sp |
| 792 | For efficiency, \f(CW$thisoffset\fR and \f(CW$prevoffset\fR are |
| 793 | actually tied hashes, and only recompute the required offset |
| 794 | when the variable's value is used. |
| 795 | .Sp |
| 796 | Assignment to \f(CW$thisoffset\fR or <$prevoffset> is a fatal error. |
| 797 | .Sp |
| 798 | Modifying the value of the variable \f(CW$text\fR will \fInot\fR affect the |
| 799 | offset counting mechanism. |
| 800 | .Sp |
| 801 | Also see the entry for \f(CW@itempos\fR. |
| 802 | .ie n .IP "@itempos" 4 |
| 803 | .el .IP "\f(CW@itempos\fR" 4 |
| 804 | .IX Item "@itempos" |
| 805 | The array \f(CW@itempos\fR stores a hash reference corresponding to |
| 806 | each element of \f(CW@item\fR. The elements of the hash provide the |
| 807 | following: |
| 808 | .Sp |
| 809 | .Vb 6 |
| 810 | \& $itempos[$n]{offset}{from} # VALUE OF $thisoffset BEFORE $item[$n] |
| 811 | \& $itempos[$n]{offset}{to} # VALUE OF $prevoffset AFTER $item[$n] |
| 812 | \& $itempos[$n]{line}{from} # VALUE OF $thisline BEFORE $item[$n] |
| 813 | \& $itempos[$n]{line}{to} # VALUE OF $prevline AFTER $item[$n] |
| 814 | \& $itempos[$n]{column}{from} # VALUE OF $thiscolumn BEFORE $item[$n] |
| 815 | \& $itempos[$n]{column}{to} # VALUE OF $prevcolumn AFTER $item[$n] |
| 816 | .Ve |
| 817 | .Sp |
| 818 | Note that the various \f(CW\*(C`$itempos[$n]...{from}\*(C'\fR values record the |
| 819 | appropriate value \fIafter\fR any token prefix has been skipped. |
| 820 | .Sp |
| 821 | Hence, instead of the somewhat tedious and error\-prone: |
| 822 | .Sp |
| 823 | .Vb 9 |
| 824 | \& rule: startcol token1 endcol |
| 825 | \& startcol token2 endcol |
| 826 | \& startcol token3 endcol |
| 827 | \& { print "token1: columns $item[1] |
| 828 | \& to $item[3] |
| 829 | \& token2: columns $item[4] |
| 830 | \& to $item[6] |
| 831 | \& token3: columns $item[7] |
| 832 | \& to $item[9]" } |
| 833 | .Ve |
| 834 | .Sp |
| 835 | .Vb 2 |
| 836 | \& startcol: // { $thiscolumn } # NEED THE // TO STEP PAST TOKEN SEP |
| 837 | \& endcol: { $prevcolumn } |
| 838 | .Ve |
| 839 | .Sp |
| 840 | it is possible to write: |
| 841 | .Sp |
| 842 | .Vb 7 |
| 843 | \& rule: token1 token2 token3 |
| 844 | \& { print "token1: columns $itempos[1]{column}{from} |
| 845 | \& to $itempos[1]{column}{to} |
| 846 | \& token2: columns $itempos[2]{column}{from} |
| 847 | \& to $itempos[2]{column}{to} |
| 848 | \& token3: columns $itempos[3]{column}{from} |
| 849 | \& to $itempos[3]{column}{to}" } |
| 850 | .Ve |
| 851 | .Sp |
| 852 | Note however that (in the current implementation) the use of \f(CW@itempos\fR |
| 853 | anywhere in a grammar implies that item positioning information is |
| 854 | collected \fIeverywhere\fR during the parse. Depending on the grammar |
| 855 | and the size of the text to be parsed, this may be prohibitively |
| 856 | expensive and the explicit use of \f(CW$thisline\fR, \f(CW$thiscolumn\fR, etc. may |
| 857 | be a better choice. |
| 858 | .ie n .IP "$thisparser" 4 |
| 859 | .el .IP "\f(CW$thisparser\fR" 4 |
| 860 | .IX Item "$thisparser" |
| 861 | A reference to the \f(CW\*(C`Parse::RecDescent\*(C'\fR object through which |
| 862 | parsing was initiated. |
| 863 | .Sp |
| 864 | The value of \f(CW$thisparser\fR propagates down the subrules of a parse |
| 865 | but not back up. Hence, you can invoke subrules from another parser |
| 866 | for the scope of the current rule as follows: |
| 867 | .Sp |
| 868 | .Vb 4 |
| 869 | \& rule: subrule1 subrule2 |
| 870 | \& | { $thisparser = $::otherparser } <reject> |
| 871 | \& | subrule3 subrule4 |
| 872 | \& | subrule5 |
| 873 | .Ve |
| 874 | .Sp |
| 875 | The result is that the production calls \*(L"subrule1\*(R" and \*(L"subrule2\*(R" of |
| 876 | the current parser, and the remaining productions call the named subrules |
| 877 | from \f(CW$::otherparser\fR. Note, however that \*(L"Bad Things\*(R" will happen if |
| 878 | \&\f(CW\*(C`::otherparser\*(C'\fR isn't a blessed reference and/or doesn't have methods |
| 879 | with the same names as the required subrules! |
| 880 | .ie n .IP "$thisrule" 4 |
| 881 | .el .IP "\f(CW$thisrule\fR" 4 |
| 882 | .IX Item "$thisrule" |
| 883 | A reference to the \f(CW\*(C`Parse::RecDescent::Rule\*(C'\fR object corresponding to the |
| 884 | rule currently being matched. |
| 885 | .ie n .IP "$thisprod" 4 |
| 886 | .el .IP "\f(CW$thisprod\fR" 4 |
| 887 | .IX Item "$thisprod" |
| 888 | A reference to the \f(CW\*(C`Parse::RecDescent::Production\*(C'\fR object |
| 889 | corresponding to the production currently being matched. |
| 890 | .ie n .IP "$score\fR and \f(CW$score_return" 4 |
| 891 | .el .IP "\f(CW$score\fR and \f(CW$score_return\fR" 4 |
| 892 | .IX Item "$score and $score_return" |
| 893 | $score stores the best production score to date, as specified by |
| 894 | an earlier \f(CW\*(C`<score:...>\*(C'\fR directive. \f(CW$score_return\fR stores |
| 895 | the corresponding return value for the successful production. |
| 896 | .Sp |
| 897 | See \*(L"Scored productions\*(R". |
| 898 | .PP |
| 899 | \&\fBWarning:\fR the parser relies on the information in the various \f(CW\*(C`this...\*(C'\fR |
| 900 | objects in some non-obvious ways. Tinkering with the other members of |
| 901 | these objects will probably cause Bad Things to happen, unless you |
| 902 | \&\fIreally\fR know what you're doing. The only exception to this advice is |
| 903 | that the use of \f(CW\*(C`$this...\->{local}\*(C'\fR is always safe. |
| 904 | .Sh "Start-up Actions" |
| 905 | .IX Subsection "Start-up Actions" |
| 906 | Any actions which appear \fIbefore\fR the first rule definition in a |
| 907 | grammar are treated as \*(L"start\-up\*(R" actions. Each such action is |
| 908 | stripped of its outermost brackets and then evaluated (in the parser's |
| 909 | special namespace) just before the rules of the grammar are first |
| 910 | compiled. |
| 911 | .PP |
| 912 | The main use of start-up actions is to declare local variables within the |
| 913 | parser's special namespace: |
| 914 | .PP |
| 915 | .Vb 1 |
| 916 | \& { my $lastitem = '???'; } |
| 917 | .Ve |
| 918 | .PP |
| 919 | .Vb 1 |
| 920 | \& list: item(s) { $return = $lastitem } |
| 921 | .Ve |
| 922 | .PP |
| 923 | .Vb 3 |
| 924 | \& item: book { $lastitem = 'book'; } |
| 925 | \& bell { $lastitem = 'bell'; } |
| 926 | \& candle { $lastitem = 'candle'; } |
| 927 | .Ve |
| 928 | .PP |
| 929 | but start-up actions can be used to execute \fIany\fR valid Perl code |
| 930 | within a parser's special namespace. |
| 931 | .PP |
| 932 | Start-up actions can appear within a grammar extension or replacement |
| 933 | (that is, a partial grammar installed via \f(CW\*(C`Parse::RecDescent::Extend()\*(C'\fR or |
| 934 | \&\f(CW\*(C`Parse::RecDescent::Replace()\*(C'\fR \- see \*(L"Incremental Parsing\*(R"), and will be |
| 935 | executed before the new grammar is installed. Note, however, that a |
| 936 | particular start-up action is only ever executed once. |
| 937 | .Sh "Autoactions" |
| 938 | .IX Subsection "Autoactions" |
| 939 | It is sometimes desirable to be able to specify a default action to be |
| 940 | taken at the end of every production (for example, in order to easily |
| 941 | build a parse tree). If the variable \f(CW$::RD_AUTOACTION\fR is defined |
| 942 | when \f(CW\*(C`Parse::RecDescent::new()\*(C'\fR is called, the contents of that |
| 943 | variable are treated as a specification of an action which is to appended |
| 944 | to each production in the corresponding grammar. So, for example, to construct |
| 945 | a simple parse tree: |
| 946 | .PP |
| 947 | .Vb 1 |
| 948 | \& $::RD_AUTOACTION = q { [@item] }; |
| 949 | .Ve |
| 950 | .PP |
| 951 | .Vb 7 |
| 952 | \& parser = new Parse::RecDescent (q{ |
| 953 | \& expression: and_expr '||' expression | and_expr |
| 954 | \& and_expr: not_expr '&&' and_expr | not_expr |
| 955 | \& not_expr: '!' brack_expr | brack_expr |
| 956 | \& brack_expr: '(' expression ')' | identifier |
| 957 | \& identifier: /[a-z]+/i |
| 958 | \& }); |
| 959 | .Ve |
| 960 | .PP |
| 961 | which is equivalent to: |
| 962 | .PP |
| 963 | .Vb 5 |
| 964 | \& parser = new Parse::RecDescent (q{ |
| 965 | \& expression: and_expr '&&' expression |
| 966 | \& { [@item] } |
| 967 | \& | and_expr |
| 968 | \& { [@item] } |
| 969 | .Ve |
| 970 | .PP |
| 971 | .Vb 4 |
| 972 | \& and_expr: not_expr '&&' and_expr |
| 973 | \& { [@item] } |
| 974 | \& | not_expr |
| 975 | \& { [@item] } |
| 976 | .Ve |
| 977 | .PP |
| 978 | .Vb 4 |
| 979 | \& not_expr: '!' brack_expr |
| 980 | \& { [@item] } |
| 981 | \& | brack_expr |
| 982 | \& { [@item] } |
| 983 | .Ve |
| 984 | .PP |
| 985 | .Vb 4 |
| 986 | \& brack_expr: '(' expression ')' |
| 987 | \& { [@item] } |
| 988 | \& | identifier |
| 989 | \& { [@item] } |
| 990 | .Ve |
| 991 | .PP |
| 992 | .Vb 3 |
| 993 | \& identifier: /[a-z]+/i |
| 994 | \& { [@item] } |
| 995 | \& }); |
| 996 | .Ve |
| 997 | .PP |
| 998 | Alternatively, we could take an object-oriented approach, use different |
| 999 | classes for each node (and also eliminating redundant intermediate nodes): |
| 1000 | .PP |
| 1001 | .Vb 2 |
| 1002 | \& $::RD_AUTOACTION = q |
| 1003 | \& { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) }; |
| 1004 | .Ve |
| 1005 | .PP |
| 1006 | .Vb 7 |
| 1007 | \& parser = new Parse::RecDescent (q{ |
| 1008 | \& expression: and_expr '||' expression | and_expr |
| 1009 | \& and_expr: not_expr '&&' and_expr | not_expr |
| 1010 | \& not_expr: '!' brack_expr | brack_expr |
| 1011 | \& brack_expr: '(' expression ')' | identifier |
| 1012 | \& identifier: /[a-z]+/i |
| 1013 | \& }); |
| 1014 | .Ve |
| 1015 | .PP |
| 1016 | which is equivalent to: |
| 1017 | .PP |
| 1018 | .Vb 4 |
| 1019 | \& parser = new Parse::RecDescent (q{ |
| 1020 | \& expression: and_expr '&&' expression |
| 1021 | \& { new expression_node (@item[1..3]) } |
| 1022 | \& | and_expr |
| 1023 | .Ve |
| 1024 | .PP |
| 1025 | .Vb 3 |
| 1026 | \& and_expr: not_expr '&&' and_expr |
| 1027 | \& { new and_expr_node (@item[1..3]) } |
| 1028 | \& | not_expr |
| 1029 | .Ve |
| 1030 | .PP |
| 1031 | .Vb 3 |
| 1032 | \& not_expr: '!' brack_expr |
| 1033 | \& { new not_expr_node (@item[1..2]) } |
| 1034 | \& | brack_expr |
| 1035 | .Ve |
| 1036 | .PP |
| 1037 | .Vb 3 |
| 1038 | \& brack_expr: '(' expression ')' |
| 1039 | \& { new brack_expr_node (@item[1..3]) } |
| 1040 | \& | identifier |
| 1041 | .Ve |
| 1042 | .PP |
| 1043 | .Vb 3 |
| 1044 | \& identifier: /[a-z]+/i |
| 1045 | \& { new identifer_node (@item[1]) } |
| 1046 | \& }); |
| 1047 | .Ve |
| 1048 | .PP |
| 1049 | Note that, if a production already ends in an action, no autoaction is appended |
| 1050 | to it. For example, in this version: |
| 1051 | .PP |
| 1052 | .Vb 2 |
| 1053 | \& $::RD_AUTOACTION = q |
| 1054 | \& { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) }; |
| 1055 | .Ve |
| 1056 | .PP |
| 1057 | .Vb 8 |
| 1058 | \& parser = new Parse::RecDescent (q{ |
| 1059 | \& expression: and_expr '&&' expression | and_expr |
| 1060 | \& and_expr: not_expr '&&' and_expr | not_expr |
| 1061 | \& not_expr: '!' brack_expr | brack_expr |
| 1062 | \& brack_expr: '(' expression ')' | identifier |
| 1063 | \& identifier: /[a-z]+/i |
| 1064 | \& { new terminal_node($item[1]) } |
| 1065 | \& }); |
| 1066 | .Ve |
| 1067 | .PP |
| 1068 | each \f(CW\*(C`identifier\*(C'\fR match produces a \f(CW\*(C`terminal_node\*(C'\fR object, \fInot\fR an |
| 1069 | \&\f(CW\*(C`identifier_node\*(C'\fR object. |
| 1070 | .PP |
| 1071 | A level 1 warning is issued each time an \*(L"autoaction\*(R" is added to |
| 1072 | some production. |
| 1073 | .Sh "Autotrees" |
| 1074 | .IX Subsection "Autotrees" |
| 1075 | A commonly needed autoaction is one that builds a parse\-tree. It is moderately |
| 1076 | tricky to set up such an action (which must treat terminals differently from |
| 1077 | non\-terminals), so Parse::RecDescent simplifies the process by providing the |
| 1078 | \&\f(CW\*(C`<autotree>\*(C'\fR directive. |
| 1079 | .PP |
| 1080 | If this directive appears at the start of grammar, it causes |
| 1081 | Parse::RecDescent to insert autoactions at the end of any rule except |
| 1082 | those which already end in an action. The action inserted depends on whether |
| 1083 | the production is an intermediate rule (two or more items), or a terminal |
| 1084 | of the grammar (i.e. a single pattern or string item). |
| 1085 | .PP |
| 1086 | So, for example, the following grammar: |
| 1087 | .PP |
| 1088 | .Vb 1 |
| 1089 | \& <autotree> |
| 1090 | .Ve |
| 1091 | .PP |
| 1092 | .Vb 7 |
| 1093 | \& file : command(s) |
| 1094 | \& command : get | set | vet |
| 1095 | \& get : 'get' ident ';' |
| 1096 | \& set : 'set' ident 'to' value ';' |
| 1097 | \& vet : 'check' ident 'is' value ';' |
| 1098 | \& ident : /\ew+/ |
| 1099 | \& value : /\ed+/ |
| 1100 | .Ve |
| 1101 | .PP |
| 1102 | is equivalent to: |
| 1103 | .PP |
| 1104 | .Vb 7 |
| 1105 | \& file : command(s) { bless \e%item, $item[0] } |
| 1106 | \& command : get { bless \e%item, $item[0] } |
| 1107 | \& | set { bless \e%item, $item[0] } |
| 1108 | \& | vet { bless \e%item, $item[0] } |
| 1109 | \& get : 'get' ident ';' { bless \e%item, $item[0] } |
| 1110 | \& set : 'set' ident 'to' value ';' { bless \e%item, $item[0] } |
| 1111 | \& vet : 'check' ident 'is' value ';' { bless \e%item, $item[0] } |
| 1112 | .Ve |
| 1113 | .PP |
| 1114 | .Vb 2 |
| 1115 | \& ident : /\ew+/ { bless {__VALUE__=>$item[1]}, $item[0] } |
| 1116 | \& value : /\ed+/ { bless {__VALUE__=>$item[1]}, $item[0] } |
| 1117 | .Ve |
| 1118 | .PP |
| 1119 | Note that each node in the tree is blessed into a class of the same name |
| 1120 | as the rule itself. This makes it easy to build object-oriented |
| 1121 | processors for the parse-trees that the grammar produces. Note too that |
| 1122 | the last two rules produce special objects with the single attribute |
| 1123 | \&'_\|_VALUE_\|_'. This is because they consist solely of a single terminal. |
| 1124 | .PP |
| 1125 | This autoaction-ed grammar would then produce a parse tree in a data |
| 1126 | structure like this: |
| 1127 | .PP |
| 1128 | .Vb 18 |
| 1129 | \& { |
| 1130 | \& file => { |
| 1131 | \& command => { |
| 1132 | \& [ get => { |
| 1133 | \& identifier => { __VALUE__ => 'a' }, |
| 1134 | \& }, |
| 1135 | \& set => { |
| 1136 | \& identifier => { __VALUE__ => 'b' }, |
| 1137 | \& value => { __VALUE__ => '7' }, |
| 1138 | \& }, |
| 1139 | \& vet => { |
| 1140 | \& identifier => { __VALUE__ => 'b' }, |
| 1141 | \& value => { __VALUE__ => '7' }, |
| 1142 | \& }, |
| 1143 | \& ], |
| 1144 | \& }, |
| 1145 | \& } |
| 1146 | \& } |
| 1147 | .Ve |
| 1148 | .PP |
| 1149 | (except, of course, that each nested hash would also be blessed into |
| 1150 | the appropriate class). |
| 1151 | .Sh "Autostubbing" |
| 1152 | .IX Subsection "Autostubbing" |
| 1153 | Normally, if a subrule appears in some production, but no rule of that |
| 1154 | name is ever defined in the grammar, the production which refers to the |
| 1155 | non-existent subrule fails immediately. This typically occurs as a |
| 1156 | result of misspellings, and is a sufficiently common occurance that a |
| 1157 | warning is generated for such situations. |
| 1158 | .PP |
| 1159 | However, when prototyping a grammar it is sometimes useful to be |
| 1160 | able to use subrules before a proper specification of them is |
| 1161 | really possible. For example, a grammar might include a section like: |
| 1162 | .PP |
| 1163 | .Vb 1 |
| 1164 | \& function_call: identifier '(' arg(s?) ')' |
| 1165 | .Ve |
| 1166 | .PP |
| 1167 | .Vb 1 |
| 1168 | \& identifier: /[a-z]\ew*/i |
| 1169 | .Ve |
| 1170 | .PP |
| 1171 | where the possible format of an argument is sufficiently complex that |
| 1172 | it is not worth specifying in full until the general function call |
| 1173 | syntax has been debugged. In this situation it is convenient to leave |
| 1174 | the real rule \f(CW\*(C`arg\*(C'\fR undefined and just slip in a placeholder (or |
| 1175 | \&\*(L"stub\*(R"): |
| 1176 | .PP |
| 1177 | .Vb 1 |
| 1178 | \& arg: 'arg' |
| 1179 | .Ve |
| 1180 | .PP |
| 1181 | so that the function call syntax can be tested with dummy input such as: |
| 1182 | .PP |
| 1183 | .Vb 4 |
| 1184 | \& f0() |
| 1185 | \& f1(arg) |
| 1186 | \& f2(arg arg) |
| 1187 | \& f3(arg arg arg) |
| 1188 | .Ve |
| 1189 | .PP |
| 1190 | et cetera. |
| 1191 | .PP |
| 1192 | Early in prototyping, many such \*(L"stubs\*(R" may be required, so |
| 1193 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR provides a means of automating their definition. |
| 1194 | If the variable \f(CW$::RD_AUTOSTUB\fR is defined when a parser is built, |
| 1195 | a subrule reference to any non-existent rule (say, \f(CW\*(C`sr\*(C'\fR), |
| 1196 | causes a \*(L"stub\*(R" rule of the form: |
| 1197 | .PP |
| 1198 | .Vb 1 |
| 1199 | \& sr: 'sr' |
| 1200 | .Ve |
| 1201 | .PP |
| 1202 | to be automatically defined in the generated parser. |
| 1203 | A level 1 warning is issued for each such \*(L"autostubbed\*(R" rule. |
| 1204 | .PP |
| 1205 | Hence, with \f(CW$::AUTOSTUB\fR defined, it is possible to only partially |
| 1206 | specify a grammar, and then \*(L"fake\*(R" matches of the unspecified |
| 1207 | (sub)rules by just typing in their name. |
| 1208 | .Sh "Look-ahead" |
| 1209 | .IX Subsection "Look-ahead" |
| 1210 | If a subrule, token, or action is prefixed by \*(L"...\*(R", then it is |
| 1211 | treated as a \*(L"look\-ahead\*(R" request. That means that the current production can |
| 1212 | (as usual) only succeed if the specified item is matched, but that the matching |
| 1213 | \&\fIdoes not consume any of the text being parsed\fR. This is very similar to the |
| 1214 | \&\f(CW\*(C`/(?=...)/\*(C'\fR look-ahead construct in Perl patterns. Thus, the rule: |
| 1215 | .PP |
| 1216 | .Vb 1 |
| 1217 | \& inner_word: word ...word |
| 1218 | .Ve |
| 1219 | .PP |
| 1220 | will match whatever the subrule \*(L"word\*(R" matches, provided that match is followed |
| 1221 | by some more text which subrule \*(L"word\*(R" would also match (although this |
| 1222 | second substring is not actually consumed by \*(L"inner_word\*(R") |
| 1223 | .PP |
| 1224 | Likewise, a \*(L"...!\*(R" prefix, causes the following item to succeed (without |
| 1225 | consuming any text) if and only if it would normally fail. Hence, a |
| 1226 | rule such as: |
| 1227 | .PP |
| 1228 | .Vb 1 |
| 1229 | \& identifier: ...!keyword ...!'_' /[A-Za-z_]\ew*/ |
| 1230 | .Ve |
| 1231 | .PP |
| 1232 | matches a string of characters which satisfies the pattern |
| 1233 | \&\f(CW\*(C`/[A\-Za\-z_]\ew*/\*(C'\fR, but only if the same sequence of characters would |
| 1234 | not match either subrule \*(L"keyword\*(R" or the literal token '_'. |
| 1235 | .PP |
| 1236 | Sequences of look-ahead prefixes accumulate, multiplying their positive and/or |
| 1237 | negative senses. Hence: |
| 1238 | .PP |
| 1239 | .Vb 1 |
| 1240 | \& inner_word: word ...!......!word |
| 1241 | .Ve |
| 1242 | .PP |
| 1243 | is exactly equivalent the the original example above (a warning is issued in |
| 1244 | cases like these, since they often indicate something left out, or |
| 1245 | misunderstood). |
| 1246 | .PP |
| 1247 | Note that actions can also be treated as look\-aheads. In such cases, |
| 1248 | the state of the parser text (in the local variable \f(CW$text\fR) |
| 1249 | \&\fIafter\fR the look-ahead action is guaranteed to be identical to its |
| 1250 | state \fIbefore\fR the action, regardless of how it's changed \fIwithin\fR |
| 1251 | the action (unless you actually undefine \f(CW$text\fR, in which case you |
| 1252 | get the disaster you deserve :\-). |
| 1253 | .Sh "Directives" |
| 1254 | .IX Subsection "Directives" |
| 1255 | Directives are special pre-defined actions which may be used to alter |
| 1256 | the behaviour of the parser. There are currently eighteen directives: |
| 1257 | \&\f(CW\*(C`<commit>\*(C'\fR, |
| 1258 | \&\f(CW\*(C`<uncommit>\*(C'\fR, |
| 1259 | \&\f(CW\*(C`<reject>\*(C'\fR, |
| 1260 | \&\f(CW\*(C`<score>\*(C'\fR, |
| 1261 | \&\f(CW\*(C`<autoscore>\*(C'\fR, |
| 1262 | \&\f(CW\*(C`<skip>\*(C'\fR, |
| 1263 | \&\f(CW\*(C`<resync>\*(C'\fR, |
| 1264 | \&\f(CW\*(C`<error>\*(C'\fR, |
| 1265 | \&\f(CW\*(C`<rulevar>\*(C'\fR, |
| 1266 | \&\f(CW\*(C`<matchrule>\*(C'\fR, |
| 1267 | \&\f(CW\*(C`<leftop>\*(C'\fR, |
| 1268 | \&\f(CW\*(C`<rightop>\*(C'\fR, |
| 1269 | \&\f(CW\*(C`<defer>\*(C'\fR, |
| 1270 | \&\f(CW\*(C`<nocheck>\*(C'\fR, |
| 1271 | \&\f(CW\*(C`<perl_quotelike>\*(C'\fR, |
| 1272 | \&\f(CW\*(C`<perl_codeblock>\*(C'\fR, |
| 1273 | \&\f(CW\*(C`<perl_variable>\*(C'\fR, |
| 1274 | and \f(CW\*(C`<token>\*(C'\fR. |
| 1275 | .IP "Committing and uncommitting" 4 |
| 1276 | .IX Item "Committing and uncommitting" |
| 1277 | The \f(CW\*(C`<commit>\*(C'\fR and \f(CW\*(C`<uncommit>\*(C'\fR directives permit the recursive |
| 1278 | descent of the parse tree to be pruned (or \*(L"cut\*(R") for efficiency. |
| 1279 | Within a rule, a \f(CW\*(C`<commit>\*(C'\fR directive instructs the rule to ignore subsequent |
| 1280 | productions if the current production fails. For example: |
| 1281 | .Sp |
| 1282 | .Vb 3 |
| 1283 | \& command: 'find' <commit> filename |
| 1284 | \& | 'open' <commit> filename |
| 1285 | \& | 'move' filename filename |
| 1286 | .Ve |
| 1287 | .Sp |
| 1288 | Clearly, if the leading token 'find' is matched in the first production but that |
| 1289 | production fails for some other reason, then the remaining |
| 1290 | productions cannot possibly match. The presence of the |
| 1291 | \&\f(CW\*(C`<commit>\*(C'\fR causes the \*(L"command\*(R" rule to fail immediately if |
| 1292 | an invalid \*(L"find\*(R" command is found, and likewise if an invalid \*(L"open\*(R" |
| 1293 | command is encountered. |
| 1294 | .Sp |
| 1295 | It is also possible to revoke a previous commitment. For example: |
| 1296 | .Sp |
| 1297 | .Vb 5 |
| 1298 | \& if_statement: 'if' <commit> condition |
| 1299 | \& 'then' block <uncommit> |
| 1300 | \& 'else' block |
| 1301 | \& | 'if' <commit> condition |
| 1302 | \& 'then' block |
| 1303 | .Ve |
| 1304 | .Sp |
| 1305 | In this case, a failure to find an \*(L"else\*(R" block in the first |
| 1306 | production shouldn't preclude trying the second production, but a |
| 1307 | failure to find a \*(L"condition\*(R" certainly should. |
| 1308 | .Sp |
| 1309 | As a special case, any production in which the \fIfirst\fR item is an |
| 1310 | \&\f(CW\*(C`<uncommit>\*(C'\fR immediately revokes a preceding \f(CW\*(C`<commit>\*(C'\fR |
| 1311 | (even though the production would not otherwise have been tried). For |
| 1312 | example, in the rule: |
| 1313 | .Sp |
| 1314 | .Vb 5 |
| 1315 | \& request: 'explain' expression |
| 1316 | \& | 'explain' <commit> keyword |
| 1317 | \& | 'save' |
| 1318 | \& | 'quit' |
| 1319 | \& | <uncommit> term '?' |
| 1320 | .Ve |
| 1321 | .Sp |
| 1322 | if the text being matched was \*(L"explain?\*(R", and the first two |
| 1323 | productions failed, then the \f(CW\*(C`<commit>\*(C'\fR in production two would cause |
| 1324 | productions three and four to be skipped, but the leading |
| 1325 | \&\f(CW\*(C`<uncommit>\*(C'\fR in the production five would allow that production to |
| 1326 | attempt a match. |
| 1327 | .Sp |
| 1328 | Note in the preceding example, that the \f(CW\*(C`<commit>\*(C'\fR was only placed |
| 1329 | in production two. If production one had been: |
| 1330 | .Sp |
| 1331 | .Vb 1 |
| 1332 | \& request: 'explain' <commit> expression |
| 1333 | .Ve |
| 1334 | .Sp |
| 1335 | then production two would be (inappropriately) skipped if a leading |
| 1336 | \&\*(L"explain...\*(R" was encountered. |
| 1337 | .Sp |
| 1338 | Both \f(CW\*(C`<commit>\*(C'\fR and \f(CW\*(C`<uncommit>\*(C'\fR directives always succeed, and their value |
| 1339 | is always 1. |
| 1340 | .IP "Rejecting a production" 4 |
| 1341 | .IX Item "Rejecting a production" |
| 1342 | The \f(CW\*(C`<reject>\*(C'\fR directive immediately causes the current production |
| 1343 | to fail (it is exactly equivalent to, but more obvious than, the |
| 1344 | action \f(CW\*(C`{undef}\*(C'\fR). A \f(CW\*(C`<reject>\*(C'\fR is useful when it is desirable to get |
| 1345 | the side effects of the actions in one production, without prejudicing a match |
| 1346 | by some other production later in the rule. For example, to insert |
| 1347 | tracing code into the parse: |
| 1348 | .Sp |
| 1349 | .Vb 1 |
| 1350 | \& complex_rule: { print "In complex rule...\en"; } <reject> |
| 1351 | .Ve |
| 1352 | .Sp |
| 1353 | .Vb 3 |
| 1354 | \& complex_rule: simple_rule '+' 'i' '*' simple_rule |
| 1355 | \& | 'i' '*' simple_rule |
| 1356 | \& | simple_rule |
| 1357 | .Ve |
| 1358 | .Sp |
| 1359 | It is also possible to specify a conditional rejection, using the |
| 1360 | form \f(CW\*(C`<reject:\f(CIcondition\f(CW>\*(C'\fR, which only rejects if the |
| 1361 | specified condition is true. This form of rejection is exactly |
| 1362 | equivalent to the action \f(CW\*(C`{(\f(CIcondition\f(CW)?undef:1}>\*(C'\fR. |
| 1363 | For example: |
| 1364 | .Sp |
| 1365 | .Vb 4 |
| 1366 | \& command: save_command |
| 1367 | \& | restore_command |
| 1368 | \& | <reject: defined $::tolerant> { exit } |
| 1369 | \& | <error: Unknown command. Ignored.> |
| 1370 | .Ve |
| 1371 | .Sp |
| 1372 | A \f(CW\*(C`<reject>\*(C'\fR directive never succeeds (and hence has no |
| 1373 | associated value). A conditional rejection may succeed (if its |
| 1374 | condition is not satisfied), in which case its value is 1. |
| 1375 | .Sp |
| 1376 | As an extra optimization, \f(CW\*(C`Parse::RecDescent\*(C'\fR ignores any production |
| 1377 | which \fIbegins\fR with an unconditional \f(CW\*(C`<reject>\*(C'\fR directive, |
| 1378 | since any such production can never successfully match or have any |
| 1379 | useful side\-effects. A level 1 warning is issued in all such cases. |
| 1380 | .Sp |
| 1381 | Note that productions beginning with conditional |
| 1382 | \&\f(CW\*(C`<reject:...>\*(C'\fR directives are \fInever\fR \*(L"optimized away\*(R" in |
| 1383 | this manner, even if they are always guaranteed to fail (for example: |
| 1384 | \&\f(CW\*(C`<reject:1>\*(C'\fR) |
| 1385 | .Sp |
| 1386 | Due to the way grammars are parsed, there is a minor restriction on the |
| 1387 | condition of a conditional \f(CW\*(C`<reject:...>\*(C'\fR: it cannot |
| 1388 | contain any raw '<' or '>' characters. For example: |
| 1389 | .Sp |
| 1390 | .Vb 1 |
| 1391 | \& line: cmd <reject: $thiscolumn > max> data |
| 1392 | .Ve |
| 1393 | .Sp |
| 1394 | results in an error when a parser is built from this grammar (since the |
| 1395 | grammar parser has no way of knowing whether the first > is a \*(L"less than\*(R" |
| 1396 | or the end of the \f(CW\*(C`<reject:...>\*(C'\fR. |
| 1397 | .Sp |
| 1398 | To overcome this problem, put the condition inside a do{} block: |
| 1399 | .Sp |
| 1400 | .Vb 1 |
| 1401 | \& line: cmd <reject: do{$thiscolumn > max}> data |
| 1402 | .Ve |
| 1403 | .Sp |
| 1404 | Note that the same problem may occur in other directives that take |
| 1405 | arguments. The same solution will work in all cases. |
| 1406 | .IP "Skipping between terminals" 4 |
| 1407 | .IX Item "Skipping between terminals" |
| 1408 | The \f(CW\*(C`<skip>\*(C'\fR directive enables the terminal prefix used in |
| 1409 | a production to be changed. For example: |
| 1410 | .Sp |
| 1411 | .Vb 1 |
| 1412 | \& OneLiner: Command <skip:'[ \et]*'> Arg(s) /;/ |
| 1413 | .Ve |
| 1414 | .Sp |
| 1415 | causes only blanks and tabs to be skipped before terminals in the \f(CW\*(C`Arg\*(C'\fR |
| 1416 | subrule (and any of \fIits\fR subrules>, and also before the final \f(CW\*(C`/;/\*(C'\fR terminal. |
| 1417 | Once the production is complete, the previous terminal prefix is |
| 1418 | reinstated. Note that this implies that distinct productions of a rule |
| 1419 | must reset their terminal prefixes individually. |
| 1420 | .Sp |
| 1421 | The \f(CW\*(C`<skip>\*(C'\fR directive evaluates to the \fIprevious\fR terminal prefix, |
| 1422 | so it's easy to reinstate a prefix later in a production: |
| 1423 | .Sp |
| 1424 | .Vb 1 |
| 1425 | \& Command: <skip:","> CSV(s) <skip:$item[1]> Modifier |
| 1426 | .Ve |
| 1427 | .Sp |
| 1428 | The value specified after the colon is interpolated into a pattern, so all of |
| 1429 | the following are equivalent (though their efficiency increases down the list): |
| 1430 | .Sp |
| 1431 | .Vb 1 |
| 1432 | \& <skip: "$colon|$comma"> # ASSUMING THE VARS HOLD THE OBVIOUS VALUES |
| 1433 | .Ve |
| 1434 | .Sp |
| 1435 | .Vb 1 |
| 1436 | \& <skip: ':|,'> |
| 1437 | .Ve |
| 1438 | .Sp |
| 1439 | .Vb 1 |
| 1440 | \& <skip: q{[:,]}> |
| 1441 | .Ve |
| 1442 | .Sp |
| 1443 | .Vb 1 |
| 1444 | \& <skip: qr/[:,]/> |
| 1445 | .Ve |
| 1446 | .Sp |
| 1447 | There is no way of directly setting the prefix for |
| 1448 | an entire rule, except as follows: |
| 1449 | .Sp |
| 1450 | .Vb 3 |
| 1451 | \& Rule: <skip: '[ \et]*'> Prod1 |
| 1452 | \& | <skip: '[ \et]*'> Prod2a Prod2b |
| 1453 | \& | <skip: '[ \et]*'> Prod3 |
| 1454 | .Ve |
| 1455 | .Sp |
| 1456 | or, better: |
| 1457 | .Sp |
| 1458 | .Vb 6 |
| 1459 | \& Rule: <skip: '[ \et]*'> |
| 1460 | \& ( |
| 1461 | \& Prod1 |
| 1462 | \& | Prod2a Prod2b |
| 1463 | \& | Prod3 |
| 1464 | \& ) |
| 1465 | .Ve |
| 1466 | .Sp |
| 1467 | \&\fBNote: Up to release 1.51 of Parse::RecDescent, an entirely different |
| 1468 | mechanism was used for specifying terminal prefixes. The current method |
| 1469 | is not backwards-compatible with that early approach. The current approach |
| 1470 | is stable and will not to change again.\fR |
| 1471 | .IP "Resynchronization" 4 |
| 1472 | .IX Item "Resynchronization" |
| 1473 | The \f(CW\*(C`<resync>\*(C'\fR directive provides a visually distinctive |
| 1474 | means of consuming some of the text being parsed, usually to skip an |
| 1475 | erroneous input. In its simplest form \f(CW\*(C`<resync>\*(C'\fR simply |
| 1476 | consumes text up to and including the next newline (\f(CW"\en"\fR) |
| 1477 | character, succeeding only if the newline is found, in which case it |
| 1478 | causes its surrounding rule to return zero on success. |
| 1479 | .Sp |
| 1480 | In other words, a \f(CW\*(C`<resync>\*(C'\fR is exactly equivalent to the token |
| 1481 | \&\f(CW\*(C`/[^\en]*\en/\*(C'\fR followed by the action \f(CW\*(C`{\ $return\ =\ 0\ }\*(C'\fR (except that |
| 1482 | productions beginning with a \f(CW\*(C`<resync>\*(C'\fR are ignored when generating |
| 1483 | error messages). A typical use might be: |
| 1484 | .Sp |
| 1485 | .Vb 1 |
| 1486 | \& script : command(s) |
| 1487 | .Ve |
| 1488 | .Sp |
| 1489 | .Vb 3 |
| 1490 | \& command: save_command |
| 1491 | \& | restore_command |
| 1492 | \& | <resync> # TRY NEXT LINE, IF POSSIBLE |
| 1493 | .Ve |
| 1494 | .Sp |
| 1495 | It is also possible to explicitly specify a resynchronization |
| 1496 | pattern, using the \f(CW\*(C`<resync:\f(CIpattern\f(CW>\*(C'\fR variant. This version |
| 1497 | succeeds only if the specified pattern matches (and consumes) the |
| 1498 | parsed text. In other words, \f(CW\*(C`<resync:\f(CIpattern\f(CW>\*(C'\fR is exactly |
| 1499 | equivalent to the token \f(CW\*(C`/\f(CIpattern\f(CW/\*(C'\fR (followed by a \f(CW\*(C`{\ $return\ =\ 0\ }\*(C'\fR |
| 1500 | action). For example, if commands were terminated by newlines or semi\-colons: |
| 1501 | .Sp |
| 1502 | .Vb 3 |
| 1503 | \& command: save_command |
| 1504 | \& | restore_command |
| 1505 | \& | <resync:[^;\en]*[;\en]> |
| 1506 | .Ve |
| 1507 | .Sp |
| 1508 | The value of a successfully matched \f(CW\*(C`<resync>\*(C'\fR directive (of either |
| 1509 | type) is the text that it consumed. Note, however, that since the |
| 1510 | directive also sets \f(CW$return\fR, a production consisting of a lone |
| 1511 | \&\f(CW\*(C`<resync>\*(C'\fR succeeds but returns the value zero (which a calling rule |
| 1512 | may find useful to distinguish between \*(L"true\*(R" matches and \*(L"tolerant\*(R" matches). |
| 1513 | Remember that returning a zero value indicates that the rule \fIsucceeded\fR (since |
| 1514 | only an \f(CW\*(C`undef\*(C'\fR denotes failure within \f(CW\*(C`Parse::RecDescent\*(C'\fR parsers. |
| 1515 | .IP "Error handling" 4 |
| 1516 | .IX Item "Error handling" |
| 1517 | The \f(CW\*(C`<error>\*(C'\fR directive provides automatic or user-defined |
| 1518 | generation of error messages during a parse. In its simplest form |
| 1519 | \&\f(CW\*(C`<error>\*(C'\fR prepares an error message based on |
| 1520 | the mismatch between the last item expected and the text which cause |
| 1521 | it to fail. For example, given the rule: |
| 1522 | .Sp |
| 1523 | .Vb 3 |
| 1524 | \& McCoy: curse ',' name ', I'm a doctor, not a' a_profession '!' |
| 1525 | \& | pronoun 'dead,' name '!' |
| 1526 | \& | <error> |
| 1527 | .Ve |
| 1528 | .Sp |
| 1529 | the following strings would produce the following messages: |
| 1530 | .RS 4 |
| 1531 | .ie n .IP """Amen, Jim!""" 4 |
| 1532 | .el .IP "``Amen, Jim!''" 4 |
| 1533 | .IX Item "Amen, Jim!" |
| 1534 | .Vb 2 |
| 1535 | \& ERROR (line 1): Invalid McCoy: Expected curse or pronoun |
| 1536 | \& not found |
| 1537 | .Ve |
| 1538 | .ie n .IP """Dammit, Jim, I'm a doctor!""" 4 |
| 1539 | .el .IP "``Dammit, Jim, I'm a doctor!''" 4 |
| 1540 | .IX Item "Dammit, Jim, I'm a doctor!" |
| 1541 | .Vb 2 |
| 1542 | \& ERROR (line 1): Invalid McCoy: Expected ", I'm a doctor, not a" |
| 1543 | \& but found ", I'm a doctor!" instead |
| 1544 | .Ve |
| 1545 | .ie n .IP """He's dead,\en""" 4 |
| 1546 | .el .IP "``He's dead,\en''" 4 |
| 1547 | .IX Item "He's dead,n" |
| 1548 | .Vb 1 |
| 1549 | \& ERROR (line 2): Invalid McCoy: Expected name not found |
| 1550 | .Ve |
| 1551 | .ie n .IP """He's alive!""" 4 |
| 1552 | .el .IP "``He's alive!''" 4 |
| 1553 | .IX Item "He's alive!" |
| 1554 | .Vb 2 |
| 1555 | \& ERROR (line 1): Invalid McCoy: Expected 'dead,' but found |
| 1556 | \& "alive!" instead |
| 1557 | .Ve |
| 1558 | .ie n .IP """Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!""" 4 |
| 1559 | .el .IP "``Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!''" 4 |
| 1560 | .IX Item "Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!" |
| 1561 | .Vb 2 |
| 1562 | \& ERROR (line 1): Invalid McCoy: Expected a profession but found |
| 1563 | \& "pointy-eared Vulcan!" instead |
| 1564 | .Ve |
| 1565 | .RE |
| 1566 | .RS 4 |
| 1567 | .Sp |
| 1568 | Note that, when autogenerating error messages, all underscores in any |
| 1569 | rule name used in a message are replaced by single spaces (for example |
| 1570 | \&\*(L"a_production\*(R" becomes \*(L"a production\*(R"). Judicious choice of rule |
| 1571 | names can therefore considerably improve the readability of automatic |
| 1572 | error messages (as well as the maintainability of the original |
| 1573 | grammar). |
| 1574 | .Sp |
| 1575 | If the automatically generated error is not sufficient, it is possible to |
| 1576 | provide an explicit message as part of the error directive. For example: |
| 1577 | .Sp |
| 1578 | .Vb 3 |
| 1579 | \& Spock: "Fascinating ',' (name | 'Captain') '.' |
| 1580 | \& | "Highly illogical, doctor." |
| 1581 | \& | <error: He never said that!> |
| 1582 | .Ve |
| 1583 | .Sp |
| 1584 | which would result in \fIall\fR failures to parse a \*(L"Spock\*(R" subrule printing the |
| 1585 | following message: |
| 1586 | .Sp |
| 1587 | .Vb 1 |
| 1588 | \& ERROR (line <N>): Invalid Spock: He never said that! |
| 1589 | .Ve |
| 1590 | .Sp |
| 1591 | The error message is treated as a \*(L"qq{...}\*(R" string and interpolated |
| 1592 | when the error is generated (\fInot\fR when the directive is specified!). |
| 1593 | Hence: |
| 1594 | .Sp |
| 1595 | .Vb 1 |
| 1596 | \& <error: Mystical error near "$text"> |
| 1597 | .Ve |
| 1598 | .Sp |
| 1599 | would correctly insert the ambient text string which caused the error. |
| 1600 | .Sp |
| 1601 | There are two other forms of error directive: \f(CW\*(C`<error?>\*(C'\fR and |
| 1602 | \&\f(CW\*(C`<error?:\ msg>\*(C'\fR. These behave just like \f(CW\*(C`<error>\*(C'\fR |
| 1603 | and \f(CW\*(C`<error:\ msg>\*(C'\fR respectively, except that they are |
| 1604 | only triggered if the rule is \*(L"committed\*(R" at the time they are |
| 1605 | encountered. For example: |
| 1606 | .Sp |
| 1607 | .Vb 3 |
| 1608 | \& Scotty: "Ya kenna change the Laws of Phusics," <commit> name |
| 1609 | \& | name <commit> ',' 'she's goanta blaw!' |
| 1610 | \& | <error?> |
| 1611 | .Ve |
| 1612 | .Sp |
| 1613 | will only generate an error for a string beginning with \*(L"Ya kenna |
| 1614 | change the Laws o' Phusics,\*(R" or a valid name, but which still fails to match the |
| 1615 | corresponding production. That is, \f(CW\*(C`$parser\->Scotty("Aye, Cap'ain")\*(C'\fR will |
| 1616 | fail silently (since neither production will \*(L"commit\*(R" the rule on that |
| 1617 | input), whereas \f(CW\*(C`$parser\->Scotty("Mr\ Spock,\ ah\ jest\ kenna\ do'ut!")\*(C'\fR |
| 1618 | will fail with the error message: |
| 1619 | .Sp |
| 1620 | .Vb 2 |
| 1621 | \& ERROR (line 1): Invalid Scotty: expected 'she's goanta blaw!' |
| 1622 | \& but found 'I jest kenna do'ut!' instead. |
| 1623 | .Ve |
| 1624 | .Sp |
| 1625 | since in that case the second production would commit after matching |
| 1626 | the leading name. |
| 1627 | .Sp |
| 1628 | Note that to allow this behaviour, all \f(CW\*(C`<error>\*(C'\fR directives which are |
| 1629 | the first item in a production automatically uncommit the rule just |
| 1630 | long enough to allow their production to be attempted (that is, when |
| 1631 | their production fails, the commitment is reinstated so that |
| 1632 | subsequent productions are skipped). |
| 1633 | .Sp |
| 1634 | In order to \fIpermanently\fR uncommit the rule before an error message, |
| 1635 | it is necessary to put an explicit \f(CW\*(C`<uncommit>\*(C'\fR before the |
| 1636 | \&\f(CW\*(C`<error>\*(C'\fR. For example: |
| 1637 | .Sp |
| 1638 | .Vb 5 |
| 1639 | \& line: 'Kirk:' <commit> Kirk |
| 1640 | \& | 'Spock:' <commit> Spock |
| 1641 | \& | 'McCoy:' <commit> McCoy |
| 1642 | \& | <uncommit> <error?> <reject> |
| 1643 | \& | <resync> |
| 1644 | .Ve |
| 1645 | .Sp |
| 1646 | Error messages generated by the various \f(CW\*(C`<error...>\*(C'\fR directives |
| 1647 | are not displayed immediately. Instead, they are \*(L"queued\*(R" in a buffer and |
| 1648 | are only displayed once parsing ultimately fails. Moreover, |
| 1649 | \&\f(CW\*(C`<error...>\*(C'\fR directives that cause one production of a rule |
| 1650 | to fail are automatically removed from the message queue |
| 1651 | if another production subsequently causes the entire rule to succeed. |
| 1652 | This means that you can put |
| 1653 | \&\f(CW\*(C`<error...>\*(C'\fR directives wherever useful diagnosis can be done, |
| 1654 | and only those associated with actual parser failure will ever be |
| 1655 | displayed. Also see \*(L"Gotchas\*(R". |
| 1656 | .Sp |
| 1657 | As a general rule, the most useful diagnostics are usually generated |
| 1658 | either at the very lowest level within the grammar, or at the very |
| 1659 | highest. A good rule of thumb is to identify those subrules which |
| 1660 | consist mainly (or entirely) of terminals, and then put an |
| 1661 | \&\f(CW\*(C`<error...>\*(C'\fR directive at the end of any other rule which calls |
| 1662 | one or more of those subrules. |
| 1663 | .Sp |
| 1664 | There is one other situation in which the output of the various types of |
| 1665 | error directive is suppressed; namely, when the rule containing them |
| 1666 | is being parsed as part of a \*(L"look\-ahead\*(R" (see \*(L"Look\-ahead\*(R"). In this |
| 1667 | case, the error directive will still cause the rule to fail, but will do |
| 1668 | so silently. |
| 1669 | .Sp |
| 1670 | An unconditional \f(CW\*(C`<error>\*(C'\fR directive always fails (and hence has no |
| 1671 | associated value). This means that encountering such a directive |
| 1672 | always causes the production containing it to fail. Hence an |
| 1673 | \&\f(CW\*(C`<error>\*(C'\fR directive will inevitably be the last (useful) item of a |
| 1674 | rule (a level 3 warning is issued if a production contains items after an unconditional |
| 1675 | \&\f(CW\*(C`<error>\*(C'\fR directive). |
| 1676 | .Sp |
| 1677 | An \f(CW\*(C`<error?>\*(C'\fR directive will \fIsucceed\fR (that is: fail to fail :\-), if |
| 1678 | the current rule is uncommitted when the directive is encountered. In |
| 1679 | that case the directive's associated value is zero. Hence, this type |
| 1680 | of error directive \fIcan\fR be used before the end of a |
| 1681 | production. For example: |
| 1682 | .Sp |
| 1683 | .Vb 3 |
| 1684 | \& command: 'do' <commit> something |
| 1685 | \& | 'report' <commit> something |
| 1686 | \& | <error?: Syntax error> <error: Unknown command> |
| 1687 | .Ve |
| 1688 | .Sp |
| 1689 | \&\fBWarning:\fR The \f(CW\*(C`<error?>\*(C'\fR directive does \fInot\fR mean \*(L"always fail (but |
| 1690 | do so silently unless committed)\*(R". It actually means "only fail (and report) if |
| 1691 | committed, otherwise \fIsucceed\fR\*(L". To achieve the \*(R"fail silently if uncommitted" |
| 1692 | semantics, it is necessary to use: |
| 1693 | .Sp |
| 1694 | .Vb 2 |
| 1695 | \& rule: item <commit> item(s) |
| 1696 | \& | <error?> <reject> # FAIL SILENTLY UNLESS COMMITTED |
| 1697 | .Ve |
| 1698 | .Sp |
| 1699 | However, because people seem to expect a lone \f(CW\*(C`<error?>\*(C'\fR directive |
| 1700 | to work like this: |
| 1701 | .Sp |
| 1702 | .Vb 3 |
| 1703 | \& rule: item <commit> item(s) |
| 1704 | \& | <error?: Error message if committed> |
| 1705 | \& | <error: Error message if uncommitted> |
| 1706 | .Ve |
| 1707 | .Sp |
| 1708 | Parse::RecDescent automatically appends a |
| 1709 | \&\f(CW\*(C`<reject>\*(C'\fR directive if the \f(CW\*(C`<error?>\*(C'\fR directive |
| 1710 | is the only item in a production. A level 2 warning (see below) |
| 1711 | is issued when this happens. |
| 1712 | .Sp |
| 1713 | The level of error reporting during both parser construction and |
| 1714 | parsing is controlled by the presence or absence of four global |
| 1715 | variables: \f(CW$::RD_ERRORS\fR, \f(CW$::RD_WARN\fR, \f(CW$::RD_HINT\fR, and |
| 1716 | <$::RD_TRACE>. If \f(CW$::RD_ERRORS\fR is defined (and, by default, it is) |
| 1717 | then fatal errors are reported. |
| 1718 | .Sp |
| 1719 | Whenever \f(CW$::RD_WARN\fR is defined, certain non-fatal problems are also reported. |
| 1720 | Warnings have an associated \*(L"level\*(R": 1, 2, or 3. The higher the level, |
| 1721 | the more serious the warning. The value of the corresponding global |
| 1722 | variable (\f(CW$::RD_WARN\fR) determines the \fIlowest\fR level of warning to |
| 1723 | be displayed. Hence, to see \fIall\fR warnings, set \f(CW$::RD_WARN\fR to 1. |
| 1724 | To see only the most serious warnings set \f(CW$::RD_WARN\fR to 3. |
| 1725 | By default \f(CW$::RD_WARN\fR is initialized to 3, ensuring that serious but |
| 1726 | non-fatal errors are automatically reported. |
| 1727 | .Sp |
| 1728 | See \fI\*(L"\s-1DIAGNOSTICS\s0\*(R"\fR for a list of the varous error and warning messages |
| 1729 | that Parse::RecDescent generates when these two variables are defined. |
| 1730 | .Sp |
| 1731 | Defining any of the remaining variables (which are not defined by |
| 1732 | default) further increases the amount of information reported. |
| 1733 | Defining \f(CW$::RD_HINT\fR causes the parser generator to offer |
| 1734 | more detailed analyses and hints on both errors and warnings. |
| 1735 | Note that setting \f(CW$::RD_HINT\fR at any point automagically |
| 1736 | sets \f(CW$::RD_WARN\fR to 1. |
| 1737 | .Sp |
| 1738 | Defining \f(CW$::RD_TRACE\fR causes the parser generator and the parser to |
| 1739 | report their progress to \s-1STDERR\s0 in excruciating detail (although, without hints |
| 1740 | unless \f(CW$::RD_HINT\fR is separately defined). This detail |
| 1741 | can be moderated in only one respect: if \f(CW$::RD_TRACE\fR has an |
| 1742 | integer value (\fIN\fR) greater than 1, only the \fIN\fR characters of |
| 1743 | the \*(L"current parsing context\*(R" (that is, where in the input string we |
| 1744 | are at any point in the parse) is reported at any time. |
| 1745 | .Sp |
| 1746 | \&\f(CW$::RD_TRACE\fR is mainly useful for debugging a grammar that isn't |
| 1747 | behaving as you expected it to. To this end, if \f(CW$::RD_TRACE\fR is |
| 1748 | defined when a parser is built, any actual parser code which is |
| 1749 | generated is also written to a file named \*(L"\s-1RD_TRACE\s0\*(R" in the local |
| 1750 | directory. |
| 1751 | .Sp |
| 1752 | Note that the four variables belong to the \*(L"main\*(R" package, which |
| 1753 | makes them easier to refer to in the code controlling the parser, and |
| 1754 | also makes it easy to turn them into command line flags (\*(L"\-RD_ERRORS\*(R", |
| 1755 | \&\*(L"\-RD_WARN\*(R", \*(L"\-RD_HINT\*(R", \*(L"\-RD_TRACE\*(R") under \fBperl \-s\fR. |
| 1756 | .RE |
| 1757 | .IP "Specifying local variables" 4 |
| 1758 | .IX Item "Specifying local variables" |
| 1759 | It is occasionally convenient to specify variables which are local |
| 1760 | to a single rule. This may be achieved by including a |
| 1761 | \&\f(CW\*(C`<rulevar:...>\*(C'\fR directive anywhere in the rule. For example: |
| 1762 | .Sp |
| 1763 | .Vb 1 |
| 1764 | \& markup: <rulevar: $tag> |
| 1765 | .Ve |
| 1766 | .Sp |
| 1767 | .Vb 1 |
| 1768 | \& markup: tag {($tag=$item[1]) =~ s/^<|>$//g} body[$tag] |
| 1769 | .Ve |
| 1770 | .Sp |
| 1771 | The example \f(CW\*(C`<rulevar: $tag>\*(C'\fR directive causes a \*(L"my\*(R" variable named |
| 1772 | \&\f(CW$tag\fR to be declared at the start of the subroutine implementing the |
| 1773 | \&\f(CW\*(C`markup\*(C'\fR rule (that is, \fIbefore\fR the first production, regardless of |
| 1774 | where in the rule it is specified). |
| 1775 | .Sp |
| 1776 | Specifically, any directive of the form: |
| 1777 | \&\f(CW\*(C`<rulevar:\f(CItext\f(CW>\*(C'\fR causes a line of the form \f(CW\*(C`my \f(CItext\f(CW;\*(C'\fR |
| 1778 | to be added at the beginning of the rule subroutine, immediately after |
| 1779 | the definitions of the following local variables: |
| 1780 | .Sp |
| 1781 | .Vb 4 |
| 1782 | \& $thisparser $commit |
| 1783 | \& $thisrule @item |
| 1784 | \& $thisline @arg |
| 1785 | \& $text %arg |
| 1786 | .Ve |
| 1787 | .Sp |
| 1788 | This means that the following \f(CW\*(C`<rulevar>\*(C'\fR directives work |
| 1789 | as expected: |
| 1790 | .Sp |
| 1791 | .Vb 1 |
| 1792 | \& <rulevar: $count = 0 > |
| 1793 | .Ve |
| 1794 | .Sp |
| 1795 | .Vb 1 |
| 1796 | \& <rulevar: $firstarg = $arg[0] || '' > |
| 1797 | .Ve |
| 1798 | .Sp |
| 1799 | .Vb 1 |
| 1800 | \& <rulevar: $myItems = \e@item > |
| 1801 | .Ve |
| 1802 | .Sp |
| 1803 | .Vb 1 |
| 1804 | \& <rulevar: @context = ( $thisline, $text, @arg ) > |
| 1805 | .Ve |
| 1806 | .Sp |
| 1807 | .Vb 1 |
| 1808 | \& <rulevar: ($name,$age) = $arg{"name","age"} > |
| 1809 | .Ve |
| 1810 | .Sp |
| 1811 | Note however that, because all such variables are \*(L"my\*(R" variables, their |
| 1812 | values \fIdo not persist\fR between match attempts on a given rule. To |
| 1813 | preserve values between match attempts, values can be stored within the |
| 1814 | \&\*(L"local\*(R" member of the \f(CW$thisrule\fR object: |
| 1815 | .Sp |
| 1816 | .Vb 6 |
| 1817 | \& countedrule: { $thisrule->{"local"}{"count"}++ } |
| 1818 | \& <reject> |
| 1819 | \& | subrule1 |
| 1820 | \& | subrule2 |
| 1821 | \& | <reject: $thisrule->{"local"}{"count"} == 1> |
| 1822 | \& subrule3 |
| 1823 | .Ve |
| 1824 | .Sp |
| 1825 | When matching a rule, each \f(CW\*(C`<rulevar>\*(C'\fR directive is matched as |
| 1826 | if it were an unconditional \f(CW\*(C`<reject>\*(C'\fR directive (that is, it |
| 1827 | causes any production in which it appears to immediately fail to match). |
| 1828 | For this reason (and to improve readability) it is usual to specify any |
| 1829 | \&\f(CW\*(C`<rulevar>\*(C'\fR directive in a separate production at the start of |
| 1830 | the rule (this has the added advantage that it enables |
| 1831 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR to optimize away such productions, just as it does |
| 1832 | for the \f(CW\*(C`<reject>\*(C'\fR directive). |
| 1833 | .IP "Dynamically matched rules" 4 |
| 1834 | .IX Item "Dynamically matched rules" |
| 1835 | Because regexes and double-quoted strings are interpolated, it is relatively |
| 1836 | easy to specify productions with \*(L"context sensitive\*(R" tokens. For example: |
| 1837 | .Sp |
| 1838 | .Vb 1 |
| 1839 | \& command: keyword body "end $item[1]" |
| 1840 | .Ve |
| 1841 | .Sp |
| 1842 | which ensures that a command block is bounded by a |
| 1843 | "\fI<keyword>\fR...end \fI<same keyword>\fR" pair. |
| 1844 | .Sp |
| 1845 | Building productions in which subrules are context sensitive is also possible, |
| 1846 | via the \f(CW\*(C`<matchrule:...>\*(C'\fR directive. This directive behaves |
| 1847 | identically to a subrule item, except that the rule which is invoked to match |
| 1848 | it is determined by the string specified after the colon. For example, we could |
| 1849 | rewrite the \f(CW\*(C`command\*(C'\fR rule like this: |
| 1850 | .Sp |
| 1851 | .Vb 1 |
| 1852 | \& command: keyword <matchrule:body> "end $item[1]" |
| 1853 | .Ve |
| 1854 | .Sp |
| 1855 | Whatever appears after the colon in the directive is treated as an interpolated |
| 1856 | string (that is, as if it appeared in \f(CW\*(C`qq{...}\*(C'\fR operator) and the value of |
| 1857 | that interpolated string is the name of the subrule to be matched. |
| 1858 | .Sp |
| 1859 | Of course, just putting a constant string like \f(CW\*(C`body\*(C'\fR in a |
| 1860 | \&\f(CW\*(C`<matchrule:...>\*(C'\fR directive is of little interest or benefit. |
| 1861 | The power of directive is seen when we use a string that interpolates |
| 1862 | to something interesting. For example: |
| 1863 | .Sp |
| 1864 | .Vb 1 |
| 1865 | \& command: keyword <matchrule:$item[1]_body> "end $item[1]" |
| 1866 | .Ve |
| 1867 | .Sp |
| 1868 | .Vb 1 |
| 1869 | \& keyword: 'while' | 'if' | 'function' |
| 1870 | .Ve |
| 1871 | .Sp |
| 1872 | .Vb 1 |
| 1873 | \& while_body: condition block |
| 1874 | .Ve |
| 1875 | .Sp |
| 1876 | .Vb 1 |
| 1877 | \& if_body: condition block ('else' block)(?) |
| 1878 | .Ve |
| 1879 | .Sp |
| 1880 | .Vb 1 |
| 1881 | \& function_body: arglist block |
| 1882 | .Ve |
| 1883 | .Sp |
| 1884 | Now the \f(CW\*(C`command\*(C'\fR rule selects how to proceed on the basis of the keyword |
| 1885 | that is found. It is as if \f(CW\*(C`command\*(C'\fR were declared: |
| 1886 | .Sp |
| 1887 | .Vb 3 |
| 1888 | \& command: 'while' while_body "end while" |
| 1889 | \& | 'if' if_body "end if" |
| 1890 | \& | 'function' function_body "end function" |
| 1891 | .Ve |
| 1892 | .Sp |
| 1893 | When a \f(CW\*(C`<matchrule:...>\*(C'\fR directive is used as a repeated |
| 1894 | subrule, the rule name expression is \*(L"late\-bound\*(R". That is, the name of |
| 1895 | the rule to be called is re-evaluated \fIeach time\fR a match attempt is |
| 1896 | made. Hence, the following grammar: |
| 1897 | .Sp |
| 1898 | .Vb 1 |
| 1899 | \& { $::species = 'dogs' } |
| 1900 | .Ve |
| 1901 | .Sp |
| 1902 | .Vb 1 |
| 1903 | \& pair: 'two' <matchrule:$::species>(s) |
| 1904 | .Ve |
| 1905 | .Sp |
| 1906 | .Vb 1 |
| 1907 | \& dogs: /dogs/ { $::species = 'cats' } |
| 1908 | .Ve |
| 1909 | .Sp |
| 1910 | .Vb 1 |
| 1911 | \& cats: /cats/ |
| 1912 | .Ve |
| 1913 | .Sp |
| 1914 | will match the string \*(L"two dogs cats cats\*(R" completely, whereas it will |
| 1915 | only match the string \*(L"two dogs dogs dogs\*(R" up to the eighth letter. If |
| 1916 | the rule name were \*(L"early bound\*(R" (that is, evaluated only the first |
| 1917 | time the directive is encountered in a production), the reverse |
| 1918 | behaviour would be expected. |
| 1919 | .IP "Deferred actions" 4 |
| 1920 | .IX Item "Deferred actions" |
| 1921 | The \f(CW\*(C`<defer:...>\*(C'\fR directive is used to specify an action to be |
| 1922 | performed when (and only if!) the current production ultimately succeeds. |
| 1923 | .Sp |
| 1924 | Whenever a \f(CW\*(C`<defer:...>\*(C'\fR directive appears, the code it specifies |
| 1925 | is converted to a closure (an anonymous subroutine reference) which is |
| 1926 | queued within the active parser object. Note that, |
| 1927 | because the deferred code is converted to a closure, the values of any |
| 1928 | \&\*(L"local\*(R" variable (such as \f(CW$text\fR, <@item>, etc.) are preserved |
| 1929 | until the deferred code is actually executed. |
| 1930 | .Sp |
| 1931 | If the parse ultimately succeeds |
| 1932 | \&\fIand\fR the production in which the \f(CW\*(C`<defer:...>\*(C'\fR directive was |
| 1933 | evaluated formed part of the successful parse, then the deferred code is |
| 1934 | executed immediately before the parse returns. If however the production |
| 1935 | which queued a deferred action fails, or one of the higher-level |
| 1936 | rules which called that production fails, then the deferred action is |
| 1937 | removed from the queue, and hence is never executed. |
| 1938 | .Sp |
| 1939 | For example, given the grammar: |
| 1940 | .Sp |
| 1941 | .Vb 2 |
| 1942 | \& sentence: noun trans noun |
| 1943 | \& | noun intrans |
| 1944 | .Ve |
| 1945 | .Sp |
| 1946 | .Vb 4 |
| 1947 | \& noun: 'the dog' |
| 1948 | \& { print "$item[1]\et(noun)\en" } |
| 1949 | \& | 'the meat' |
| 1950 | \& { print "$item[1]\et(noun)\en" } |
| 1951 | .Ve |
| 1952 | .Sp |
| 1953 | .Vb 2 |
| 1954 | \& trans: 'ate' |
| 1955 | \& { print "$item[1]\et(transitive)\en" } |
| 1956 | .Ve |
| 1957 | .Sp |
| 1958 | .Vb 4 |
| 1959 | \& intrans: 'ate' |
| 1960 | \& { print "$item[1]\et(intransitive)\en" } |
| 1961 | \& | 'barked' |
| 1962 | \& { print "$item[1]\et(intransitive)\en" } |
| 1963 | .Ve |
| 1964 | .Sp |
| 1965 | then parsing the sentence \f(CW"the dog ate"\fR would produce the output: |
| 1966 | .Sp |
| 1967 | .Vb 4 |
| 1968 | \& the dog (noun) |
| 1969 | \& ate (transitive) |
| 1970 | \& the dog (noun) |
| 1971 | \& ate (intransitive) |
| 1972 | .Ve |
| 1973 | .Sp |
| 1974 | This is because, even though the first production of \f(CW\*(C`sentence\*(C'\fR |
| 1975 | ultimately fails, its initial subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`trans\*(C'\fR do match, |
| 1976 | and hence they execute their associated actions. |
| 1977 | Then the second production of \f(CW\*(C`sentence\*(C'\fR succeeds, causing the |
| 1978 | actions of the subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`intrans\*(C'\fR to be executed as well. |
| 1979 | .Sp |
| 1980 | On the other hand, if the actions were replaced by \f(CW\*(C`<defer:...>\*(C'\fR |
| 1981 | directives: |
| 1982 | .Sp |
| 1983 | .Vb 2 |
| 1984 | \& sentence: noun trans noun |
| 1985 | \& | noun intrans |
| 1986 | .Ve |
| 1987 | .Sp |
| 1988 | .Vb 4 |
| 1989 | \& noun: 'the dog' |
| 1990 | \& <defer: print "$item[1]\et(noun)\en" > |
| 1991 | \& | 'the meat' |
| 1992 | \& <defer: print "$item[1]\et(noun)\en" > |
| 1993 | .Ve |
| 1994 | .Sp |
| 1995 | .Vb 2 |
| 1996 | \& trans: 'ate' |
| 1997 | \& <defer: print "$item[1]\et(transitive)\en" > |
| 1998 | .Ve |
| 1999 | .Sp |
| 2000 | .Vb 4 |
| 2001 | \& intrans: 'ate' |
| 2002 | \& <defer: print "$item[1]\et(intransitive)\en" > |
| 2003 | \& | 'barked' |
| 2004 | \& <defer: print "$item[1]\et(intransitive)\en" > |
| 2005 | .Ve |
| 2006 | .Sp |
| 2007 | the output would be: |
| 2008 | .Sp |
| 2009 | .Vb 2 |
| 2010 | \& the dog (noun) |
| 2011 | \& ate (intransitive) |
| 2012 | .Ve |
| 2013 | .Sp |
| 2014 | since deferred actions are only executed if they were evaluated in |
| 2015 | a production which ultimately contributes to the successful parse. |
| 2016 | .Sp |
| 2017 | In this case, even though the first production of \f(CW\*(C`sentence\*(C'\fR caused |
| 2018 | the subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`trans\*(C'\fR to match, that production ultimately |
| 2019 | failed and so the deferred actions queued by those subrules were subsequently |
| 2020 | disgarded. The second production then succeeded, causing the entire |
| 2021 | parse to succeed, and so the deferred actions queued by the (second) match of |
| 2022 | the \f(CW\*(C`noun\*(C'\fR subrule and the subsequent match of \f(CW\*(C`intrans\*(C'\fR \fIare\fR preserved and |
| 2023 | eventually executed. |
| 2024 | .Sp |
| 2025 | Deferred actions provide a means of improving the performance of a parser, |
| 2026 | by only executing those actions which are part of the final parse-tree |
| 2027 | for the input data. |
| 2028 | .Sp |
| 2029 | Alternatively, deferred actions can be viewed as a mechanism for building |
| 2030 | (and executing) a |
| 2031 | customized subroutine corresponding to the given input data, much in the |
| 2032 | same way that autoactions (see \*(L"Autoactions\*(R") can be used to build a |
| 2033 | customized data structure for specific input. |
| 2034 | .Sp |
| 2035 | Whether or not the action it specifies is ever executed, |
| 2036 | a \f(CW\*(C`<defer:...>\*(C'\fR directive always succeeds, returning the |
| 2037 | number of deferred actions currently queued at that point. |
| 2038 | .IP "Parsing Perl" 4 |
| 2039 | .IX Item "Parsing Perl" |
| 2040 | Parse::RecDescent provides limited support for parsing subsets of Perl, |
| 2041 | namely: quote-like operators, Perl variables, and complete code blocks. |
| 2042 | .Sp |
| 2043 | The \f(CW\*(C`<perl_quotelike>\*(C'\fR directive can be used to parse any Perl |
| 2044 | quote-like operator: \f(CW'a string'\fR, \f(CW\*(C`m/a pattern/\*(C'\fR, \f(CW\*(C`tr{ans}{lation}\*(C'\fR, |
| 2045 | etc. It does this by calling \fIText::Balanced::quotelike()\fR. |
| 2046 | .Sp |
| 2047 | If a quote-like operator is found, a reference to an array of eight elements |
| 2048 | is returned. Those elements are identical to the last eight elements returned |
| 2049 | by \fIText::Balanced::extract_quotelike()\fR in an array context, namely: |
| 2050 | .RS 4 |
| 2051 | .IP "[0]" 4 |
| 2052 | .IX Item "[0]" |
| 2053 | the name of the quotelike operator \*(-- 'q', 'qq', 'm', 's', 'tr' \*(-- if the |
| 2054 | operator was named; otherwise \f(CW\*(C`undef\*(C'\fR, |
| 2055 | .IP "[1]" 4 |
| 2056 | .IX Item "[1]" |
| 2057 | the left delimiter of the first block of the operation, |
| 2058 | .IP "[2]" 4 |
| 2059 | .IX Item "[2]" |
| 2060 | the text of the first block of the operation |
| 2061 | (that is, the contents of |
| 2062 | a quote, the regex of a match, or substitution or the target list of a |
| 2063 | translation), |
| 2064 | .IP "[3]" 4 |
| 2065 | .IX Item "[3]" |
| 2066 | the right delimiter of the first block of the operation, |
| 2067 | .IP "[4]" 4 |
| 2068 | .IX Item "[4]" |
| 2069 | the left delimiter of the second block of the operation if there is one |
| 2070 | (that is, if it is a \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR, or \f(CW\*(C`y\*(C'\fR); otherwise \f(CW\*(C`undef\*(C'\fR, |
| 2071 | .IP "[5]" 4 |
| 2072 | .IX Item "[5]" |
| 2073 | the text of the second block of the operation if there is one |
| 2074 | (that is, the replacement of a substitution or the translation list |
| 2075 | of a translation); otherwise \f(CW\*(C`undef\*(C'\fR, |
| 2076 | .IP "[6]" 4 |
| 2077 | .IX Item "[6]" |
| 2078 | the right delimiter of the second block of the operation (if any); |
| 2079 | otherwise \f(CW\*(C`undef\*(C'\fR, |
| 2080 | .IP "[7]" 4 |
| 2081 | .IX Item "[7]" |
| 2082 | the trailing modifiers on the operation (if any); otherwise \f(CW\*(C`undef\*(C'\fR. |
| 2083 | .RE |
| 2084 | .RS 4 |
| 2085 | .Sp |
| 2086 | If a quote-like expression is not found, the directive fails with the usual |
| 2087 | \&\f(CW\*(C`undef\*(C'\fR value. |
| 2088 | .Sp |
| 2089 | The \f(CW\*(C`<perl_variable>\*(C'\fR directive can be used to parse any Perl |
| 2090 | variable: \f(CW$scalar\fR, \f(CW@array\fR, \f(CW%hash\fR, \f(CW$ref\fR\->{field}[$index], etc. |
| 2091 | It does this by calling \fIText::Balanced::extract_variable()\fR. |
| 2092 | .Sp |
| 2093 | If the directive matches text representing a valid Perl variable |
| 2094 | specification, it returns that text. Otherwise it fails with the usual |
| 2095 | \&\f(CW\*(C`undef\*(C'\fR value. |
| 2096 | .Sp |
| 2097 | The \f(CW\*(C`<perl_codeblock>\*(C'\fR directive can be used to parse curly-brace-delimited block of Perl code, such as: { \f(CW$a\fR = 1; f() =~ m/pat/; }. |
| 2098 | It does this by calling \fIText::Balanced::extract_codeblock()\fR. |
| 2099 | .Sp |
| 2100 | If the directive matches text representing a valid Perl code block, |
| 2101 | it returns that text. Otherwise it fails with the usual \f(CW\*(C`undef\*(C'\fR value. |
| 2102 | .RE |
| 2103 | .IP "Constructing tokens" 4 |
| 2104 | .IX Item "Constructing tokens" |
| 2105 | Eventually, Parse::RecDescent will be able to parse tokenized input, as |
| 2106 | well as ordinary strings. In preparation for this joyous day, the |
| 2107 | \&\f(CW\*(C`<token:...>\*(C'\fR directive has been provided. |
| 2108 | This directive creates a token which will be suitable for |
| 2109 | input to a Parse::RecDescent parser (when it eventually supports |
| 2110 | tokenized input). |
| 2111 | .Sp |
| 2112 | The text of the token is the value of the |
| 2113 | immediately preceding item in the production. A |
| 2114 | \&\f(CW\*(C`<token:...>\*(C'\fR directive always succeeds with a return |
| 2115 | value which is the hash reference that is the new token. It also |
| 2116 | sets the return value for the production to that hash ref. |
| 2117 | .Sp |
| 2118 | The \f(CW\*(C`<token:...>\*(C'\fR directive makes it easy to build |
| 2119 | a Parse::RecDescent\-compatible lexer in Parse::RecDescent: |
| 2120 | .Sp |
| 2121 | .Vb 3 |
| 2122 | \& my $lexer = new Parse::RecDescent q |
| 2123 | \& { |
| 2124 | \& lex: token(s) |
| 2125 | .Ve |
| 2126 | .Sp |
| 2127 | .Vb 5 |
| 2128 | \& token: /a\eb/ <token:INDEF> |
| 2129 | \& | /the\eb/ <token:DEF> |
| 2130 | \& | /fly\eb/ <token:NOUN,VERB> |
| 2131 | \& | /[a-z]+/i { lc $item[1] } <token:ALPHA> |
| 2132 | \& | <error: Unknown token> |
| 2133 | .Ve |
| 2134 | .Sp |
| 2135 | .Vb 1 |
| 2136 | \& }; |
| 2137 | .Ve |
| 2138 | .Sp |
| 2139 | which will eventually be able to be used with a regular Parse::RecDescent |
| 2140 | grammar: |
| 2141 | .Sp |
| 2142 | .Vb 3 |
| 2143 | \& my $parser = new Parse::RecDescent q |
| 2144 | \& { |
| 2145 | \& startrule: subrule1 subrule 2 |
| 2146 | .Ve |
| 2147 | .Sp |
| 2148 | .Vb 2 |
| 2149 | \& # ETC... |
| 2150 | \& }; |
| 2151 | .Ve |
| 2152 | .Sp |
| 2153 | either with a pre-lexing phase: |
| 2154 | .Sp |
| 2155 | .Vb 1 |
| 2156 | \& $parser->startrule( $lexer->lex($data) ); |
| 2157 | .Ve |
| 2158 | .Sp |
| 2159 | or with a lex-on-demand approach: |
| 2160 | .Sp |
| 2161 | .Vb 1 |
| 2162 | \& $parser->startrule( sub{$lexer->token(\e$data)} ); |
| 2163 | .Ve |
| 2164 | .Sp |
| 2165 | But at present, only the \f(CW\*(C`<token:...>\*(C'\fR directive is |
| 2166 | actually implemented. The rest is vapourware. |
| 2167 | .IP "Specifying operations" 4 |
| 2168 | .IX Item "Specifying operations" |
| 2169 | One of the commonest requirements when building a parser is to specify |
| 2170 | binary operators. Unfortunately, in a normal grammar, the rules for |
| 2171 | such things are awkward: |
| 2172 | .Sp |
| 2173 | .Vb 2 |
| 2174 | \& disjunction: conjunction ('or' conjunction)(s?) |
| 2175 | \& { $return = [ $item[1], @{$item[2]} ] } |
| 2176 | .Ve |
| 2177 | .Sp |
| 2178 | .Vb 2 |
| 2179 | \& conjunction: atom ('and' atom)(s?) |
| 2180 | \& { $return = [ $item[1], @{$item[2]} ] } |
| 2181 | .Ve |
| 2182 | .Sp |
| 2183 | or inefficient: |
| 2184 | .Sp |
| 2185 | .Vb 4 |
| 2186 | \& disjunction: conjunction 'or' disjunction |
| 2187 | \& { $return = [ $item[1], @{$item[2]} ] } |
| 2188 | \& | conjunction |
| 2189 | \& { $return = [ $item[1] ] } |
| 2190 | .Ve |
| 2191 | .Sp |
| 2192 | .Vb 4 |
| 2193 | \& conjunction: atom 'and' conjunction |
| 2194 | \& { $return = [ $item[1], @{$item[2]} ] } |
| 2195 | \& | atom |
| 2196 | \& { $return = [ $item[1] ] } |
| 2197 | .Ve |
| 2198 | .Sp |
| 2199 | and either way is ugly and hard to get right. |
| 2200 | .Sp |
| 2201 | The \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives provide an |
| 2202 | easier way of specifying such operations. Using \f(CW\*(C`<leftop:...>\*(C'\fR the |
| 2203 | above examples become: |
| 2204 | .Sp |
| 2205 | .Vb 2 |
| 2206 | \& disjunction: <leftop: conjunction 'or' conjunction> |
| 2207 | \& conjunction: <leftop: atom 'and' atom> |
| 2208 | .Ve |
| 2209 | .Sp |
| 2210 | The \f(CW\*(C`<leftop:...>\*(C'\fR directive specifies a left-associative binary operator. |
| 2211 | It is specified around three other grammar elements |
| 2212 | (typically subrules or terminals), which match the left operand, |
| 2213 | the operator itself, and the right operand respectively. |
| 2214 | .Sp |
| 2215 | A \f(CW\*(C`<leftop:...>\*(C'\fR directive such as: |
| 2216 | .Sp |
| 2217 | .Vb 1 |
| 2218 | \& disjunction: <leftop: conjunction 'or' conjunction> |
| 2219 | .Ve |
| 2220 | .Sp |
| 2221 | is converted to the following: |
| 2222 | .Sp |
| 2223 | .Vb 2 |
| 2224 | \& disjunction: ( conjunction ('or' conjunction)(s?) |
| 2225 | \& { $return = [ $item[1], @{$item[2]} ] } ) |
| 2226 | .Ve |
| 2227 | .Sp |
| 2228 | In other words, a \f(CW\*(C`<leftop:...>\*(C'\fR directive matches the left operand followed by zero |
| 2229 | or more repetitions of both the operator and the right operand. It then |
| 2230 | flattens the matched items into an anonymous array which becomes the |
| 2231 | (single) value of the entire \f(CW\*(C`<leftop:...>\*(C'\fR directive. |
| 2232 | .Sp |
| 2233 | For example, an \f(CW\*(C`<leftop:...>\*(C'\fR directive such as: |
| 2234 | .Sp |
| 2235 | .Vb 1 |
| 2236 | \& output: <leftop: ident '<<' expr > |
| 2237 | .Ve |
| 2238 | .Sp |
| 2239 | when given a string such as: |
| 2240 | .Sp |
| 2241 | .Vb 1 |
| 2242 | \& cout << var << "str" << 3 |
| 2243 | .Ve |
| 2244 | .Sp |
| 2245 | would match, and \f(CW$item[1]\fR would be set to: |
| 2246 | .Sp |
| 2247 | .Vb 1 |
| 2248 | \& [ 'cout', 'var', '"str"', '3' ] |
| 2249 | .Ve |
| 2250 | .Sp |
| 2251 | In other words: |
| 2252 | .Sp |
| 2253 | .Vb 1 |
| 2254 | \& output: <leftop: ident '<<' expr > |
| 2255 | .Ve |
| 2256 | .Sp |
| 2257 | is equivalent to a left-associative operator: |
| 2258 | .Sp |
| 2259 | .Vb 5 |
| 2260 | \& output: ident { $return = [$item[1]] } |
| 2261 | \& | ident '<<' expr { $return = [@item[1,3]] } |
| 2262 | \& | ident '<<' expr '<<' expr { $return = [@item[1,3,5]] } |
| 2263 | \& | ident '<<' expr '<<' expr '<<' expr { $return = [@item[1,3,5,7]] } |
| 2264 | \& # ...etc... |
| 2265 | .Ve |
| 2266 | .Sp |
| 2267 | Similarly, the \f(CW\*(C`<rightop:...>\*(C'\fR directive takes a left operand, an operator, and a right operand: |
| 2268 | .Sp |
| 2269 | .Vb 1 |
| 2270 | \& assign: <rightop: var '=' expr > |
| 2271 | .Ve |
| 2272 | .Sp |
| 2273 | and converts them to: |
| 2274 | .Sp |
| 2275 | .Vb 2 |
| 2276 | \& assign: ( (var '=' {$return=$item[1]})(s?) expr |
| 2277 | \& { $return = [ @{$item[1]}, $item[2] ] } ) |
| 2278 | .Ve |
| 2279 | .Sp |
| 2280 | which is equivalent to a right-associative operator: |
| 2281 | .Sp |
| 2282 | .Vb 5 |
| 2283 | \& assign: var { $return = [$item[1]] } |
| 2284 | \& | var '=' expr { $return = [@item[1,3]] } |
| 2285 | \& | var '=' var '=' expr { $return = [@item[1,3,5]] } |
| 2286 | \& | var '=' var '=' var '=' expr { $return = [@item[1,3,5,7]] } |
| 2287 | \& # ...etc... |
| 2288 | .Ve |
| 2289 | .Sp |
| 2290 | Note that for both the \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives, the directive does not normally |
| 2291 | return the operator itself, just a list of the operands involved. This is |
| 2292 | particularly handy for specifying lists: |
| 2293 | .Sp |
| 2294 | .Vb 2 |
| 2295 | \& list: '(' <leftop: list_item ',' list_item> ')' |
| 2296 | \& { $return = $item[2] } |
| 2297 | .Ve |
| 2298 | .Sp |
| 2299 | There is, however, a problem: sometimes the operator is itself significant. |
| 2300 | For example, in a Perl list a comma and a \f(CW\*(C`=>\*(C'\fR are both |
| 2301 | valid separators, but the \f(CW\*(C`=>\*(C'\fR has additional stringification semantics. |
| 2302 | Hence it's important to know which was used in each case. |
| 2303 | .Sp |
| 2304 | To solve this problem the |
| 2305 | \&\f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives |
| 2306 | \&\fIdo\fR return the operator(s) as well, under two circumstances. |
| 2307 | The first case is where the operator is specified as a subrule. In that instance, |
| 2308 | whatever the operator matches is returned (on the assumption that if the operator |
| 2309 | is important enough to have its own subrule, then it's important enough to return). |
| 2310 | .Sp |
| 2311 | The second case is where the operator is specified as a regular |
| 2312 | expression. In that case, if the first bracketed subpattern of the |
| 2313 | regular expression matches, that matching value is returned (this is analogous to |
| 2314 | the behaviour of the Perl \f(CW\*(C`split\*(C'\fR function, except that only the first subpattern |
| 2315 | is returned). |
| 2316 | .Sp |
| 2317 | In other words, given the input: |
| 2318 | .Sp |
| 2319 | .Vb 1 |
| 2320 | \& ( a=>1, b=>2 ) |
| 2321 | .Ve |
| 2322 | .Sp |
| 2323 | the specifications: |
| 2324 | .Sp |
| 2325 | .Vb 1 |
| 2326 | \& list: '(' <leftop: list_item separator list_item> ')' |
| 2327 | .Ve |
| 2328 | .Sp |
| 2329 | .Vb 1 |
| 2330 | \& separator: ',' | '=>' |
| 2331 | .Ve |
| 2332 | .Sp |
| 2333 | or: |
| 2334 | .Sp |
| 2335 | .Vb 1 |
| 2336 | \& list: '(' <leftop: list_item /(,|=>)/ list_item> ')' |
| 2337 | .Ve |
| 2338 | .Sp |
| 2339 | cause the list separators to be interleaved with the operands in the |
| 2340 | anonymous array in \f(CW$item[2]\fR: |
| 2341 | .Sp |
| 2342 | .Vb 1 |
| 2343 | \& [ 'a', '=>', '1', ',', 'b', '=>', '2' ] |
| 2344 | .Ve |
| 2345 | .Sp |
| 2346 | But the following version: |
| 2347 | .Sp |
| 2348 | .Vb 1 |
| 2349 | \& list: '(' <leftop: list_item /,|=>/ list_item> ')' |
| 2350 | .Ve |
| 2351 | .Sp |
| 2352 | returns only the operators: |
| 2353 | .Sp |
| 2354 | .Vb 1 |
| 2355 | \& [ 'a', '1', 'b', '2' ] |
| 2356 | .Ve |
| 2357 | .Sp |
| 2358 | Of course, none of the above specifications handle the case of an empty |
| 2359 | list, since the \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives |
| 2360 | require at least a single right or left operand to match. To specify |
| 2361 | that the operator can match \*(L"trivially\*(R", |
| 2362 | it's necessary to add a \f(CW\*(C`(?)\*(C'\fR qualifier to the directive: |
| 2363 | .Sp |
| 2364 | .Vb 1 |
| 2365 | \& list: '(' <leftop: list_item /(,|=>)/ list_item>(?) ')' |
| 2366 | .Ve |
| 2367 | .Sp |
| 2368 | Note that in almost all the above examples, the first and third arguments |
| 2369 | of the \f(CW\*(C`<leftop:...>\*(C'\fR directive were the same subrule. That is because |
| 2370 | \&\f(CW\*(C`<leftop:...>\*(C'\fR's are frequently used to specify \*(L"separated\*(R" lists of the |
| 2371 | same type of item. To make such lists easier to specify, the following |
| 2372 | syntax: |
| 2373 | .Sp |
| 2374 | .Vb 1 |
| 2375 | \& list: element(s /,/) |
| 2376 | .Ve |
| 2377 | .Sp |
| 2378 | is exactly equivalent to: |
| 2379 | .Sp |
| 2380 | .Vb 1 |
| 2381 | \& list: <leftop: element /,/ element> |
| 2382 | .Ve |
| 2383 | .Sp |
| 2384 | Note that the separator must be specified as a raw pattern (i.e. |
| 2385 | not a string or subrule). |
| 2386 | .IP "Scored productions" 4 |
| 2387 | .IX Item "Scored productions" |
| 2388 | By default, Parse::RecDescent grammar rules always accept the first |
| 2389 | production that matches the input. But if two or more productions may |
| 2390 | potentially match the same input, choosing the first that does so may |
| 2391 | not be optimal. |
| 2392 | .Sp |
| 2393 | For example, if you were parsing the sentence \*(L"time flies like an arrow\*(R", |
| 2394 | you might use a rule like this: |
| 2395 | .Sp |
| 2396 | .Vb 3 |
| 2397 | \& sentence: verb noun preposition article noun { [@item] } |
| 2398 | \& | adjective noun verb article noun { [@item] } |
| 2399 | \& | noun verb preposition article noun { [@item] } |
| 2400 | .Ve |
| 2401 | .Sp |
| 2402 | Each of these productions matches the sentence, but the third one |
| 2403 | is the most likely interpretation. However, if the sentence had been |
| 2404 | \&\*(L"fruit flies like a banana\*(R", then the second production is probably |
| 2405 | the right match. |
| 2406 | .Sp |
| 2407 | To cater for such situtations, the \f(CW\*(C`<score:...>\*(C'\fR can be used. |
| 2408 | The directive is equivalent to an unconditional \f(CW\*(C`<reject>\*(C'\fR, |
| 2409 | except that it allows you to specify a \*(L"score\*(R" for the current |
| 2410 | production. If that score is numerically greater than the best |
| 2411 | score of any preceding production, the current production is cached for later |
| 2412 | consideration. If no later production matches, then the cached |
| 2413 | production is treated as having matched, and the value of the |
| 2414 | item immediately before its \f(CW\*(C`<score:...>\*(C'\fR directive is returned as the |
| 2415 | result. |
| 2416 | .Sp |
| 2417 | In other words, by putting a \f(CW\*(C`<score:...>\*(C'\fR directive at the end of |
| 2418 | each production, you can select which production matches using |
| 2419 | criteria other than specification order. For example: |
| 2420 | .Sp |
| 2421 | .Vb 3 |
| 2422 | \& sentence: verb noun preposition article noun { [@item] } <score: sensible(@item)> |
| 2423 | \& | adjective noun verb article noun { [@item] } <score: sensible(@item)> |
| 2424 | \& | noun verb preposition article noun { [@item] } <score: sensible(@item)> |
| 2425 | .Ve |
| 2426 | .Sp |
| 2427 | Now, when each production reaches its respective \f(CW\*(C`<score:...>\*(C'\fR |
| 2428 | directive, the subroutine \f(CW\*(C`sensible\*(C'\fR will be called to evaluate the |
| 2429 | matched items (somehow). Once all productions have been tried, the |
| 2430 | one which \f(CW\*(C`sensible\*(C'\fR scored most highly will be the one that is |
| 2431 | accepted as a match for the rule. |
| 2432 | .Sp |
| 2433 | The variable \f(CW$score\fR always holds the current best score of any production, |
| 2434 | and the variable \f(CW$score_return\fR holds the corresponding return value. |
| 2435 | .Sp |
| 2436 | As another example, the following grammar matches lines that may be |
| 2437 | separated by commas, colons, or semi\-colons. This can be tricky if |
| 2438 | a colon-separated line also contains commas, or vice versa. The grammar |
| 2439 | resolves the ambiguity by selecting the rule that results in the |
| 2440 | fewest fields: |
| 2441 | .Sp |
| 2442 | .Vb 3 |
| 2443 | \& line: seplist[sep=>','] <score: -@{$item[1]}> |
| 2444 | \& | seplist[sep=>':'] <score: -@{$item[1]}> |
| 2445 | \& | seplist[sep=>" "] <score: -@{$item[1]}> |
| 2446 | .Ve |
| 2447 | .Sp |
| 2448 | .Vb 1 |
| 2449 | \& seplist: <skip:""> <leftop: /[^$arg{sep}]*/ "$arg{sep}" /[^$arg{sep}]*/> |
| 2450 | .Ve |
| 2451 | .Sp |
| 2452 | Note the use of negation within the \f(CW\*(C`<score:...>\*(C'\fR directive |
| 2453 | to ensure that the seplist with the most items gets the lowest score. |
| 2454 | .Sp |
| 2455 | As the above examples indicate, it is often the case that all productions |
| 2456 | in a rule use exactly the same \f(CW\*(C`<score:...>\*(C'\fR directive. It is |
| 2457 | tedious to have to repeat this identical directive in every production, so |
| 2458 | Parse::RecDescent also provides the \f(CW\*(C`<autoscore:...>\*(C'\fR directive. |
| 2459 | .Sp |
| 2460 | If an \f(CW\*(C`<autoscore:...>\*(C'\fR directive appears in any |
| 2461 | production of a rule, the code it specifies is used as the scoring |
| 2462 | code for every production of that rule, except productions that already |
| 2463 | end with an explicit \f(CW\*(C`<score:...>\*(C'\fR directive. Thus the rules above could |
| 2464 | be rewritten: |
| 2465 | .Sp |
| 2466 | .Vb 4 |
| 2467 | \& line: <autoscore: -@{$item[1]}> |
| 2468 | \& line: seplist[sep=>','] |
| 2469 | \& | seplist[sep=>':'] |
| 2470 | \& | seplist[sep=>" "] |
| 2471 | .Ve |
| 2472 | .Sp |
| 2473 | .Vb 4 |
| 2474 | \& sentence: <autoscore: sensible(@item)> |
| 2475 | \& | verb noun preposition article noun { [@item] } |
| 2476 | \& | adjective noun verb article noun { [@item] } |
| 2477 | \& | noun verb preposition article noun { [@item] } |
| 2478 | .Ve |
| 2479 | .Sp |
| 2480 | Note that the \f(CW\*(C`<autoscore:...>\*(C'\fR directive itself acts as an |
| 2481 | unconditional \f(CW\*(C`<reject>\*(C'\fR, and (like the \f(CW\*(C`<rulevar:...>\*(C'\fR |
| 2482 | directive) is pruned at compile-time wherever possible. |
| 2483 | .IP "Dispensing with grammar checks" 4 |
| 2484 | .IX Item "Dispensing with grammar checks" |
| 2485 | During the compilation phase of parser construction, Parse::RecDescent performs |
| 2486 | a small number of checks on the grammar it's given. Specifically it checks that |
| 2487 | the grammar is not left\-recursive, that there are no \*(L"insatiable\*(R" constructs of |
| 2488 | the form: |
| 2489 | .Sp |
| 2490 | .Vb 1 |
| 2491 | \& rule: subrule(s) subrule |
| 2492 | .Ve |
| 2493 | .Sp |
| 2494 | and that there are no rules missing (i.e. referred to, but never defined). |
| 2495 | .Sp |
| 2496 | These checks are important during development, but can slow down parser |
| 2497 | construction in stable code. So Parse::RecDescent provides the |
| 2498 | <nocheck> directive to turn them off. The directive can only appear |
| 2499 | before the first rule definition, and switches off checking throughout the rest |
| 2500 | of the current grammar. |
| 2501 | .Sp |
| 2502 | Typically, this directive would be added when a parser has been thoroughly |
| 2503 | tested and is ready for release. |
| 2504 | .Sh "Subrule argument lists" |
| 2505 | .IX Subsection "Subrule argument lists" |
| 2506 | It is occasionally useful to pass data to a subrule which is being invoked. For |
| 2507 | example, consider the following grammar fragment: |
| 2508 | .PP |
| 2509 | .Vb 1 |
| 2510 | \& classdecl: keyword decl |
| 2511 | .Ve |
| 2512 | .PP |
| 2513 | .Vb 1 |
| 2514 | \& keyword: 'struct' | 'class'; |
| 2515 | .Ve |
| 2516 | .PP |
| 2517 | .Vb 1 |
| 2518 | \& decl: # WHATEVER |
| 2519 | .Ve |
| 2520 | .PP |
| 2521 | The \f(CW\*(C`decl\*(C'\fR rule might wish to know which of the two keywords was used |
| 2522 | (since it may affect some aspect of the way the subsequent declaration |
| 2523 | is interpreted). \f(CW\*(C`Parse::RecDescent\*(C'\fR allows the grammar designer to |
| 2524 | pass data into a rule, by placing that data in an \fIargument list\fR |
| 2525 | (that is, in square brackets) immediately after any subrule item in a |
| 2526 | production. Hence, we could pass the keyword to \f(CW\*(C`decl\*(C'\fR as follows: |
| 2527 | .PP |
| 2528 | .Vb 1 |
| 2529 | \& classdecl: keyword decl[ $item[1] ] |
| 2530 | .Ve |
| 2531 | .PP |
| 2532 | .Vb 1 |
| 2533 | \& keyword: 'struct' | 'class'; |
| 2534 | .Ve |
| 2535 | .PP |
| 2536 | .Vb 1 |
| 2537 | \& decl: # WHATEVER |
| 2538 | .Ve |
| 2539 | .PP |
| 2540 | The argument list can consist of any number (including zero!) of comma-separated |
| 2541 | Perl expressions. In other words, it looks exactly like a Perl anonymous |
| 2542 | array reference. For example, we could pass the keyword, the name of the |
| 2543 | surrounding rule, and the literal 'keyword' to \f(CW\*(C`decl\*(C'\fR like so: |
| 2544 | .PP |
| 2545 | .Vb 1 |
| 2546 | \& classdecl: keyword decl[$item[1],$item[0],'keyword'] |
| 2547 | .Ve |
| 2548 | .PP |
| 2549 | .Vb 1 |
| 2550 | \& keyword: 'struct' | 'class'; |
| 2551 | .Ve |
| 2552 | .PP |
| 2553 | .Vb 1 |
| 2554 | \& decl: # WHATEVER |
| 2555 | .Ve |
| 2556 | .PP |
| 2557 | Within the rule to which the data is passed (\f(CW\*(C`decl\*(C'\fR in the above examples) |
| 2558 | that data is available as the elements of a local variable \f(CW@arg\fR. Hence |
| 2559 | \&\f(CW\*(C`decl\*(C'\fR might report its intentions as follows: |
| 2560 | .PP |
| 2561 | .Vb 1 |
| 2562 | \& classdecl: keyword decl[$item[1],$item[0],'keyword'] |
| 2563 | .Ve |
| 2564 | .PP |
| 2565 | .Vb 1 |
| 2566 | \& keyword: 'struct' | 'class'; |
| 2567 | .Ve |
| 2568 | .PP |
| 2569 | .Vb 2 |
| 2570 | \& decl: { print "Declaring $arg[0] (a $arg[2])\en"; |
| 2571 | \& print "(this rule called by $arg[1])" } |
| 2572 | .Ve |
| 2573 | .PP |
| 2574 | Subrule argument lists can also be interpreted as hashes, simply by using |
| 2575 | the local variable \f(CW%arg\fR instead of \f(CW@arg\fR. Hence we could rewrite the |
| 2576 | previous example: |
| 2577 | .PP |
| 2578 | .Vb 3 |
| 2579 | \& classdecl: keyword decl[keyword => $item[1], |
| 2580 | \& caller => $item[0], |
| 2581 | \& type => 'keyword'] |
| 2582 | .Ve |
| 2583 | .PP |
| 2584 | .Vb 1 |
| 2585 | \& keyword: 'struct' | 'class'; |
| 2586 | .Ve |
| 2587 | .PP |
| 2588 | .Vb 2 |
| 2589 | \& decl: { print "Declaring $arg{keyword} (a $arg{type})\en"; |
| 2590 | \& print "(this rule called by $arg{caller})" } |
| 2591 | .Ve |
| 2592 | .PP |
| 2593 | Both \f(CW@arg\fR and \f(CW%arg\fR are always available, so the grammar designer may |
| 2594 | choose whichever convention (or combination of conventions) suits best. |
| 2595 | .PP |
| 2596 | Subrule argument lists are also useful for creating \*(L"rule templates\*(R" |
| 2597 | (especially when used in conjunction with the \f(CW\*(C`<matchrule:...>\*(C'\fR |
| 2598 | directive). For example, the subrule: |
| 2599 | .PP |
| 2600 | .Vb 4 |
| 2601 | \& list: <matchrule:$arg{rule}> /$arg{sep}/ list[%arg] |
| 2602 | \& { $return = [ $item[1], @{$item[3]} ] } |
| 2603 | \& | <matchrule:$arg{rule}> |
| 2604 | \& { $return = [ $item[1]] } |
| 2605 | .Ve |
| 2606 | .PP |
| 2607 | is a handy template for the common problem of matching a separated list. |
| 2608 | For example: |
| 2609 | .PP |
| 2610 | .Vb 1 |
| 2611 | \& function: 'func' name '(' list[rule=>'param',sep=>';'] ')' |
| 2612 | .Ve |
| 2613 | .PP |
| 2614 | .Vb 1 |
| 2615 | \& param: list[rule=>'name',sep=>','] ':' typename |
| 2616 | .Ve |
| 2617 | .PP |
| 2618 | .Vb 1 |
| 2619 | \& name: /\ew+/ |
| 2620 | .Ve |
| 2621 | .PP |
| 2622 | .Vb 1 |
| 2623 | \& typename: name |
| 2624 | .Ve |
| 2625 | .PP |
| 2626 | When a subrule argument list is used with a repeated subrule, the argument list |
| 2627 | goes \fIbefore\fR the repetition specifier: |
| 2628 | .PP |
| 2629 | .Vb 1 |
| 2630 | \& list: /some|many/ thing[ $item[1] ](s) |
| 2631 | .Ve |
| 2632 | .PP |
| 2633 | The argument list is \*(L"late bound\*(R". That is, it is re-evaluated for every |
| 2634 | repetition of the repeated subrule. |
| 2635 | This means that each repeated attempt to match the subrule may be |
| 2636 | passed a completely different set of arguments if the value of the |
| 2637 | expression in the argument list changes between attempts. So, for |
| 2638 | example, the grammar: |
| 2639 | .PP |
| 2640 | .Vb 1 |
| 2641 | \& { $::species = 'dogs' } |
| 2642 | .Ve |
| 2643 | .PP |
| 2644 | .Vb 1 |
| 2645 | \& pair: 'two' animal[$::species](s) |
| 2646 | .Ve |
| 2647 | .PP |
| 2648 | .Vb 1 |
| 2649 | \& animal: /$arg[0]/ { $::species = 'cats' } |
| 2650 | .Ve |
| 2651 | .PP |
| 2652 | will match the string \*(L"two dogs cats cats\*(R" completely, whereas |
| 2653 | it will only match the string \*(L"two dogs dogs dogs\*(R" up to the |
| 2654 | eighth letter. If the value of the argument list were \*(L"early bound\*(R" |
| 2655 | (that is, evaluated only the first time a repeated subrule match is |
| 2656 | attempted), one would expect the matching behaviours to be reversed. |
| 2657 | .PP |
| 2658 | Of course, it is possible to effectively \*(L"early bind\*(R" such argument lists |
| 2659 | by passing them a value which does not change on each repetition. For example: |
| 2660 | .PP |
| 2661 | .Vb 1 |
| 2662 | \& { $::species = 'dogs' } |
| 2663 | .Ve |
| 2664 | .PP |
| 2665 | .Vb 1 |
| 2666 | \& pair: 'two' { $::species } animal[$item[2]](s) |
| 2667 | .Ve |
| 2668 | .PP |
| 2669 | .Vb 1 |
| 2670 | \& animal: /$arg[0]/ { $::species = 'cats' } |
| 2671 | .Ve |
| 2672 | .PP |
| 2673 | Arguments can also be passed to the start rule, simply by appending them |
| 2674 | to the argument list with which the start rule is called (\fIafter\fR the |
| 2675 | \&\*(L"line number\*(R" parameter). For example, given: |
| 2676 | .PP |
| 2677 | .Vb 1 |
| 2678 | \& $parser = new Parse::RecDescent ( $grammar ); |
| 2679 | .Ve |
| 2680 | .PP |
| 2681 | .Vb 1 |
| 2682 | \& $parser->data($text, 1, "str", 2, \e@arr); |
| 2683 | .Ve |
| 2684 | .PP |
| 2685 | .Vb 5 |
| 2686 | \& # ^^^^^ ^ ^^^^^^^^^^^^^^^ |
| 2687 | \& # | | | |
| 2688 | \& # TEXT TO BE PARSED | | |
| 2689 | \& # STARTING LINE NUMBER | |
| 2690 | \& # ELEMENTS OF @arg WHICH IS PASSED TO RULE data |
| 2691 | .Ve |
| 2692 | .PP |
| 2693 | then within the productions of the rule \f(CW\*(C`data\*(C'\fR, the array \f(CW@arg\fR will contain |
| 2694 | \&\f(CW\*(C`("str", 2, \e@arr)\*(C'\fR. |
| 2695 | .Sh "Alternations" |
| 2696 | .IX Subsection "Alternations" |
| 2697 | Alternations are implicit (unnamed) rules defined as part of a production. An |
| 2698 | alternation is defined as a series of '|'\-separated productions inside a |
| 2699 | pair of round brackets. For example: |
| 2700 | .PP |
| 2701 | .Vb 1 |
| 2702 | \& character: 'the' ( good | bad | ugly ) /dude/ |
| 2703 | .Ve |
| 2704 | .PP |
| 2705 | Every alternation implicitly defines a new subrule, whose |
| 2706 | automatically-generated name indicates its origin: |
| 2707 | \&\*(L"_alternation_<I>_of_production_<P>_of_rule<R>\*(R" for the appropriate |
| 2708 | values of <I>, <P>, and <R>. A call to this implicit subrule is then |
| 2709 | inserted in place of the brackets. Hence the above example is merely a |
| 2710 | convenient short-hand for: |
| 2711 | .PP |
| 2712 | .Vb 3 |
| 2713 | \& character: 'the' |
| 2714 | \& _alternation_1_of_production_1_of_rule_character |
| 2715 | \& /dude/ |
| 2716 | .Ve |
| 2717 | .PP |
| 2718 | .Vb 2 |
| 2719 | \& _alternation_1_of_production_1_of_rule_character: |
| 2720 | \& good | bad | ugly |
| 2721 | .Ve |
| 2722 | .PP |
| 2723 | Since alternations are parsed by recursively calling the parser generator, |
| 2724 | any type(s) of item can appear in an alternation. For example: |
| 2725 | .PP |
| 2726 | .Vb 5 |
| 2727 | \& character: 'the' ( 'high' "plains" # Silent, with poncho |
| 2728 | \& | /no[- ]name/ # Silent, no poncho |
| 2729 | \& | vengeance_seeking # Poncho-optional |
| 2730 | \& | <error> |
| 2731 | \& ) drifter |
| 2732 | .Ve |
| 2733 | .PP |
| 2734 | In this case, if an error occurred, the automatically generated |
| 2735 | message would be: |
| 2736 | .PP |
| 2737 | .Vb 3 |
| 2738 | \& ERROR (line <N>): Invalid implicit subrule: Expected |
| 2739 | \& 'high' or /no[- ]name/ or generic, |
| 2740 | \& but found "pacifist" instead |
| 2741 | .Ve |
| 2742 | .PP |
| 2743 | Since every alternation actually has a name, it's even possible |
| 2744 | to extend or replace them: |
| 2745 | .PP |
| 2746 | .Vb 4 |
| 2747 | \& parser->Replace( |
| 2748 | \& "_alternation_1_of_production_1_of_rule_character: |
| 2749 | \& 'generic Eastwood'" |
| 2750 | \& ); |
| 2751 | .Ve |
| 2752 | .PP |
| 2753 | More importantly, since alternations are a form of subrule, they can be given |
| 2754 | repetition specifiers: |
| 2755 | .PP |
| 2756 | .Vb 1 |
| 2757 | \& character: 'the' ( good | bad | ugly )(?) /dude/ |
| 2758 | .Ve |
| 2759 | .Sh "Incremental Parsing" |
| 2760 | .IX Subsection "Incremental Parsing" |
| 2761 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR provides two methods \- \f(CW\*(C`Extend\*(C'\fR and \f(CW\*(C`Replace\*(C'\fR \- which |
| 2762 | can be used to alter the grammar matched by a parser. Both methods |
| 2763 | take the same argument as \f(CW\*(C`Parse::RecDescent::new\*(C'\fR, namely a |
| 2764 | grammar specification string |
| 2765 | .PP |
| 2766 | \&\f(CW\*(C`Parse::RecDescent::Extend\*(C'\fR interprets the grammar specification and adds any |
| 2767 | productions it finds to the end of the rules for which they are specified. For |
| 2768 | example: |
| 2769 | .PP |
| 2770 | .Vb 2 |
| 2771 | \& $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\endesc: colour /necks?/"; |
| 2772 | \& parser->Extend($add); |
| 2773 | .Ve |
| 2774 | .PP |
| 2775 | adds two productions to the rule \*(L"name\*(R" (creating it if necessary) and one |
| 2776 | production to the rule \*(L"desc\*(R". |
| 2777 | .PP |
| 2778 | \&\f(CW\*(C`Parse::RecDescent::Replace\*(C'\fR is identical, except that it first resets are |
| 2779 | rule specified in the additional grammar, removing any existing productions. |
| 2780 | Hence after: |
| 2781 | .PP |
| 2782 | .Vb 2 |
| 2783 | \& $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\endesc: colour /necks?/"; |
| 2784 | \& parser->Replace($add); |
| 2785 | .Ve |
| 2786 | .PP |
| 2787 | are are \fIonly\fR valid \*(L"name\*(R"s and the one possible description. |
| 2788 | .PP |
| 2789 | A more interesting use of the \f(CW\*(C`Extend\*(C'\fR and \f(CW\*(C`Replace\*(C'\fR methods is to call them |
| 2790 | inside the action of an executing parser. For example: |
| 2791 | .PP |
| 2792 | .Vb 3 |
| 2793 | \& typedef: 'typedef' type_name identifier ';' |
| 2794 | \& { $thisparser->Extend("type_name: '$item[3]'") } |
| 2795 | \& | <error> |
| 2796 | .Ve |
| 2797 | .PP |
| 2798 | .Vb 1 |
| 2799 | \& identifier: ...!type_name /[A-Za-z_]w*/ |
| 2800 | .Ve |
| 2801 | .PP |
| 2802 | which automatically prevents type names from being typedef'd, or: |
| 2803 | .PP |
| 2804 | .Vb 6 |
| 2805 | \& command: 'map' key_name 'to' abort_key |
| 2806 | \& { $thisparser->Replace("abort_key: '$item[2]'") } |
| 2807 | \& | 'map' key_name 'to' key_name |
| 2808 | \& { map_key($item[2],$item[4]) } |
| 2809 | \& | abort_key |
| 2810 | \& { exit if confirm("abort?") } |
| 2811 | .Ve |
| 2812 | .PP |
| 2813 | .Vb 1 |
| 2814 | \& abort_key: 'q' |
| 2815 | .Ve |
| 2816 | .PP |
| 2817 | .Vb 1 |
| 2818 | \& key_name: ...!abort_key /[A-Za-z]/ |
| 2819 | .Ve |
| 2820 | .PP |
| 2821 | which allows the user to change the abort key binding, but not to unbind it. |
| 2822 | .PP |
| 2823 | The careful use of such constructs makes it possible to reconfigure a |
| 2824 | a running parser, eliminating the need for semantic feedback by |
| 2825 | providing syntactic feedback instead. However, as currently implemented, |
| 2826 | \&\f(CW\*(C`Replace()\*(C'\fR and \f(CW\*(C`Extend()\*(C'\fR have to regenerate and re\-\f(CW\*(C`eval\*(C'\fR the |
| 2827 | entire parser whenever they are called. This makes them quite slow for |
| 2828 | large grammars. |
| 2829 | .PP |
| 2830 | In such cases, the judicious use of an interpolated regex is likely to |
| 2831 | be far more efficient: |
| 2832 | .PP |
| 2833 | .Vb 3 |
| 2834 | \& typedef: 'typedef' type_name/ identifier ';' |
| 2835 | \& { $thisparser->{local}{type_name} .= "|$item[3]" } |
| 2836 | \& | <error> |
| 2837 | .Ve |
| 2838 | .PP |
| 2839 | .Vb 1 |
| 2840 | \& identifier: ...!type_name /[A-Za-z_]w*/ |
| 2841 | .Ve |
| 2842 | .PP |
| 2843 | .Vb 1 |
| 2844 | \& type_name: /$thisparser->{local}{type_name}/ |
| 2845 | .Ve |
| 2846 | .Sh "Precompiling parsers" |
| 2847 | .IX Subsection "Precompiling parsers" |
| 2848 | Normally Parse::RecDescent builds a parser from a grammar at run\-time. |
| 2849 | That approach simplifies the design and implementation of parsing code, |
| 2850 | but has the disadvantage that it slows the parsing process down \- you |
| 2851 | have to wait for Parse::RecDescent to build the parser every time the |
| 2852 | program runs. Long or complex grammars can be particularly slow to |
| 2853 | build, leading to unacceptable delays at start\-up. |
| 2854 | .PP |
| 2855 | To overcome this, the module provides a way of \*(L"pre\-building\*(R" a parser |
| 2856 | object and saving it in a separate module. That module can then be used |
| 2857 | to create clones of the original parser. |
| 2858 | .PP |
| 2859 | A grammar may be precompiled using the \f(CW\*(C`Precompile\*(C'\fR class method. |
| 2860 | For example, to precompile a grammar stored in the scalar \f(CW$grammar\fR, |
| 2861 | and produce a class named PreGrammar in a module file named PreGrammar.pm, |
| 2862 | you could use: |
| 2863 | .PP |
| 2864 | .Vb 1 |
| 2865 | \& use Parse::RecDescent; |
| 2866 | .Ve |
| 2867 | .PP |
| 2868 | .Vb 1 |
| 2869 | \& Parse::RecDescent->Precompile($grammar, "PreGrammar"); |
| 2870 | .Ve |
| 2871 | .PP |
| 2872 | The first argument is the grammar string, the second is the name of the class |
| 2873 | to be built. The name of the module file is generated automatically by |
| 2874 | appending \*(L".pm\*(R" to the last element of the class name. Thus |
| 2875 | .PP |
| 2876 | .Vb 1 |
| 2877 | \& Parse::RecDescent->Precompile($grammar, "My::New::Parser"); |
| 2878 | .Ve |
| 2879 | .PP |
| 2880 | would produce a module file named Parser.pm. |
| 2881 | .PP |
| 2882 | It is somewhat tedious to have to write a small Perl program just to |
| 2883 | generate a precompiled grammar class, so Parse::RecDescent has some special |
| 2884 | magic that allows you to do the job directly from the command\-line. |
| 2885 | .PP |
| 2886 | If your grammar is specified in a file named \fIgrammar\fR, you can generate |
| 2887 | a class named Yet::Another::Grammar like so: |
| 2888 | .PP |
| 2889 | .Vb 1 |
| 2890 | \& > perl -MParse::RecDescent - grammar Yet::Another::Grammar |
| 2891 | .Ve |
| 2892 | .PP |
| 2893 | This would produce a file named \fIGrammar.pm\fR containing the full |
| 2894 | definition of a class called Yet::Another::Grammar. Of course, to use |
| 2895 | that class, you would need to put the \fIGrammar.pm\fR file in a |
| 2896 | directory named \fIYet/Another\fR, somewhere in your Perl include path. |
| 2897 | .PP |
| 2898 | Having created the new class, it's very easy to use it to build |
| 2899 | a parser. You simply \f(CW\*(C`use\*(C'\fR the new module, and then call its |
| 2900 | \&\f(CW\*(C`new\*(C'\fR method to create a parser object. For example: |
| 2901 | .PP |
| 2902 | .Vb 2 |
| 2903 | \& use Yet::Another::Grammar; |
| 2904 | \& my $parser = Yet::Another::Grammar->new(); |
| 2905 | .Ve |
| 2906 | .PP |
| 2907 | The effect of these two lines is exactly the same as: |
| 2908 | .PP |
| 2909 | .Vb 1 |
| 2910 | \& use Parse::RecDescent; |
| 2911 | .Ve |
| 2912 | .PP |
| 2913 | .Vb 3 |
| 2914 | \& open GRAMMAR_FILE, "grammar" or die; |
| 2915 | \& local $/; |
| 2916 | \& my $grammar = <GRAMMAR_FILE>; |
| 2917 | .Ve |
| 2918 | .PP |
| 2919 | .Vb 1 |
| 2920 | \& my $parser = Parse::RecDescent->new($grammar); |
| 2921 | .Ve |
| 2922 | .PP |
| 2923 | only considerably faster. |
| 2924 | .PP |
| 2925 | Note however that the parsers produced by either approach are exactly |
| 2926 | the same, so whilst precompilation has an effect on \fIset-up\fR speed, |
| 2927 | it has no effect on \fIparsing\fR speed. RecDescent 2.0 will address that |
| 2928 | problem. |
| 2929 | .ie n .Sh "A Metagrammar for ""Parse::RecDescent""" |
| 2930 | .el .Sh "A Metagrammar for \f(CWParse::RecDescent\fP" |
| 2931 | .IX Subsection "A Metagrammar for Parse::RecDescent" |
| 2932 | The following is a specification of grammar format accepted by |
| 2933 | \&\f(CW\*(C`Parse::RecDescent::new\*(C'\fR (specified in the \f(CW\*(C`Parse::RecDescent\*(C'\fR grammar format!): |
| 2934 | .PP |
| 2935 | .Vb 1 |
| 2936 | \& grammar : components(s) |
| 2937 | .Ve |
| 2938 | .PP |
| 2939 | .Vb 1 |
| 2940 | \& component : rule | comment |
| 2941 | .Ve |
| 2942 | .PP |
| 2943 | .Vb 1 |
| 2944 | \& rule : "\en" identifier ":" production(s?) |
| 2945 | .Ve |
| 2946 | .PP |
| 2947 | .Vb 1 |
| 2948 | \& production : items(s) |
| 2949 | .Ve |
| 2950 | .PP |
| 2951 | .Vb 3 |
| 2952 | \& item : lookahead(?) simpleitem |
| 2953 | \& | directive |
| 2954 | \& | comment |
| 2955 | .Ve |
| 2956 | .PP |
| 2957 | .Vb 1 |
| 2958 | \& lookahead : '...' | '...!' # +'ve or -'ve lookahead |
| 2959 | .Ve |
| 2960 | .PP |
| 2961 | .Vb 5 |
| 2962 | \& simpleitem : subrule args(?) # match another rule |
| 2963 | \& | repetition # match repeated subrules |
| 2964 | \& | terminal # match the next input |
| 2965 | \& | bracket args(?) # match alternative items |
| 2966 | \& | action # do something |
| 2967 | .Ve |
| 2968 | .PP |
| 2969 | .Vb 1 |
| 2970 | \& subrule : identifier # the name of the rule |
| 2971 | .Ve |
| 2972 | .PP |
| 2973 | .Vb 1 |
| 2974 | \& args : {extract_codeblock($text,'[]')} # just like a [...] array ref |
| 2975 | .Ve |
| 2976 | .PP |
| 2977 | .Vb 1 |
| 2978 | \& repetition : subrule args(?) howoften |
| 2979 | .Ve |
| 2980 | .PP |
| 2981 | .Vb 6 |
| 2982 | \& howoften : '(?)' # 0 or 1 times |
| 2983 | \& | '(s?)' # 0 or more times |
| 2984 | \& | '(s)' # 1 or more times |
| 2985 | \& | /(\ed+)[.][.](/\ed+)/ # $1 to $2 times |
| 2986 | \& | /[.][.](/\ed*)/ # at most $1 times |
| 2987 | \& | /(\ed*)[.][.])/ # at least $1 times |
| 2988 | .Ve |
| 2989 | .PP |
| 2990 | .Vb 3 |
| 2991 | \& terminal : /[/]([\e][/]|[^/])*[/]/ # interpolated pattern |
| 2992 | \& | /"([\e]"|[^"])*"/ # interpolated literal |
| 2993 | \& | /'([\e]'|[^'])*'/ # uninterpolated literal |
| 2994 | .Ve |
| 2995 | .PP |
| 2996 | .Vb 1 |
| 2997 | \& action : { extract_codeblock($text) } # embedded Perl code |
| 2998 | .Ve |
| 2999 | .PP |
| 3000 | .Vb 1 |
| 3001 | \& bracket : '(' Item(s) production(s?) ')' # alternative subrules |
| 3002 | .Ve |
| 3003 | .PP |
| 3004 | .Vb 12 |
| 3005 | \& directive : '<commit>' # commit to production |
| 3006 | \& | '<uncommit>' # cancel commitment |
| 3007 | \& | '<resync>' # skip to newline |
| 3008 | \& | '<resync:' pattern '>' # skip <pattern> |
| 3009 | \& | '<reject>' # fail this production |
| 3010 | \& | '<reject:' condition '>' # fail if <condition> |
| 3011 | \& | '<error>' # report an error |
| 3012 | \& | '<error:' string '>' # report error as "<string>" |
| 3013 | \& | '<error?>' # error only if committed |
| 3014 | \& | '<error?:' string '>' # " " " " |
| 3015 | \& | '<rulevar:' /[^>]+/ '>' # define rule-local variable |
| 3016 | \& | '<matchrule:' string '>' # invoke rule named in string |
| 3017 | .Ve |
| 3018 | .PP |
| 3019 | .Vb 1 |
| 3020 | \& identifier : /[a-z]\ew*/i # must start with alpha |
| 3021 | .Ve |
| 3022 | .PP |
| 3023 | .Vb 1 |
| 3024 | \& comment : /#[^\en]*/ # same as Perl |
| 3025 | .Ve |
| 3026 | .PP |
| 3027 | .Vb 1 |
| 3028 | \& pattern : {extract_bracketed($text,'<')} # allow embedded "<..>" |
| 3029 | .Ve |
| 3030 | .PP |
| 3031 | .Vb 1 |
| 3032 | \& condition : {extract_codeblock($text,'{<')} # full Perl expression |
| 3033 | .Ve |
| 3034 | .PP |
| 3035 | .Vb 3 |
| 3036 | \& string : {extract_variable($text)} # any Perl variable |
| 3037 | \& | {extract_quotelike($text)} # or quotelike string |
| 3038 | \& | {extract_bracketed($text,'<')} # or balanced brackets |
| 3039 | .Ve |
| 3040 | .SH "GOTCHAS" |
| 3041 | .IX Header "GOTCHAS" |
| 3042 | This section describes common mistakes that grammar writers seem to |
| 3043 | make on a regular basis. |
| 3044 | .Sh "1. Expecting an error to always invalidate a parse" |
| 3045 | .IX Subsection "1. Expecting an error to always invalidate a parse" |
| 3046 | A common mistake when using error messages is to write the grammar like this: |
| 3047 | .PP |
| 3048 | .Vb 1 |
| 3049 | \& file: line(s) |
| 3050 | .Ve |
| 3051 | .PP |
| 3052 | .Vb 4 |
| 3053 | \& line: line_type_1 |
| 3054 | \& | line_type_2 |
| 3055 | \& | line_type_3 |
| 3056 | \& | <error> |
| 3057 | .Ve |
| 3058 | .PP |
| 3059 | The expectation seems to be that any line that is not of type 1, 2 or 3 will |
| 3060 | invoke the \f(CW\*(C`<error>\*(C'\fR directive and thereby cause the parse to fail. |
| 3061 | .PP |
| 3062 | Unfortunately, that only happens if the error occurs in the very first line. |
| 3063 | The first rule states that a \f(CW\*(C`file\*(C'\fR is matched by one or more lines, so if |
| 3064 | even a single line succeeds, the first rule is completely satisfied and the |
| 3065 | parse as a whole succeeds. That means that any error messages generated by |
| 3066 | subsequent failures in the \f(CW\*(C`line\*(C'\fR rule are quietly ignored. |
| 3067 | .PP |
| 3068 | Typically what's really needed is this: |
| 3069 | .PP |
| 3070 | .Vb 1 |
| 3071 | \& file: line(s) eofile { $return = $item[1] } |
| 3072 | .Ve |
| 3073 | .PP |
| 3074 | .Vb 4 |
| 3075 | \& line: line_type_1 |
| 3076 | \& | line_type_2 |
| 3077 | \& | line_type_3 |
| 3078 | \& | <error> |
| 3079 | .Ve |
| 3080 | .PP |
| 3081 | .Vb 1 |
| 3082 | \& eofile: /^\eZ/ |
| 3083 | .Ve |
| 3084 | .PP |
| 3085 | The addition of the \f(CW\*(C`eofile\*(C'\fR subrule to the first production means that |
| 3086 | a file only matches a series of successful \f(CW\*(C`line\*(C'\fR matches \fIthat consume the |
| 3087 | complete input text\fR. If any input text remains after the lines are matched, |
| 3088 | there must have been an error in the last \f(CW\*(C`line\*(C'\fR. In that case the \f(CW\*(C`eofile\*(C'\fR |
| 3089 | rule will fail, causing the entire \f(CW\*(C`file\*(C'\fR rule to fail too. |
| 3090 | .PP |
| 3091 | Note too that \f(CW\*(C`eofile\*(C'\fR must match \f(CW\*(C`/^\eZ/\*(C'\fR (end\-of\-text), \fInot\fR |
| 3092 | \&\f(CW\*(C`/^\ecZ/\*(C'\fR or \f(CW\*(C`/^\ecD/\*(C'\fR (end\-of\-file). |
| 3093 | .PP |
| 3094 | And don't forget the action at the end of the production. If you just |
| 3095 | write: |
| 3096 | .PP |
| 3097 | .Vb 1 |
| 3098 | \& file: line(s) eofile |
| 3099 | .Ve |
| 3100 | .PP |
| 3101 | then the value returned by the \f(CW\*(C`file\*(C'\fR rule will be the value of its |
| 3102 | last item: \f(CW\*(C`eofile\*(C'\fR. Since \f(CW\*(C`eofile\*(C'\fR always returns an empty string |
| 3103 | on success, that will cause the \f(CW\*(C`file\*(C'\fR rule to return that empty |
| 3104 | string. Apart from returning the wrong value, returning an empty string |
| 3105 | will trip up code such as: |
| 3106 | .PP |
| 3107 | .Vb 1 |
| 3108 | \& $parser->file($filetext) || die; |
| 3109 | .Ve |
| 3110 | .PP |
| 3111 | (since "" is false). |
| 3112 | .PP |
| 3113 | Remember that Parse::RecDescent returns undef on failure, |
| 3114 | so the only safe test for failure is: |
| 3115 | .PP |
| 3116 | .Vb 1 |
| 3117 | \& defined($parser->file($filetext)) || die; |
| 3118 | .Ve |
| 3119 | .SH "DIAGNOSTICS" |
| 3120 | .IX Header "DIAGNOSTICS" |
| 3121 | Diagnostics are intended to be self-explanatory (particularly if you |
| 3122 | use \fB\-RD_HINT\fR (under \fBperl \-s\fR) or define \f(CW$::RD_HINT\fR inside the program). |
| 3123 | .PP |
| 3124 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR currently diagnoses the following: |
| 3125 | .IP "\(bu" 4 |
| 3126 | Invalid regular expressions used as pattern terminals (fatal error). |
| 3127 | .IP "\(bu" 4 |
| 3128 | Invalid Perl code in code blocks (fatal error). |
| 3129 | .IP "\(bu" 4 |
| 3130 | Lookahead used in the wrong place or in a nonsensical way (fatal error). |
| 3131 | .IP "\(bu" 4 |
| 3132 | \&\*(L"Obvious\*(R" cases of left-recursion (fatal error). |
| 3133 | .IP "\(bu" 4 |
| 3134 | Missing or extra components in a \f(CW\*(C`<leftop>\*(C'\fR or \f(CW\*(C`<rightop>\*(C'\fR |
| 3135 | directive. |
| 3136 | .IP "\(bu" 4 |
| 3137 | Unrecognisable components in the grammar specification (fatal error). |
| 3138 | .IP "\(bu" 4 |
| 3139 | \&\*(L"Orphaned\*(R" rule components specified before the first rule (fatal error) |
| 3140 | or after an \f(CW\*(C`<error>\*(C'\fR directive (level 3 warning). |
| 3141 | .IP "\(bu" 4 |
| 3142 | Missing rule definitions (this only generates a level 3 warning, since you |
| 3143 | may be providing them later via \f(CW\*(C`Parse::RecDescent::Extend()\*(C'\fR). |
| 3144 | .IP "\(bu" 4 |
| 3145 | Instances where greedy repetition behaviour will almost certainly |
| 3146 | cause the failure of a production (a level 3 warning \- see |
| 3147 | \&\*(L"\s-1ON\-GOING\s0 \s-1ISSUES\s0 \s-1AND\s0 \s-1FUTURE\s0 \s-1DIRECTIONS\s0\*(R" below). |
| 3148 | .IP "\(bu" 4 |
| 3149 | Attempts to define rules named 'Replace' or 'Extend', which cannot be |
| 3150 | called directly through the parser object because of the predefined |
| 3151 | meaning of \f(CW\*(C`Parse::RecDescent::Replace\*(C'\fR and |
| 3152 | \&\f(CW\*(C`Parse::RecDescent::Extend\*(C'\fR. (Only a level 2 warning is generated, since |
| 3153 | such rules \fIcan\fR still be used as subrules). |
| 3154 | .IP "\(bu" 4 |
| 3155 | Productions which consist of a single \f(CW\*(C`<error?>\*(C'\fR |
| 3156 | directive, and which therefore may succeed unexpectedly |
| 3157 | (a level 2 warning, since this might conceivably be the desired effect). |
| 3158 | .IP "\(bu" 4 |
| 3159 | Multiple consecutive lookahead specifiers (a level 1 warning only, since their |
| 3160 | effects simply accumulate). |
| 3161 | .IP "\(bu" 4 |
| 3162 | Productions which start with a \f(CW\*(C`<reject>\*(C'\fR or \f(CW\*(C`<rulevar:...>\*(C'\fR |
| 3163 | directive. Such productions are optimized away (a level 1 warning). |
| 3164 | .IP "\(bu" 4 |
| 3165 | Rules which are autogenerated under \f(CW$::AUTOSTUB\fR (a level 1 warning). |
| 3166 | .SH "AUTHOR" |
| 3167 | .IX Header "AUTHOR" |
| 3168 | Damian Conway (damian@conway.org) |
| 3169 | .SH "BUGS AND IRRITATIONS" |
| 3170 | .IX Header "BUGS AND IRRITATIONS" |
| 3171 | There are undoubtedly serious bugs lurking somewhere in this much code :\-) |
| 3172 | Bug reports and other feedback are most welcome. |
| 3173 | .PP |
| 3174 | Ongoing annoyances include: |
| 3175 | .IP "\(bu" 4 |
| 3176 | There's no support for parsing directly from an input stream. |
| 3177 | If and when the Perl Gods give us regular expressions on streams, |
| 3178 | this should be trivial (ahem!) to implement. |
| 3179 | .IP "\(bu" 4 |
| 3180 | The parser generator can get confused if actions aren't properly |
| 3181 | closed or if they contain particularly nasty Perl syntax errors |
| 3182 | (especially unmatched curly brackets). |
| 3183 | .IP "\(bu" 4 |
| 3184 | The generator only detects the most obvious form of left recursion |
| 3185 | (potential recursion on the first subrule in a rule). More subtle |
| 3186 | forms of left recursion (for example, through the second item in a |
| 3187 | rule after a \*(L"zero\*(R" match of a preceding \*(L"zero\-or\-more\*(R" repetition, |
| 3188 | or after a match of a subrule with an empty production) are not found. |
| 3189 | .IP "\(bu" 4 |
| 3190 | Instead of complaining about left\-recursion, the generator should |
| 3191 | silently transform the grammar to remove it. Don't expect this |
| 3192 | feature any time soon as it would require a more sophisticated |
| 3193 | approach to parser generation than is currently used. |
| 3194 | .IP "\(bu" 4 |
| 3195 | The generated parsers don't always run as fast as might be wished. |
| 3196 | .IP "\(bu" 4 |
| 3197 | The meta-parser should be bootstrapped using \f(CW\*(C`Parse::RecDescent\*(C'\fR :\-) |
| 3198 | .SH "ON-GOING ISSUES AND FUTURE DIRECTIONS" |
| 3199 | .IX Header "ON-GOING ISSUES AND FUTURE DIRECTIONS" |
| 3200 | .IP "1." 4 |
| 3201 | Repetitions are \*(L"incorrigibly greedy\*(R" in that they will eat everything they can |
| 3202 | and won't backtrack if that behaviour causes a production to fail needlessly. |
| 3203 | So, for example: |
| 3204 | .Sp |
| 3205 | .Vb 1 |
| 3206 | \& rule: subrule(s) subrule |
| 3207 | .Ve |
| 3208 | .Sp |
| 3209 | will \fInever\fR succeed, because the repetition will eat all the |
| 3210 | subrules it finds, leaving none to match the second item. Such |
| 3211 | constructions are relatively rare (and \f(CW\*(C`Parse::RecDescent::new\*(C'\fR generates a |
| 3212 | warning whenever they occur) so this may not be a problem, especially |
| 3213 | since the insatiable behaviour can be overcome \*(L"manually\*(R" by writing: |
| 3214 | .Sp |
| 3215 | .Vb 1 |
| 3216 | \& rule: penultimate_subrule(s) subrule |
| 3217 | .Ve |
| 3218 | .Sp |
| 3219 | .Vb 1 |
| 3220 | \& penultimate_subrule: subrule ...subrule |
| 3221 | .Ve |
| 3222 | .Sp |
| 3223 | The issue is that this construction is exactly twice as expensive as the |
| 3224 | original, whereas backtracking would add only 1/\fIN\fR to the cost (for |
| 3225 | matching \fIN\fR repetitions of \f(CW\*(C`subrule\*(C'\fR). I would welcome feedback on |
| 3226 | the need for backtracking; particularly on cases where the lack of it |
| 3227 | makes parsing performance problematical. |
| 3228 | .IP "2." 4 |
| 3229 | Having opened that can of worms, it's also necessary to consider whether there |
| 3230 | is a need for non-greedy repetition specifiers. Again, it's possible (at some |
| 3231 | cost) to manually provide the required functionality: |
| 3232 | .Sp |
| 3233 | .Vb 1 |
| 3234 | \& rule: nongreedy_subrule(s) othersubrule |
| 3235 | .Ve |
| 3236 | .Sp |
| 3237 | .Vb 1 |
| 3238 | \& nongreedy_subrule: subrule ...!othersubrule |
| 3239 | .Ve |
| 3240 | .Sp |
| 3241 | Overall, the issue is whether the benefit of this extra functionality |
| 3242 | outweighs the drawbacks of further complicating the (currently |
| 3243 | minimalist) grammar specification syntax, and (worse) introducing more overhead |
| 3244 | into the generated parsers. |
| 3245 | .IP "3." 4 |
| 3246 | An \f(CW\*(C`<autocommit>\*(C'\fR directive would be nice. That is, it would be useful to be |
| 3247 | able to say: |
| 3248 | .Sp |
| 3249 | .Vb 7 |
| 3250 | \& command: <autocommit> |
| 3251 | \& command: 'find' name |
| 3252 | \& | 'find' address |
| 3253 | \& | 'do' command 'at' time 'if' condition |
| 3254 | \& | 'do' command 'at' time |
| 3255 | \& | 'do' command |
| 3256 | \& | unusual_command |
| 3257 | .Ve |
| 3258 | .Sp |
| 3259 | and have the generator work out that this should be \*(L"pruned\*(R" thus: |
| 3260 | .Sp |
| 3261 | .Vb 9 |
| 3262 | \& command: 'find' name |
| 3263 | \& | 'find' <commit> address |
| 3264 | \& | 'do' <commit> command <uncommit> |
| 3265 | \& 'at' time |
| 3266 | \& 'if' <commit> condition |
| 3267 | \& | 'do' <commit> command <uncommit> |
| 3268 | \& 'at' <commit> time |
| 3269 | \& | 'do' <commit> command |
| 3270 | \& | unusual_command |
| 3271 | .Ve |
| 3272 | .Sp |
| 3273 | There are several issues here. Firstly, should the |
| 3274 | \&\f(CW\*(C`<autocommit>\*(C'\fR automatically install an \f(CW\*(C`<uncommit>\*(C'\fR |
| 3275 | at the start of the last production (on the grounds that the \*(L"command\*(R" |
| 3276 | rule doesn't know whether an \*(L"unusual_command\*(R" might start with \*(L"find\*(R" |
| 3277 | or \*(L"do\*(R") or should the \*(L"unusual_command\*(R" subgraph be analysed (to see |
| 3278 | if it \fImight\fR be viable after a \*(L"find\*(R" or \*(L"do\*(R")? |
| 3279 | .Sp |
| 3280 | The second issue is how regular expressions should be treated. The simplest |
| 3281 | approach would be simply to uncommit before them (on the grounds that they |
| 3282 | \&\fImight\fR match). Better efficiency would be obtained by analyzing all preceding |
| 3283 | literal tokens to determine whether the pattern would match them. |
| 3284 | .Sp |
| 3285 | Overall, the issues are: can such automated \*(L"pruning\*(R" approach a hand-tuned |
| 3286 | version sufficiently closely to warrant the extra set-up expense, and (more |
| 3287 | importantly) is the problem important enough to even warrant the non-trivial |
| 3288 | effort of building an automated solution? |
| 3289 | .SH "COPYRIGHT" |
| 3290 | .IX Header "COPYRIGHT" |
| 3291 | Copyright (c) 1997\-2000, Damian Conway. All Rights Reserved. |
| 3292 | This module is free software. It may be used, redistributed |
| 3293 | and/or modified under the terms of the Perl Artistic License |
| 3294 | (see http://www.perl.com/perl/misc/Artistic.html) |