Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | .\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13 |
2 | .\" | |
3 | .\" Standard preamble: | |
4 | .\" ======================================================================== | |
5 | .de Sh \" Subsection heading | |
6 | .br | |
7 | .if t .Sp | |
8 | .ne 5 | |
9 | .PP | |
10 | \fB\\$1\fR | |
11 | .PP | |
12 | .. | |
13 | .de Sp \" Vertical space (when we can't use .PP) | |
14 | .if t .sp .5v | |
15 | .if n .sp | |
16 | .. | |
17 | .de Vb \" Begin verbatim text | |
18 | .ft CW | |
19 | .nf | |
20 | .ne \\$1 | |
21 | .. | |
22 | .de Ve \" End verbatim text | |
23 | .ft R | |
24 | .fi | |
25 | .. | |
26 | .\" Set up some character translations and predefined strings. \*(-- will | |
27 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left | |
28 | .\" double quote, and \*(R" will give a right double quote. | will give a | |
29 | .\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to | |
30 | .\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C' | |
31 | .\" expand to `' in nroff, nothing in troff, for use with C<>. | |
32 | .tr \(*W-|\(bv\*(Tr | |
33 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' | |
34 | .ie n \{\ | |
35 | . ds -- \(*W- | |
36 | . ds PI pi | |
37 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch | |
38 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch | |
39 | . ds L" "" | |
40 | . ds R" "" | |
41 | . ds C` "" | |
42 | . ds C' "" | |
43 | 'br\} | |
44 | .el\{\ | |
45 | . ds -- \|\(em\| | |
46 | . ds PI \(*p | |
47 | . ds L" `` | |
48 | . ds R" '' | |
49 | 'br\} | |
50 | .\" | |
51 | .\" If the F register is turned on, we'll generate index entries on stderr for | |
52 | .\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index | |
53 | .\" entries marked with X<> in POD. Of course, you'll have to process the | |
54 | .\" output yourself in some meaningful fashion. | |
55 | .if \nF \{\ | |
56 | . de IX | |
57 | . tm Index:\\$1\t\\n%\t"\\$2" | |
58 | .. | |
59 | . nr % 0 | |
60 | . rr F | |
61 | .\} | |
62 | .\" | |
63 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes | |
64 | .\" way too many mistakes in technical documents. | |
65 | .hy 0 | |
66 | .if n .na | |
67 | .\" | |
68 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). | |
69 | .\" Fear. Run. Save yourself. No user-serviceable parts. | |
70 | . \" fudge factors for nroff and troff | |
71 | .if n \{\ | |
72 | . ds #H 0 | |
73 | . ds #V .8m | |
74 | . ds #F .3m | |
75 | . ds #[ \f1 | |
76 | . ds #] \fP | |
77 | .\} | |
78 | .if t \{\ | |
79 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) | |
80 | . ds #V .6m | |
81 | . ds #F 0 | |
82 | . ds #[ \& | |
83 | . ds #] \& | |
84 | .\} | |
85 | . \" simple accents for nroff and troff | |
86 | .if n \{\ | |
87 | . ds ' \& | |
88 | . ds ` \& | |
89 | . ds ^ \& | |
90 | . ds , \& | |
91 | . ds ~ ~ | |
92 | . ds / | |
93 | .\} | |
94 | .if t \{\ | |
95 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" | |
96 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' | |
97 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' | |
98 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' | |
99 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' | |
100 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' | |
101 | .\} | |
102 | . \" troff and (daisy-wheel) nroff accents | |
103 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' | |
104 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' | |
105 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] | |
106 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' | |
107 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' | |
108 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] | |
109 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] | |
110 | .ds ae a\h'-(\w'a'u*4/10)'e | |
111 | .ds Ae A\h'-(\w'A'u*4/10)'E | |
112 | . \" corrections for vroff | |
113 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' | |
114 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' | |
115 | . \" for low resolution devices (crt and lpr) | |
116 | .if \n(.H>23 .if \n(.V>19 \ | |
117 | \{\ | |
118 | . ds : e | |
119 | . ds 8 ss | |
120 | . ds o a | |
121 | . ds d- d\h'-1'\(ga | |
122 | . ds D- D\h'-1'\(hy | |
123 | . ds th \o'bp' | |
124 | . ds Th \o'LP' | |
125 | . ds ae ae | |
126 | . ds Ae AE | |
127 | .\} | |
128 | .rm #[ #] #H #V #F C | |
129 | .\" ======================================================================== | |
130 | .\" | |
131 | .IX Title "RECDESCENT 1" | |
132 | .TH RECDESCENT 1 "2000-08-20" "perl v5.8.0" "User Contributed Perl Documentation" | |
133 | .SH "NAME" | |
134 | Parse::RecDescent \- Generate Recursive\-Descent Parsers | |
135 | .SH "VERSION" | |
136 | .IX Header "VERSION" | |
137 | This document describes version 1.79 of Parse::RecDescent, | |
138 | released August 21, 2000. | |
139 | .SH "SYNOPSIS" | |
140 | .IX Header "SYNOPSIS" | |
141 | .Vb 1 | |
142 | \& use Parse::RecDescent; | |
143 | .Ve | |
144 | .PP | |
145 | .Vb 1 | |
146 | \& # Generate a parser from the specification in $grammar: | |
147 | .Ve | |
148 | .PP | |
149 | .Vb 1 | |
150 | \& $parser = new Parse::RecDescent ($grammar); | |
151 | .Ve | |
152 | .PP | |
153 | .Vb 1 | |
154 | \& # Generate a parser from the specification in $othergrammar | |
155 | .Ve | |
156 | .PP | |
157 | .Vb 1 | |
158 | \& $anotherparser = new Parse::RecDescent ($othergrammar); | |
159 | .Ve | |
160 | .PP | |
161 | .Vb 2 | |
162 | \& # Parse $text using rule 'startrule' (which must be | |
163 | \& # defined in $grammar): | |
164 | .Ve | |
165 | .PP | |
166 | .Vb 1 | |
167 | \& $parser->startrule($text); | |
168 | .Ve | |
169 | .PP | |
170 | .Vb 2 | |
171 | \& # Parse $text using rule 'otherrule' (which must also | |
172 | \& # be defined in $grammar): | |
173 | .Ve | |
174 | .PP | |
175 | .Vb 1 | |
176 | \& $parser->otherrule($text); | |
177 | .Ve | |
178 | .PP | |
179 | .Vb 2 | |
180 | \& # Change the universal token prefix pattern | |
181 | \& # (the default is: '\es*'): | |
182 | .Ve | |
183 | .PP | |
184 | .Vb 1 | |
185 | \& $Parse::RecDescent::skip = '[ \et]+'; | |
186 | .Ve | |
187 | .PP | |
188 | .Vb 2 | |
189 | \& # Replace productions of existing rules (or create new ones) | |
190 | \& # with the productions defined in $newgrammar: | |
191 | .Ve | |
192 | .PP | |
193 | .Vb 1 | |
194 | \& $parser->Replace($newgrammar); | |
195 | .Ve | |
196 | .PP | |
197 | .Vb 2 | |
198 | \& # Extend existing rules (or create new ones) | |
199 | \& # by adding extra productions defined in $moregrammar: | |
200 | .Ve | |
201 | .PP | |
202 | .Vb 1 | |
203 | \& $parser->Extend($moregrammar); | |
204 | .Ve | |
205 | .PP | |
206 | .Vb 1 | |
207 | \& # Global flags (useful as command line arguments under -s): | |
208 | .Ve | |
209 | .PP | |
210 | .Vb 6 | |
211 | \& $::RD_ERRORS # unless undefined, report fatal errors | |
212 | \& $::RD_WARN # unless undefined, also report non-fatal problems | |
213 | \& $::RD_HINT # if defined, also suggestion remedies | |
214 | \& $::RD_TRACE # if defined, also trace parsers' behaviour | |
215 | \& $::RD_AUTOSTUB # if defined, generates "stubs" for undefined rules | |
216 | \& $::RD_AUTOACTION # if defined, appends specified action to productions | |
217 | .Ve | |
218 | .SH "DESCRIPTION" | |
219 | .IX Header "DESCRIPTION" | |
220 | .Sh "Overview" | |
221 | .IX Subsection "Overview" | |
222 | Parse::RecDescent incrementally generates top-down recursive-descent text | |
223 | parsers from simple \fIyacc\fR\-like grammar specifications. It provides: | |
224 | .IP "\(bu" 4 | |
225 | Regular expressions or literal strings as terminals (tokens), | |
226 | .IP "\(bu" 4 | |
227 | Multiple (non\-contiguous) productions for any rule, | |
228 | .IP "\(bu" 4 | |
229 | Repeated and optional subrules within productions, | |
230 | .IP "\(bu" 4 | |
231 | Full access to Perl within actions specified as part of the grammar, | |
232 | .IP "\(bu" 4 | |
233 | Simple automated error reporting during parser generation and parsing, | |
234 | .IP "\(bu" 4 | |
235 | The ability to commit to, uncommit to, or reject particular | |
236 | productions during a parse, | |
237 | .IP "\(bu" 4 | |
238 | The ability to pass data up and down the parse tree (\*(L"down\*(R" via subrule | |
239 | argument lists, \*(L"up\*(R" via subrule return values) | |
240 | .IP "\(bu" 4 | |
241 | Incremental extension of the parsing grammar (even during a parse), | |
242 | .IP "\(bu" 4 | |
243 | Precompilation of parser objects, | |
244 | .IP "\(bu" 4 | |
245 | User-definable reduce-reduce conflict resolution via | |
246 | \&\*(L"scoring\*(R" of matching productions. | |
247 | .ie n .Sh "Using ""Parse::RecDescent""" | |
248 | .el .Sh "Using \f(CWParse::RecDescent\fP" | |
249 | .IX Subsection "Using Parse::RecDescent" | |
250 | Parser objects are created by calling \f(CW\*(C`Parse::RecDescent::new\*(C'\fR, passing in a | |
251 | grammar specification (see the following subsections). If the grammar is | |
252 | correct, \f(CW\*(C`new\*(C'\fR returns a blessed reference which can then be used to initiate | |
253 | parsing through any rule specified in the original grammar. A typical sequence | |
254 | looks like this: | |
255 | .PP | |
256 | .Vb 3 | |
257 | \& $grammar = q { | |
258 | \& # GRAMMAR SPECIFICATION HERE | |
259 | \& }; | |
260 | .Ve | |
261 | .PP | |
262 | .Vb 1 | |
263 | \& $parser = new Parse::RecDescent ($grammar) or die "Bad grammar!\en"; | |
264 | .Ve | |
265 | .PP | |
266 | .Vb 1 | |
267 | \& # acquire $text | |
268 | .Ve | |
269 | .PP | |
270 | .Vb 1 | |
271 | \& defined $parser->startrule($text) or print "Bad text!\en"; | |
272 | .Ve | |
273 | .PP | |
274 | The rule through which parsing is initiated must be explicitly defined | |
275 | in the grammar (i.e. for the above example, the grammar must include a | |
276 | rule of the form: \*(L"startrule: <subrules>\*(R". | |
277 | .PP | |
278 | If the starting rule succeeds, its value (see below) | |
279 | is returned. Failure to generate the original parser or failure to match a text | |
280 | is indicated by returning \f(CW\*(C`undef\*(C'\fR. Note that it's easy to set up grammars | |
281 | that can succeed, but which return a value of 0, \*(L"0\*(R", or "". So don't be | |
282 | tempted to write: | |
283 | .PP | |
284 | .Vb 1 | |
285 | \& $parser->startrule($text) or print "Bad text!\en"; | |
286 | .Ve | |
287 | .PP | |
288 | Normally, the parser has no effect on the original text. So in the | |
289 | previous example the value of \f(CW$text\fR would be unchanged after having | |
290 | been parsed. | |
291 | .PP | |
292 | If, however, the text to be matched is passed by reference: | |
293 | .PP | |
294 | .Vb 1 | |
295 | \& $parser->startrule(\e$text) | |
296 | .Ve | |
297 | .PP | |
298 | then any text which was consumed during the match will be removed from the | |
299 | start of \f(CW$text\fR. | |
300 | .Sh "Rules" | |
301 | .IX Subsection "Rules" | |
302 | In the grammar from which the parser is built, rules are specified by | |
303 | giving an identifier (which must satisfy /[A\-Za\-z]\ew*/), followed by a | |
304 | colon \fIon the same line\fR, followed by one or more productions, | |
305 | separated by single vertical bars. The layout of the productions | |
306 | is entirely free\-format: | |
307 | .PP | |
308 | .Vb 3 | |
309 | \& rule1: production1 | |
310 | \& | production2 | | |
311 | \& production3 | production4 | |
312 | .Ve | |
313 | .PP | |
314 | At any point in the grammar previously defined rules may be extended with | |
315 | additional productions. This is achieved by redeclaring the rule with the new | |
316 | productions. Thus: | |
317 | .PP | |
318 | .Vb 3 | |
319 | \& rule1: a | b | c | |
320 | \& rule2: d | e | f | |
321 | \& rule1: g | h | |
322 | .Ve | |
323 | .PP | |
324 | is exactly equivalent to: | |
325 | .PP | |
326 | .Vb 2 | |
327 | \& rule1: a | b | c | g | h | |
328 | \& rule2: d | e | f | |
329 | .Ve | |
330 | .PP | |
331 | Each production in a rule consists of zero or more items, each of which | |
332 | may be either: the name of another rule to be matched (a \*(L"subrule\*(R"), | |
333 | a pattern or string literal to be matched directly (a \*(L"token\*(R"), a | |
334 | block of Perl code to be executed (an \*(L"action\*(R"), a special instruction | |
335 | to the parser (a \*(L"directive\*(R"), or a standard Perl comment (which is | |
336 | ignored). | |
337 | .PP | |
338 | A rule matches a text if one of its productions matches. A production | |
339 | matches if each of its items match consecutive substrings of the | |
340 | text. The productions of a rule being matched are tried in the same | |
341 | order that they appear in the original grammar, and the first matching | |
342 | production terminates the match attempt (successfully). If all | |
343 | productions are tried and none matches, the match attempt fails. | |
344 | .PP | |
345 | Note that this behaviour is quite different from the \*(L"prefer the longer match\*(R" | |
346 | behaviour of \fIyacc\fR. For example, if \fIyacc\fR were parsing the rule: | |
347 | .PP | |
348 | .Vb 2 | |
349 | \& seq : 'A' 'B' | |
350 | \& | 'A' 'B' 'C' | |
351 | .Ve | |
352 | .PP | |
353 | upon matching \*(L"\s-1AB\s0\*(R" it would look ahead to see if a 'C' is next and, if | |
354 | so, will match the second production in preference to the first. In | |
355 | other words, \fIyacc\fR effectively tries all the productions of a rule | |
356 | breadth-first in parallel, and selects the \*(L"best\*(R" match, where \*(L"best\*(R" | |
357 | means longest (note that this is a gross simplification of the true | |
358 | behaviour of \fIyacc\fR but it will do for our purposes). | |
359 | .PP | |
360 | In contrast, \f(CW\*(C`Parse::RecDescent\*(C'\fR tries each production depth-first in | |
361 | sequence, and selects the \*(L"best\*(R" match, where \*(L"best\*(R" means first. This is | |
362 | the fundamental difference between \*(L"bottom\-up\*(R" and \*(L"recursive descent\*(R" | |
363 | parsing. | |
364 | .PP | |
365 | Each successfully matched item in a production is assigned a value, | |
366 | which can be accessed in subsequent actions within the same | |
367 | production (or, in some cases, as the return value of a successful | |
368 | subrule call). Unsuccessful items don't have an associated value, | |
369 | since the failure of an item causes the entire surrounding production | |
370 | to immediately fail. The following sections describe the various types | |
371 | of items and their success values. | |
372 | .Sh "Subrules" | |
373 | .IX Subsection "Subrules" | |
374 | A subrule which appears in a production is an instruction to the parser to | |
375 | attempt to match the named rule at that point in the text being | |
376 | parsed. If the named subrule is not defined when requested the | |
377 | production containing it immediately fails (unless it was \*(L"autostubbed\*(R" \- see | |
378 | Autostubbing). | |
379 | .PP | |
380 | A rule may (recursively) call itself as a subrule, but \fInot\fR as the | |
381 | left-most item in any of its productions (since such recursions are usually | |
382 | non\-terminating). | |
383 | .PP | |
384 | The value associated with a subrule is the value associated with its | |
385 | \&\f(CW$return\fR variable (see \*(L"Actions\*(R" below), or with the last successfully | |
386 | matched item in the subrule match. | |
387 | .PP | |
388 | Subrules may also be specified with a trailing repetition specifier, | |
389 | indicating that they are to be (greedily) matched the specified number | |
390 | of times. The available specifiers are: | |
391 | .PP | |
392 | .Vb 7 | |
393 | \& subrule(?) # Match one-or-zero times | |
394 | \& subrule(s) # Match one-or-more times | |
395 | \& subrule(s?) # Match zero-or-more times | |
396 | \& subrule(N) # Match exactly N times for integer N > 0 | |
397 | \& subrule(N..M) # Match between N and M times | |
398 | \& subrule(..M) # Match between 1 and M times | |
399 | \& subrule(N..) # Match at least N times | |
400 | .Ve | |
401 | .PP | |
402 | Repeated subrules keep matching until either the subrule fails to | |
403 | match, or it has matched the minimal number of times but fails to | |
404 | consume any of the parsed text (this second condition prevents the | |
405 | subrule matching forever in some cases). | |
406 | .PP | |
407 | Since a repeated subrule may match many instances of the subrule itself, the | |
408 | value associated with it is not a simple scalar, but rather a reference to a | |
409 | list of scalars, each of which is the value associated with one of the | |
410 | individual subrule matches. In other words in the rule: | |
411 | .PP | |
412 | .Vb 1 | |
413 | \& program: statement(s) | |
414 | .Ve | |
415 | .PP | |
416 | the value associated with the repeated subrule \*(L"statement(s)\*(R" is a reference | |
417 | to an array containing the values matched by each call to the individual | |
418 | subrule \*(L"statement\*(R". | |
419 | .PP | |
420 | Repetition modifieres may include a separator pattern: | |
421 | .PP | |
422 | .Vb 1 | |
423 | \& program: statement(s /;/) | |
424 | .Ve | |
425 | .PP | |
426 | specifying some sequence of characters to be skipped between each repetition. | |
427 | This is really just a shorthand for the <leftop:...> directive | |
428 | (see below). | |
429 | .Sh "Tokens" | |
430 | .IX Subsection "Tokens" | |
431 | If a quote-delimited string or a Perl regex appears in a production, | |
432 | the parser attempts to match that string or pattern at that point in | |
433 | the text. For example: | |
434 | .PP | |
435 | .Vb 1 | |
436 | \& typedef: "typedef" typename identifier ';' | |
437 | .Ve | |
438 | .PP | |
439 | .Vb 1 | |
440 | \& identifier: /[A-Za-z_][A-Za-z0-9_]*/ | |
441 | .Ve | |
442 | .PP | |
443 | As in regular Perl, a single quoted string is uninterpolated, whilst | |
444 | a double-quoted string or a pattern is interpolated (at the time | |
445 | of matching, \fInot\fR when the parser is constructed). Hence, it is | |
446 | possible to define rules in which tokens can be set at run\-time: | |
447 | .PP | |
448 | .Vb 1 | |
449 | \& typedef: "$::typedefkeyword" typename identifier ';' | |
450 | .Ve | |
451 | .PP | |
452 | .Vb 1 | |
453 | \& identifier: /$::identpat/ | |
454 | .Ve | |
455 | .PP | |
456 | Note that, since each rule is implemented inside a special namespace | |
457 | belonging to its parser, it is necessary to explicitly quantify | |
458 | variables from the main package. | |
459 | .PP | |
460 | Regex tokens can be specified using just slashes as delimiters | |
461 | or with the explicit \f(CW\*(C`m<delimiter>......<delimiter>\*(C'\fR syntax: | |
462 | .PP | |
463 | .Vb 1 | |
464 | \& typedef: "typedef" typename identifier ';' | |
465 | .Ve | |
466 | .PP | |
467 | .Vb 1 | |
468 | \& typename: /[A-Za-z_][A-Za-z0-9_]*/ | |
469 | .Ve | |
470 | .PP | |
471 | .Vb 1 | |
472 | \& identifier: m{[A-Za-z_][A-Za-z0-9_]*} | |
473 | .Ve | |
474 | .PP | |
475 | A regex of either type can also have any valid trailing parameter(s) | |
476 | (that is, any of [cgimsox]): | |
477 | .PP | |
478 | .Vb 1 | |
479 | \& typedef: "typedef" typename identifier ';' | |
480 | .Ve | |
481 | .PP | |
482 | .Vb 3 | |
483 | \& identifier: / [a-z_] # LEADING ALPHA OR UNDERSCORE | |
484 | \& [a-z0-9_]* # THEN DIGITS ALSO ALLOWED | |
485 | \& /ix # CASE/SPACE/COMMENT INSENSITIVE | |
486 | .Ve | |
487 | .PP | |
488 | The value associated with any successfully matched token is a string | |
489 | containing the actual text which was matched by the token. | |
490 | .PP | |
491 | It is important to remember that, since each grammar is specified in a | |
492 | Perl string, all instances of the universal escape character '\e' within | |
493 | a grammar must be \*(L"doubled\*(R", so that they interpolate to single '\e's when | |
494 | the string is compiled. For example, to use the grammar: | |
495 | .PP | |
496 | .Vb 3 | |
497 | \& word: /\eS+/ | backslash | |
498 | \& line: prefix word(s) "\en" | |
499 | \& backslash: '\e\e' | |
500 | .Ve | |
501 | .PP | |
502 | the following code is required: | |
503 | .PP | |
504 | .Vb 1 | |
505 | \& $parser = new Parse::RecDescent (q{ | |
506 | .Ve | |
507 | .PP | |
508 | .Vb 3 | |
509 | \& word: /\e\eS+/ | backslash | |
510 | \& line: prefix word(s) "\e\en" | |
511 | \& backslash: '\e\e\e\e' | |
512 | .Ve | |
513 | .PP | |
514 | .Vb 1 | |
515 | \& }); | |
516 | .Ve | |
517 | .Sh "Terminal Separators" | |
518 | .IX Subsection "Terminal Separators" | |
519 | For the purpose of matching, each terminal in a production is considered | |
520 | to be preceded by a \*(L"prefix\*(R" \- a pattern which must be | |
521 | matched before a token match is attempted. By default, the | |
522 | prefix is optional whitespace (which always matches, at | |
523 | least trivially), but this default may be reset in any production. | |
524 | .PP | |
525 | The variable \f(CW$Parse::RecDescent::skip\fR stores the universal | |
526 | prefix, which is the default for all terminal matches in all parsers | |
527 | built with \f(CW\*(C`Parse::RecDescent\*(C'\fR. | |
528 | .PP | |
529 | The prefix for an individual production can be altered | |
530 | by using the \f(CW\*(C`<skip:...>\*(C'\fR directive (see below). | |
531 | .Sh "Actions" | |
532 | .IX Subsection "Actions" | |
533 | An action is a block of Perl code which is to be executed (as the | |
534 | block of a \f(CW\*(C`do\*(C'\fR statement) when the parser reaches that point in a | |
535 | production. The action executes within a special namespace belonging to | |
536 | the active parser, so care must be taken in correctly qualifying variable | |
537 | names (see also \*(L"Start\-up Actions\*(R" below). | |
538 | .PP | |
539 | The action is considered to succeed if the final value of the block | |
540 | is defined (that is, if the implied \f(CW\*(C`do\*(C'\fR statement evaluates to a | |
541 | defined value \- \fIeven one which would be treated as \*(L"false\*(R"\fR). Note | |
542 | that the value associated with a successful action is also the final | |
543 | value in the block. | |
544 | .PP | |
545 | An action will \fIfail\fR if its last evaluated value is \f(CW\*(C`undef\*(C'\fR. This is | |
546 | surprisingly easy to accomplish by accident. For instance, here's an | |
547 | infuriating case of an action that makes its production fail, but only | |
548 | when debugging \fIisn't\fR activated: | |
549 | .PP | |
550 | .Vb 4 | |
551 | \& description: name rank serial_number | |
552 | \& { print "Got $item[2] $item[1] ($item[3])\en" | |
553 | \& if $::debugging | |
554 | \& } | |
555 | .Ve | |
556 | .PP | |
557 | If \f(CW$debugging\fR is false, no statement in the block is executed, so | |
558 | the final value is \f(CW\*(C`undef\*(C'\fR, and the entire production fails. The solution is: | |
559 | .PP | |
560 | .Vb 5 | |
561 | \& description: name rank serial_number | |
562 | \& { print "Got $item[2] $item[1] ($item[3])\en" | |
563 | \& if $::debugging; | |
564 | \& 1; | |
565 | \& } | |
566 | .Ve | |
567 | .PP | |
568 | Within an action, a number of useful parse-time variables are | |
569 | available in the special parser namespace (there are other variables | |
570 | also accessible, but meddling with them will probably just break your | |
571 | parser. As a general rule, if you avoid referring to unqualified | |
572 | variables \- especially those starting with an underscore \- inside an action, | |
573 | things should be okay): | |
574 | .ie n .IP "@item\fR and \f(CW%item" 4 | |
575 | .el .IP "\f(CW@item\fR and \f(CW%item\fR" 4 | |
576 | .IX Item "@item and %item" | |
577 | The array slice \f(CW@item[1..$#item]\fR stores the value associated with each item | |
578 | (that is, each subrule, token, or action) in the current production. The | |
579 | analogy is to \f(CW$1\fR, \f(CW$2\fR, etc. in a \fIyacc\fR grammar. | |
580 | Note that, for obvious reasons, \f(CW@item\fR only contains the | |
581 | values of items \fIbefore\fR the current point in the production. | |
582 | .Sp | |
583 | The first element (\f(CW$item[0]\fR) stores the name of the current rule | |
584 | being matched. | |
585 | .Sp | |
586 | \&\f(CW@item\fR is a standard Perl array, so it can also be indexed with negative | |
587 | numbers, representing the number of items \fIback\fR from the current position in | |
588 | the parse: | |
589 | .Sp | |
590 | .Vb 3 | |
591 | \& stuff: /various/ bits 'and' pieces "then" data 'end' | |
592 | \& { print $item[-2] } # PRINTS data | |
593 | \& # (EASIER THAN: $item[6]) | |
594 | .Ve | |
595 | .Sp | |
596 | The \f(CW%item\fR hash complements the <@item> array, providing named | |
597 | access to the same item values: | |
598 | .Sp | |
599 | .Vb 3 | |
600 | \& stuff: /various/ bits 'and' pieces "then" data 'end' | |
601 | \& { print $item{data} # PRINTS data | |
602 | \& # (EVEN EASIER THAN USING @item) | |
603 | .Ve | |
604 | .Sp | |
605 | The results of named subrules are stored in the hash under each | |
606 | subrule's name, whilst all other items are stored under a \*(L"named | |
607 | positional\*(R" key that indictates their ordinal position within their item | |
608 | type: _\|_STRING\fIn\fR_\|_, _\|_PATTERN\fIn\fR_\|_, _\|_DIRECTIVE\fIn\fR_\|_, _\|_ACTION\fIn\fR_\|_: | |
609 | .Sp | |
610 | .Vb 6 | |
611 | \& stuff: /various/ bits 'and' pieces "then" data 'end' { save } | |
612 | \& { print $item{__PATTERN1__}, # PRINTS 'various' | |
613 | \& $item{__STRING2__}, # PRINTS 'then' | |
614 | \& $item{__ACTION1__}, # PRINTS RETURN | |
615 | \& # VALUE OF save | |
616 | \& } | |
617 | .Ve | |
618 | .Sp | |
619 | If you want proper \fInamed\fR access to patterns or literals, you need to turn | |
620 | them into separate rules: | |
621 | .Sp | |
622 | .Vb 3 | |
623 | \& stuff: various bits 'and' pieces "then" data 'end' | |
624 | \& { print $item{various} # PRINTS various | |
625 | \& } | |
626 | .Ve | |
627 | .Sp | |
628 | .Vb 1 | |
629 | \& various: /various/ | |
630 | .Ve | |
631 | .Sp | |
632 | The special entry \f(CW$item{_\|_RULE_\|_}\fR stores the name of the current | |
633 | rule (i.e. the same value as \f(CW$item[0]\fR. | |
634 | .Sp | |
635 | The advantage of using \f(CW%item\fR, instead of \f(CW@items\fR is that it | |
636 | removes the need to track items positions that may change as a grammar | |
637 | evolves. For example, adding an interim \f(CW\*(C`<skip>\*(C'\fR directive | |
638 | of action can silently ruin a trailing action, by moving an \f(CW@item\fR | |
639 | element \*(L"down\*(R" the array one place. In contrast, the named entry | |
640 | of \f(CW%item\fR is unaffected by such an insertion. | |
641 | .Sp | |
642 | A limitation of the \f(CW%item\fR hash is that it only records the \fIlast\fR | |
643 | value of a particular subrule. For example: | |
644 | .Sp | |
645 | .Vb 2 | |
646 | \& range: '(' number '..' number )' | |
647 | \& { $return = $item{number} } | |
648 | .Ve | |
649 | .Sp | |
650 | will return only the value corresponding to the \fIsecond\fR match of the | |
651 | \&\f(CW\*(C`number\*(C'\fR subrule. In other words, successive calls to a subrule | |
652 | overwrite the corresponding entry in \f(CW%item\fR. Once again, the | |
653 | solution is to rename each subrule in its own rule: | |
654 | .Sp | |
655 | .Vb 2 | |
656 | \& range: '(' from_num '..' to_num )' | |
657 | \& { $return = $item{from_num} } | |
658 | .Ve | |
659 | .Sp | |
660 | .Vb 2 | |
661 | \& from_num: number | |
662 | \& to_num: number | |
663 | .Ve | |
664 | .ie n .IP "@arg\fR and \f(CW%arg" 4 | |
665 | .el .IP "\f(CW@arg\fR and \f(CW%arg\fR" 4 | |
666 | .IX Item "@arg and %arg" | |
667 | The array \f(CW@arg\fR and the hash \f(CW%arg\fR store any arguments passed to | |
668 | the rule from some other rule (see "\*(L"Subrule argument lists\*(R"). Changes | |
669 | to the elements of either variable do not propagate back to the calling | |
670 | rule (data can be passed back from a subrule via the \f(CW$return\fR | |
671 | variable \- see next item). | |
672 | .ie n .IP "$return" 4 | |
673 | .el .IP "\f(CW$return\fR" 4 | |
674 | .IX Item "$return" | |
675 | If a value is assigned to \f(CW$return\fR within an action, that value is | |
676 | returned if the production containing the action eventually matches | |
677 | successfully. Note that setting \f(CW$return\fR \fIdoesn't\fR cause the current | |
678 | production to succeed. It merely tells it what to return if it \fIdoes\fR succeed. | |
679 | Hence \f(CW$return\fR is analogous to \f(CW$$\fR in a \fIyacc\fR grammar. | |
680 | .Sp | |
681 | If \f(CW$return\fR is not assigned within a production, the value of the | |
682 | last component of the production (namely: \f(CW$item[$#item]\fR) is | |
683 | returned if the production succeeds. | |
684 | .ie n .IP "$commit" 4 | |
685 | .el .IP "\f(CW$commit\fR" 4 | |
686 | .IX Item "$commit" | |
687 | The current state of commitment to the current production (see \*(L"Directives\*(R" | |
688 | below). | |
689 | .ie n .IP "$skip" 4 | |
690 | .el .IP "\f(CW$skip\fR" 4 | |
691 | .IX Item "$skip" | |
692 | The current terminal prefix (see \*(L"Directives\*(R" below). | |
693 | .ie n .IP "$text" 4 | |
694 | .el .IP "\f(CW$text\fR" 4 | |
695 | .IX Item "$text" | |
696 | The remaining (unparsed) text. Changes to \f(CW$text\fR \fIdo not | |
697 | propagate\fR out of unsuccessful productions, but \fIdo\fR survive | |
698 | successful productions. Hence it is possible to dynamically alter the | |
699 | text being parsed \- for example, to provide a \f(CW\*(C`#include\*(C'\fR\-like facility: | |
700 | .Sp | |
701 | .Vb 2 | |
702 | \& hash_include: '#include' filename | |
703 | \& { $text = ::loadfile($item[2]) . $text } | |
704 | .Ve | |
705 | .Sp | |
706 | .Vb 2 | |
707 | \& filename: '<' /[a-z0-9._-]+/i '>' { $return = $item[2] } | |
708 | \& | '"' /[a-z0-9._-]+/i '"' { $return = $item[2] } | |
709 | .Ve | |
710 | .ie n .IP "$thisline\fR and \f(CW$prevline" 4 | |
711 | .el .IP "\f(CW$thisline\fR and \f(CW$prevline\fR" 4 | |
712 | .IX Item "$thisline and $prevline" | |
713 | \&\f(CW$thisline\fR stores the current line number within the current parse | |
714 | (starting from 1). \f(CW$prevline\fR stores the line number for the last | |
715 | character which was already successfully parsed (this will be different from | |
716 | \&\f(CW$thisline\fR at the end of each line). | |
717 | .Sp | |
718 | For efficiency, \f(CW$thisline\fR and \f(CW$prevline\fR are actually tied | |
719 | hashes, and only recompute the required line number when the variable's | |
720 | value is used. | |
721 | .Sp | |
722 | Assignment to \f(CW$thisline\fR adjusts the line number calculator, so that | |
723 | it believes that the current line number is the value being assigned. Note | |
724 | that this adjustment will be reflected in all subsequent line numbers | |
725 | calculations. | |
726 | .Sp | |
727 | Modifying the value of the variable \f(CW$text\fR (as in the previous | |
728 | \&\f(CW\*(C`hash_include\*(C'\fR example, for instance) will confuse the line | |
729 | counting mechanism. To prevent this, you should call | |
730 | \&\f(CW\*(C`Parse::RecDescent::LineCounter::resync($thisline)\*(C'\fR \fIimmediately\fR | |
731 | after any assignment to the variable \f(CW$text\fR (or, at least, before the | |
732 | next attempt to use \f(CW$thisline\fR). | |
733 | .Sp | |
734 | Note that if a production fails after assigning to or | |
735 | resync'ing \f(CW$thisline\fR, the parser's line counter mechanism will | |
736 | usually be corrupted. | |
737 | .Sp | |
738 | Also see the entry for \f(CW@itempos\fR. | |
739 | .Sp | |
740 | The line number can be set to values other than 1, by calling the start | |
741 | rule with a second argument. For example: | |
742 | .Sp | |
743 | .Vb 1 | |
744 | \& $parser = new Parse::RecDescent ($grammar); | |
745 | .Ve | |
746 | .Sp | |
747 | .Vb 1 | |
748 | \& $parser->input($text, 10); # START LINE NUMBERS AT 10 | |
749 | .Ve | |
750 | .ie n .IP "$thiscolumn\fR and \f(CW$prevcolumn" 4 | |
751 | .el .IP "\f(CW$thiscolumn\fR and \f(CW$prevcolumn\fR" 4 | |
752 | .IX Item "$thiscolumn and $prevcolumn" | |
753 | \&\f(CW$thiscolumn\fR stores the current column number within the current line | |
754 | being parsed (starting from 1). \f(CW$prevcolumn\fR stores the column number | |
755 | of the last character which was actually successfully parsed. Usually | |
756 | \&\f(CW\*(C`$prevcolumn == $thiscolumn\-1\*(C'\fR, but not at the end of lines. | |
757 | .Sp | |
758 | For efficiency, \f(CW$thiscolumn\fR and \f(CW$prevcolumn\fR are | |
759 | actually tied hashes, and only recompute the required column number | |
760 | when the variable's value is used. | |
761 | .Sp | |
762 | Assignment to \f(CW$thiscolumn\fR or \f(CW$prevcolumn\fR is a fatal error. | |
763 | .Sp | |
764 | Modifying the value of the variable \f(CW$text\fR (as in the previous | |
765 | \&\f(CW\*(C`hash_include\*(C'\fR example, for instance) may confuse the column | |
766 | counting mechanism. | |
767 | .Sp | |
768 | Note that \f(CW$thiscolumn\fR reports the column number \fIbefore\fR any | |
769 | whitespace that might be skipped before reading a token. Hence | |
770 | if you wish to know where a token started (and ended) use something like this: | |
771 | .Sp | |
772 | .Vb 2 | |
773 | \& rule: token1 token2 startcol token3 endcol token4 | |
774 | \& { print "token3: columns $item[3] to $item[5]"; } | |
775 | .Ve | |
776 | .Sp | |
777 | .Vb 2 | |
778 | \& startcol: // { $thiscolumn } # NEED THE // TO STEP PAST TOKEN SEP | |
779 | \& endcol: { $prevcolumn } | |
780 | .Ve | |
781 | .Sp | |
782 | Also see the entry for \f(CW@itempos\fR. | |
783 | .ie n .IP "$thisoffset\fR and \f(CW$prevoffset" 4 | |
784 | .el .IP "\f(CW$thisoffset\fR and \f(CW$prevoffset\fR" 4 | |
785 | .IX Item "$thisoffset and $prevoffset" | |
786 | \&\f(CW$thisoffset\fR stores the offset of the current parsing position | |
787 | within the complete text | |
788 | being parsed (starting from 0). \f(CW$prevoffset\fR stores the offset | |
789 | of the last character which was actually successfully parsed. In all | |
790 | cases \f(CW\*(C`$prevoffset == $thisoffset\-1\*(C'\fR. | |
791 | .Sp | |
792 | For efficiency, \f(CW$thisoffset\fR and \f(CW$prevoffset\fR are | |
793 | actually tied hashes, and only recompute the required offset | |
794 | when the variable's value is used. | |
795 | .Sp | |
796 | Assignment to \f(CW$thisoffset\fR or <$prevoffset> is a fatal error. | |
797 | .Sp | |
798 | Modifying the value of the variable \f(CW$text\fR will \fInot\fR affect the | |
799 | offset counting mechanism. | |
800 | .Sp | |
801 | Also see the entry for \f(CW@itempos\fR. | |
802 | .ie n .IP "@itempos" 4 | |
803 | .el .IP "\f(CW@itempos\fR" 4 | |
804 | .IX Item "@itempos" | |
805 | The array \f(CW@itempos\fR stores a hash reference corresponding to | |
806 | each element of \f(CW@item\fR. The elements of the hash provide the | |
807 | following: | |
808 | .Sp | |
809 | .Vb 6 | |
810 | \& $itempos[$n]{offset}{from} # VALUE OF $thisoffset BEFORE $item[$n] | |
811 | \& $itempos[$n]{offset}{to} # VALUE OF $prevoffset AFTER $item[$n] | |
812 | \& $itempos[$n]{line}{from} # VALUE OF $thisline BEFORE $item[$n] | |
813 | \& $itempos[$n]{line}{to} # VALUE OF $prevline AFTER $item[$n] | |
814 | \& $itempos[$n]{column}{from} # VALUE OF $thiscolumn BEFORE $item[$n] | |
815 | \& $itempos[$n]{column}{to} # VALUE OF $prevcolumn AFTER $item[$n] | |
816 | .Ve | |
817 | .Sp | |
818 | Note that the various \f(CW\*(C`$itempos[$n]...{from}\*(C'\fR values record the | |
819 | appropriate value \fIafter\fR any token prefix has been skipped. | |
820 | .Sp | |
821 | Hence, instead of the somewhat tedious and error\-prone: | |
822 | .Sp | |
823 | .Vb 9 | |
824 | \& rule: startcol token1 endcol | |
825 | \& startcol token2 endcol | |
826 | \& startcol token3 endcol | |
827 | \& { print "token1: columns $item[1] | |
828 | \& to $item[3] | |
829 | \& token2: columns $item[4] | |
830 | \& to $item[6] | |
831 | \& token3: columns $item[7] | |
832 | \& to $item[9]" } | |
833 | .Ve | |
834 | .Sp | |
835 | .Vb 2 | |
836 | \& startcol: // { $thiscolumn } # NEED THE // TO STEP PAST TOKEN SEP | |
837 | \& endcol: { $prevcolumn } | |
838 | .Ve | |
839 | .Sp | |
840 | it is possible to write: | |
841 | .Sp | |
842 | .Vb 7 | |
843 | \& rule: token1 token2 token3 | |
844 | \& { print "token1: columns $itempos[1]{column}{from} | |
845 | \& to $itempos[1]{column}{to} | |
846 | \& token2: columns $itempos[2]{column}{from} | |
847 | \& to $itempos[2]{column}{to} | |
848 | \& token3: columns $itempos[3]{column}{from} | |
849 | \& to $itempos[3]{column}{to}" } | |
850 | .Ve | |
851 | .Sp | |
852 | Note however that (in the current implementation) the use of \f(CW@itempos\fR | |
853 | anywhere in a grammar implies that item positioning information is | |
854 | collected \fIeverywhere\fR during the parse. Depending on the grammar | |
855 | and the size of the text to be parsed, this may be prohibitively | |
856 | expensive and the explicit use of \f(CW$thisline\fR, \f(CW$thiscolumn\fR, etc. may | |
857 | be a better choice. | |
858 | .ie n .IP "$thisparser" 4 | |
859 | .el .IP "\f(CW$thisparser\fR" 4 | |
860 | .IX Item "$thisparser" | |
861 | A reference to the \f(CW\*(C`Parse::RecDescent\*(C'\fR object through which | |
862 | parsing was initiated. | |
863 | .Sp | |
864 | The value of \f(CW$thisparser\fR propagates down the subrules of a parse | |
865 | but not back up. Hence, you can invoke subrules from another parser | |
866 | for the scope of the current rule as follows: | |
867 | .Sp | |
868 | .Vb 4 | |
869 | \& rule: subrule1 subrule2 | |
870 | \& | { $thisparser = $::otherparser } <reject> | |
871 | \& | subrule3 subrule4 | |
872 | \& | subrule5 | |
873 | .Ve | |
874 | .Sp | |
875 | The result is that the production calls \*(L"subrule1\*(R" and \*(L"subrule2\*(R" of | |
876 | the current parser, and the remaining productions call the named subrules | |
877 | from \f(CW$::otherparser\fR. Note, however that \*(L"Bad Things\*(R" will happen if | |
878 | \&\f(CW\*(C`::otherparser\*(C'\fR isn't a blessed reference and/or doesn't have methods | |
879 | with the same names as the required subrules! | |
880 | .ie n .IP "$thisrule" 4 | |
881 | .el .IP "\f(CW$thisrule\fR" 4 | |
882 | .IX Item "$thisrule" | |
883 | A reference to the \f(CW\*(C`Parse::RecDescent::Rule\*(C'\fR object corresponding to the | |
884 | rule currently being matched. | |
885 | .ie n .IP "$thisprod" 4 | |
886 | .el .IP "\f(CW$thisprod\fR" 4 | |
887 | .IX Item "$thisprod" | |
888 | A reference to the \f(CW\*(C`Parse::RecDescent::Production\*(C'\fR object | |
889 | corresponding to the production currently being matched. | |
890 | .ie n .IP "$score\fR and \f(CW$score_return" 4 | |
891 | .el .IP "\f(CW$score\fR and \f(CW$score_return\fR" 4 | |
892 | .IX Item "$score and $score_return" | |
893 | $score stores the best production score to date, as specified by | |
894 | an earlier \f(CW\*(C`<score:...>\*(C'\fR directive. \f(CW$score_return\fR stores | |
895 | the corresponding return value for the successful production. | |
896 | .Sp | |
897 | See \*(L"Scored productions\*(R". | |
898 | .PP | |
899 | \&\fBWarning:\fR the parser relies on the information in the various \f(CW\*(C`this...\*(C'\fR | |
900 | objects in some non-obvious ways. Tinkering with the other members of | |
901 | these objects will probably cause Bad Things to happen, unless you | |
902 | \&\fIreally\fR know what you're doing. The only exception to this advice is | |
903 | that the use of \f(CW\*(C`$this...\->{local}\*(C'\fR is always safe. | |
904 | .Sh "Start-up Actions" | |
905 | .IX Subsection "Start-up Actions" | |
906 | Any actions which appear \fIbefore\fR the first rule definition in a | |
907 | grammar are treated as \*(L"start\-up\*(R" actions. Each such action is | |
908 | stripped of its outermost brackets and then evaluated (in the parser's | |
909 | special namespace) just before the rules of the grammar are first | |
910 | compiled. | |
911 | .PP | |
912 | The main use of start-up actions is to declare local variables within the | |
913 | parser's special namespace: | |
914 | .PP | |
915 | .Vb 1 | |
916 | \& { my $lastitem = '???'; } | |
917 | .Ve | |
918 | .PP | |
919 | .Vb 1 | |
920 | \& list: item(s) { $return = $lastitem } | |
921 | .Ve | |
922 | .PP | |
923 | .Vb 3 | |
924 | \& item: book { $lastitem = 'book'; } | |
925 | \& bell { $lastitem = 'bell'; } | |
926 | \& candle { $lastitem = 'candle'; } | |
927 | .Ve | |
928 | .PP | |
929 | but start-up actions can be used to execute \fIany\fR valid Perl code | |
930 | within a parser's special namespace. | |
931 | .PP | |
932 | Start-up actions can appear within a grammar extension or replacement | |
933 | (that is, a partial grammar installed via \f(CW\*(C`Parse::RecDescent::Extend()\*(C'\fR or | |
934 | \&\f(CW\*(C`Parse::RecDescent::Replace()\*(C'\fR \- see \*(L"Incremental Parsing\*(R"), and will be | |
935 | executed before the new grammar is installed. Note, however, that a | |
936 | particular start-up action is only ever executed once. | |
937 | .Sh "Autoactions" | |
938 | .IX Subsection "Autoactions" | |
939 | It is sometimes desirable to be able to specify a default action to be | |
940 | taken at the end of every production (for example, in order to easily | |
941 | build a parse tree). If the variable \f(CW$::RD_AUTOACTION\fR is defined | |
942 | when \f(CW\*(C`Parse::RecDescent::new()\*(C'\fR is called, the contents of that | |
943 | variable are treated as a specification of an action which is to appended | |
944 | to each production in the corresponding grammar. So, for example, to construct | |
945 | a simple parse tree: | |
946 | .PP | |
947 | .Vb 1 | |
948 | \& $::RD_AUTOACTION = q { [@item] }; | |
949 | .Ve | |
950 | .PP | |
951 | .Vb 7 | |
952 | \& parser = new Parse::RecDescent (q{ | |
953 | \& expression: and_expr '||' expression | and_expr | |
954 | \& and_expr: not_expr '&&' and_expr | not_expr | |
955 | \& not_expr: '!' brack_expr | brack_expr | |
956 | \& brack_expr: '(' expression ')' | identifier | |
957 | \& identifier: /[a-z]+/i | |
958 | \& }); | |
959 | .Ve | |
960 | .PP | |
961 | which is equivalent to: | |
962 | .PP | |
963 | .Vb 5 | |
964 | \& parser = new Parse::RecDescent (q{ | |
965 | \& expression: and_expr '&&' expression | |
966 | \& { [@item] } | |
967 | \& | and_expr | |
968 | \& { [@item] } | |
969 | .Ve | |
970 | .PP | |
971 | .Vb 4 | |
972 | \& and_expr: not_expr '&&' and_expr | |
973 | \& { [@item] } | |
974 | \& | not_expr | |
975 | \& { [@item] } | |
976 | .Ve | |
977 | .PP | |
978 | .Vb 4 | |
979 | \& not_expr: '!' brack_expr | |
980 | \& { [@item] } | |
981 | \& | brack_expr | |
982 | \& { [@item] } | |
983 | .Ve | |
984 | .PP | |
985 | .Vb 4 | |
986 | \& brack_expr: '(' expression ')' | |
987 | \& { [@item] } | |
988 | \& | identifier | |
989 | \& { [@item] } | |
990 | .Ve | |
991 | .PP | |
992 | .Vb 3 | |
993 | \& identifier: /[a-z]+/i | |
994 | \& { [@item] } | |
995 | \& }); | |
996 | .Ve | |
997 | .PP | |
998 | Alternatively, we could take an object-oriented approach, use different | |
999 | classes for each node (and also eliminating redundant intermediate nodes): | |
1000 | .PP | |
1001 | .Vb 2 | |
1002 | \& $::RD_AUTOACTION = q | |
1003 | \& { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) }; | |
1004 | .Ve | |
1005 | .PP | |
1006 | .Vb 7 | |
1007 | \& parser = new Parse::RecDescent (q{ | |
1008 | \& expression: and_expr '||' expression | and_expr | |
1009 | \& and_expr: not_expr '&&' and_expr | not_expr | |
1010 | \& not_expr: '!' brack_expr | brack_expr | |
1011 | \& brack_expr: '(' expression ')' | identifier | |
1012 | \& identifier: /[a-z]+/i | |
1013 | \& }); | |
1014 | .Ve | |
1015 | .PP | |
1016 | which is equivalent to: | |
1017 | .PP | |
1018 | .Vb 4 | |
1019 | \& parser = new Parse::RecDescent (q{ | |
1020 | \& expression: and_expr '&&' expression | |
1021 | \& { new expression_node (@item[1..3]) } | |
1022 | \& | and_expr | |
1023 | .Ve | |
1024 | .PP | |
1025 | .Vb 3 | |
1026 | \& and_expr: not_expr '&&' and_expr | |
1027 | \& { new and_expr_node (@item[1..3]) } | |
1028 | \& | not_expr | |
1029 | .Ve | |
1030 | .PP | |
1031 | .Vb 3 | |
1032 | \& not_expr: '!' brack_expr | |
1033 | \& { new not_expr_node (@item[1..2]) } | |
1034 | \& | brack_expr | |
1035 | .Ve | |
1036 | .PP | |
1037 | .Vb 3 | |
1038 | \& brack_expr: '(' expression ')' | |
1039 | \& { new brack_expr_node (@item[1..3]) } | |
1040 | \& | identifier | |
1041 | .Ve | |
1042 | .PP | |
1043 | .Vb 3 | |
1044 | \& identifier: /[a-z]+/i | |
1045 | \& { new identifer_node (@item[1]) } | |
1046 | \& }); | |
1047 | .Ve | |
1048 | .PP | |
1049 | Note that, if a production already ends in an action, no autoaction is appended | |
1050 | to it. For example, in this version: | |
1051 | .PP | |
1052 | .Vb 2 | |
1053 | \& $::RD_AUTOACTION = q | |
1054 | \& { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) }; | |
1055 | .Ve | |
1056 | .PP | |
1057 | .Vb 8 | |
1058 | \& parser = new Parse::RecDescent (q{ | |
1059 | \& expression: and_expr '&&' expression | and_expr | |
1060 | \& and_expr: not_expr '&&' and_expr | not_expr | |
1061 | \& not_expr: '!' brack_expr | brack_expr | |
1062 | \& brack_expr: '(' expression ')' | identifier | |
1063 | \& identifier: /[a-z]+/i | |
1064 | \& { new terminal_node($item[1]) } | |
1065 | \& }); | |
1066 | .Ve | |
1067 | .PP | |
1068 | each \f(CW\*(C`identifier\*(C'\fR match produces a \f(CW\*(C`terminal_node\*(C'\fR object, \fInot\fR an | |
1069 | \&\f(CW\*(C`identifier_node\*(C'\fR object. | |
1070 | .PP | |
1071 | A level 1 warning is issued each time an \*(L"autoaction\*(R" is added to | |
1072 | some production. | |
1073 | .Sh "Autotrees" | |
1074 | .IX Subsection "Autotrees" | |
1075 | A commonly needed autoaction is one that builds a parse\-tree. It is moderately | |
1076 | tricky to set up such an action (which must treat terminals differently from | |
1077 | non\-terminals), so Parse::RecDescent simplifies the process by providing the | |
1078 | \&\f(CW\*(C`<autotree>\*(C'\fR directive. | |
1079 | .PP | |
1080 | If this directive appears at the start of grammar, it causes | |
1081 | Parse::RecDescent to insert autoactions at the end of any rule except | |
1082 | those which already end in an action. The action inserted depends on whether | |
1083 | the production is an intermediate rule (two or more items), or a terminal | |
1084 | of the grammar (i.e. a single pattern or string item). | |
1085 | .PP | |
1086 | So, for example, the following grammar: | |
1087 | .PP | |
1088 | .Vb 1 | |
1089 | \& <autotree> | |
1090 | .Ve | |
1091 | .PP | |
1092 | .Vb 7 | |
1093 | \& file : command(s) | |
1094 | \& command : get | set | vet | |
1095 | \& get : 'get' ident ';' | |
1096 | \& set : 'set' ident 'to' value ';' | |
1097 | \& vet : 'check' ident 'is' value ';' | |
1098 | \& ident : /\ew+/ | |
1099 | \& value : /\ed+/ | |
1100 | .Ve | |
1101 | .PP | |
1102 | is equivalent to: | |
1103 | .PP | |
1104 | .Vb 7 | |
1105 | \& file : command(s) { bless \e%item, $item[0] } | |
1106 | \& command : get { bless \e%item, $item[0] } | |
1107 | \& | set { bless \e%item, $item[0] } | |
1108 | \& | vet { bless \e%item, $item[0] } | |
1109 | \& get : 'get' ident ';' { bless \e%item, $item[0] } | |
1110 | \& set : 'set' ident 'to' value ';' { bless \e%item, $item[0] } | |
1111 | \& vet : 'check' ident 'is' value ';' { bless \e%item, $item[0] } | |
1112 | .Ve | |
1113 | .PP | |
1114 | .Vb 2 | |
1115 | \& ident : /\ew+/ { bless {__VALUE__=>$item[1]}, $item[0] } | |
1116 | \& value : /\ed+/ { bless {__VALUE__=>$item[1]}, $item[0] } | |
1117 | .Ve | |
1118 | .PP | |
1119 | Note that each node in the tree is blessed into a class of the same name | |
1120 | as the rule itself. This makes it easy to build object-oriented | |
1121 | processors for the parse-trees that the grammar produces. Note too that | |
1122 | the last two rules produce special objects with the single attribute | |
1123 | \&'_\|_VALUE_\|_'. This is because they consist solely of a single terminal. | |
1124 | .PP | |
1125 | This autoaction-ed grammar would then produce a parse tree in a data | |
1126 | structure like this: | |
1127 | .PP | |
1128 | .Vb 18 | |
1129 | \& { | |
1130 | \& file => { | |
1131 | \& command => { | |
1132 | \& [ get => { | |
1133 | \& identifier => { __VALUE__ => 'a' }, | |
1134 | \& }, | |
1135 | \& set => { | |
1136 | \& identifier => { __VALUE__ => 'b' }, | |
1137 | \& value => { __VALUE__ => '7' }, | |
1138 | \& }, | |
1139 | \& vet => { | |
1140 | \& identifier => { __VALUE__ => 'b' }, | |
1141 | \& value => { __VALUE__ => '7' }, | |
1142 | \& }, | |
1143 | \& ], | |
1144 | \& }, | |
1145 | \& } | |
1146 | \& } | |
1147 | .Ve | |
1148 | .PP | |
1149 | (except, of course, that each nested hash would also be blessed into | |
1150 | the appropriate class). | |
1151 | .Sh "Autostubbing" | |
1152 | .IX Subsection "Autostubbing" | |
1153 | Normally, if a subrule appears in some production, but no rule of that | |
1154 | name is ever defined in the grammar, the production which refers to the | |
1155 | non-existent subrule fails immediately. This typically occurs as a | |
1156 | result of misspellings, and is a sufficiently common occurance that a | |
1157 | warning is generated for such situations. | |
1158 | .PP | |
1159 | However, when prototyping a grammar it is sometimes useful to be | |
1160 | able to use subrules before a proper specification of them is | |
1161 | really possible. For example, a grammar might include a section like: | |
1162 | .PP | |
1163 | .Vb 1 | |
1164 | \& function_call: identifier '(' arg(s?) ')' | |
1165 | .Ve | |
1166 | .PP | |
1167 | .Vb 1 | |
1168 | \& identifier: /[a-z]\ew*/i | |
1169 | .Ve | |
1170 | .PP | |
1171 | where the possible format of an argument is sufficiently complex that | |
1172 | it is not worth specifying in full until the general function call | |
1173 | syntax has been debugged. In this situation it is convenient to leave | |
1174 | the real rule \f(CW\*(C`arg\*(C'\fR undefined and just slip in a placeholder (or | |
1175 | \&\*(L"stub\*(R"): | |
1176 | .PP | |
1177 | .Vb 1 | |
1178 | \& arg: 'arg' | |
1179 | .Ve | |
1180 | .PP | |
1181 | so that the function call syntax can be tested with dummy input such as: | |
1182 | .PP | |
1183 | .Vb 4 | |
1184 | \& f0() | |
1185 | \& f1(arg) | |
1186 | \& f2(arg arg) | |
1187 | \& f3(arg arg arg) | |
1188 | .Ve | |
1189 | .PP | |
1190 | et cetera. | |
1191 | .PP | |
1192 | Early in prototyping, many such \*(L"stubs\*(R" may be required, so | |
1193 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR provides a means of automating their definition. | |
1194 | If the variable \f(CW$::RD_AUTOSTUB\fR is defined when a parser is built, | |
1195 | a subrule reference to any non-existent rule (say, \f(CW\*(C`sr\*(C'\fR), | |
1196 | causes a \*(L"stub\*(R" rule of the form: | |
1197 | .PP | |
1198 | .Vb 1 | |
1199 | \& sr: 'sr' | |
1200 | .Ve | |
1201 | .PP | |
1202 | to be automatically defined in the generated parser. | |
1203 | A level 1 warning is issued for each such \*(L"autostubbed\*(R" rule. | |
1204 | .PP | |
1205 | Hence, with \f(CW$::AUTOSTUB\fR defined, it is possible to only partially | |
1206 | specify a grammar, and then \*(L"fake\*(R" matches of the unspecified | |
1207 | (sub)rules by just typing in their name. | |
1208 | .Sh "Look-ahead" | |
1209 | .IX Subsection "Look-ahead" | |
1210 | If a subrule, token, or action is prefixed by \*(L"...\*(R", then it is | |
1211 | treated as a \*(L"look\-ahead\*(R" request. That means that the current production can | |
1212 | (as usual) only succeed if the specified item is matched, but that the matching | |
1213 | \&\fIdoes not consume any of the text being parsed\fR. This is very similar to the | |
1214 | \&\f(CW\*(C`/(?=...)/\*(C'\fR look-ahead construct in Perl patterns. Thus, the rule: | |
1215 | .PP | |
1216 | .Vb 1 | |
1217 | \& inner_word: word ...word | |
1218 | .Ve | |
1219 | .PP | |
1220 | will match whatever the subrule \*(L"word\*(R" matches, provided that match is followed | |
1221 | by some more text which subrule \*(L"word\*(R" would also match (although this | |
1222 | second substring is not actually consumed by \*(L"inner_word\*(R") | |
1223 | .PP | |
1224 | Likewise, a \*(L"...!\*(R" prefix, causes the following item to succeed (without | |
1225 | consuming any text) if and only if it would normally fail. Hence, a | |
1226 | rule such as: | |
1227 | .PP | |
1228 | .Vb 1 | |
1229 | \& identifier: ...!keyword ...!'_' /[A-Za-z_]\ew*/ | |
1230 | .Ve | |
1231 | .PP | |
1232 | matches a string of characters which satisfies the pattern | |
1233 | \&\f(CW\*(C`/[A\-Za\-z_]\ew*/\*(C'\fR, but only if the same sequence of characters would | |
1234 | not match either subrule \*(L"keyword\*(R" or the literal token '_'. | |
1235 | .PP | |
1236 | Sequences of look-ahead prefixes accumulate, multiplying their positive and/or | |
1237 | negative senses. Hence: | |
1238 | .PP | |
1239 | .Vb 1 | |
1240 | \& inner_word: word ...!......!word | |
1241 | .Ve | |
1242 | .PP | |
1243 | is exactly equivalent the the original example above (a warning is issued in | |
1244 | cases like these, since they often indicate something left out, or | |
1245 | misunderstood). | |
1246 | .PP | |
1247 | Note that actions can also be treated as look\-aheads. In such cases, | |
1248 | the state of the parser text (in the local variable \f(CW$text\fR) | |
1249 | \&\fIafter\fR the look-ahead action is guaranteed to be identical to its | |
1250 | state \fIbefore\fR the action, regardless of how it's changed \fIwithin\fR | |
1251 | the action (unless you actually undefine \f(CW$text\fR, in which case you | |
1252 | get the disaster you deserve :\-). | |
1253 | .Sh "Directives" | |
1254 | .IX Subsection "Directives" | |
1255 | Directives are special pre-defined actions which may be used to alter | |
1256 | the behaviour of the parser. There are currently eighteen directives: | |
1257 | \&\f(CW\*(C`<commit>\*(C'\fR, | |
1258 | \&\f(CW\*(C`<uncommit>\*(C'\fR, | |
1259 | \&\f(CW\*(C`<reject>\*(C'\fR, | |
1260 | \&\f(CW\*(C`<score>\*(C'\fR, | |
1261 | \&\f(CW\*(C`<autoscore>\*(C'\fR, | |
1262 | \&\f(CW\*(C`<skip>\*(C'\fR, | |
1263 | \&\f(CW\*(C`<resync>\*(C'\fR, | |
1264 | \&\f(CW\*(C`<error>\*(C'\fR, | |
1265 | \&\f(CW\*(C`<rulevar>\*(C'\fR, | |
1266 | \&\f(CW\*(C`<matchrule>\*(C'\fR, | |
1267 | \&\f(CW\*(C`<leftop>\*(C'\fR, | |
1268 | \&\f(CW\*(C`<rightop>\*(C'\fR, | |
1269 | \&\f(CW\*(C`<defer>\*(C'\fR, | |
1270 | \&\f(CW\*(C`<nocheck>\*(C'\fR, | |
1271 | \&\f(CW\*(C`<perl_quotelike>\*(C'\fR, | |
1272 | \&\f(CW\*(C`<perl_codeblock>\*(C'\fR, | |
1273 | \&\f(CW\*(C`<perl_variable>\*(C'\fR, | |
1274 | and \f(CW\*(C`<token>\*(C'\fR. | |
1275 | .IP "Committing and uncommitting" 4 | |
1276 | .IX Item "Committing and uncommitting" | |
1277 | The \f(CW\*(C`<commit>\*(C'\fR and \f(CW\*(C`<uncommit>\*(C'\fR directives permit the recursive | |
1278 | descent of the parse tree to be pruned (or \*(L"cut\*(R") for efficiency. | |
1279 | Within a rule, a \f(CW\*(C`<commit>\*(C'\fR directive instructs the rule to ignore subsequent | |
1280 | productions if the current production fails. For example: | |
1281 | .Sp | |
1282 | .Vb 3 | |
1283 | \& command: 'find' <commit> filename | |
1284 | \& | 'open' <commit> filename | |
1285 | \& | 'move' filename filename | |
1286 | .Ve | |
1287 | .Sp | |
1288 | Clearly, if the leading token 'find' is matched in the first production but that | |
1289 | production fails for some other reason, then the remaining | |
1290 | productions cannot possibly match. The presence of the | |
1291 | \&\f(CW\*(C`<commit>\*(C'\fR causes the \*(L"command\*(R" rule to fail immediately if | |
1292 | an invalid \*(L"find\*(R" command is found, and likewise if an invalid \*(L"open\*(R" | |
1293 | command is encountered. | |
1294 | .Sp | |
1295 | It is also possible to revoke a previous commitment. For example: | |
1296 | .Sp | |
1297 | .Vb 5 | |
1298 | \& if_statement: 'if' <commit> condition | |
1299 | \& 'then' block <uncommit> | |
1300 | \& 'else' block | |
1301 | \& | 'if' <commit> condition | |
1302 | \& 'then' block | |
1303 | .Ve | |
1304 | .Sp | |
1305 | In this case, a failure to find an \*(L"else\*(R" block in the first | |
1306 | production shouldn't preclude trying the second production, but a | |
1307 | failure to find a \*(L"condition\*(R" certainly should. | |
1308 | .Sp | |
1309 | As a special case, any production in which the \fIfirst\fR item is an | |
1310 | \&\f(CW\*(C`<uncommit>\*(C'\fR immediately revokes a preceding \f(CW\*(C`<commit>\*(C'\fR | |
1311 | (even though the production would not otherwise have been tried). For | |
1312 | example, in the rule: | |
1313 | .Sp | |
1314 | .Vb 5 | |
1315 | \& request: 'explain' expression | |
1316 | \& | 'explain' <commit> keyword | |
1317 | \& | 'save' | |
1318 | \& | 'quit' | |
1319 | \& | <uncommit> term '?' | |
1320 | .Ve | |
1321 | .Sp | |
1322 | if the text being matched was \*(L"explain?\*(R", and the first two | |
1323 | productions failed, then the \f(CW\*(C`<commit>\*(C'\fR in production two would cause | |
1324 | productions three and four to be skipped, but the leading | |
1325 | \&\f(CW\*(C`<uncommit>\*(C'\fR in the production five would allow that production to | |
1326 | attempt a match. | |
1327 | .Sp | |
1328 | Note in the preceding example, that the \f(CW\*(C`<commit>\*(C'\fR was only placed | |
1329 | in production two. If production one had been: | |
1330 | .Sp | |
1331 | .Vb 1 | |
1332 | \& request: 'explain' <commit> expression | |
1333 | .Ve | |
1334 | .Sp | |
1335 | then production two would be (inappropriately) skipped if a leading | |
1336 | \&\*(L"explain...\*(R" was encountered. | |
1337 | .Sp | |
1338 | Both \f(CW\*(C`<commit>\*(C'\fR and \f(CW\*(C`<uncommit>\*(C'\fR directives always succeed, and their value | |
1339 | is always 1. | |
1340 | .IP "Rejecting a production" 4 | |
1341 | .IX Item "Rejecting a production" | |
1342 | The \f(CW\*(C`<reject>\*(C'\fR directive immediately causes the current production | |
1343 | to fail (it is exactly equivalent to, but more obvious than, the | |
1344 | action \f(CW\*(C`{undef}\*(C'\fR). A \f(CW\*(C`<reject>\*(C'\fR is useful when it is desirable to get | |
1345 | the side effects of the actions in one production, without prejudicing a match | |
1346 | by some other production later in the rule. For example, to insert | |
1347 | tracing code into the parse: | |
1348 | .Sp | |
1349 | .Vb 1 | |
1350 | \& complex_rule: { print "In complex rule...\en"; } <reject> | |
1351 | .Ve | |
1352 | .Sp | |
1353 | .Vb 3 | |
1354 | \& complex_rule: simple_rule '+' 'i' '*' simple_rule | |
1355 | \& | 'i' '*' simple_rule | |
1356 | \& | simple_rule | |
1357 | .Ve | |
1358 | .Sp | |
1359 | It is also possible to specify a conditional rejection, using the | |
1360 | form \f(CW\*(C`<reject:\f(CIcondition\f(CW>\*(C'\fR, which only rejects if the | |
1361 | specified condition is true. This form of rejection is exactly | |
1362 | equivalent to the action \f(CW\*(C`{(\f(CIcondition\f(CW)?undef:1}>\*(C'\fR. | |
1363 | For example: | |
1364 | .Sp | |
1365 | .Vb 4 | |
1366 | \& command: save_command | |
1367 | \& | restore_command | |
1368 | \& | <reject: defined $::tolerant> { exit } | |
1369 | \& | <error: Unknown command. Ignored.> | |
1370 | .Ve | |
1371 | .Sp | |
1372 | A \f(CW\*(C`<reject>\*(C'\fR directive never succeeds (and hence has no | |
1373 | associated value). A conditional rejection may succeed (if its | |
1374 | condition is not satisfied), in which case its value is 1. | |
1375 | .Sp | |
1376 | As an extra optimization, \f(CW\*(C`Parse::RecDescent\*(C'\fR ignores any production | |
1377 | which \fIbegins\fR with an unconditional \f(CW\*(C`<reject>\*(C'\fR directive, | |
1378 | since any such production can never successfully match or have any | |
1379 | useful side\-effects. A level 1 warning is issued in all such cases. | |
1380 | .Sp | |
1381 | Note that productions beginning with conditional | |
1382 | \&\f(CW\*(C`<reject:...>\*(C'\fR directives are \fInever\fR \*(L"optimized away\*(R" in | |
1383 | this manner, even if they are always guaranteed to fail (for example: | |
1384 | \&\f(CW\*(C`<reject:1>\*(C'\fR) | |
1385 | .Sp | |
1386 | Due to the way grammars are parsed, there is a minor restriction on the | |
1387 | condition of a conditional \f(CW\*(C`<reject:...>\*(C'\fR: it cannot | |
1388 | contain any raw '<' or '>' characters. For example: | |
1389 | .Sp | |
1390 | .Vb 1 | |
1391 | \& line: cmd <reject: $thiscolumn > max> data | |
1392 | .Ve | |
1393 | .Sp | |
1394 | results in an error when a parser is built from this grammar (since the | |
1395 | grammar parser has no way of knowing whether the first > is a \*(L"less than\*(R" | |
1396 | or the end of the \f(CW\*(C`<reject:...>\*(C'\fR. | |
1397 | .Sp | |
1398 | To overcome this problem, put the condition inside a do{} block: | |
1399 | .Sp | |
1400 | .Vb 1 | |
1401 | \& line: cmd <reject: do{$thiscolumn > max}> data | |
1402 | .Ve | |
1403 | .Sp | |
1404 | Note that the same problem may occur in other directives that take | |
1405 | arguments. The same solution will work in all cases. | |
1406 | .IP "Skipping between terminals" 4 | |
1407 | .IX Item "Skipping between terminals" | |
1408 | The \f(CW\*(C`<skip>\*(C'\fR directive enables the terminal prefix used in | |
1409 | a production to be changed. For example: | |
1410 | .Sp | |
1411 | .Vb 1 | |
1412 | \& OneLiner: Command <skip:'[ \et]*'> Arg(s) /;/ | |
1413 | .Ve | |
1414 | .Sp | |
1415 | causes only blanks and tabs to be skipped before terminals in the \f(CW\*(C`Arg\*(C'\fR | |
1416 | subrule (and any of \fIits\fR subrules>, and also before the final \f(CW\*(C`/;/\*(C'\fR terminal. | |
1417 | Once the production is complete, the previous terminal prefix is | |
1418 | reinstated. Note that this implies that distinct productions of a rule | |
1419 | must reset their terminal prefixes individually. | |
1420 | .Sp | |
1421 | The \f(CW\*(C`<skip>\*(C'\fR directive evaluates to the \fIprevious\fR terminal prefix, | |
1422 | so it's easy to reinstate a prefix later in a production: | |
1423 | .Sp | |
1424 | .Vb 1 | |
1425 | \& Command: <skip:","> CSV(s) <skip:$item[1]> Modifier | |
1426 | .Ve | |
1427 | .Sp | |
1428 | The value specified after the colon is interpolated into a pattern, so all of | |
1429 | the following are equivalent (though their efficiency increases down the list): | |
1430 | .Sp | |
1431 | .Vb 1 | |
1432 | \& <skip: "$colon|$comma"> # ASSUMING THE VARS HOLD THE OBVIOUS VALUES | |
1433 | .Ve | |
1434 | .Sp | |
1435 | .Vb 1 | |
1436 | \& <skip: ':|,'> | |
1437 | .Ve | |
1438 | .Sp | |
1439 | .Vb 1 | |
1440 | \& <skip: q{[:,]}> | |
1441 | .Ve | |
1442 | .Sp | |
1443 | .Vb 1 | |
1444 | \& <skip: qr/[:,]/> | |
1445 | .Ve | |
1446 | .Sp | |
1447 | There is no way of directly setting the prefix for | |
1448 | an entire rule, except as follows: | |
1449 | .Sp | |
1450 | .Vb 3 | |
1451 | \& Rule: <skip: '[ \et]*'> Prod1 | |
1452 | \& | <skip: '[ \et]*'> Prod2a Prod2b | |
1453 | \& | <skip: '[ \et]*'> Prod3 | |
1454 | .Ve | |
1455 | .Sp | |
1456 | or, better: | |
1457 | .Sp | |
1458 | .Vb 6 | |
1459 | \& Rule: <skip: '[ \et]*'> | |
1460 | \& ( | |
1461 | \& Prod1 | |
1462 | \& | Prod2a Prod2b | |
1463 | \& | Prod3 | |
1464 | \& ) | |
1465 | .Ve | |
1466 | .Sp | |
1467 | \&\fBNote: Up to release 1.51 of Parse::RecDescent, an entirely different | |
1468 | mechanism was used for specifying terminal prefixes. The current method | |
1469 | is not backwards-compatible with that early approach. The current approach | |
1470 | is stable and will not to change again.\fR | |
1471 | .IP "Resynchronization" 4 | |
1472 | .IX Item "Resynchronization" | |
1473 | The \f(CW\*(C`<resync>\*(C'\fR directive provides a visually distinctive | |
1474 | means of consuming some of the text being parsed, usually to skip an | |
1475 | erroneous input. In its simplest form \f(CW\*(C`<resync>\*(C'\fR simply | |
1476 | consumes text up to and including the next newline (\f(CW"\en"\fR) | |
1477 | character, succeeding only if the newline is found, in which case it | |
1478 | causes its surrounding rule to return zero on success. | |
1479 | .Sp | |
1480 | In other words, a \f(CW\*(C`<resync>\*(C'\fR is exactly equivalent to the token | |
1481 | \&\f(CW\*(C`/[^\en]*\en/\*(C'\fR followed by the action \f(CW\*(C`{\ $return\ =\ 0\ }\*(C'\fR (except that | |
1482 | productions beginning with a \f(CW\*(C`<resync>\*(C'\fR are ignored when generating | |
1483 | error messages). A typical use might be: | |
1484 | .Sp | |
1485 | .Vb 1 | |
1486 | \& script : command(s) | |
1487 | .Ve | |
1488 | .Sp | |
1489 | .Vb 3 | |
1490 | \& command: save_command | |
1491 | \& | restore_command | |
1492 | \& | <resync> # TRY NEXT LINE, IF POSSIBLE | |
1493 | .Ve | |
1494 | .Sp | |
1495 | It is also possible to explicitly specify a resynchronization | |
1496 | pattern, using the \f(CW\*(C`<resync:\f(CIpattern\f(CW>\*(C'\fR variant. This version | |
1497 | succeeds only if the specified pattern matches (and consumes) the | |
1498 | parsed text. In other words, \f(CW\*(C`<resync:\f(CIpattern\f(CW>\*(C'\fR is exactly | |
1499 | equivalent to the token \f(CW\*(C`/\f(CIpattern\f(CW/\*(C'\fR (followed by a \f(CW\*(C`{\ $return\ =\ 0\ }\*(C'\fR | |
1500 | action). For example, if commands were terminated by newlines or semi\-colons: | |
1501 | .Sp | |
1502 | .Vb 3 | |
1503 | \& command: save_command | |
1504 | \& | restore_command | |
1505 | \& | <resync:[^;\en]*[;\en]> | |
1506 | .Ve | |
1507 | .Sp | |
1508 | The value of a successfully matched \f(CW\*(C`<resync>\*(C'\fR directive (of either | |
1509 | type) is the text that it consumed. Note, however, that since the | |
1510 | directive also sets \f(CW$return\fR, a production consisting of a lone | |
1511 | \&\f(CW\*(C`<resync>\*(C'\fR succeeds but returns the value zero (which a calling rule | |
1512 | may find useful to distinguish between \*(L"true\*(R" matches and \*(L"tolerant\*(R" matches). | |
1513 | Remember that returning a zero value indicates that the rule \fIsucceeded\fR (since | |
1514 | only an \f(CW\*(C`undef\*(C'\fR denotes failure within \f(CW\*(C`Parse::RecDescent\*(C'\fR parsers. | |
1515 | .IP "Error handling" 4 | |
1516 | .IX Item "Error handling" | |
1517 | The \f(CW\*(C`<error>\*(C'\fR directive provides automatic or user-defined | |
1518 | generation of error messages during a parse. In its simplest form | |
1519 | \&\f(CW\*(C`<error>\*(C'\fR prepares an error message based on | |
1520 | the mismatch between the last item expected and the text which cause | |
1521 | it to fail. For example, given the rule: | |
1522 | .Sp | |
1523 | .Vb 3 | |
1524 | \& McCoy: curse ',' name ', I'm a doctor, not a' a_profession '!' | |
1525 | \& | pronoun 'dead,' name '!' | |
1526 | \& | <error> | |
1527 | .Ve | |
1528 | .Sp | |
1529 | the following strings would produce the following messages: | |
1530 | .RS 4 | |
1531 | .ie n .IP """Amen, Jim!""" 4 | |
1532 | .el .IP "``Amen, Jim!''" 4 | |
1533 | .IX Item "Amen, Jim!" | |
1534 | .Vb 2 | |
1535 | \& ERROR (line 1): Invalid McCoy: Expected curse or pronoun | |
1536 | \& not found | |
1537 | .Ve | |
1538 | .ie n .IP """Dammit, Jim, I'm a doctor!""" 4 | |
1539 | .el .IP "``Dammit, Jim, I'm a doctor!''" 4 | |
1540 | .IX Item "Dammit, Jim, I'm a doctor!" | |
1541 | .Vb 2 | |
1542 | \& ERROR (line 1): Invalid McCoy: Expected ", I'm a doctor, not a" | |
1543 | \& but found ", I'm a doctor!" instead | |
1544 | .Ve | |
1545 | .ie n .IP """He's dead,\en""" 4 | |
1546 | .el .IP "``He's dead,\en''" 4 | |
1547 | .IX Item "He's dead,n" | |
1548 | .Vb 1 | |
1549 | \& ERROR (line 2): Invalid McCoy: Expected name not found | |
1550 | .Ve | |
1551 | .ie n .IP """He's alive!""" 4 | |
1552 | .el .IP "``He's alive!''" 4 | |
1553 | .IX Item "He's alive!" | |
1554 | .Vb 2 | |
1555 | \& ERROR (line 1): Invalid McCoy: Expected 'dead,' but found | |
1556 | \& "alive!" instead | |
1557 | .Ve | |
1558 | .ie n .IP """Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!""" 4 | |
1559 | .el .IP "``Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!''" 4 | |
1560 | .IX Item "Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!" | |
1561 | .Vb 2 | |
1562 | \& ERROR (line 1): Invalid McCoy: Expected a profession but found | |
1563 | \& "pointy-eared Vulcan!" instead | |
1564 | .Ve | |
1565 | .RE | |
1566 | .RS 4 | |
1567 | .Sp | |
1568 | Note that, when autogenerating error messages, all underscores in any | |
1569 | rule name used in a message are replaced by single spaces (for example | |
1570 | \&\*(L"a_production\*(R" becomes \*(L"a production\*(R"). Judicious choice of rule | |
1571 | names can therefore considerably improve the readability of automatic | |
1572 | error messages (as well as the maintainability of the original | |
1573 | grammar). | |
1574 | .Sp | |
1575 | If the automatically generated error is not sufficient, it is possible to | |
1576 | provide an explicit message as part of the error directive. For example: | |
1577 | .Sp | |
1578 | .Vb 3 | |
1579 | \& Spock: "Fascinating ',' (name | 'Captain') '.' | |
1580 | \& | "Highly illogical, doctor." | |
1581 | \& | <error: He never said that!> | |
1582 | .Ve | |
1583 | .Sp | |
1584 | which would result in \fIall\fR failures to parse a \*(L"Spock\*(R" subrule printing the | |
1585 | following message: | |
1586 | .Sp | |
1587 | .Vb 1 | |
1588 | \& ERROR (line <N>): Invalid Spock: He never said that! | |
1589 | .Ve | |
1590 | .Sp | |
1591 | The error message is treated as a \*(L"qq{...}\*(R" string and interpolated | |
1592 | when the error is generated (\fInot\fR when the directive is specified!). | |
1593 | Hence: | |
1594 | .Sp | |
1595 | .Vb 1 | |
1596 | \& <error: Mystical error near "$text"> | |
1597 | .Ve | |
1598 | .Sp | |
1599 | would correctly insert the ambient text string which caused the error. | |
1600 | .Sp | |
1601 | There are two other forms of error directive: \f(CW\*(C`<error?>\*(C'\fR and | |
1602 | \&\f(CW\*(C`<error?:\ msg>\*(C'\fR. These behave just like \f(CW\*(C`<error>\*(C'\fR | |
1603 | and \f(CW\*(C`<error:\ msg>\*(C'\fR respectively, except that they are | |
1604 | only triggered if the rule is \*(L"committed\*(R" at the time they are | |
1605 | encountered. For example: | |
1606 | .Sp | |
1607 | .Vb 3 | |
1608 | \& Scotty: "Ya kenna change the Laws of Phusics," <commit> name | |
1609 | \& | name <commit> ',' 'she's goanta blaw!' | |
1610 | \& | <error?> | |
1611 | .Ve | |
1612 | .Sp | |
1613 | will only generate an error for a string beginning with \*(L"Ya kenna | |
1614 | change the Laws o' Phusics,\*(R" or a valid name, but which still fails to match the | |
1615 | corresponding production. That is, \f(CW\*(C`$parser\->Scotty("Aye, Cap'ain")\*(C'\fR will | |
1616 | fail silently (since neither production will \*(L"commit\*(R" the rule on that | |
1617 | input), whereas \f(CW\*(C`$parser\->Scotty("Mr\ Spock,\ ah\ jest\ kenna\ do'ut!")\*(C'\fR | |
1618 | will fail with the error message: | |
1619 | .Sp | |
1620 | .Vb 2 | |
1621 | \& ERROR (line 1): Invalid Scotty: expected 'she's goanta blaw!' | |
1622 | \& but found 'I jest kenna do'ut!' instead. | |
1623 | .Ve | |
1624 | .Sp | |
1625 | since in that case the second production would commit after matching | |
1626 | the leading name. | |
1627 | .Sp | |
1628 | Note that to allow this behaviour, all \f(CW\*(C`<error>\*(C'\fR directives which are | |
1629 | the first item in a production automatically uncommit the rule just | |
1630 | long enough to allow their production to be attempted (that is, when | |
1631 | their production fails, the commitment is reinstated so that | |
1632 | subsequent productions are skipped). | |
1633 | .Sp | |
1634 | In order to \fIpermanently\fR uncommit the rule before an error message, | |
1635 | it is necessary to put an explicit \f(CW\*(C`<uncommit>\*(C'\fR before the | |
1636 | \&\f(CW\*(C`<error>\*(C'\fR. For example: | |
1637 | .Sp | |
1638 | .Vb 5 | |
1639 | \& line: 'Kirk:' <commit> Kirk | |
1640 | \& | 'Spock:' <commit> Spock | |
1641 | \& | 'McCoy:' <commit> McCoy | |
1642 | \& | <uncommit> <error?> <reject> | |
1643 | \& | <resync> | |
1644 | .Ve | |
1645 | .Sp | |
1646 | Error messages generated by the various \f(CW\*(C`<error...>\*(C'\fR directives | |
1647 | are not displayed immediately. Instead, they are \*(L"queued\*(R" in a buffer and | |
1648 | are only displayed once parsing ultimately fails. Moreover, | |
1649 | \&\f(CW\*(C`<error...>\*(C'\fR directives that cause one production of a rule | |
1650 | to fail are automatically removed from the message queue | |
1651 | if another production subsequently causes the entire rule to succeed. | |
1652 | This means that you can put | |
1653 | \&\f(CW\*(C`<error...>\*(C'\fR directives wherever useful diagnosis can be done, | |
1654 | and only those associated with actual parser failure will ever be | |
1655 | displayed. Also see \*(L"Gotchas\*(R". | |
1656 | .Sp | |
1657 | As a general rule, the most useful diagnostics are usually generated | |
1658 | either at the very lowest level within the grammar, or at the very | |
1659 | highest. A good rule of thumb is to identify those subrules which | |
1660 | consist mainly (or entirely) of terminals, and then put an | |
1661 | \&\f(CW\*(C`<error...>\*(C'\fR directive at the end of any other rule which calls | |
1662 | one or more of those subrules. | |
1663 | .Sp | |
1664 | There is one other situation in which the output of the various types of | |
1665 | error directive is suppressed; namely, when the rule containing them | |
1666 | is being parsed as part of a \*(L"look\-ahead\*(R" (see \*(L"Look\-ahead\*(R"). In this | |
1667 | case, the error directive will still cause the rule to fail, but will do | |
1668 | so silently. | |
1669 | .Sp | |
1670 | An unconditional \f(CW\*(C`<error>\*(C'\fR directive always fails (and hence has no | |
1671 | associated value). This means that encountering such a directive | |
1672 | always causes the production containing it to fail. Hence an | |
1673 | \&\f(CW\*(C`<error>\*(C'\fR directive will inevitably be the last (useful) item of a | |
1674 | rule (a level 3 warning is issued if a production contains items after an unconditional | |
1675 | \&\f(CW\*(C`<error>\*(C'\fR directive). | |
1676 | .Sp | |
1677 | An \f(CW\*(C`<error?>\*(C'\fR directive will \fIsucceed\fR (that is: fail to fail :\-), if | |
1678 | the current rule is uncommitted when the directive is encountered. In | |
1679 | that case the directive's associated value is zero. Hence, this type | |
1680 | of error directive \fIcan\fR be used before the end of a | |
1681 | production. For example: | |
1682 | .Sp | |
1683 | .Vb 3 | |
1684 | \& command: 'do' <commit> something | |
1685 | \& | 'report' <commit> something | |
1686 | \& | <error?: Syntax error> <error: Unknown command> | |
1687 | .Ve | |
1688 | .Sp | |
1689 | \&\fBWarning:\fR The \f(CW\*(C`<error?>\*(C'\fR directive does \fInot\fR mean \*(L"always fail (but | |
1690 | do so silently unless committed)\*(R". It actually means "only fail (and report) if | |
1691 | committed, otherwise \fIsucceed\fR\*(L". To achieve the \*(R"fail silently if uncommitted" | |
1692 | semantics, it is necessary to use: | |
1693 | .Sp | |
1694 | .Vb 2 | |
1695 | \& rule: item <commit> item(s) | |
1696 | \& | <error?> <reject> # FAIL SILENTLY UNLESS COMMITTED | |
1697 | .Ve | |
1698 | .Sp | |
1699 | However, because people seem to expect a lone \f(CW\*(C`<error?>\*(C'\fR directive | |
1700 | to work like this: | |
1701 | .Sp | |
1702 | .Vb 3 | |
1703 | \& rule: item <commit> item(s) | |
1704 | \& | <error?: Error message if committed> | |
1705 | \& | <error: Error message if uncommitted> | |
1706 | .Ve | |
1707 | .Sp | |
1708 | Parse::RecDescent automatically appends a | |
1709 | \&\f(CW\*(C`<reject>\*(C'\fR directive if the \f(CW\*(C`<error?>\*(C'\fR directive | |
1710 | is the only item in a production. A level 2 warning (see below) | |
1711 | is issued when this happens. | |
1712 | .Sp | |
1713 | The level of error reporting during both parser construction and | |
1714 | parsing is controlled by the presence or absence of four global | |
1715 | variables: \f(CW$::RD_ERRORS\fR, \f(CW$::RD_WARN\fR, \f(CW$::RD_HINT\fR, and | |
1716 | <$::RD_TRACE>. If \f(CW$::RD_ERRORS\fR is defined (and, by default, it is) | |
1717 | then fatal errors are reported. | |
1718 | .Sp | |
1719 | Whenever \f(CW$::RD_WARN\fR is defined, certain non-fatal problems are also reported. | |
1720 | Warnings have an associated \*(L"level\*(R": 1, 2, or 3. The higher the level, | |
1721 | the more serious the warning. The value of the corresponding global | |
1722 | variable (\f(CW$::RD_WARN\fR) determines the \fIlowest\fR level of warning to | |
1723 | be displayed. Hence, to see \fIall\fR warnings, set \f(CW$::RD_WARN\fR to 1. | |
1724 | To see only the most serious warnings set \f(CW$::RD_WARN\fR to 3. | |
1725 | By default \f(CW$::RD_WARN\fR is initialized to 3, ensuring that serious but | |
1726 | non-fatal errors are automatically reported. | |
1727 | .Sp | |
1728 | See \fI\*(L"\s-1DIAGNOSTICS\s0\*(R"\fR for a list of the varous error and warning messages | |
1729 | that Parse::RecDescent generates when these two variables are defined. | |
1730 | .Sp | |
1731 | Defining any of the remaining variables (which are not defined by | |
1732 | default) further increases the amount of information reported. | |
1733 | Defining \f(CW$::RD_HINT\fR causes the parser generator to offer | |
1734 | more detailed analyses and hints on both errors and warnings. | |
1735 | Note that setting \f(CW$::RD_HINT\fR at any point automagically | |
1736 | sets \f(CW$::RD_WARN\fR to 1. | |
1737 | .Sp | |
1738 | Defining \f(CW$::RD_TRACE\fR causes the parser generator and the parser to | |
1739 | report their progress to \s-1STDERR\s0 in excruciating detail (although, without hints | |
1740 | unless \f(CW$::RD_HINT\fR is separately defined). This detail | |
1741 | can be moderated in only one respect: if \f(CW$::RD_TRACE\fR has an | |
1742 | integer value (\fIN\fR) greater than 1, only the \fIN\fR characters of | |
1743 | the \*(L"current parsing context\*(R" (that is, where in the input string we | |
1744 | are at any point in the parse) is reported at any time. | |
1745 | .Sp | |
1746 | \&\f(CW$::RD_TRACE\fR is mainly useful for debugging a grammar that isn't | |
1747 | behaving as you expected it to. To this end, if \f(CW$::RD_TRACE\fR is | |
1748 | defined when a parser is built, any actual parser code which is | |
1749 | generated is also written to a file named \*(L"\s-1RD_TRACE\s0\*(R" in the local | |
1750 | directory. | |
1751 | .Sp | |
1752 | Note that the four variables belong to the \*(L"main\*(R" package, which | |
1753 | makes them easier to refer to in the code controlling the parser, and | |
1754 | also makes it easy to turn them into command line flags (\*(L"\-RD_ERRORS\*(R", | |
1755 | \&\*(L"\-RD_WARN\*(R", \*(L"\-RD_HINT\*(R", \*(L"\-RD_TRACE\*(R") under \fBperl \-s\fR. | |
1756 | .RE | |
1757 | .IP "Specifying local variables" 4 | |
1758 | .IX Item "Specifying local variables" | |
1759 | It is occasionally convenient to specify variables which are local | |
1760 | to a single rule. This may be achieved by including a | |
1761 | \&\f(CW\*(C`<rulevar:...>\*(C'\fR directive anywhere in the rule. For example: | |
1762 | .Sp | |
1763 | .Vb 1 | |
1764 | \& markup: <rulevar: $tag> | |
1765 | .Ve | |
1766 | .Sp | |
1767 | .Vb 1 | |
1768 | \& markup: tag {($tag=$item[1]) =~ s/^<|>$//g} body[$tag] | |
1769 | .Ve | |
1770 | .Sp | |
1771 | The example \f(CW\*(C`<rulevar: $tag>\*(C'\fR directive causes a \*(L"my\*(R" variable named | |
1772 | \&\f(CW$tag\fR to be declared at the start of the subroutine implementing the | |
1773 | \&\f(CW\*(C`markup\*(C'\fR rule (that is, \fIbefore\fR the first production, regardless of | |
1774 | where in the rule it is specified). | |
1775 | .Sp | |
1776 | Specifically, any directive of the form: | |
1777 | \&\f(CW\*(C`<rulevar:\f(CItext\f(CW>\*(C'\fR causes a line of the form \f(CW\*(C`my \f(CItext\f(CW;\*(C'\fR | |
1778 | to be added at the beginning of the rule subroutine, immediately after | |
1779 | the definitions of the following local variables: | |
1780 | .Sp | |
1781 | .Vb 4 | |
1782 | \& $thisparser $commit | |
1783 | \& $thisrule @item | |
1784 | \& $thisline @arg | |
1785 | \& $text %arg | |
1786 | .Ve | |
1787 | .Sp | |
1788 | This means that the following \f(CW\*(C`<rulevar>\*(C'\fR directives work | |
1789 | as expected: | |
1790 | .Sp | |
1791 | .Vb 1 | |
1792 | \& <rulevar: $count = 0 > | |
1793 | .Ve | |
1794 | .Sp | |
1795 | .Vb 1 | |
1796 | \& <rulevar: $firstarg = $arg[0] || '' > | |
1797 | .Ve | |
1798 | .Sp | |
1799 | .Vb 1 | |
1800 | \& <rulevar: $myItems = \e@item > | |
1801 | .Ve | |
1802 | .Sp | |
1803 | .Vb 1 | |
1804 | \& <rulevar: @context = ( $thisline, $text, @arg ) > | |
1805 | .Ve | |
1806 | .Sp | |
1807 | .Vb 1 | |
1808 | \& <rulevar: ($name,$age) = $arg{"name","age"} > | |
1809 | .Ve | |
1810 | .Sp | |
1811 | Note however that, because all such variables are \*(L"my\*(R" variables, their | |
1812 | values \fIdo not persist\fR between match attempts on a given rule. To | |
1813 | preserve values between match attempts, values can be stored within the | |
1814 | \&\*(L"local\*(R" member of the \f(CW$thisrule\fR object: | |
1815 | .Sp | |
1816 | .Vb 6 | |
1817 | \& countedrule: { $thisrule->{"local"}{"count"}++ } | |
1818 | \& <reject> | |
1819 | \& | subrule1 | |
1820 | \& | subrule2 | |
1821 | \& | <reject: $thisrule->{"local"}{"count"} == 1> | |
1822 | \& subrule3 | |
1823 | .Ve | |
1824 | .Sp | |
1825 | When matching a rule, each \f(CW\*(C`<rulevar>\*(C'\fR directive is matched as | |
1826 | if it were an unconditional \f(CW\*(C`<reject>\*(C'\fR directive (that is, it | |
1827 | causes any production in which it appears to immediately fail to match). | |
1828 | For this reason (and to improve readability) it is usual to specify any | |
1829 | \&\f(CW\*(C`<rulevar>\*(C'\fR directive in a separate production at the start of | |
1830 | the rule (this has the added advantage that it enables | |
1831 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR to optimize away such productions, just as it does | |
1832 | for the \f(CW\*(C`<reject>\*(C'\fR directive). | |
1833 | .IP "Dynamically matched rules" 4 | |
1834 | .IX Item "Dynamically matched rules" | |
1835 | Because regexes and double-quoted strings are interpolated, it is relatively | |
1836 | easy to specify productions with \*(L"context sensitive\*(R" tokens. For example: | |
1837 | .Sp | |
1838 | .Vb 1 | |
1839 | \& command: keyword body "end $item[1]" | |
1840 | .Ve | |
1841 | .Sp | |
1842 | which ensures that a command block is bounded by a | |
1843 | "\fI<keyword>\fR...end \fI<same keyword>\fR" pair. | |
1844 | .Sp | |
1845 | Building productions in which subrules are context sensitive is also possible, | |
1846 | via the \f(CW\*(C`<matchrule:...>\*(C'\fR directive. This directive behaves | |
1847 | identically to a subrule item, except that the rule which is invoked to match | |
1848 | it is determined by the string specified after the colon. For example, we could | |
1849 | rewrite the \f(CW\*(C`command\*(C'\fR rule like this: | |
1850 | .Sp | |
1851 | .Vb 1 | |
1852 | \& command: keyword <matchrule:body> "end $item[1]" | |
1853 | .Ve | |
1854 | .Sp | |
1855 | Whatever appears after the colon in the directive is treated as an interpolated | |
1856 | string (that is, as if it appeared in \f(CW\*(C`qq{...}\*(C'\fR operator) and the value of | |
1857 | that interpolated string is the name of the subrule to be matched. | |
1858 | .Sp | |
1859 | Of course, just putting a constant string like \f(CW\*(C`body\*(C'\fR in a | |
1860 | \&\f(CW\*(C`<matchrule:...>\*(C'\fR directive is of little interest or benefit. | |
1861 | The power of directive is seen when we use a string that interpolates | |
1862 | to something interesting. For example: | |
1863 | .Sp | |
1864 | .Vb 1 | |
1865 | \& command: keyword <matchrule:$item[1]_body> "end $item[1]" | |
1866 | .Ve | |
1867 | .Sp | |
1868 | .Vb 1 | |
1869 | \& keyword: 'while' | 'if' | 'function' | |
1870 | .Ve | |
1871 | .Sp | |
1872 | .Vb 1 | |
1873 | \& while_body: condition block | |
1874 | .Ve | |
1875 | .Sp | |
1876 | .Vb 1 | |
1877 | \& if_body: condition block ('else' block)(?) | |
1878 | .Ve | |
1879 | .Sp | |
1880 | .Vb 1 | |
1881 | \& function_body: arglist block | |
1882 | .Ve | |
1883 | .Sp | |
1884 | Now the \f(CW\*(C`command\*(C'\fR rule selects how to proceed on the basis of the keyword | |
1885 | that is found. It is as if \f(CW\*(C`command\*(C'\fR were declared: | |
1886 | .Sp | |
1887 | .Vb 3 | |
1888 | \& command: 'while' while_body "end while" | |
1889 | \& | 'if' if_body "end if" | |
1890 | \& | 'function' function_body "end function" | |
1891 | .Ve | |
1892 | .Sp | |
1893 | When a \f(CW\*(C`<matchrule:...>\*(C'\fR directive is used as a repeated | |
1894 | subrule, the rule name expression is \*(L"late\-bound\*(R". That is, the name of | |
1895 | the rule to be called is re-evaluated \fIeach time\fR a match attempt is | |
1896 | made. Hence, the following grammar: | |
1897 | .Sp | |
1898 | .Vb 1 | |
1899 | \& { $::species = 'dogs' } | |
1900 | .Ve | |
1901 | .Sp | |
1902 | .Vb 1 | |
1903 | \& pair: 'two' <matchrule:$::species>(s) | |
1904 | .Ve | |
1905 | .Sp | |
1906 | .Vb 1 | |
1907 | \& dogs: /dogs/ { $::species = 'cats' } | |
1908 | .Ve | |
1909 | .Sp | |
1910 | .Vb 1 | |
1911 | \& cats: /cats/ | |
1912 | .Ve | |
1913 | .Sp | |
1914 | will match the string \*(L"two dogs cats cats\*(R" completely, whereas it will | |
1915 | only match the string \*(L"two dogs dogs dogs\*(R" up to the eighth letter. If | |
1916 | the rule name were \*(L"early bound\*(R" (that is, evaluated only the first | |
1917 | time the directive is encountered in a production), the reverse | |
1918 | behaviour would be expected. | |
1919 | .IP "Deferred actions" 4 | |
1920 | .IX Item "Deferred actions" | |
1921 | The \f(CW\*(C`<defer:...>\*(C'\fR directive is used to specify an action to be | |
1922 | performed when (and only if!) the current production ultimately succeeds. | |
1923 | .Sp | |
1924 | Whenever a \f(CW\*(C`<defer:...>\*(C'\fR directive appears, the code it specifies | |
1925 | is converted to a closure (an anonymous subroutine reference) which is | |
1926 | queued within the active parser object. Note that, | |
1927 | because the deferred code is converted to a closure, the values of any | |
1928 | \&\*(L"local\*(R" variable (such as \f(CW$text\fR, <@item>, etc.) are preserved | |
1929 | until the deferred code is actually executed. | |
1930 | .Sp | |
1931 | If the parse ultimately succeeds | |
1932 | \&\fIand\fR the production in which the \f(CW\*(C`<defer:...>\*(C'\fR directive was | |
1933 | evaluated formed part of the successful parse, then the deferred code is | |
1934 | executed immediately before the parse returns. If however the production | |
1935 | which queued a deferred action fails, or one of the higher-level | |
1936 | rules which called that production fails, then the deferred action is | |
1937 | removed from the queue, and hence is never executed. | |
1938 | .Sp | |
1939 | For example, given the grammar: | |
1940 | .Sp | |
1941 | .Vb 2 | |
1942 | \& sentence: noun trans noun | |
1943 | \& | noun intrans | |
1944 | .Ve | |
1945 | .Sp | |
1946 | .Vb 4 | |
1947 | \& noun: 'the dog' | |
1948 | \& { print "$item[1]\et(noun)\en" } | |
1949 | \& | 'the meat' | |
1950 | \& { print "$item[1]\et(noun)\en" } | |
1951 | .Ve | |
1952 | .Sp | |
1953 | .Vb 2 | |
1954 | \& trans: 'ate' | |
1955 | \& { print "$item[1]\et(transitive)\en" } | |
1956 | .Ve | |
1957 | .Sp | |
1958 | .Vb 4 | |
1959 | \& intrans: 'ate' | |
1960 | \& { print "$item[1]\et(intransitive)\en" } | |
1961 | \& | 'barked' | |
1962 | \& { print "$item[1]\et(intransitive)\en" } | |
1963 | .Ve | |
1964 | .Sp | |
1965 | then parsing the sentence \f(CW"the dog ate"\fR would produce the output: | |
1966 | .Sp | |
1967 | .Vb 4 | |
1968 | \& the dog (noun) | |
1969 | \& ate (transitive) | |
1970 | \& the dog (noun) | |
1971 | \& ate (intransitive) | |
1972 | .Ve | |
1973 | .Sp | |
1974 | This is because, even though the first production of \f(CW\*(C`sentence\*(C'\fR | |
1975 | ultimately fails, its initial subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`trans\*(C'\fR do match, | |
1976 | and hence they execute their associated actions. | |
1977 | Then the second production of \f(CW\*(C`sentence\*(C'\fR succeeds, causing the | |
1978 | actions of the subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`intrans\*(C'\fR to be executed as well. | |
1979 | .Sp | |
1980 | On the other hand, if the actions were replaced by \f(CW\*(C`<defer:...>\*(C'\fR | |
1981 | directives: | |
1982 | .Sp | |
1983 | .Vb 2 | |
1984 | \& sentence: noun trans noun | |
1985 | \& | noun intrans | |
1986 | .Ve | |
1987 | .Sp | |
1988 | .Vb 4 | |
1989 | \& noun: 'the dog' | |
1990 | \& <defer: print "$item[1]\et(noun)\en" > | |
1991 | \& | 'the meat' | |
1992 | \& <defer: print "$item[1]\et(noun)\en" > | |
1993 | .Ve | |
1994 | .Sp | |
1995 | .Vb 2 | |
1996 | \& trans: 'ate' | |
1997 | \& <defer: print "$item[1]\et(transitive)\en" > | |
1998 | .Ve | |
1999 | .Sp | |
2000 | .Vb 4 | |
2001 | \& intrans: 'ate' | |
2002 | \& <defer: print "$item[1]\et(intransitive)\en" > | |
2003 | \& | 'barked' | |
2004 | \& <defer: print "$item[1]\et(intransitive)\en" > | |
2005 | .Ve | |
2006 | .Sp | |
2007 | the output would be: | |
2008 | .Sp | |
2009 | .Vb 2 | |
2010 | \& the dog (noun) | |
2011 | \& ate (intransitive) | |
2012 | .Ve | |
2013 | .Sp | |
2014 | since deferred actions are only executed if they were evaluated in | |
2015 | a production which ultimately contributes to the successful parse. | |
2016 | .Sp | |
2017 | In this case, even though the first production of \f(CW\*(C`sentence\*(C'\fR caused | |
2018 | the subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`trans\*(C'\fR to match, that production ultimately | |
2019 | failed and so the deferred actions queued by those subrules were subsequently | |
2020 | disgarded. The second production then succeeded, causing the entire | |
2021 | parse to succeed, and so the deferred actions queued by the (second) match of | |
2022 | the \f(CW\*(C`noun\*(C'\fR subrule and the subsequent match of \f(CW\*(C`intrans\*(C'\fR \fIare\fR preserved and | |
2023 | eventually executed. | |
2024 | .Sp | |
2025 | Deferred actions provide a means of improving the performance of a parser, | |
2026 | by only executing those actions which are part of the final parse-tree | |
2027 | for the input data. | |
2028 | .Sp | |
2029 | Alternatively, deferred actions can be viewed as a mechanism for building | |
2030 | (and executing) a | |
2031 | customized subroutine corresponding to the given input data, much in the | |
2032 | same way that autoactions (see \*(L"Autoactions\*(R") can be used to build a | |
2033 | customized data structure for specific input. | |
2034 | .Sp | |
2035 | Whether or not the action it specifies is ever executed, | |
2036 | a \f(CW\*(C`<defer:...>\*(C'\fR directive always succeeds, returning the | |
2037 | number of deferred actions currently queued at that point. | |
2038 | .IP "Parsing Perl" 4 | |
2039 | .IX Item "Parsing Perl" | |
2040 | Parse::RecDescent provides limited support for parsing subsets of Perl, | |
2041 | namely: quote-like operators, Perl variables, and complete code blocks. | |
2042 | .Sp | |
2043 | The \f(CW\*(C`<perl_quotelike>\*(C'\fR directive can be used to parse any Perl | |
2044 | quote-like operator: \f(CW'a string'\fR, \f(CW\*(C`m/a pattern/\*(C'\fR, \f(CW\*(C`tr{ans}{lation}\*(C'\fR, | |
2045 | etc. It does this by calling \fIText::Balanced::quotelike()\fR. | |
2046 | .Sp | |
2047 | If a quote-like operator is found, a reference to an array of eight elements | |
2048 | is returned. Those elements are identical to the last eight elements returned | |
2049 | by \fIText::Balanced::extract_quotelike()\fR in an array context, namely: | |
2050 | .RS 4 | |
2051 | .IP "[0]" 4 | |
2052 | .IX Item "[0]" | |
2053 | the name of the quotelike operator \*(-- 'q', 'qq', 'm', 's', 'tr' \*(-- if the | |
2054 | operator was named; otherwise \f(CW\*(C`undef\*(C'\fR, | |
2055 | .IP "[1]" 4 | |
2056 | .IX Item "[1]" | |
2057 | the left delimiter of the first block of the operation, | |
2058 | .IP "[2]" 4 | |
2059 | .IX Item "[2]" | |
2060 | the text of the first block of the operation | |
2061 | (that is, the contents of | |
2062 | a quote, the regex of a match, or substitution or the target list of a | |
2063 | translation), | |
2064 | .IP "[3]" 4 | |
2065 | .IX Item "[3]" | |
2066 | the right delimiter of the first block of the operation, | |
2067 | .IP "[4]" 4 | |
2068 | .IX Item "[4]" | |
2069 | the left delimiter of the second block of the operation if there is one | |
2070 | (that is, if it is a \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR, or \f(CW\*(C`y\*(C'\fR); otherwise \f(CW\*(C`undef\*(C'\fR, | |
2071 | .IP "[5]" 4 | |
2072 | .IX Item "[5]" | |
2073 | the text of the second block of the operation if there is one | |
2074 | (that is, the replacement of a substitution or the translation list | |
2075 | of a translation); otherwise \f(CW\*(C`undef\*(C'\fR, | |
2076 | .IP "[6]" 4 | |
2077 | .IX Item "[6]" | |
2078 | the right delimiter of the second block of the operation (if any); | |
2079 | otherwise \f(CW\*(C`undef\*(C'\fR, | |
2080 | .IP "[7]" 4 | |
2081 | .IX Item "[7]" | |
2082 | the trailing modifiers on the operation (if any); otherwise \f(CW\*(C`undef\*(C'\fR. | |
2083 | .RE | |
2084 | .RS 4 | |
2085 | .Sp | |
2086 | If a quote-like expression is not found, the directive fails with the usual | |
2087 | \&\f(CW\*(C`undef\*(C'\fR value. | |
2088 | .Sp | |
2089 | The \f(CW\*(C`<perl_variable>\*(C'\fR directive can be used to parse any Perl | |
2090 | variable: \f(CW$scalar\fR, \f(CW@array\fR, \f(CW%hash\fR, \f(CW$ref\fR\->{field}[$index], etc. | |
2091 | It does this by calling \fIText::Balanced::extract_variable()\fR. | |
2092 | .Sp | |
2093 | If the directive matches text representing a valid Perl variable | |
2094 | specification, it returns that text. Otherwise it fails with the usual | |
2095 | \&\f(CW\*(C`undef\*(C'\fR value. | |
2096 | .Sp | |
2097 | The \f(CW\*(C`<perl_codeblock>\*(C'\fR directive can be used to parse curly-brace-delimited block of Perl code, such as: { \f(CW$a\fR = 1; f() =~ m/pat/; }. | |
2098 | It does this by calling \fIText::Balanced::extract_codeblock()\fR. | |
2099 | .Sp | |
2100 | If the directive matches text representing a valid Perl code block, | |
2101 | it returns that text. Otherwise it fails with the usual \f(CW\*(C`undef\*(C'\fR value. | |
2102 | .RE | |
2103 | .IP "Constructing tokens" 4 | |
2104 | .IX Item "Constructing tokens" | |
2105 | Eventually, Parse::RecDescent will be able to parse tokenized input, as | |
2106 | well as ordinary strings. In preparation for this joyous day, the | |
2107 | \&\f(CW\*(C`<token:...>\*(C'\fR directive has been provided. | |
2108 | This directive creates a token which will be suitable for | |
2109 | input to a Parse::RecDescent parser (when it eventually supports | |
2110 | tokenized input). | |
2111 | .Sp | |
2112 | The text of the token is the value of the | |
2113 | immediately preceding item in the production. A | |
2114 | \&\f(CW\*(C`<token:...>\*(C'\fR directive always succeeds with a return | |
2115 | value which is the hash reference that is the new token. It also | |
2116 | sets the return value for the production to that hash ref. | |
2117 | .Sp | |
2118 | The \f(CW\*(C`<token:...>\*(C'\fR directive makes it easy to build | |
2119 | a Parse::RecDescent\-compatible lexer in Parse::RecDescent: | |
2120 | .Sp | |
2121 | .Vb 3 | |
2122 | \& my $lexer = new Parse::RecDescent q | |
2123 | \& { | |
2124 | \& lex: token(s) | |
2125 | .Ve | |
2126 | .Sp | |
2127 | .Vb 5 | |
2128 | \& token: /a\eb/ <token:INDEF> | |
2129 | \& | /the\eb/ <token:DEF> | |
2130 | \& | /fly\eb/ <token:NOUN,VERB> | |
2131 | \& | /[a-z]+/i { lc $item[1] } <token:ALPHA> | |
2132 | \& | <error: Unknown token> | |
2133 | .Ve | |
2134 | .Sp | |
2135 | .Vb 1 | |
2136 | \& }; | |
2137 | .Ve | |
2138 | .Sp | |
2139 | which will eventually be able to be used with a regular Parse::RecDescent | |
2140 | grammar: | |
2141 | .Sp | |
2142 | .Vb 3 | |
2143 | \& my $parser = new Parse::RecDescent q | |
2144 | \& { | |
2145 | \& startrule: subrule1 subrule 2 | |
2146 | .Ve | |
2147 | .Sp | |
2148 | .Vb 2 | |
2149 | \& # ETC... | |
2150 | \& }; | |
2151 | .Ve | |
2152 | .Sp | |
2153 | either with a pre-lexing phase: | |
2154 | .Sp | |
2155 | .Vb 1 | |
2156 | \& $parser->startrule( $lexer->lex($data) ); | |
2157 | .Ve | |
2158 | .Sp | |
2159 | or with a lex-on-demand approach: | |
2160 | .Sp | |
2161 | .Vb 1 | |
2162 | \& $parser->startrule( sub{$lexer->token(\e$data)} ); | |
2163 | .Ve | |
2164 | .Sp | |
2165 | But at present, only the \f(CW\*(C`<token:...>\*(C'\fR directive is | |
2166 | actually implemented. The rest is vapourware. | |
2167 | .IP "Specifying operations" 4 | |
2168 | .IX Item "Specifying operations" | |
2169 | One of the commonest requirements when building a parser is to specify | |
2170 | binary operators. Unfortunately, in a normal grammar, the rules for | |
2171 | such things are awkward: | |
2172 | .Sp | |
2173 | .Vb 2 | |
2174 | \& disjunction: conjunction ('or' conjunction)(s?) | |
2175 | \& { $return = [ $item[1], @{$item[2]} ] } | |
2176 | .Ve | |
2177 | .Sp | |
2178 | .Vb 2 | |
2179 | \& conjunction: atom ('and' atom)(s?) | |
2180 | \& { $return = [ $item[1], @{$item[2]} ] } | |
2181 | .Ve | |
2182 | .Sp | |
2183 | or inefficient: | |
2184 | .Sp | |
2185 | .Vb 4 | |
2186 | \& disjunction: conjunction 'or' disjunction | |
2187 | \& { $return = [ $item[1], @{$item[2]} ] } | |
2188 | \& | conjunction | |
2189 | \& { $return = [ $item[1] ] } | |
2190 | .Ve | |
2191 | .Sp | |
2192 | .Vb 4 | |
2193 | \& conjunction: atom 'and' conjunction | |
2194 | \& { $return = [ $item[1], @{$item[2]} ] } | |
2195 | \& | atom | |
2196 | \& { $return = [ $item[1] ] } | |
2197 | .Ve | |
2198 | .Sp | |
2199 | and either way is ugly and hard to get right. | |
2200 | .Sp | |
2201 | The \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives provide an | |
2202 | easier way of specifying such operations. Using \f(CW\*(C`<leftop:...>\*(C'\fR the | |
2203 | above examples become: | |
2204 | .Sp | |
2205 | .Vb 2 | |
2206 | \& disjunction: <leftop: conjunction 'or' conjunction> | |
2207 | \& conjunction: <leftop: atom 'and' atom> | |
2208 | .Ve | |
2209 | .Sp | |
2210 | The \f(CW\*(C`<leftop:...>\*(C'\fR directive specifies a left-associative binary operator. | |
2211 | It is specified around three other grammar elements | |
2212 | (typically subrules or terminals), which match the left operand, | |
2213 | the operator itself, and the right operand respectively. | |
2214 | .Sp | |
2215 | A \f(CW\*(C`<leftop:...>\*(C'\fR directive such as: | |
2216 | .Sp | |
2217 | .Vb 1 | |
2218 | \& disjunction: <leftop: conjunction 'or' conjunction> | |
2219 | .Ve | |
2220 | .Sp | |
2221 | is converted to the following: | |
2222 | .Sp | |
2223 | .Vb 2 | |
2224 | \& disjunction: ( conjunction ('or' conjunction)(s?) | |
2225 | \& { $return = [ $item[1], @{$item[2]} ] } ) | |
2226 | .Ve | |
2227 | .Sp | |
2228 | In other words, a \f(CW\*(C`<leftop:...>\*(C'\fR directive matches the left operand followed by zero | |
2229 | or more repetitions of both the operator and the right operand. It then | |
2230 | flattens the matched items into an anonymous array which becomes the | |
2231 | (single) value of the entire \f(CW\*(C`<leftop:...>\*(C'\fR directive. | |
2232 | .Sp | |
2233 | For example, an \f(CW\*(C`<leftop:...>\*(C'\fR directive such as: | |
2234 | .Sp | |
2235 | .Vb 1 | |
2236 | \& output: <leftop: ident '<<' expr > | |
2237 | .Ve | |
2238 | .Sp | |
2239 | when given a string such as: | |
2240 | .Sp | |
2241 | .Vb 1 | |
2242 | \& cout << var << "str" << 3 | |
2243 | .Ve | |
2244 | .Sp | |
2245 | would match, and \f(CW$item[1]\fR would be set to: | |
2246 | .Sp | |
2247 | .Vb 1 | |
2248 | \& [ 'cout', 'var', '"str"', '3' ] | |
2249 | .Ve | |
2250 | .Sp | |
2251 | In other words: | |
2252 | .Sp | |
2253 | .Vb 1 | |
2254 | \& output: <leftop: ident '<<' expr > | |
2255 | .Ve | |
2256 | .Sp | |
2257 | is equivalent to a left-associative operator: | |
2258 | .Sp | |
2259 | .Vb 5 | |
2260 | \& output: ident { $return = [$item[1]] } | |
2261 | \& | ident '<<' expr { $return = [@item[1,3]] } | |
2262 | \& | ident '<<' expr '<<' expr { $return = [@item[1,3,5]] } | |
2263 | \& | ident '<<' expr '<<' expr '<<' expr { $return = [@item[1,3,5,7]] } | |
2264 | \& # ...etc... | |
2265 | .Ve | |
2266 | .Sp | |
2267 | Similarly, the \f(CW\*(C`<rightop:...>\*(C'\fR directive takes a left operand, an operator, and a right operand: | |
2268 | .Sp | |
2269 | .Vb 1 | |
2270 | \& assign: <rightop: var '=' expr > | |
2271 | .Ve | |
2272 | .Sp | |
2273 | and converts them to: | |
2274 | .Sp | |
2275 | .Vb 2 | |
2276 | \& assign: ( (var '=' {$return=$item[1]})(s?) expr | |
2277 | \& { $return = [ @{$item[1]}, $item[2] ] } ) | |
2278 | .Ve | |
2279 | .Sp | |
2280 | which is equivalent to a right-associative operator: | |
2281 | .Sp | |
2282 | .Vb 5 | |
2283 | \& assign: var { $return = [$item[1]] } | |
2284 | \& | var '=' expr { $return = [@item[1,3]] } | |
2285 | \& | var '=' var '=' expr { $return = [@item[1,3,5]] } | |
2286 | \& | var '=' var '=' var '=' expr { $return = [@item[1,3,5,7]] } | |
2287 | \& # ...etc... | |
2288 | .Ve | |
2289 | .Sp | |
2290 | Note that for both the \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives, the directive does not normally | |
2291 | return the operator itself, just a list of the operands involved. This is | |
2292 | particularly handy for specifying lists: | |
2293 | .Sp | |
2294 | .Vb 2 | |
2295 | \& list: '(' <leftop: list_item ',' list_item> ')' | |
2296 | \& { $return = $item[2] } | |
2297 | .Ve | |
2298 | .Sp | |
2299 | There is, however, a problem: sometimes the operator is itself significant. | |
2300 | For example, in a Perl list a comma and a \f(CW\*(C`=>\*(C'\fR are both | |
2301 | valid separators, but the \f(CW\*(C`=>\*(C'\fR has additional stringification semantics. | |
2302 | Hence it's important to know which was used in each case. | |
2303 | .Sp | |
2304 | To solve this problem the | |
2305 | \&\f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives | |
2306 | \&\fIdo\fR return the operator(s) as well, under two circumstances. | |
2307 | The first case is where the operator is specified as a subrule. In that instance, | |
2308 | whatever the operator matches is returned (on the assumption that if the operator | |
2309 | is important enough to have its own subrule, then it's important enough to return). | |
2310 | .Sp | |
2311 | The second case is where the operator is specified as a regular | |
2312 | expression. In that case, if the first bracketed subpattern of the | |
2313 | regular expression matches, that matching value is returned (this is analogous to | |
2314 | the behaviour of the Perl \f(CW\*(C`split\*(C'\fR function, except that only the first subpattern | |
2315 | is returned). | |
2316 | .Sp | |
2317 | In other words, given the input: | |
2318 | .Sp | |
2319 | .Vb 1 | |
2320 | \& ( a=>1, b=>2 ) | |
2321 | .Ve | |
2322 | .Sp | |
2323 | the specifications: | |
2324 | .Sp | |
2325 | .Vb 1 | |
2326 | \& list: '(' <leftop: list_item separator list_item> ')' | |
2327 | .Ve | |
2328 | .Sp | |
2329 | .Vb 1 | |
2330 | \& separator: ',' | '=>' | |
2331 | .Ve | |
2332 | .Sp | |
2333 | or: | |
2334 | .Sp | |
2335 | .Vb 1 | |
2336 | \& list: '(' <leftop: list_item /(,|=>)/ list_item> ')' | |
2337 | .Ve | |
2338 | .Sp | |
2339 | cause the list separators to be interleaved with the operands in the | |
2340 | anonymous array in \f(CW$item[2]\fR: | |
2341 | .Sp | |
2342 | .Vb 1 | |
2343 | \& [ 'a', '=>', '1', ',', 'b', '=>', '2' ] | |
2344 | .Ve | |
2345 | .Sp | |
2346 | But the following version: | |
2347 | .Sp | |
2348 | .Vb 1 | |
2349 | \& list: '(' <leftop: list_item /,|=>/ list_item> ')' | |
2350 | .Ve | |
2351 | .Sp | |
2352 | returns only the operators: | |
2353 | .Sp | |
2354 | .Vb 1 | |
2355 | \& [ 'a', '1', 'b', '2' ] | |
2356 | .Ve | |
2357 | .Sp | |
2358 | Of course, none of the above specifications handle the case of an empty | |
2359 | list, since the \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives | |
2360 | require at least a single right or left operand to match. To specify | |
2361 | that the operator can match \*(L"trivially\*(R", | |
2362 | it's necessary to add a \f(CW\*(C`(?)\*(C'\fR qualifier to the directive: | |
2363 | .Sp | |
2364 | .Vb 1 | |
2365 | \& list: '(' <leftop: list_item /(,|=>)/ list_item>(?) ')' | |
2366 | .Ve | |
2367 | .Sp | |
2368 | Note that in almost all the above examples, the first and third arguments | |
2369 | of the \f(CW\*(C`<leftop:...>\*(C'\fR directive were the same subrule. That is because | |
2370 | \&\f(CW\*(C`<leftop:...>\*(C'\fR's are frequently used to specify \*(L"separated\*(R" lists of the | |
2371 | same type of item. To make such lists easier to specify, the following | |
2372 | syntax: | |
2373 | .Sp | |
2374 | .Vb 1 | |
2375 | \& list: element(s /,/) | |
2376 | .Ve | |
2377 | .Sp | |
2378 | is exactly equivalent to: | |
2379 | .Sp | |
2380 | .Vb 1 | |
2381 | \& list: <leftop: element /,/ element> | |
2382 | .Ve | |
2383 | .Sp | |
2384 | Note that the separator must be specified as a raw pattern (i.e. | |
2385 | not a string or subrule). | |
2386 | .IP "Scored productions" 4 | |
2387 | .IX Item "Scored productions" | |
2388 | By default, Parse::RecDescent grammar rules always accept the first | |
2389 | production that matches the input. But if two or more productions may | |
2390 | potentially match the same input, choosing the first that does so may | |
2391 | not be optimal. | |
2392 | .Sp | |
2393 | For example, if you were parsing the sentence \*(L"time flies like an arrow\*(R", | |
2394 | you might use a rule like this: | |
2395 | .Sp | |
2396 | .Vb 3 | |
2397 | \& sentence: verb noun preposition article noun { [@item] } | |
2398 | \& | adjective noun verb article noun { [@item] } | |
2399 | \& | noun verb preposition article noun { [@item] } | |
2400 | .Ve | |
2401 | .Sp | |
2402 | Each of these productions matches the sentence, but the third one | |
2403 | is the most likely interpretation. However, if the sentence had been | |
2404 | \&\*(L"fruit flies like a banana\*(R", then the second production is probably | |
2405 | the right match. | |
2406 | .Sp | |
2407 | To cater for such situtations, the \f(CW\*(C`<score:...>\*(C'\fR can be used. | |
2408 | The directive is equivalent to an unconditional \f(CW\*(C`<reject>\*(C'\fR, | |
2409 | except that it allows you to specify a \*(L"score\*(R" for the current | |
2410 | production. If that score is numerically greater than the best | |
2411 | score of any preceding production, the current production is cached for later | |
2412 | consideration. If no later production matches, then the cached | |
2413 | production is treated as having matched, and the value of the | |
2414 | item immediately before its \f(CW\*(C`<score:...>\*(C'\fR directive is returned as the | |
2415 | result. | |
2416 | .Sp | |
2417 | In other words, by putting a \f(CW\*(C`<score:...>\*(C'\fR directive at the end of | |
2418 | each production, you can select which production matches using | |
2419 | criteria other than specification order. For example: | |
2420 | .Sp | |
2421 | .Vb 3 | |
2422 | \& sentence: verb noun preposition article noun { [@item] } <score: sensible(@item)> | |
2423 | \& | adjective noun verb article noun { [@item] } <score: sensible(@item)> | |
2424 | \& | noun verb preposition article noun { [@item] } <score: sensible(@item)> | |
2425 | .Ve | |
2426 | .Sp | |
2427 | Now, when each production reaches its respective \f(CW\*(C`<score:...>\*(C'\fR | |
2428 | directive, the subroutine \f(CW\*(C`sensible\*(C'\fR will be called to evaluate the | |
2429 | matched items (somehow). Once all productions have been tried, the | |
2430 | one which \f(CW\*(C`sensible\*(C'\fR scored most highly will be the one that is | |
2431 | accepted as a match for the rule. | |
2432 | .Sp | |
2433 | The variable \f(CW$score\fR always holds the current best score of any production, | |
2434 | and the variable \f(CW$score_return\fR holds the corresponding return value. | |
2435 | .Sp | |
2436 | As another example, the following grammar matches lines that may be | |
2437 | separated by commas, colons, or semi\-colons. This can be tricky if | |
2438 | a colon-separated line also contains commas, or vice versa. The grammar | |
2439 | resolves the ambiguity by selecting the rule that results in the | |
2440 | fewest fields: | |
2441 | .Sp | |
2442 | .Vb 3 | |
2443 | \& line: seplist[sep=>','] <score: -@{$item[1]}> | |
2444 | \& | seplist[sep=>':'] <score: -@{$item[1]}> | |
2445 | \& | seplist[sep=>" "] <score: -@{$item[1]}> | |
2446 | .Ve | |
2447 | .Sp | |
2448 | .Vb 1 | |
2449 | \& seplist: <skip:""> <leftop: /[^$arg{sep}]*/ "$arg{sep}" /[^$arg{sep}]*/> | |
2450 | .Ve | |
2451 | .Sp | |
2452 | Note the use of negation within the \f(CW\*(C`<score:...>\*(C'\fR directive | |
2453 | to ensure that the seplist with the most items gets the lowest score. | |
2454 | .Sp | |
2455 | As the above examples indicate, it is often the case that all productions | |
2456 | in a rule use exactly the same \f(CW\*(C`<score:...>\*(C'\fR directive. It is | |
2457 | tedious to have to repeat this identical directive in every production, so | |
2458 | Parse::RecDescent also provides the \f(CW\*(C`<autoscore:...>\*(C'\fR directive. | |
2459 | .Sp | |
2460 | If an \f(CW\*(C`<autoscore:...>\*(C'\fR directive appears in any | |
2461 | production of a rule, the code it specifies is used as the scoring | |
2462 | code for every production of that rule, except productions that already | |
2463 | end with an explicit \f(CW\*(C`<score:...>\*(C'\fR directive. Thus the rules above could | |
2464 | be rewritten: | |
2465 | .Sp | |
2466 | .Vb 4 | |
2467 | \& line: <autoscore: -@{$item[1]}> | |
2468 | \& line: seplist[sep=>','] | |
2469 | \& | seplist[sep=>':'] | |
2470 | \& | seplist[sep=>" "] | |
2471 | .Ve | |
2472 | .Sp | |
2473 | .Vb 4 | |
2474 | \& sentence: <autoscore: sensible(@item)> | |
2475 | \& | verb noun preposition article noun { [@item] } | |
2476 | \& | adjective noun verb article noun { [@item] } | |
2477 | \& | noun verb preposition article noun { [@item] } | |
2478 | .Ve | |
2479 | .Sp | |
2480 | Note that the \f(CW\*(C`<autoscore:...>\*(C'\fR directive itself acts as an | |
2481 | unconditional \f(CW\*(C`<reject>\*(C'\fR, and (like the \f(CW\*(C`<rulevar:...>\*(C'\fR | |
2482 | directive) is pruned at compile-time wherever possible. | |
2483 | .IP "Dispensing with grammar checks" 4 | |
2484 | .IX Item "Dispensing with grammar checks" | |
2485 | During the compilation phase of parser construction, Parse::RecDescent performs | |
2486 | a small number of checks on the grammar it's given. Specifically it checks that | |
2487 | the grammar is not left\-recursive, that there are no \*(L"insatiable\*(R" constructs of | |
2488 | the form: | |
2489 | .Sp | |
2490 | .Vb 1 | |
2491 | \& rule: subrule(s) subrule | |
2492 | .Ve | |
2493 | .Sp | |
2494 | and that there are no rules missing (i.e. referred to, but never defined). | |
2495 | .Sp | |
2496 | These checks are important during development, but can slow down parser | |
2497 | construction in stable code. So Parse::RecDescent provides the | |
2498 | <nocheck> directive to turn them off. The directive can only appear | |
2499 | before the first rule definition, and switches off checking throughout the rest | |
2500 | of the current grammar. | |
2501 | .Sp | |
2502 | Typically, this directive would be added when a parser has been thoroughly | |
2503 | tested and is ready for release. | |
2504 | .Sh "Subrule argument lists" | |
2505 | .IX Subsection "Subrule argument lists" | |
2506 | It is occasionally useful to pass data to a subrule which is being invoked. For | |
2507 | example, consider the following grammar fragment: | |
2508 | .PP | |
2509 | .Vb 1 | |
2510 | \& classdecl: keyword decl | |
2511 | .Ve | |
2512 | .PP | |
2513 | .Vb 1 | |
2514 | \& keyword: 'struct' | 'class'; | |
2515 | .Ve | |
2516 | .PP | |
2517 | .Vb 1 | |
2518 | \& decl: # WHATEVER | |
2519 | .Ve | |
2520 | .PP | |
2521 | The \f(CW\*(C`decl\*(C'\fR rule might wish to know which of the two keywords was used | |
2522 | (since it may affect some aspect of the way the subsequent declaration | |
2523 | is interpreted). \f(CW\*(C`Parse::RecDescent\*(C'\fR allows the grammar designer to | |
2524 | pass data into a rule, by placing that data in an \fIargument list\fR | |
2525 | (that is, in square brackets) immediately after any subrule item in a | |
2526 | production. Hence, we could pass the keyword to \f(CW\*(C`decl\*(C'\fR as follows: | |
2527 | .PP | |
2528 | .Vb 1 | |
2529 | \& classdecl: keyword decl[ $item[1] ] | |
2530 | .Ve | |
2531 | .PP | |
2532 | .Vb 1 | |
2533 | \& keyword: 'struct' | 'class'; | |
2534 | .Ve | |
2535 | .PP | |
2536 | .Vb 1 | |
2537 | \& decl: # WHATEVER | |
2538 | .Ve | |
2539 | .PP | |
2540 | The argument list can consist of any number (including zero!) of comma-separated | |
2541 | Perl expressions. In other words, it looks exactly like a Perl anonymous | |
2542 | array reference. For example, we could pass the keyword, the name of the | |
2543 | surrounding rule, and the literal 'keyword' to \f(CW\*(C`decl\*(C'\fR like so: | |
2544 | .PP | |
2545 | .Vb 1 | |
2546 | \& classdecl: keyword decl[$item[1],$item[0],'keyword'] | |
2547 | .Ve | |
2548 | .PP | |
2549 | .Vb 1 | |
2550 | \& keyword: 'struct' | 'class'; | |
2551 | .Ve | |
2552 | .PP | |
2553 | .Vb 1 | |
2554 | \& decl: # WHATEVER | |
2555 | .Ve | |
2556 | .PP | |
2557 | Within the rule to which the data is passed (\f(CW\*(C`decl\*(C'\fR in the above examples) | |
2558 | that data is available as the elements of a local variable \f(CW@arg\fR. Hence | |
2559 | \&\f(CW\*(C`decl\*(C'\fR might report its intentions as follows: | |
2560 | .PP | |
2561 | .Vb 1 | |
2562 | \& classdecl: keyword decl[$item[1],$item[0],'keyword'] | |
2563 | .Ve | |
2564 | .PP | |
2565 | .Vb 1 | |
2566 | \& keyword: 'struct' | 'class'; | |
2567 | .Ve | |
2568 | .PP | |
2569 | .Vb 2 | |
2570 | \& decl: { print "Declaring $arg[0] (a $arg[2])\en"; | |
2571 | \& print "(this rule called by $arg[1])" } | |
2572 | .Ve | |
2573 | .PP | |
2574 | Subrule argument lists can also be interpreted as hashes, simply by using | |
2575 | the local variable \f(CW%arg\fR instead of \f(CW@arg\fR. Hence we could rewrite the | |
2576 | previous example: | |
2577 | .PP | |
2578 | .Vb 3 | |
2579 | \& classdecl: keyword decl[keyword => $item[1], | |
2580 | \& caller => $item[0], | |
2581 | \& type => 'keyword'] | |
2582 | .Ve | |
2583 | .PP | |
2584 | .Vb 1 | |
2585 | \& keyword: 'struct' | 'class'; | |
2586 | .Ve | |
2587 | .PP | |
2588 | .Vb 2 | |
2589 | \& decl: { print "Declaring $arg{keyword} (a $arg{type})\en"; | |
2590 | \& print "(this rule called by $arg{caller})" } | |
2591 | .Ve | |
2592 | .PP | |
2593 | Both \f(CW@arg\fR and \f(CW%arg\fR are always available, so the grammar designer may | |
2594 | choose whichever convention (or combination of conventions) suits best. | |
2595 | .PP | |
2596 | Subrule argument lists are also useful for creating \*(L"rule templates\*(R" | |
2597 | (especially when used in conjunction with the \f(CW\*(C`<matchrule:...>\*(C'\fR | |
2598 | directive). For example, the subrule: | |
2599 | .PP | |
2600 | .Vb 4 | |
2601 | \& list: <matchrule:$arg{rule}> /$arg{sep}/ list[%arg] | |
2602 | \& { $return = [ $item[1], @{$item[3]} ] } | |
2603 | \& | <matchrule:$arg{rule}> | |
2604 | \& { $return = [ $item[1]] } | |
2605 | .Ve | |
2606 | .PP | |
2607 | is a handy template for the common problem of matching a separated list. | |
2608 | For example: | |
2609 | .PP | |
2610 | .Vb 1 | |
2611 | \& function: 'func' name '(' list[rule=>'param',sep=>';'] ')' | |
2612 | .Ve | |
2613 | .PP | |
2614 | .Vb 1 | |
2615 | \& param: list[rule=>'name',sep=>','] ':' typename | |
2616 | .Ve | |
2617 | .PP | |
2618 | .Vb 1 | |
2619 | \& name: /\ew+/ | |
2620 | .Ve | |
2621 | .PP | |
2622 | .Vb 1 | |
2623 | \& typename: name | |
2624 | .Ve | |
2625 | .PP | |
2626 | When a subrule argument list is used with a repeated subrule, the argument list | |
2627 | goes \fIbefore\fR the repetition specifier: | |
2628 | .PP | |
2629 | .Vb 1 | |
2630 | \& list: /some|many/ thing[ $item[1] ](s) | |
2631 | .Ve | |
2632 | .PP | |
2633 | The argument list is \*(L"late bound\*(R". That is, it is re-evaluated for every | |
2634 | repetition of the repeated subrule. | |
2635 | This means that each repeated attempt to match the subrule may be | |
2636 | passed a completely different set of arguments if the value of the | |
2637 | expression in the argument list changes between attempts. So, for | |
2638 | example, the grammar: | |
2639 | .PP | |
2640 | .Vb 1 | |
2641 | \& { $::species = 'dogs' } | |
2642 | .Ve | |
2643 | .PP | |
2644 | .Vb 1 | |
2645 | \& pair: 'two' animal[$::species](s) | |
2646 | .Ve | |
2647 | .PP | |
2648 | .Vb 1 | |
2649 | \& animal: /$arg[0]/ { $::species = 'cats' } | |
2650 | .Ve | |
2651 | .PP | |
2652 | will match the string \*(L"two dogs cats cats\*(R" completely, whereas | |
2653 | it will only match the string \*(L"two dogs dogs dogs\*(R" up to the | |
2654 | eighth letter. If the value of the argument list were \*(L"early bound\*(R" | |
2655 | (that is, evaluated only the first time a repeated subrule match is | |
2656 | attempted), one would expect the matching behaviours to be reversed. | |
2657 | .PP | |
2658 | Of course, it is possible to effectively \*(L"early bind\*(R" such argument lists | |
2659 | by passing them a value which does not change on each repetition. For example: | |
2660 | .PP | |
2661 | .Vb 1 | |
2662 | \& { $::species = 'dogs' } | |
2663 | .Ve | |
2664 | .PP | |
2665 | .Vb 1 | |
2666 | \& pair: 'two' { $::species } animal[$item[2]](s) | |
2667 | .Ve | |
2668 | .PP | |
2669 | .Vb 1 | |
2670 | \& animal: /$arg[0]/ { $::species = 'cats' } | |
2671 | .Ve | |
2672 | .PP | |
2673 | Arguments can also be passed to the start rule, simply by appending them | |
2674 | to the argument list with which the start rule is called (\fIafter\fR the | |
2675 | \&\*(L"line number\*(R" parameter). For example, given: | |
2676 | .PP | |
2677 | .Vb 1 | |
2678 | \& $parser = new Parse::RecDescent ( $grammar ); | |
2679 | .Ve | |
2680 | .PP | |
2681 | .Vb 1 | |
2682 | \& $parser->data($text, 1, "str", 2, \e@arr); | |
2683 | .Ve | |
2684 | .PP | |
2685 | .Vb 5 | |
2686 | \& # ^^^^^ ^ ^^^^^^^^^^^^^^^ | |
2687 | \& # | | | | |
2688 | \& # TEXT TO BE PARSED | | | |
2689 | \& # STARTING LINE NUMBER | | |
2690 | \& # ELEMENTS OF @arg WHICH IS PASSED TO RULE data | |
2691 | .Ve | |
2692 | .PP | |
2693 | then within the productions of the rule \f(CW\*(C`data\*(C'\fR, the array \f(CW@arg\fR will contain | |
2694 | \&\f(CW\*(C`("str", 2, \e@arr)\*(C'\fR. | |
2695 | .Sh "Alternations" | |
2696 | .IX Subsection "Alternations" | |
2697 | Alternations are implicit (unnamed) rules defined as part of a production. An | |
2698 | alternation is defined as a series of '|'\-separated productions inside a | |
2699 | pair of round brackets. For example: | |
2700 | .PP | |
2701 | .Vb 1 | |
2702 | \& character: 'the' ( good | bad | ugly ) /dude/ | |
2703 | .Ve | |
2704 | .PP | |
2705 | Every alternation implicitly defines a new subrule, whose | |
2706 | automatically-generated name indicates its origin: | |
2707 | \&\*(L"_alternation_<I>_of_production_<P>_of_rule<R>\*(R" for the appropriate | |
2708 | values of <I>, <P>, and <R>. A call to this implicit subrule is then | |
2709 | inserted in place of the brackets. Hence the above example is merely a | |
2710 | convenient short-hand for: | |
2711 | .PP | |
2712 | .Vb 3 | |
2713 | \& character: 'the' | |
2714 | \& _alternation_1_of_production_1_of_rule_character | |
2715 | \& /dude/ | |
2716 | .Ve | |
2717 | .PP | |
2718 | .Vb 2 | |
2719 | \& _alternation_1_of_production_1_of_rule_character: | |
2720 | \& good | bad | ugly | |
2721 | .Ve | |
2722 | .PP | |
2723 | Since alternations are parsed by recursively calling the parser generator, | |
2724 | any type(s) of item can appear in an alternation. For example: | |
2725 | .PP | |
2726 | .Vb 5 | |
2727 | \& character: 'the' ( 'high' "plains" # Silent, with poncho | |
2728 | \& | /no[- ]name/ # Silent, no poncho | |
2729 | \& | vengeance_seeking # Poncho-optional | |
2730 | \& | <error> | |
2731 | \& ) drifter | |
2732 | .Ve | |
2733 | .PP | |
2734 | In this case, if an error occurred, the automatically generated | |
2735 | message would be: | |
2736 | .PP | |
2737 | .Vb 3 | |
2738 | \& ERROR (line <N>): Invalid implicit subrule: Expected | |
2739 | \& 'high' or /no[- ]name/ or generic, | |
2740 | \& but found "pacifist" instead | |
2741 | .Ve | |
2742 | .PP | |
2743 | Since every alternation actually has a name, it's even possible | |
2744 | to extend or replace them: | |
2745 | .PP | |
2746 | .Vb 4 | |
2747 | \& parser->Replace( | |
2748 | \& "_alternation_1_of_production_1_of_rule_character: | |
2749 | \& 'generic Eastwood'" | |
2750 | \& ); | |
2751 | .Ve | |
2752 | .PP | |
2753 | More importantly, since alternations are a form of subrule, they can be given | |
2754 | repetition specifiers: | |
2755 | .PP | |
2756 | .Vb 1 | |
2757 | \& character: 'the' ( good | bad | ugly )(?) /dude/ | |
2758 | .Ve | |
2759 | .Sh "Incremental Parsing" | |
2760 | .IX Subsection "Incremental Parsing" | |
2761 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR provides two methods \- \f(CW\*(C`Extend\*(C'\fR and \f(CW\*(C`Replace\*(C'\fR \- which | |
2762 | can be used to alter the grammar matched by a parser. Both methods | |
2763 | take the same argument as \f(CW\*(C`Parse::RecDescent::new\*(C'\fR, namely a | |
2764 | grammar specification string | |
2765 | .PP | |
2766 | \&\f(CW\*(C`Parse::RecDescent::Extend\*(C'\fR interprets the grammar specification and adds any | |
2767 | productions it finds to the end of the rules for which they are specified. For | |
2768 | example: | |
2769 | .PP | |
2770 | .Vb 2 | |
2771 | \& $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\endesc: colour /necks?/"; | |
2772 | \& parser->Extend($add); | |
2773 | .Ve | |
2774 | .PP | |
2775 | adds two productions to the rule \*(L"name\*(R" (creating it if necessary) and one | |
2776 | production to the rule \*(L"desc\*(R". | |
2777 | .PP | |
2778 | \&\f(CW\*(C`Parse::RecDescent::Replace\*(C'\fR is identical, except that it first resets are | |
2779 | rule specified in the additional grammar, removing any existing productions. | |
2780 | Hence after: | |
2781 | .PP | |
2782 | .Vb 2 | |
2783 | \& $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\endesc: colour /necks?/"; | |
2784 | \& parser->Replace($add); | |
2785 | .Ve | |
2786 | .PP | |
2787 | are are \fIonly\fR valid \*(L"name\*(R"s and the one possible description. | |
2788 | .PP | |
2789 | A more interesting use of the \f(CW\*(C`Extend\*(C'\fR and \f(CW\*(C`Replace\*(C'\fR methods is to call them | |
2790 | inside the action of an executing parser. For example: | |
2791 | .PP | |
2792 | .Vb 3 | |
2793 | \& typedef: 'typedef' type_name identifier ';' | |
2794 | \& { $thisparser->Extend("type_name: '$item[3]'") } | |
2795 | \& | <error> | |
2796 | .Ve | |
2797 | .PP | |
2798 | .Vb 1 | |
2799 | \& identifier: ...!type_name /[A-Za-z_]w*/ | |
2800 | .Ve | |
2801 | .PP | |
2802 | which automatically prevents type names from being typedef'd, or: | |
2803 | .PP | |
2804 | .Vb 6 | |
2805 | \& command: 'map' key_name 'to' abort_key | |
2806 | \& { $thisparser->Replace("abort_key: '$item[2]'") } | |
2807 | \& | 'map' key_name 'to' key_name | |
2808 | \& { map_key($item[2],$item[4]) } | |
2809 | \& | abort_key | |
2810 | \& { exit if confirm("abort?") } | |
2811 | .Ve | |
2812 | .PP | |
2813 | .Vb 1 | |
2814 | \& abort_key: 'q' | |
2815 | .Ve | |
2816 | .PP | |
2817 | .Vb 1 | |
2818 | \& key_name: ...!abort_key /[A-Za-z]/ | |
2819 | .Ve | |
2820 | .PP | |
2821 | which allows the user to change the abort key binding, but not to unbind it. | |
2822 | .PP | |
2823 | The careful use of such constructs makes it possible to reconfigure a | |
2824 | a running parser, eliminating the need for semantic feedback by | |
2825 | providing syntactic feedback instead. However, as currently implemented, | |
2826 | \&\f(CW\*(C`Replace()\*(C'\fR and \f(CW\*(C`Extend()\*(C'\fR have to regenerate and re\-\f(CW\*(C`eval\*(C'\fR the | |
2827 | entire parser whenever they are called. This makes them quite slow for | |
2828 | large grammars. | |
2829 | .PP | |
2830 | In such cases, the judicious use of an interpolated regex is likely to | |
2831 | be far more efficient: | |
2832 | .PP | |
2833 | .Vb 3 | |
2834 | \& typedef: 'typedef' type_name/ identifier ';' | |
2835 | \& { $thisparser->{local}{type_name} .= "|$item[3]" } | |
2836 | \& | <error> | |
2837 | .Ve | |
2838 | .PP | |
2839 | .Vb 1 | |
2840 | \& identifier: ...!type_name /[A-Za-z_]w*/ | |
2841 | .Ve | |
2842 | .PP | |
2843 | .Vb 1 | |
2844 | \& type_name: /$thisparser->{local}{type_name}/ | |
2845 | .Ve | |
2846 | .Sh "Precompiling parsers" | |
2847 | .IX Subsection "Precompiling parsers" | |
2848 | Normally Parse::RecDescent builds a parser from a grammar at run\-time. | |
2849 | That approach simplifies the design and implementation of parsing code, | |
2850 | but has the disadvantage that it slows the parsing process down \- you | |
2851 | have to wait for Parse::RecDescent to build the parser every time the | |
2852 | program runs. Long or complex grammars can be particularly slow to | |
2853 | build, leading to unacceptable delays at start\-up. | |
2854 | .PP | |
2855 | To overcome this, the module provides a way of \*(L"pre\-building\*(R" a parser | |
2856 | object and saving it in a separate module. That module can then be used | |
2857 | to create clones of the original parser. | |
2858 | .PP | |
2859 | A grammar may be precompiled using the \f(CW\*(C`Precompile\*(C'\fR class method. | |
2860 | For example, to precompile a grammar stored in the scalar \f(CW$grammar\fR, | |
2861 | and produce a class named PreGrammar in a module file named PreGrammar.pm, | |
2862 | you could use: | |
2863 | .PP | |
2864 | .Vb 1 | |
2865 | \& use Parse::RecDescent; | |
2866 | .Ve | |
2867 | .PP | |
2868 | .Vb 1 | |
2869 | \& Parse::RecDescent->Precompile($grammar, "PreGrammar"); | |
2870 | .Ve | |
2871 | .PP | |
2872 | The first argument is the grammar string, the second is the name of the class | |
2873 | to be built. The name of the module file is generated automatically by | |
2874 | appending \*(L".pm\*(R" to the last element of the class name. Thus | |
2875 | .PP | |
2876 | .Vb 1 | |
2877 | \& Parse::RecDescent->Precompile($grammar, "My::New::Parser"); | |
2878 | .Ve | |
2879 | .PP | |
2880 | would produce a module file named Parser.pm. | |
2881 | .PP | |
2882 | It is somewhat tedious to have to write a small Perl program just to | |
2883 | generate a precompiled grammar class, so Parse::RecDescent has some special | |
2884 | magic that allows you to do the job directly from the command\-line. | |
2885 | .PP | |
2886 | If your grammar is specified in a file named \fIgrammar\fR, you can generate | |
2887 | a class named Yet::Another::Grammar like so: | |
2888 | .PP | |
2889 | .Vb 1 | |
2890 | \& > perl -MParse::RecDescent - grammar Yet::Another::Grammar | |
2891 | .Ve | |
2892 | .PP | |
2893 | This would produce a file named \fIGrammar.pm\fR containing the full | |
2894 | definition of a class called Yet::Another::Grammar. Of course, to use | |
2895 | that class, you would need to put the \fIGrammar.pm\fR file in a | |
2896 | directory named \fIYet/Another\fR, somewhere in your Perl include path. | |
2897 | .PP | |
2898 | Having created the new class, it's very easy to use it to build | |
2899 | a parser. You simply \f(CW\*(C`use\*(C'\fR the new module, and then call its | |
2900 | \&\f(CW\*(C`new\*(C'\fR method to create a parser object. For example: | |
2901 | .PP | |
2902 | .Vb 2 | |
2903 | \& use Yet::Another::Grammar; | |
2904 | \& my $parser = Yet::Another::Grammar->new(); | |
2905 | .Ve | |
2906 | .PP | |
2907 | The effect of these two lines is exactly the same as: | |
2908 | .PP | |
2909 | .Vb 1 | |
2910 | \& use Parse::RecDescent; | |
2911 | .Ve | |
2912 | .PP | |
2913 | .Vb 3 | |
2914 | \& open GRAMMAR_FILE, "grammar" or die; | |
2915 | \& local $/; | |
2916 | \& my $grammar = <GRAMMAR_FILE>; | |
2917 | .Ve | |
2918 | .PP | |
2919 | .Vb 1 | |
2920 | \& my $parser = Parse::RecDescent->new($grammar); | |
2921 | .Ve | |
2922 | .PP | |
2923 | only considerably faster. | |
2924 | .PP | |
2925 | Note however that the parsers produced by either approach are exactly | |
2926 | the same, so whilst precompilation has an effect on \fIset-up\fR speed, | |
2927 | it has no effect on \fIparsing\fR speed. RecDescent 2.0 will address that | |
2928 | problem. | |
2929 | .ie n .Sh "A Metagrammar for ""Parse::RecDescent""" | |
2930 | .el .Sh "A Metagrammar for \f(CWParse::RecDescent\fP" | |
2931 | .IX Subsection "A Metagrammar for Parse::RecDescent" | |
2932 | The following is a specification of grammar format accepted by | |
2933 | \&\f(CW\*(C`Parse::RecDescent::new\*(C'\fR (specified in the \f(CW\*(C`Parse::RecDescent\*(C'\fR grammar format!): | |
2934 | .PP | |
2935 | .Vb 1 | |
2936 | \& grammar : components(s) | |
2937 | .Ve | |
2938 | .PP | |
2939 | .Vb 1 | |
2940 | \& component : rule | comment | |
2941 | .Ve | |
2942 | .PP | |
2943 | .Vb 1 | |
2944 | \& rule : "\en" identifier ":" production(s?) | |
2945 | .Ve | |
2946 | .PP | |
2947 | .Vb 1 | |
2948 | \& production : items(s) | |
2949 | .Ve | |
2950 | .PP | |
2951 | .Vb 3 | |
2952 | \& item : lookahead(?) simpleitem | |
2953 | \& | directive | |
2954 | \& | comment | |
2955 | .Ve | |
2956 | .PP | |
2957 | .Vb 1 | |
2958 | \& lookahead : '...' | '...!' # +'ve or -'ve lookahead | |
2959 | .Ve | |
2960 | .PP | |
2961 | .Vb 5 | |
2962 | \& simpleitem : subrule args(?) # match another rule | |
2963 | \& | repetition # match repeated subrules | |
2964 | \& | terminal # match the next input | |
2965 | \& | bracket args(?) # match alternative items | |
2966 | \& | action # do something | |
2967 | .Ve | |
2968 | .PP | |
2969 | .Vb 1 | |
2970 | \& subrule : identifier # the name of the rule | |
2971 | .Ve | |
2972 | .PP | |
2973 | .Vb 1 | |
2974 | \& args : {extract_codeblock($text,'[]')} # just like a [...] array ref | |
2975 | .Ve | |
2976 | .PP | |
2977 | .Vb 1 | |
2978 | \& repetition : subrule args(?) howoften | |
2979 | .Ve | |
2980 | .PP | |
2981 | .Vb 6 | |
2982 | \& howoften : '(?)' # 0 or 1 times | |
2983 | \& | '(s?)' # 0 or more times | |
2984 | \& | '(s)' # 1 or more times | |
2985 | \& | /(\ed+)[.][.](/\ed+)/ # $1 to $2 times | |
2986 | \& | /[.][.](/\ed*)/ # at most $1 times | |
2987 | \& | /(\ed*)[.][.])/ # at least $1 times | |
2988 | .Ve | |
2989 | .PP | |
2990 | .Vb 3 | |
2991 | \& terminal : /[/]([\e][/]|[^/])*[/]/ # interpolated pattern | |
2992 | \& | /"([\e]"|[^"])*"/ # interpolated literal | |
2993 | \& | /'([\e]'|[^'])*'/ # uninterpolated literal | |
2994 | .Ve | |
2995 | .PP | |
2996 | .Vb 1 | |
2997 | \& action : { extract_codeblock($text) } # embedded Perl code | |
2998 | .Ve | |
2999 | .PP | |
3000 | .Vb 1 | |
3001 | \& bracket : '(' Item(s) production(s?) ')' # alternative subrules | |
3002 | .Ve | |
3003 | .PP | |
3004 | .Vb 12 | |
3005 | \& directive : '<commit>' # commit to production | |
3006 | \& | '<uncommit>' # cancel commitment | |
3007 | \& | '<resync>' # skip to newline | |
3008 | \& | '<resync:' pattern '>' # skip <pattern> | |
3009 | \& | '<reject>' # fail this production | |
3010 | \& | '<reject:' condition '>' # fail if <condition> | |
3011 | \& | '<error>' # report an error | |
3012 | \& | '<error:' string '>' # report error as "<string>" | |
3013 | \& | '<error?>' # error only if committed | |
3014 | \& | '<error?:' string '>' # " " " " | |
3015 | \& | '<rulevar:' /[^>]+/ '>' # define rule-local variable | |
3016 | \& | '<matchrule:' string '>' # invoke rule named in string | |
3017 | .Ve | |
3018 | .PP | |
3019 | .Vb 1 | |
3020 | \& identifier : /[a-z]\ew*/i # must start with alpha | |
3021 | .Ve | |
3022 | .PP | |
3023 | .Vb 1 | |
3024 | \& comment : /#[^\en]*/ # same as Perl | |
3025 | .Ve | |
3026 | .PP | |
3027 | .Vb 1 | |
3028 | \& pattern : {extract_bracketed($text,'<')} # allow embedded "<..>" | |
3029 | .Ve | |
3030 | .PP | |
3031 | .Vb 1 | |
3032 | \& condition : {extract_codeblock($text,'{<')} # full Perl expression | |
3033 | .Ve | |
3034 | .PP | |
3035 | .Vb 3 | |
3036 | \& string : {extract_variable($text)} # any Perl variable | |
3037 | \& | {extract_quotelike($text)} # or quotelike string | |
3038 | \& | {extract_bracketed($text,'<')} # or balanced brackets | |
3039 | .Ve | |
3040 | .SH "GOTCHAS" | |
3041 | .IX Header "GOTCHAS" | |
3042 | This section describes common mistakes that grammar writers seem to | |
3043 | make on a regular basis. | |
3044 | .Sh "1. Expecting an error to always invalidate a parse" | |
3045 | .IX Subsection "1. Expecting an error to always invalidate a parse" | |
3046 | A common mistake when using error messages is to write the grammar like this: | |
3047 | .PP | |
3048 | .Vb 1 | |
3049 | \& file: line(s) | |
3050 | .Ve | |
3051 | .PP | |
3052 | .Vb 4 | |
3053 | \& line: line_type_1 | |
3054 | \& | line_type_2 | |
3055 | \& | line_type_3 | |
3056 | \& | <error> | |
3057 | .Ve | |
3058 | .PP | |
3059 | The expectation seems to be that any line that is not of type 1, 2 or 3 will | |
3060 | invoke the \f(CW\*(C`<error>\*(C'\fR directive and thereby cause the parse to fail. | |
3061 | .PP | |
3062 | Unfortunately, that only happens if the error occurs in the very first line. | |
3063 | The first rule states that a \f(CW\*(C`file\*(C'\fR is matched by one or more lines, so if | |
3064 | even a single line succeeds, the first rule is completely satisfied and the | |
3065 | parse as a whole succeeds. That means that any error messages generated by | |
3066 | subsequent failures in the \f(CW\*(C`line\*(C'\fR rule are quietly ignored. | |
3067 | .PP | |
3068 | Typically what's really needed is this: | |
3069 | .PP | |
3070 | .Vb 1 | |
3071 | \& file: line(s) eofile { $return = $item[1] } | |
3072 | .Ve | |
3073 | .PP | |
3074 | .Vb 4 | |
3075 | \& line: line_type_1 | |
3076 | \& | line_type_2 | |
3077 | \& | line_type_3 | |
3078 | \& | <error> | |
3079 | .Ve | |
3080 | .PP | |
3081 | .Vb 1 | |
3082 | \& eofile: /^\eZ/ | |
3083 | .Ve | |
3084 | .PP | |
3085 | The addition of the \f(CW\*(C`eofile\*(C'\fR subrule to the first production means that | |
3086 | a file only matches a series of successful \f(CW\*(C`line\*(C'\fR matches \fIthat consume the | |
3087 | complete input text\fR. If any input text remains after the lines are matched, | |
3088 | there must have been an error in the last \f(CW\*(C`line\*(C'\fR. In that case the \f(CW\*(C`eofile\*(C'\fR | |
3089 | rule will fail, causing the entire \f(CW\*(C`file\*(C'\fR rule to fail too. | |
3090 | .PP | |
3091 | Note too that \f(CW\*(C`eofile\*(C'\fR must match \f(CW\*(C`/^\eZ/\*(C'\fR (end\-of\-text), \fInot\fR | |
3092 | \&\f(CW\*(C`/^\ecZ/\*(C'\fR or \f(CW\*(C`/^\ecD/\*(C'\fR (end\-of\-file). | |
3093 | .PP | |
3094 | And don't forget the action at the end of the production. If you just | |
3095 | write: | |
3096 | .PP | |
3097 | .Vb 1 | |
3098 | \& file: line(s) eofile | |
3099 | .Ve | |
3100 | .PP | |
3101 | then the value returned by the \f(CW\*(C`file\*(C'\fR rule will be the value of its | |
3102 | last item: \f(CW\*(C`eofile\*(C'\fR. Since \f(CW\*(C`eofile\*(C'\fR always returns an empty string | |
3103 | on success, that will cause the \f(CW\*(C`file\*(C'\fR rule to return that empty | |
3104 | string. Apart from returning the wrong value, returning an empty string | |
3105 | will trip up code such as: | |
3106 | .PP | |
3107 | .Vb 1 | |
3108 | \& $parser->file($filetext) || die; | |
3109 | .Ve | |
3110 | .PP | |
3111 | (since "" is false). | |
3112 | .PP | |
3113 | Remember that Parse::RecDescent returns undef on failure, | |
3114 | so the only safe test for failure is: | |
3115 | .PP | |
3116 | .Vb 1 | |
3117 | \& defined($parser->file($filetext)) || die; | |
3118 | .Ve | |
3119 | .SH "DIAGNOSTICS" | |
3120 | .IX Header "DIAGNOSTICS" | |
3121 | Diagnostics are intended to be self-explanatory (particularly if you | |
3122 | use \fB\-RD_HINT\fR (under \fBperl \-s\fR) or define \f(CW$::RD_HINT\fR inside the program). | |
3123 | .PP | |
3124 | \&\f(CW\*(C`Parse::RecDescent\*(C'\fR currently diagnoses the following: | |
3125 | .IP "\(bu" 4 | |
3126 | Invalid regular expressions used as pattern terminals (fatal error). | |
3127 | .IP "\(bu" 4 | |
3128 | Invalid Perl code in code blocks (fatal error). | |
3129 | .IP "\(bu" 4 | |
3130 | Lookahead used in the wrong place or in a nonsensical way (fatal error). | |
3131 | .IP "\(bu" 4 | |
3132 | \&\*(L"Obvious\*(R" cases of left-recursion (fatal error). | |
3133 | .IP "\(bu" 4 | |
3134 | Missing or extra components in a \f(CW\*(C`<leftop>\*(C'\fR or \f(CW\*(C`<rightop>\*(C'\fR | |
3135 | directive. | |
3136 | .IP "\(bu" 4 | |
3137 | Unrecognisable components in the grammar specification (fatal error). | |
3138 | .IP "\(bu" 4 | |
3139 | \&\*(L"Orphaned\*(R" rule components specified before the first rule (fatal error) | |
3140 | or after an \f(CW\*(C`<error>\*(C'\fR directive (level 3 warning). | |
3141 | .IP "\(bu" 4 | |
3142 | Missing rule definitions (this only generates a level 3 warning, since you | |
3143 | may be providing them later via \f(CW\*(C`Parse::RecDescent::Extend()\*(C'\fR). | |
3144 | .IP "\(bu" 4 | |
3145 | Instances where greedy repetition behaviour will almost certainly | |
3146 | cause the failure of a production (a level 3 warning \- see | |
3147 | \&\*(L"\s-1ON\-GOING\s0 \s-1ISSUES\s0 \s-1AND\s0 \s-1FUTURE\s0 \s-1DIRECTIONS\s0\*(R" below). | |
3148 | .IP "\(bu" 4 | |
3149 | Attempts to define rules named 'Replace' or 'Extend', which cannot be | |
3150 | called directly through the parser object because of the predefined | |
3151 | meaning of \f(CW\*(C`Parse::RecDescent::Replace\*(C'\fR and | |
3152 | \&\f(CW\*(C`Parse::RecDescent::Extend\*(C'\fR. (Only a level 2 warning is generated, since | |
3153 | such rules \fIcan\fR still be used as subrules). | |
3154 | .IP "\(bu" 4 | |
3155 | Productions which consist of a single \f(CW\*(C`<error?>\*(C'\fR | |
3156 | directive, and which therefore may succeed unexpectedly | |
3157 | (a level 2 warning, since this might conceivably be the desired effect). | |
3158 | .IP "\(bu" 4 | |
3159 | Multiple consecutive lookahead specifiers (a level 1 warning only, since their | |
3160 | effects simply accumulate). | |
3161 | .IP "\(bu" 4 | |
3162 | Productions which start with a \f(CW\*(C`<reject>\*(C'\fR or \f(CW\*(C`<rulevar:...>\*(C'\fR | |
3163 | directive. Such productions are optimized away (a level 1 warning). | |
3164 | .IP "\(bu" 4 | |
3165 | Rules which are autogenerated under \f(CW$::AUTOSTUB\fR (a level 1 warning). | |
3166 | .SH "AUTHOR" | |
3167 | .IX Header "AUTHOR" | |
3168 | Damian Conway (damian@conway.org) | |
3169 | .SH "BUGS AND IRRITATIONS" | |
3170 | .IX Header "BUGS AND IRRITATIONS" | |
3171 | There are undoubtedly serious bugs lurking somewhere in this much code :\-) | |
3172 | Bug reports and other feedback are most welcome. | |
3173 | .PP | |
3174 | Ongoing annoyances include: | |
3175 | .IP "\(bu" 4 | |
3176 | There's no support for parsing directly from an input stream. | |
3177 | If and when the Perl Gods give us regular expressions on streams, | |
3178 | this should be trivial (ahem!) to implement. | |
3179 | .IP "\(bu" 4 | |
3180 | The parser generator can get confused if actions aren't properly | |
3181 | closed or if they contain particularly nasty Perl syntax errors | |
3182 | (especially unmatched curly brackets). | |
3183 | .IP "\(bu" 4 | |
3184 | The generator only detects the most obvious form of left recursion | |
3185 | (potential recursion on the first subrule in a rule). More subtle | |
3186 | forms of left recursion (for example, through the second item in a | |
3187 | rule after a \*(L"zero\*(R" match of a preceding \*(L"zero\-or\-more\*(R" repetition, | |
3188 | or after a match of a subrule with an empty production) are not found. | |
3189 | .IP "\(bu" 4 | |
3190 | Instead of complaining about left\-recursion, the generator should | |
3191 | silently transform the grammar to remove it. Don't expect this | |
3192 | feature any time soon as it would require a more sophisticated | |
3193 | approach to parser generation than is currently used. | |
3194 | .IP "\(bu" 4 | |
3195 | The generated parsers don't always run as fast as might be wished. | |
3196 | .IP "\(bu" 4 | |
3197 | The meta-parser should be bootstrapped using \f(CW\*(C`Parse::RecDescent\*(C'\fR :\-) | |
3198 | .SH "ON-GOING ISSUES AND FUTURE DIRECTIONS" | |
3199 | .IX Header "ON-GOING ISSUES AND FUTURE DIRECTIONS" | |
3200 | .IP "1." 4 | |
3201 | Repetitions are \*(L"incorrigibly greedy\*(R" in that they will eat everything they can | |
3202 | and won't backtrack if that behaviour causes a production to fail needlessly. | |
3203 | So, for example: | |
3204 | .Sp | |
3205 | .Vb 1 | |
3206 | \& rule: subrule(s) subrule | |
3207 | .Ve | |
3208 | .Sp | |
3209 | will \fInever\fR succeed, because the repetition will eat all the | |
3210 | subrules it finds, leaving none to match the second item. Such | |
3211 | constructions are relatively rare (and \f(CW\*(C`Parse::RecDescent::new\*(C'\fR generates a | |
3212 | warning whenever they occur) so this may not be a problem, especially | |
3213 | since the insatiable behaviour can be overcome \*(L"manually\*(R" by writing: | |
3214 | .Sp | |
3215 | .Vb 1 | |
3216 | \& rule: penultimate_subrule(s) subrule | |
3217 | .Ve | |
3218 | .Sp | |
3219 | .Vb 1 | |
3220 | \& penultimate_subrule: subrule ...subrule | |
3221 | .Ve | |
3222 | .Sp | |
3223 | The issue is that this construction is exactly twice as expensive as the | |
3224 | original, whereas backtracking would add only 1/\fIN\fR to the cost (for | |
3225 | matching \fIN\fR repetitions of \f(CW\*(C`subrule\*(C'\fR). I would welcome feedback on | |
3226 | the need for backtracking; particularly on cases where the lack of it | |
3227 | makes parsing performance problematical. | |
3228 | .IP "2." 4 | |
3229 | Having opened that can of worms, it's also necessary to consider whether there | |
3230 | is a need for non-greedy repetition specifiers. Again, it's possible (at some | |
3231 | cost) to manually provide the required functionality: | |
3232 | .Sp | |
3233 | .Vb 1 | |
3234 | \& rule: nongreedy_subrule(s) othersubrule | |
3235 | .Ve | |
3236 | .Sp | |
3237 | .Vb 1 | |
3238 | \& nongreedy_subrule: subrule ...!othersubrule | |
3239 | .Ve | |
3240 | .Sp | |
3241 | Overall, the issue is whether the benefit of this extra functionality | |
3242 | outweighs the drawbacks of further complicating the (currently | |
3243 | minimalist) grammar specification syntax, and (worse) introducing more overhead | |
3244 | into the generated parsers. | |
3245 | .IP "3." 4 | |
3246 | An \f(CW\*(C`<autocommit>\*(C'\fR directive would be nice. That is, it would be useful to be | |
3247 | able to say: | |
3248 | .Sp | |
3249 | .Vb 7 | |
3250 | \& command: <autocommit> | |
3251 | \& command: 'find' name | |
3252 | \& | 'find' address | |
3253 | \& | 'do' command 'at' time 'if' condition | |
3254 | \& | 'do' command 'at' time | |
3255 | \& | 'do' command | |
3256 | \& | unusual_command | |
3257 | .Ve | |
3258 | .Sp | |
3259 | and have the generator work out that this should be \*(L"pruned\*(R" thus: | |
3260 | .Sp | |
3261 | .Vb 9 | |
3262 | \& command: 'find' name | |
3263 | \& | 'find' <commit> address | |
3264 | \& | 'do' <commit> command <uncommit> | |
3265 | \& 'at' time | |
3266 | \& 'if' <commit> condition | |
3267 | \& | 'do' <commit> command <uncommit> | |
3268 | \& 'at' <commit> time | |
3269 | \& | 'do' <commit> command | |
3270 | \& | unusual_command | |
3271 | .Ve | |
3272 | .Sp | |
3273 | There are several issues here. Firstly, should the | |
3274 | \&\f(CW\*(C`<autocommit>\*(C'\fR automatically install an \f(CW\*(C`<uncommit>\*(C'\fR | |
3275 | at the start of the last production (on the grounds that the \*(L"command\*(R" | |
3276 | rule doesn't know whether an \*(L"unusual_command\*(R" might start with \*(L"find\*(R" | |
3277 | or \*(L"do\*(R") or should the \*(L"unusual_command\*(R" subgraph be analysed (to see | |
3278 | if it \fImight\fR be viable after a \*(L"find\*(R" or \*(L"do\*(R")? | |
3279 | .Sp | |
3280 | The second issue is how regular expressions should be treated. The simplest | |
3281 | approach would be simply to uncommit before them (on the grounds that they | |
3282 | \&\fImight\fR match). Better efficiency would be obtained by analyzing all preceding | |
3283 | literal tokens to determine whether the pattern would match them. | |
3284 | .Sp | |
3285 | Overall, the issues are: can such automated \*(L"pruning\*(R" approach a hand-tuned | |
3286 | version sufficiently closely to warrant the extra set-up expense, and (more | |
3287 | importantly) is the problem important enough to even warrant the non-trivial | |
3288 | effort of building an automated solution? | |
3289 | .SH "COPYRIGHT" | |
3290 | .IX Header "COPYRIGHT" | |
3291 | Copyright (c) 1997\-2000, Damian Conway. All Rights Reserved. | |
3292 | This module is free software. It may be used, redistributed | |
3293 | and/or modified under the terms of the Perl Artistic License | |
3294 | (see http://www.perl.com/perl/misc/Artistic.html) |