Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / perl-5.8.0 / man / man3 / Parse::RecDescent.3
CommitLineData
86530b38
AT
1.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
2.\"
3.\" Standard preamble:
4.\" ========================================================================
5.de Sh \" Subsection heading
6.br
7.if t .Sp
8.ne 5
9.PP
10\fB\\$1\fR
11.PP
12..
13.de Sp \" Vertical space (when we can't use .PP)
14.if t .sp .5v
15.if n .sp
16..
17.de Vb \" Begin verbatim text
18.ft CW
19.nf
20.ne \\$1
21..
22.de Ve \" End verbatim text
23.ft R
24.fi
25..
26.\" Set up some character translations and predefined strings. \*(-- will
27.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
28.\" double quote, and \*(R" will give a right double quote. | will give a
29.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
30.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
31.\" expand to `' in nroff, nothing in troff, for use with C<>.
32.tr \(*W-|\(bv\*(Tr
33.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
34.ie n \{\
35. ds -- \(*W-
36. ds PI pi
37. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
38. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
39. ds L" ""
40. ds R" ""
41. ds C` ""
42. ds C' ""
43'br\}
44.el\{\
45. ds -- \|\(em\|
46. ds PI \(*p
47. ds L" ``
48. ds R" ''
49'br\}
50.\"
51.\" If the F register is turned on, we'll generate index entries on stderr for
52.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
53.\" entries marked with X<> in POD. Of course, you'll have to process the
54.\" output yourself in some meaningful fashion.
55.if \nF \{\
56. de IX
57. tm Index:\\$1\t\\n%\t"\\$2"
58..
59. nr % 0
60. rr F
61.\}
62.\"
63.\" For nroff, turn off justification. Always turn off hyphenation; it makes
64.\" way too many mistakes in technical documents.
65.hy 0
66.if n .na
67.\"
68.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
69.\" Fear. Run. Save yourself. No user-serviceable parts.
70. \" fudge factors for nroff and troff
71.if n \{\
72. ds #H 0
73. ds #V .8m
74. ds #F .3m
75. ds #[ \f1
76. ds #] \fP
77.\}
78.if t \{\
79. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
80. ds #V .6m
81. ds #F 0
82. ds #[ \&
83. ds #] \&
84.\}
85. \" simple accents for nroff and troff
86.if n \{\
87. ds ' \&
88. ds ` \&
89. ds ^ \&
90. ds , \&
91. ds ~ ~
92. ds /
93.\}
94.if t \{\
95. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
96. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
97. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
98. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
99. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
100. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
101.\}
102. \" troff and (daisy-wheel) nroff accents
103.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
104.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
105.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
106.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
107.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
108.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
109.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
110.ds ae a\h'-(\w'a'u*4/10)'e
111.ds Ae A\h'-(\w'A'u*4/10)'E
112. \" corrections for vroff
113.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
114.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
115. \" for low resolution devices (crt and lpr)
116.if \n(.H>23 .if \n(.V>19 \
117\{\
118. ds : e
119. ds 8 ss
120. ds o a
121. ds d- d\h'-1'\(ga
122. ds D- D\h'-1'\(hy
123. ds th \o'bp'
124. ds Th \o'LP'
125. ds ae ae
126. ds Ae AE
127.\}
128.rm #[ #] #H #V #F C
129.\" ========================================================================
130.\"
131.IX Title "RECDESCENT 1"
132.TH RECDESCENT 1 "2000-08-20" "perl v5.8.0" "User Contributed Perl Documentation"
133.SH "NAME"
134Parse::RecDescent \- Generate Recursive\-Descent Parsers
135.SH "VERSION"
136.IX Header "VERSION"
137This document describes version 1.79 of Parse::RecDescent,
138released August 21, 2000.
139.SH "SYNOPSIS"
140.IX Header "SYNOPSIS"
141.Vb 1
142\& use Parse::RecDescent;
143.Ve
144.PP
145.Vb 1
146\& # Generate a parser from the specification in $grammar:
147.Ve
148.PP
149.Vb 1
150\& $parser = new Parse::RecDescent ($grammar);
151.Ve
152.PP
153.Vb 1
154\& # Generate a parser from the specification in $othergrammar
155.Ve
156.PP
157.Vb 1
158\& $anotherparser = new Parse::RecDescent ($othergrammar);
159.Ve
160.PP
161.Vb 2
162\& # Parse $text using rule 'startrule' (which must be
163\& # defined in $grammar):
164.Ve
165.PP
166.Vb 1
167\& $parser->startrule($text);
168.Ve
169.PP
170.Vb 2
171\& # Parse $text using rule 'otherrule' (which must also
172\& # be defined in $grammar):
173.Ve
174.PP
175.Vb 1
176\& $parser->otherrule($text);
177.Ve
178.PP
179.Vb 2
180\& # Change the universal token prefix pattern
181\& # (the default is: '\es*'):
182.Ve
183.PP
184.Vb 1
185\& $Parse::RecDescent::skip = '[ \et]+';
186.Ve
187.PP
188.Vb 2
189\& # Replace productions of existing rules (or create new ones)
190\& # with the productions defined in $newgrammar:
191.Ve
192.PP
193.Vb 1
194\& $parser->Replace($newgrammar);
195.Ve
196.PP
197.Vb 2
198\& # Extend existing rules (or create new ones)
199\& # by adding extra productions defined in $moregrammar:
200.Ve
201.PP
202.Vb 1
203\& $parser->Extend($moregrammar);
204.Ve
205.PP
206.Vb 1
207\& # Global flags (useful as command line arguments under -s):
208.Ve
209.PP
210.Vb 6
211\& $::RD_ERRORS # unless undefined, report fatal errors
212\& $::RD_WARN # unless undefined, also report non-fatal problems
213\& $::RD_HINT # if defined, also suggestion remedies
214\& $::RD_TRACE # if defined, also trace parsers' behaviour
215\& $::RD_AUTOSTUB # if defined, generates "stubs" for undefined rules
216\& $::RD_AUTOACTION # if defined, appends specified action to productions
217.Ve
218.SH "DESCRIPTION"
219.IX Header "DESCRIPTION"
220.Sh "Overview"
221.IX Subsection "Overview"
222Parse::RecDescent incrementally generates top-down recursive-descent text
223parsers from simple \fIyacc\fR\-like grammar specifications. It provides:
224.IP "\(bu" 4
225Regular expressions or literal strings as terminals (tokens),
226.IP "\(bu" 4
227Multiple (non\-contiguous) productions for any rule,
228.IP "\(bu" 4
229Repeated and optional subrules within productions,
230.IP "\(bu" 4
231Full access to Perl within actions specified as part of the grammar,
232.IP "\(bu" 4
233Simple automated error reporting during parser generation and parsing,
234.IP "\(bu" 4
235The ability to commit to, uncommit to, or reject particular
236productions during a parse,
237.IP "\(bu" 4
238The ability to pass data up and down the parse tree (\*(L"down\*(R" via subrule
239argument lists, \*(L"up\*(R" via subrule return values)
240.IP "\(bu" 4
241Incremental extension of the parsing grammar (even during a parse),
242.IP "\(bu" 4
243Precompilation of parser objects,
244.IP "\(bu" 4
245User-definable reduce-reduce conflict resolution via
246\&\*(L"scoring\*(R" of matching productions.
247.ie n .Sh "Using ""Parse::RecDescent"""
248.el .Sh "Using \f(CWParse::RecDescent\fP"
249.IX Subsection "Using Parse::RecDescent"
250Parser objects are created by calling \f(CW\*(C`Parse::RecDescent::new\*(C'\fR, passing in a
251grammar specification (see the following subsections). If the grammar is
252correct, \f(CW\*(C`new\*(C'\fR returns a blessed reference which can then be used to initiate
253parsing through any rule specified in the original grammar. A typical sequence
254looks like this:
255.PP
256.Vb 3
257\& $grammar = q {
258\& # GRAMMAR SPECIFICATION HERE
259\& };
260.Ve
261.PP
262.Vb 1
263\& $parser = new Parse::RecDescent ($grammar) or die "Bad grammar!\en";
264.Ve
265.PP
266.Vb 1
267\& # acquire $text
268.Ve
269.PP
270.Vb 1
271\& defined $parser->startrule($text) or print "Bad text!\en";
272.Ve
273.PP
274The rule through which parsing is initiated must be explicitly defined
275in the grammar (i.e. for the above example, the grammar must include a
276rule of the form: \*(L"startrule: <subrules>\*(R".
277.PP
278If the starting rule succeeds, its value (see below)
279is returned. Failure to generate the original parser or failure to match a text
280is indicated by returning \f(CW\*(C`undef\*(C'\fR. Note that it's easy to set up grammars
281that can succeed, but which return a value of 0, \*(L"0\*(R", or "". So don't be
282tempted to write:
283.PP
284.Vb 1
285\& $parser->startrule($text) or print "Bad text!\en";
286.Ve
287.PP
288Normally, the parser has no effect on the original text. So in the
289previous example the value of \f(CW$text\fR would be unchanged after having
290been parsed.
291.PP
292If, however, the text to be matched is passed by reference:
293.PP
294.Vb 1
295\& $parser->startrule(\e$text)
296.Ve
297.PP
298then any text which was consumed during the match will be removed from the
299start of \f(CW$text\fR.
300.Sh "Rules"
301.IX Subsection "Rules"
302In the grammar from which the parser is built, rules are specified by
303giving an identifier (which must satisfy /[A\-Za\-z]\ew*/), followed by a
304colon \fIon the same line\fR, followed by one or more productions,
305separated by single vertical bars. The layout of the productions
306is entirely free\-format:
307.PP
308.Vb 3
309\& rule1: production1
310\& | production2 |
311\& production3 | production4
312.Ve
313.PP
314At any point in the grammar previously defined rules may be extended with
315additional productions. This is achieved by redeclaring the rule with the new
316productions. Thus:
317.PP
318.Vb 3
319\& rule1: a | b | c
320\& rule2: d | e | f
321\& rule1: g | h
322.Ve
323.PP
324is exactly equivalent to:
325.PP
326.Vb 2
327\& rule1: a | b | c | g | h
328\& rule2: d | e | f
329.Ve
330.PP
331Each production in a rule consists of zero or more items, each of which
332may be either: the name of another rule to be matched (a \*(L"subrule\*(R"),
333a pattern or string literal to be matched directly (a \*(L"token\*(R"), a
334block of Perl code to be executed (an \*(L"action\*(R"), a special instruction
335to the parser (a \*(L"directive\*(R"), or a standard Perl comment (which is
336ignored).
337.PP
338A rule matches a text if one of its productions matches. A production
339matches if each of its items match consecutive substrings of the
340text. The productions of a rule being matched are tried in the same
341order that they appear in the original grammar, and the first matching
342production terminates the match attempt (successfully). If all
343productions are tried and none matches, the match attempt fails.
344.PP
345Note that this behaviour is quite different from the \*(L"prefer the longer match\*(R"
346behaviour of \fIyacc\fR. For example, if \fIyacc\fR were parsing the rule:
347.PP
348.Vb 2
349\& seq : 'A' 'B'
350\& | 'A' 'B' 'C'
351.Ve
352.PP
353upon matching \*(L"\s-1AB\s0\*(R" it would look ahead to see if a 'C' is next and, if
354so, will match the second production in preference to the first. In
355other words, \fIyacc\fR effectively tries all the productions of a rule
356breadth-first in parallel, and selects the \*(L"best\*(R" match, where \*(L"best\*(R"
357means longest (note that this is a gross simplification of the true
358behaviour of \fIyacc\fR but it will do for our purposes).
359.PP
360In contrast, \f(CW\*(C`Parse::RecDescent\*(C'\fR tries each production depth-first in
361sequence, and selects the \*(L"best\*(R" match, where \*(L"best\*(R" means first. This is
362the fundamental difference between \*(L"bottom\-up\*(R" and \*(L"recursive descent\*(R"
363parsing.
364.PP
365Each successfully matched item in a production is assigned a value,
366which can be accessed in subsequent actions within the same
367production (or, in some cases, as the return value of a successful
368subrule call). Unsuccessful items don't have an associated value,
369since the failure of an item causes the entire surrounding production
370to immediately fail. The following sections describe the various types
371of items and their success values.
372.Sh "Subrules"
373.IX Subsection "Subrules"
374A subrule which appears in a production is an instruction to the parser to
375attempt to match the named rule at that point in the text being
376parsed. If the named subrule is not defined when requested the
377production containing it immediately fails (unless it was \*(L"autostubbed\*(R" \- see
378Autostubbing).
379.PP
380A rule may (recursively) call itself as a subrule, but \fInot\fR as the
381left-most item in any of its productions (since such recursions are usually
382non\-terminating).
383.PP
384The value associated with a subrule is the value associated with its
385\&\f(CW$return\fR variable (see \*(L"Actions\*(R" below), or with the last successfully
386matched item in the subrule match.
387.PP
388Subrules may also be specified with a trailing repetition specifier,
389indicating that they are to be (greedily) matched the specified number
390of times. The available specifiers are:
391.PP
392.Vb 7
393\& subrule(?) # Match one-or-zero times
394\& subrule(s) # Match one-or-more times
395\& subrule(s?) # Match zero-or-more times
396\& subrule(N) # Match exactly N times for integer N > 0
397\& subrule(N..M) # Match between N and M times
398\& subrule(..M) # Match between 1 and M times
399\& subrule(N..) # Match at least N times
400.Ve
401.PP
402Repeated subrules keep matching until either the subrule fails to
403match, or it has matched the minimal number of times but fails to
404consume any of the parsed text (this second condition prevents the
405subrule matching forever in some cases).
406.PP
407Since a repeated subrule may match many instances of the subrule itself, the
408value associated with it is not a simple scalar, but rather a reference to a
409list of scalars, each of which is the value associated with one of the
410individual subrule matches. In other words in the rule:
411.PP
412.Vb 1
413\& program: statement(s)
414.Ve
415.PP
416the value associated with the repeated subrule \*(L"statement(s)\*(R" is a reference
417to an array containing the values matched by each call to the individual
418subrule \*(L"statement\*(R".
419.PP
420Repetition modifieres may include a separator pattern:
421.PP
422.Vb 1
423\& program: statement(s /;/)
424.Ve
425.PP
426specifying some sequence of characters to be skipped between each repetition.
427This is really just a shorthand for the <leftop:...> directive
428(see below).
429.Sh "Tokens"
430.IX Subsection "Tokens"
431If a quote-delimited string or a Perl regex appears in a production,
432the parser attempts to match that string or pattern at that point in
433the text. For example:
434.PP
435.Vb 1
436\& typedef: "typedef" typename identifier ';'
437.Ve
438.PP
439.Vb 1
440\& identifier: /[A-Za-z_][A-Za-z0-9_]*/
441.Ve
442.PP
443As in regular Perl, a single quoted string is uninterpolated, whilst
444a double-quoted string or a pattern is interpolated (at the time
445of matching, \fInot\fR when the parser is constructed). Hence, it is
446possible to define rules in which tokens can be set at run\-time:
447.PP
448.Vb 1
449\& typedef: "$::typedefkeyword" typename identifier ';'
450.Ve
451.PP
452.Vb 1
453\& identifier: /$::identpat/
454.Ve
455.PP
456Note that, since each rule is implemented inside a special namespace
457belonging to its parser, it is necessary to explicitly quantify
458variables from the main package.
459.PP
460Regex tokens can be specified using just slashes as delimiters
461or with the explicit \f(CW\*(C`m<delimiter>......<delimiter>\*(C'\fR syntax:
462.PP
463.Vb 1
464\& typedef: "typedef" typename identifier ';'
465.Ve
466.PP
467.Vb 1
468\& typename: /[A-Za-z_][A-Za-z0-9_]*/
469.Ve
470.PP
471.Vb 1
472\& identifier: m{[A-Za-z_][A-Za-z0-9_]*}
473.Ve
474.PP
475A regex of either type can also have any valid trailing parameter(s)
476(that is, any of [cgimsox]):
477.PP
478.Vb 1
479\& typedef: "typedef" typename identifier ';'
480.Ve
481.PP
482.Vb 3
483\& identifier: / [a-z_] # LEADING ALPHA OR UNDERSCORE
484\& [a-z0-9_]* # THEN DIGITS ALSO ALLOWED
485\& /ix # CASE/SPACE/COMMENT INSENSITIVE
486.Ve
487.PP
488The value associated with any successfully matched token is a string
489containing the actual text which was matched by the token.
490.PP
491It is important to remember that, since each grammar is specified in a
492Perl string, all instances of the universal escape character '\e' within
493a grammar must be \*(L"doubled\*(R", so that they interpolate to single '\e's when
494the string is compiled. For example, to use the grammar:
495.PP
496.Vb 3
497\& word: /\eS+/ | backslash
498\& line: prefix word(s) "\en"
499\& backslash: '\e\e'
500.Ve
501.PP
502the following code is required:
503.PP
504.Vb 1
505\& $parser = new Parse::RecDescent (q{
506.Ve
507.PP
508.Vb 3
509\& word: /\e\eS+/ | backslash
510\& line: prefix word(s) "\e\en"
511\& backslash: '\e\e\e\e'
512.Ve
513.PP
514.Vb 1
515\& });
516.Ve
517.Sh "Terminal Separators"
518.IX Subsection "Terminal Separators"
519For the purpose of matching, each terminal in a production is considered
520to be preceded by a \*(L"prefix\*(R" \- a pattern which must be
521matched before a token match is attempted. By default, the
522prefix is optional whitespace (which always matches, at
523least trivially), but this default may be reset in any production.
524.PP
525The variable \f(CW$Parse::RecDescent::skip\fR stores the universal
526prefix, which is the default for all terminal matches in all parsers
527built with \f(CW\*(C`Parse::RecDescent\*(C'\fR.
528.PP
529The prefix for an individual production can be altered
530by using the \f(CW\*(C`<skip:...>\*(C'\fR directive (see below).
531.Sh "Actions"
532.IX Subsection "Actions"
533An action is a block of Perl code which is to be executed (as the
534block of a \f(CW\*(C`do\*(C'\fR statement) when the parser reaches that point in a
535production. The action executes within a special namespace belonging to
536the active parser, so care must be taken in correctly qualifying variable
537names (see also \*(L"Start\-up Actions\*(R" below).
538.PP
539The action is considered to succeed if the final value of the block
540is defined (that is, if the implied \f(CW\*(C`do\*(C'\fR statement evaluates to a
541defined value \- \fIeven one which would be treated as \*(L"false\*(R"\fR). Note
542that the value associated with a successful action is also the final
543value in the block.
544.PP
545An action will \fIfail\fR if its last evaluated value is \f(CW\*(C`undef\*(C'\fR. This is
546surprisingly easy to accomplish by accident. For instance, here's an
547infuriating case of an action that makes its production fail, but only
548when debugging \fIisn't\fR activated:
549.PP
550.Vb 4
551\& description: name rank serial_number
552\& { print "Got $item[2] $item[1] ($item[3])\en"
553\& if $::debugging
554\& }
555.Ve
556.PP
557If \f(CW$debugging\fR is false, no statement in the block is executed, so
558the final value is \f(CW\*(C`undef\*(C'\fR, and the entire production fails. The solution is:
559.PP
560.Vb 5
561\& description: name rank serial_number
562\& { print "Got $item[2] $item[1] ($item[3])\en"
563\& if $::debugging;
564\& 1;
565\& }
566.Ve
567.PP
568Within an action, a number of useful parse-time variables are
569available in the special parser namespace (there are other variables
570also accessible, but meddling with them will probably just break your
571parser. As a general rule, if you avoid referring to unqualified
572variables \- especially those starting with an underscore \- inside an action,
573things should be okay):
574.ie n .IP "@item\fR and \f(CW%item" 4
575.el .IP "\f(CW@item\fR and \f(CW%item\fR" 4
576.IX Item "@item and %item"
577The array slice \f(CW@item[1..$#item]\fR stores the value associated with each item
578(that is, each subrule, token, or action) in the current production. The
579analogy is to \f(CW$1\fR, \f(CW$2\fR, etc. in a \fIyacc\fR grammar.
580Note that, for obvious reasons, \f(CW@item\fR only contains the
581values of items \fIbefore\fR the current point in the production.
582.Sp
583The first element (\f(CW$item[0]\fR) stores the name of the current rule
584being matched.
585.Sp
586\&\f(CW@item\fR is a standard Perl array, so it can also be indexed with negative
587numbers, representing the number of items \fIback\fR from the current position in
588the parse:
589.Sp
590.Vb 3
591\& stuff: /various/ bits 'and' pieces "then" data 'end'
592\& { print $item[-2] } # PRINTS data
593\& # (EASIER THAN: $item[6])
594.Ve
595.Sp
596The \f(CW%item\fR hash complements the <@item> array, providing named
597access to the same item values:
598.Sp
599.Vb 3
600\& stuff: /various/ bits 'and' pieces "then" data 'end'
601\& { print $item{data} # PRINTS data
602\& # (EVEN EASIER THAN USING @item)
603.Ve
604.Sp
605The results of named subrules are stored in the hash under each
606subrule's name, whilst all other items are stored under a \*(L"named
607positional\*(R" key that indictates their ordinal position within their item
608type: _\|_STRING\fIn\fR_\|_, _\|_PATTERN\fIn\fR_\|_, _\|_DIRECTIVE\fIn\fR_\|_, _\|_ACTION\fIn\fR_\|_:
609.Sp
610.Vb 6
611\& stuff: /various/ bits 'and' pieces "then" data 'end' { save }
612\& { print $item{__PATTERN1__}, # PRINTS 'various'
613\& $item{__STRING2__}, # PRINTS 'then'
614\& $item{__ACTION1__}, # PRINTS RETURN
615\& # VALUE OF save
616\& }
617.Ve
618.Sp
619If you want proper \fInamed\fR access to patterns or literals, you need to turn
620them into separate rules:
621.Sp
622.Vb 3
623\& stuff: various bits 'and' pieces "then" data 'end'
624\& { print $item{various} # PRINTS various
625\& }
626.Ve
627.Sp
628.Vb 1
629\& various: /various/
630.Ve
631.Sp
632The special entry \f(CW$item{_\|_RULE_\|_}\fR stores the name of the current
633rule (i.e. the same value as \f(CW$item[0]\fR.
634.Sp
635The advantage of using \f(CW%item\fR, instead of \f(CW@items\fR is that it
636removes the need to track items positions that may change as a grammar
637evolves. For example, adding an interim \f(CW\*(C`<skip>\*(C'\fR directive
638of action can silently ruin a trailing action, by moving an \f(CW@item\fR
639element \*(L"down\*(R" the array one place. In contrast, the named entry
640of \f(CW%item\fR is unaffected by such an insertion.
641.Sp
642A limitation of the \f(CW%item\fR hash is that it only records the \fIlast\fR
643value of a particular subrule. For example:
644.Sp
645.Vb 2
646\& range: '(' number '..' number )'
647\& { $return = $item{number} }
648.Ve
649.Sp
650will return only the value corresponding to the \fIsecond\fR match of the
651\&\f(CW\*(C`number\*(C'\fR subrule. In other words, successive calls to a subrule
652overwrite the corresponding entry in \f(CW%item\fR. Once again, the
653solution is to rename each subrule in its own rule:
654.Sp
655.Vb 2
656\& range: '(' from_num '..' to_num )'
657\& { $return = $item{from_num} }
658.Ve
659.Sp
660.Vb 2
661\& from_num: number
662\& to_num: number
663.Ve
664.ie n .IP "@arg\fR and \f(CW%arg" 4
665.el .IP "\f(CW@arg\fR and \f(CW%arg\fR" 4
666.IX Item "@arg and %arg"
667The array \f(CW@arg\fR and the hash \f(CW%arg\fR store any arguments passed to
668the rule from some other rule (see "\*(L"Subrule argument lists\*(R"). Changes
669to the elements of either variable do not propagate back to the calling
670rule (data can be passed back from a subrule via the \f(CW$return\fR
671variable \- see next item).
672.ie n .IP "$return" 4
673.el .IP "\f(CW$return\fR" 4
674.IX Item "$return"
675If a value is assigned to \f(CW$return\fR within an action, that value is
676returned if the production containing the action eventually matches
677successfully. Note that setting \f(CW$return\fR \fIdoesn't\fR cause the current
678production to succeed. It merely tells it what to return if it \fIdoes\fR succeed.
679Hence \f(CW$return\fR is analogous to \f(CW$$\fR in a \fIyacc\fR grammar.
680.Sp
681If \f(CW$return\fR is not assigned within a production, the value of the
682last component of the production (namely: \f(CW$item[$#item]\fR) is
683returned if the production succeeds.
684.ie n .IP "$commit" 4
685.el .IP "\f(CW$commit\fR" 4
686.IX Item "$commit"
687The current state of commitment to the current production (see \*(L"Directives\*(R"
688below).
689.ie n .IP "$skip" 4
690.el .IP "\f(CW$skip\fR" 4
691.IX Item "$skip"
692The current terminal prefix (see \*(L"Directives\*(R" below).
693.ie n .IP "$text" 4
694.el .IP "\f(CW$text\fR" 4
695.IX Item "$text"
696The remaining (unparsed) text. Changes to \f(CW$text\fR \fIdo not
697propagate\fR out of unsuccessful productions, but \fIdo\fR survive
698successful productions. Hence it is possible to dynamically alter the
699text being parsed \- for example, to provide a \f(CW\*(C`#include\*(C'\fR\-like facility:
700.Sp
701.Vb 2
702\& hash_include: '#include' filename
703\& { $text = ::loadfile($item[2]) . $text }
704.Ve
705.Sp
706.Vb 2
707\& filename: '<' /[a-z0-9._-]+/i '>' { $return = $item[2] }
708\& | '"' /[a-z0-9._-]+/i '"' { $return = $item[2] }
709.Ve
710.ie n .IP "$thisline\fR and \f(CW$prevline" 4
711.el .IP "\f(CW$thisline\fR and \f(CW$prevline\fR" 4
712.IX Item "$thisline and $prevline"
713\&\f(CW$thisline\fR stores the current line number within the current parse
714(starting from 1). \f(CW$prevline\fR stores the line number for the last
715character which was already successfully parsed (this will be different from
716\&\f(CW$thisline\fR at the end of each line).
717.Sp
718For efficiency, \f(CW$thisline\fR and \f(CW$prevline\fR are actually tied
719hashes, and only recompute the required line number when the variable's
720value is used.
721.Sp
722Assignment to \f(CW$thisline\fR adjusts the line number calculator, so that
723it believes that the current line number is the value being assigned. Note
724that this adjustment will be reflected in all subsequent line numbers
725calculations.
726.Sp
727Modifying the value of the variable \f(CW$text\fR (as in the previous
728\&\f(CW\*(C`hash_include\*(C'\fR example, for instance) will confuse the line
729counting mechanism. To prevent this, you should call
730\&\f(CW\*(C`Parse::RecDescent::LineCounter::resync($thisline)\*(C'\fR \fIimmediately\fR
731after any assignment to the variable \f(CW$text\fR (or, at least, before the
732next attempt to use \f(CW$thisline\fR).
733.Sp
734Note that if a production fails after assigning to or
735resync'ing \f(CW$thisline\fR, the parser's line counter mechanism will
736usually be corrupted.
737.Sp
738Also see the entry for \f(CW@itempos\fR.
739.Sp
740The line number can be set to values other than 1, by calling the start
741rule with a second argument. For example:
742.Sp
743.Vb 1
744\& $parser = new Parse::RecDescent ($grammar);
745.Ve
746.Sp
747.Vb 1
748\& $parser->input($text, 10); # START LINE NUMBERS AT 10
749.Ve
750.ie n .IP "$thiscolumn\fR and \f(CW$prevcolumn" 4
751.el .IP "\f(CW$thiscolumn\fR and \f(CW$prevcolumn\fR" 4
752.IX Item "$thiscolumn and $prevcolumn"
753\&\f(CW$thiscolumn\fR stores the current column number within the current line
754being parsed (starting from 1). \f(CW$prevcolumn\fR stores the column number
755of the last character which was actually successfully parsed. Usually
756\&\f(CW\*(C`$prevcolumn == $thiscolumn\-1\*(C'\fR, but not at the end of lines.
757.Sp
758For efficiency, \f(CW$thiscolumn\fR and \f(CW$prevcolumn\fR are
759actually tied hashes, and only recompute the required column number
760when the variable's value is used.
761.Sp
762Assignment to \f(CW$thiscolumn\fR or \f(CW$prevcolumn\fR is a fatal error.
763.Sp
764Modifying the value of the variable \f(CW$text\fR (as in the previous
765\&\f(CW\*(C`hash_include\*(C'\fR example, for instance) may confuse the column
766counting mechanism.
767.Sp
768Note that \f(CW$thiscolumn\fR reports the column number \fIbefore\fR any
769whitespace that might be skipped before reading a token. Hence
770if you wish to know where a token started (and ended) use something like this:
771.Sp
772.Vb 2
773\& rule: token1 token2 startcol token3 endcol token4
774\& { print "token3: columns $item[3] to $item[5]"; }
775.Ve
776.Sp
777.Vb 2
778\& startcol: // { $thiscolumn } # NEED THE // TO STEP PAST TOKEN SEP
779\& endcol: { $prevcolumn }
780.Ve
781.Sp
782Also see the entry for \f(CW@itempos\fR.
783.ie n .IP "$thisoffset\fR and \f(CW$prevoffset" 4
784.el .IP "\f(CW$thisoffset\fR and \f(CW$prevoffset\fR" 4
785.IX Item "$thisoffset and $prevoffset"
786\&\f(CW$thisoffset\fR stores the offset of the current parsing position
787within the complete text
788being parsed (starting from 0). \f(CW$prevoffset\fR stores the offset
789of the last character which was actually successfully parsed. In all
790cases \f(CW\*(C`$prevoffset == $thisoffset\-1\*(C'\fR.
791.Sp
792For efficiency, \f(CW$thisoffset\fR and \f(CW$prevoffset\fR are
793actually tied hashes, and only recompute the required offset
794when the variable's value is used.
795.Sp
796Assignment to \f(CW$thisoffset\fR or <$prevoffset> is a fatal error.
797.Sp
798Modifying the value of the variable \f(CW$text\fR will \fInot\fR affect the
799offset counting mechanism.
800.Sp
801Also see the entry for \f(CW@itempos\fR.
802.ie n .IP "@itempos" 4
803.el .IP "\f(CW@itempos\fR" 4
804.IX Item "@itempos"
805The array \f(CW@itempos\fR stores a hash reference corresponding to
806each element of \f(CW@item\fR. The elements of the hash provide the
807following:
808.Sp
809.Vb 6
810\& $itempos[$n]{offset}{from} # VALUE OF $thisoffset BEFORE $item[$n]
811\& $itempos[$n]{offset}{to} # VALUE OF $prevoffset AFTER $item[$n]
812\& $itempos[$n]{line}{from} # VALUE OF $thisline BEFORE $item[$n]
813\& $itempos[$n]{line}{to} # VALUE OF $prevline AFTER $item[$n]
814\& $itempos[$n]{column}{from} # VALUE OF $thiscolumn BEFORE $item[$n]
815\& $itempos[$n]{column}{to} # VALUE OF $prevcolumn AFTER $item[$n]
816.Ve
817.Sp
818Note that the various \f(CW\*(C`$itempos[$n]...{from}\*(C'\fR values record the
819appropriate value \fIafter\fR any token prefix has been skipped.
820.Sp
821Hence, instead of the somewhat tedious and error\-prone:
822.Sp
823.Vb 9
824\& rule: startcol token1 endcol
825\& startcol token2 endcol
826\& startcol token3 endcol
827\& { print "token1: columns $item[1]
828\& to $item[3]
829\& token2: columns $item[4]
830\& to $item[6]
831\& token3: columns $item[7]
832\& to $item[9]" }
833.Ve
834.Sp
835.Vb 2
836\& startcol: // { $thiscolumn } # NEED THE // TO STEP PAST TOKEN SEP
837\& endcol: { $prevcolumn }
838.Ve
839.Sp
840it is possible to write:
841.Sp
842.Vb 7
843\& rule: token1 token2 token3
844\& { print "token1: columns $itempos[1]{column}{from}
845\& to $itempos[1]{column}{to}
846\& token2: columns $itempos[2]{column}{from}
847\& to $itempos[2]{column}{to}
848\& token3: columns $itempos[3]{column}{from}
849\& to $itempos[3]{column}{to}" }
850.Ve
851.Sp
852Note however that (in the current implementation) the use of \f(CW@itempos\fR
853anywhere in a grammar implies that item positioning information is
854collected \fIeverywhere\fR during the parse. Depending on the grammar
855and the size of the text to be parsed, this may be prohibitively
856expensive and the explicit use of \f(CW$thisline\fR, \f(CW$thiscolumn\fR, etc. may
857be a better choice.
858.ie n .IP "$thisparser" 4
859.el .IP "\f(CW$thisparser\fR" 4
860.IX Item "$thisparser"
861A reference to the \f(CW\*(C`Parse::RecDescent\*(C'\fR object through which
862parsing was initiated.
863.Sp
864The value of \f(CW$thisparser\fR propagates down the subrules of a parse
865but not back up. Hence, you can invoke subrules from another parser
866for the scope of the current rule as follows:
867.Sp
868.Vb 4
869\& rule: subrule1 subrule2
870\& | { $thisparser = $::otherparser } <reject>
871\& | subrule3 subrule4
872\& | subrule5
873.Ve
874.Sp
875The result is that the production calls \*(L"subrule1\*(R" and \*(L"subrule2\*(R" of
876the current parser, and the remaining productions call the named subrules
877from \f(CW$::otherparser\fR. Note, however that \*(L"Bad Things\*(R" will happen if
878\&\f(CW\*(C`::otherparser\*(C'\fR isn't a blessed reference and/or doesn't have methods
879with the same names as the required subrules!
880.ie n .IP "$thisrule" 4
881.el .IP "\f(CW$thisrule\fR" 4
882.IX Item "$thisrule"
883A reference to the \f(CW\*(C`Parse::RecDescent::Rule\*(C'\fR object corresponding to the
884rule currently being matched.
885.ie n .IP "$thisprod" 4
886.el .IP "\f(CW$thisprod\fR" 4
887.IX Item "$thisprod"
888A reference to the \f(CW\*(C`Parse::RecDescent::Production\*(C'\fR object
889corresponding to the production currently being matched.
890.ie n .IP "$score\fR and \f(CW$score_return" 4
891.el .IP "\f(CW$score\fR and \f(CW$score_return\fR" 4
892.IX Item "$score and $score_return"
893$score stores the best production score to date, as specified by
894an earlier \f(CW\*(C`<score:...>\*(C'\fR directive. \f(CW$score_return\fR stores
895the corresponding return value for the successful production.
896.Sp
897See \*(L"Scored productions\*(R".
898.PP
899\&\fBWarning:\fR the parser relies on the information in the various \f(CW\*(C`this...\*(C'\fR
900objects in some non-obvious ways. Tinkering with the other members of
901these objects will probably cause Bad Things to happen, unless you
902\&\fIreally\fR know what you're doing. The only exception to this advice is
903that the use of \f(CW\*(C`$this...\->{local}\*(C'\fR is always safe.
904.Sh "Start-up Actions"
905.IX Subsection "Start-up Actions"
906Any actions which appear \fIbefore\fR the first rule definition in a
907grammar are treated as \*(L"start\-up\*(R" actions. Each such action is
908stripped of its outermost brackets and then evaluated (in the parser's
909special namespace) just before the rules of the grammar are first
910compiled.
911.PP
912The main use of start-up actions is to declare local variables within the
913parser's special namespace:
914.PP
915.Vb 1
916\& { my $lastitem = '???'; }
917.Ve
918.PP
919.Vb 1
920\& list: item(s) { $return = $lastitem }
921.Ve
922.PP
923.Vb 3
924\& item: book { $lastitem = 'book'; }
925\& bell { $lastitem = 'bell'; }
926\& candle { $lastitem = 'candle'; }
927.Ve
928.PP
929but start-up actions can be used to execute \fIany\fR valid Perl code
930within a parser's special namespace.
931.PP
932Start-up actions can appear within a grammar extension or replacement
933(that is, a partial grammar installed via \f(CW\*(C`Parse::RecDescent::Extend()\*(C'\fR or
934\&\f(CW\*(C`Parse::RecDescent::Replace()\*(C'\fR \- see \*(L"Incremental Parsing\*(R"), and will be
935executed before the new grammar is installed. Note, however, that a
936particular start-up action is only ever executed once.
937.Sh "Autoactions"
938.IX Subsection "Autoactions"
939It is sometimes desirable to be able to specify a default action to be
940taken at the end of every production (for example, in order to easily
941build a parse tree). If the variable \f(CW$::RD_AUTOACTION\fR is defined
942when \f(CW\*(C`Parse::RecDescent::new()\*(C'\fR is called, the contents of that
943variable are treated as a specification of an action which is to appended
944to each production in the corresponding grammar. So, for example, to construct
945a simple parse tree:
946.PP
947.Vb 1
948\& $::RD_AUTOACTION = q { [@item] };
949.Ve
950.PP
951.Vb 7
952\& parser = new Parse::RecDescent (q{
953\& expression: and_expr '||' expression | and_expr
954\& and_expr: not_expr '&&' and_expr | not_expr
955\& not_expr: '!' brack_expr | brack_expr
956\& brack_expr: '(' expression ')' | identifier
957\& identifier: /[a-z]+/i
958\& });
959.Ve
960.PP
961which is equivalent to:
962.PP
963.Vb 5
964\& parser = new Parse::RecDescent (q{
965\& expression: and_expr '&&' expression
966\& { [@item] }
967\& | and_expr
968\& { [@item] }
969.Ve
970.PP
971.Vb 4
972\& and_expr: not_expr '&&' and_expr
973\& { [@item] }
974\& | not_expr
975\& { [@item] }
976.Ve
977.PP
978.Vb 4
979\& not_expr: '!' brack_expr
980\& { [@item] }
981\& | brack_expr
982\& { [@item] }
983.Ve
984.PP
985.Vb 4
986\& brack_expr: '(' expression ')'
987\& { [@item] }
988\& | identifier
989\& { [@item] }
990.Ve
991.PP
992.Vb 3
993\& identifier: /[a-z]+/i
994\& { [@item] }
995\& });
996.Ve
997.PP
998Alternatively, we could take an object-oriented approach, use different
999classes for each node (and also eliminating redundant intermediate nodes):
1000.PP
1001.Vb 2
1002\& $::RD_AUTOACTION = q
1003\& { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) };
1004.Ve
1005.PP
1006.Vb 7
1007\& parser = new Parse::RecDescent (q{
1008\& expression: and_expr '||' expression | and_expr
1009\& and_expr: not_expr '&&' and_expr | not_expr
1010\& not_expr: '!' brack_expr | brack_expr
1011\& brack_expr: '(' expression ')' | identifier
1012\& identifier: /[a-z]+/i
1013\& });
1014.Ve
1015.PP
1016which is equivalent to:
1017.PP
1018.Vb 4
1019\& parser = new Parse::RecDescent (q{
1020\& expression: and_expr '&&' expression
1021\& { new expression_node (@item[1..3]) }
1022\& | and_expr
1023.Ve
1024.PP
1025.Vb 3
1026\& and_expr: not_expr '&&' and_expr
1027\& { new and_expr_node (@item[1..3]) }
1028\& | not_expr
1029.Ve
1030.PP
1031.Vb 3
1032\& not_expr: '!' brack_expr
1033\& { new not_expr_node (@item[1..2]) }
1034\& | brack_expr
1035.Ve
1036.PP
1037.Vb 3
1038\& brack_expr: '(' expression ')'
1039\& { new brack_expr_node (@item[1..3]) }
1040\& | identifier
1041.Ve
1042.PP
1043.Vb 3
1044\& identifier: /[a-z]+/i
1045\& { new identifer_node (@item[1]) }
1046\& });
1047.Ve
1048.PP
1049Note that, if a production already ends in an action, no autoaction is appended
1050to it. For example, in this version:
1051.PP
1052.Vb 2
1053\& $::RD_AUTOACTION = q
1054\& { $#item==1 ? $item[1] : new ${"$item[0]_node"} (@item[1..$#item]) };
1055.Ve
1056.PP
1057.Vb 8
1058\& parser = new Parse::RecDescent (q{
1059\& expression: and_expr '&&' expression | and_expr
1060\& and_expr: not_expr '&&' and_expr | not_expr
1061\& not_expr: '!' brack_expr | brack_expr
1062\& brack_expr: '(' expression ')' | identifier
1063\& identifier: /[a-z]+/i
1064\& { new terminal_node($item[1]) }
1065\& });
1066.Ve
1067.PP
1068each \f(CW\*(C`identifier\*(C'\fR match produces a \f(CW\*(C`terminal_node\*(C'\fR object, \fInot\fR an
1069\&\f(CW\*(C`identifier_node\*(C'\fR object.
1070.PP
1071A level 1 warning is issued each time an \*(L"autoaction\*(R" is added to
1072some production.
1073.Sh "Autotrees"
1074.IX Subsection "Autotrees"
1075A commonly needed autoaction is one that builds a parse\-tree. It is moderately
1076tricky to set up such an action (which must treat terminals differently from
1077non\-terminals), so Parse::RecDescent simplifies the process by providing the
1078\&\f(CW\*(C`<autotree>\*(C'\fR directive.
1079.PP
1080If this directive appears at the start of grammar, it causes
1081Parse::RecDescent to insert autoactions at the end of any rule except
1082those which already end in an action. The action inserted depends on whether
1083the production is an intermediate rule (two or more items), or a terminal
1084of the grammar (i.e. a single pattern or string item).
1085.PP
1086So, for example, the following grammar:
1087.PP
1088.Vb 1
1089\& <autotree>
1090.Ve
1091.PP
1092.Vb 7
1093\& file : command(s)
1094\& command : get | set | vet
1095\& get : 'get' ident ';'
1096\& set : 'set' ident 'to' value ';'
1097\& vet : 'check' ident 'is' value ';'
1098\& ident : /\ew+/
1099\& value : /\ed+/
1100.Ve
1101.PP
1102is equivalent to:
1103.PP
1104.Vb 7
1105\& file : command(s) { bless \e%item, $item[0] }
1106\& command : get { bless \e%item, $item[0] }
1107\& | set { bless \e%item, $item[0] }
1108\& | vet { bless \e%item, $item[0] }
1109\& get : 'get' ident ';' { bless \e%item, $item[0] }
1110\& set : 'set' ident 'to' value ';' { bless \e%item, $item[0] }
1111\& vet : 'check' ident 'is' value ';' { bless \e%item, $item[0] }
1112.Ve
1113.PP
1114.Vb 2
1115\& ident : /\ew+/ { bless {__VALUE__=>$item[1]}, $item[0] }
1116\& value : /\ed+/ { bless {__VALUE__=>$item[1]}, $item[0] }
1117.Ve
1118.PP
1119Note that each node in the tree is blessed into a class of the same name
1120as the rule itself. This makes it easy to build object-oriented
1121processors for the parse-trees that the grammar produces. Note too that
1122the last two rules produce special objects with the single attribute
1123\&'_\|_VALUE_\|_'. This is because they consist solely of a single terminal.
1124.PP
1125This autoaction-ed grammar would then produce a parse tree in a data
1126structure like this:
1127.PP
1128.Vb 18
1129\& {
1130\& file => {
1131\& command => {
1132\& [ get => {
1133\& identifier => { __VALUE__ => 'a' },
1134\& },
1135\& set => {
1136\& identifier => { __VALUE__ => 'b' },
1137\& value => { __VALUE__ => '7' },
1138\& },
1139\& vet => {
1140\& identifier => { __VALUE__ => 'b' },
1141\& value => { __VALUE__ => '7' },
1142\& },
1143\& ],
1144\& },
1145\& }
1146\& }
1147.Ve
1148.PP
1149(except, of course, that each nested hash would also be blessed into
1150the appropriate class).
1151.Sh "Autostubbing"
1152.IX Subsection "Autostubbing"
1153Normally, if a subrule appears in some production, but no rule of that
1154name is ever defined in the grammar, the production which refers to the
1155non-existent subrule fails immediately. This typically occurs as a
1156result of misspellings, and is a sufficiently common occurance that a
1157warning is generated for such situations.
1158.PP
1159However, when prototyping a grammar it is sometimes useful to be
1160able to use subrules before a proper specification of them is
1161really possible. For example, a grammar might include a section like:
1162.PP
1163.Vb 1
1164\& function_call: identifier '(' arg(s?) ')'
1165.Ve
1166.PP
1167.Vb 1
1168\& identifier: /[a-z]\ew*/i
1169.Ve
1170.PP
1171where the possible format of an argument is sufficiently complex that
1172it is not worth specifying in full until the general function call
1173syntax has been debugged. In this situation it is convenient to leave
1174the real rule \f(CW\*(C`arg\*(C'\fR undefined and just slip in a placeholder (or
1175\&\*(L"stub\*(R"):
1176.PP
1177.Vb 1
1178\& arg: 'arg'
1179.Ve
1180.PP
1181so that the function call syntax can be tested with dummy input such as:
1182.PP
1183.Vb 4
1184\& f0()
1185\& f1(arg)
1186\& f2(arg arg)
1187\& f3(arg arg arg)
1188.Ve
1189.PP
1190et cetera.
1191.PP
1192Early in prototyping, many such \*(L"stubs\*(R" may be required, so
1193\&\f(CW\*(C`Parse::RecDescent\*(C'\fR provides a means of automating their definition.
1194If the variable \f(CW$::RD_AUTOSTUB\fR is defined when a parser is built,
1195a subrule reference to any non-existent rule (say, \f(CW\*(C`sr\*(C'\fR),
1196causes a \*(L"stub\*(R" rule of the form:
1197.PP
1198.Vb 1
1199\& sr: 'sr'
1200.Ve
1201.PP
1202to be automatically defined in the generated parser.
1203A level 1 warning is issued for each such \*(L"autostubbed\*(R" rule.
1204.PP
1205Hence, with \f(CW$::AUTOSTUB\fR defined, it is possible to only partially
1206specify a grammar, and then \*(L"fake\*(R" matches of the unspecified
1207(sub)rules by just typing in their name.
1208.Sh "Look-ahead"
1209.IX Subsection "Look-ahead"
1210If a subrule, token, or action is prefixed by \*(L"...\*(R", then it is
1211treated as a \*(L"look\-ahead\*(R" request. That means that the current production can
1212(as usual) only succeed if the specified item is matched, but that the matching
1213\&\fIdoes not consume any of the text being parsed\fR. This is very similar to the
1214\&\f(CW\*(C`/(?=...)/\*(C'\fR look-ahead construct in Perl patterns. Thus, the rule:
1215.PP
1216.Vb 1
1217\& inner_word: word ...word
1218.Ve
1219.PP
1220will match whatever the subrule \*(L"word\*(R" matches, provided that match is followed
1221by some more text which subrule \*(L"word\*(R" would also match (although this
1222second substring is not actually consumed by \*(L"inner_word\*(R")
1223.PP
1224Likewise, a \*(L"...!\*(R" prefix, causes the following item to succeed (without
1225consuming any text) if and only if it would normally fail. Hence, a
1226rule such as:
1227.PP
1228.Vb 1
1229\& identifier: ...!keyword ...!'_' /[A-Za-z_]\ew*/
1230.Ve
1231.PP
1232matches a string of characters which satisfies the pattern
1233\&\f(CW\*(C`/[A\-Za\-z_]\ew*/\*(C'\fR, but only if the same sequence of characters would
1234not match either subrule \*(L"keyword\*(R" or the literal token '_'.
1235.PP
1236Sequences of look-ahead prefixes accumulate, multiplying their positive and/or
1237negative senses. Hence:
1238.PP
1239.Vb 1
1240\& inner_word: word ...!......!word
1241.Ve
1242.PP
1243is exactly equivalent the the original example above (a warning is issued in
1244cases like these, since they often indicate something left out, or
1245misunderstood).
1246.PP
1247Note that actions can also be treated as look\-aheads. In such cases,
1248the state of the parser text (in the local variable \f(CW$text\fR)
1249\&\fIafter\fR the look-ahead action is guaranteed to be identical to its
1250state \fIbefore\fR the action, regardless of how it's changed \fIwithin\fR
1251the action (unless you actually undefine \f(CW$text\fR, in which case you
1252get the disaster you deserve :\-).
1253.Sh "Directives"
1254.IX Subsection "Directives"
1255Directives are special pre-defined actions which may be used to alter
1256the behaviour of the parser. There are currently eighteen directives:
1257\&\f(CW\*(C`<commit>\*(C'\fR,
1258\&\f(CW\*(C`<uncommit>\*(C'\fR,
1259\&\f(CW\*(C`<reject>\*(C'\fR,
1260\&\f(CW\*(C`<score>\*(C'\fR,
1261\&\f(CW\*(C`<autoscore>\*(C'\fR,
1262\&\f(CW\*(C`<skip>\*(C'\fR,
1263\&\f(CW\*(C`<resync>\*(C'\fR,
1264\&\f(CW\*(C`<error>\*(C'\fR,
1265\&\f(CW\*(C`<rulevar>\*(C'\fR,
1266\&\f(CW\*(C`<matchrule>\*(C'\fR,
1267\&\f(CW\*(C`<leftop>\*(C'\fR,
1268\&\f(CW\*(C`<rightop>\*(C'\fR,
1269\&\f(CW\*(C`<defer>\*(C'\fR,
1270\&\f(CW\*(C`<nocheck>\*(C'\fR,
1271\&\f(CW\*(C`<perl_quotelike>\*(C'\fR,
1272\&\f(CW\*(C`<perl_codeblock>\*(C'\fR,
1273\&\f(CW\*(C`<perl_variable>\*(C'\fR,
1274and \f(CW\*(C`<token>\*(C'\fR.
1275.IP "Committing and uncommitting" 4
1276.IX Item "Committing and uncommitting"
1277The \f(CW\*(C`<commit>\*(C'\fR and \f(CW\*(C`<uncommit>\*(C'\fR directives permit the recursive
1278descent of the parse tree to be pruned (or \*(L"cut\*(R") for efficiency.
1279Within a rule, a \f(CW\*(C`<commit>\*(C'\fR directive instructs the rule to ignore subsequent
1280productions if the current production fails. For example:
1281.Sp
1282.Vb 3
1283\& command: 'find' <commit> filename
1284\& | 'open' <commit> filename
1285\& | 'move' filename filename
1286.Ve
1287.Sp
1288Clearly, if the leading token 'find' is matched in the first production but that
1289production fails for some other reason, then the remaining
1290productions cannot possibly match. The presence of the
1291\&\f(CW\*(C`<commit>\*(C'\fR causes the \*(L"command\*(R" rule to fail immediately if
1292an invalid \*(L"find\*(R" command is found, and likewise if an invalid \*(L"open\*(R"
1293command is encountered.
1294.Sp
1295It is also possible to revoke a previous commitment. For example:
1296.Sp
1297.Vb 5
1298\& if_statement: 'if' <commit> condition
1299\& 'then' block <uncommit>
1300\& 'else' block
1301\& | 'if' <commit> condition
1302\& 'then' block
1303.Ve
1304.Sp
1305In this case, a failure to find an \*(L"else\*(R" block in the first
1306production shouldn't preclude trying the second production, but a
1307failure to find a \*(L"condition\*(R" certainly should.
1308.Sp
1309As a special case, any production in which the \fIfirst\fR item is an
1310\&\f(CW\*(C`<uncommit>\*(C'\fR immediately revokes a preceding \f(CW\*(C`<commit>\*(C'\fR
1311(even though the production would not otherwise have been tried). For
1312example, in the rule:
1313.Sp
1314.Vb 5
1315\& request: 'explain' expression
1316\& | 'explain' <commit> keyword
1317\& | 'save'
1318\& | 'quit'
1319\& | <uncommit> term '?'
1320.Ve
1321.Sp
1322if the text being matched was \*(L"explain?\*(R", and the first two
1323productions failed, then the \f(CW\*(C`<commit>\*(C'\fR in production two would cause
1324productions three and four to be skipped, but the leading
1325\&\f(CW\*(C`<uncommit>\*(C'\fR in the production five would allow that production to
1326attempt a match.
1327.Sp
1328Note in the preceding example, that the \f(CW\*(C`<commit>\*(C'\fR was only placed
1329in production two. If production one had been:
1330.Sp
1331.Vb 1
1332\& request: 'explain' <commit> expression
1333.Ve
1334.Sp
1335then production two would be (inappropriately) skipped if a leading
1336\&\*(L"explain...\*(R" was encountered.
1337.Sp
1338Both \f(CW\*(C`<commit>\*(C'\fR and \f(CW\*(C`<uncommit>\*(C'\fR directives always succeed, and their value
1339is always 1.
1340.IP "Rejecting a production" 4
1341.IX Item "Rejecting a production"
1342The \f(CW\*(C`<reject>\*(C'\fR directive immediately causes the current production
1343to fail (it is exactly equivalent to, but more obvious than, the
1344action \f(CW\*(C`{undef}\*(C'\fR). A \f(CW\*(C`<reject>\*(C'\fR is useful when it is desirable to get
1345the side effects of the actions in one production, without prejudicing a match
1346by some other production later in the rule. For example, to insert
1347tracing code into the parse:
1348.Sp
1349.Vb 1
1350\& complex_rule: { print "In complex rule...\en"; } <reject>
1351.Ve
1352.Sp
1353.Vb 3
1354\& complex_rule: simple_rule '+' 'i' '*' simple_rule
1355\& | 'i' '*' simple_rule
1356\& | simple_rule
1357.Ve
1358.Sp
1359It is also possible to specify a conditional rejection, using the
1360form \f(CW\*(C`<reject:\f(CIcondition\f(CW>\*(C'\fR, which only rejects if the
1361specified condition is true. This form of rejection is exactly
1362equivalent to the action \f(CW\*(C`{(\f(CIcondition\f(CW)?undef:1}>\*(C'\fR.
1363For example:
1364.Sp
1365.Vb 4
1366\& command: save_command
1367\& | restore_command
1368\& | <reject: defined $::tolerant> { exit }
1369\& | <error: Unknown command. Ignored.>
1370.Ve
1371.Sp
1372A \f(CW\*(C`<reject>\*(C'\fR directive never succeeds (and hence has no
1373associated value). A conditional rejection may succeed (if its
1374condition is not satisfied), in which case its value is 1.
1375.Sp
1376As an extra optimization, \f(CW\*(C`Parse::RecDescent\*(C'\fR ignores any production
1377which \fIbegins\fR with an unconditional \f(CW\*(C`<reject>\*(C'\fR directive,
1378since any such production can never successfully match or have any
1379useful side\-effects. A level 1 warning is issued in all such cases.
1380.Sp
1381Note that productions beginning with conditional
1382\&\f(CW\*(C`<reject:...>\*(C'\fR directives are \fInever\fR \*(L"optimized away\*(R" in
1383this manner, even if they are always guaranteed to fail (for example:
1384\&\f(CW\*(C`<reject:1>\*(C'\fR)
1385.Sp
1386Due to the way grammars are parsed, there is a minor restriction on the
1387condition of a conditional \f(CW\*(C`<reject:...>\*(C'\fR: it cannot
1388contain any raw '<' or '>' characters. For example:
1389.Sp
1390.Vb 1
1391\& line: cmd <reject: $thiscolumn > max> data
1392.Ve
1393.Sp
1394results in an error when a parser is built from this grammar (since the
1395grammar parser has no way of knowing whether the first > is a \*(L"less than\*(R"
1396or the end of the \f(CW\*(C`<reject:...>\*(C'\fR.
1397.Sp
1398To overcome this problem, put the condition inside a do{} block:
1399.Sp
1400.Vb 1
1401\& line: cmd <reject: do{$thiscolumn > max}> data
1402.Ve
1403.Sp
1404Note that the same problem may occur in other directives that take
1405arguments. The same solution will work in all cases.
1406.IP "Skipping between terminals" 4
1407.IX Item "Skipping between terminals"
1408The \f(CW\*(C`<skip>\*(C'\fR directive enables the terminal prefix used in
1409a production to be changed. For example:
1410.Sp
1411.Vb 1
1412\& OneLiner: Command <skip:'[ \et]*'> Arg(s) /;/
1413.Ve
1414.Sp
1415causes only blanks and tabs to be skipped before terminals in the \f(CW\*(C`Arg\*(C'\fR
1416subrule (and any of \fIits\fR subrules>, and also before the final \f(CW\*(C`/;/\*(C'\fR terminal.
1417Once the production is complete, the previous terminal prefix is
1418reinstated. Note that this implies that distinct productions of a rule
1419must reset their terminal prefixes individually.
1420.Sp
1421The \f(CW\*(C`<skip>\*(C'\fR directive evaluates to the \fIprevious\fR terminal prefix,
1422so it's easy to reinstate a prefix later in a production:
1423.Sp
1424.Vb 1
1425\& Command: <skip:","> CSV(s) <skip:$item[1]> Modifier
1426.Ve
1427.Sp
1428The value specified after the colon is interpolated into a pattern, so all of
1429the following are equivalent (though their efficiency increases down the list):
1430.Sp
1431.Vb 1
1432\& <skip: "$colon|$comma"> # ASSUMING THE VARS HOLD THE OBVIOUS VALUES
1433.Ve
1434.Sp
1435.Vb 1
1436\& <skip: ':|,'>
1437.Ve
1438.Sp
1439.Vb 1
1440\& <skip: q{[:,]}>
1441.Ve
1442.Sp
1443.Vb 1
1444\& <skip: qr/[:,]/>
1445.Ve
1446.Sp
1447There is no way of directly setting the prefix for
1448an entire rule, except as follows:
1449.Sp
1450.Vb 3
1451\& Rule: <skip: '[ \et]*'> Prod1
1452\& | <skip: '[ \et]*'> Prod2a Prod2b
1453\& | <skip: '[ \et]*'> Prod3
1454.Ve
1455.Sp
1456or, better:
1457.Sp
1458.Vb 6
1459\& Rule: <skip: '[ \et]*'>
1460\& (
1461\& Prod1
1462\& | Prod2a Prod2b
1463\& | Prod3
1464\& )
1465.Ve
1466.Sp
1467\&\fBNote: Up to release 1.51 of Parse::RecDescent, an entirely different
1468mechanism was used for specifying terminal prefixes. The current method
1469is not backwards-compatible with that early approach. The current approach
1470is stable and will not to change again.\fR
1471.IP "Resynchronization" 4
1472.IX Item "Resynchronization"
1473The \f(CW\*(C`<resync>\*(C'\fR directive provides a visually distinctive
1474means of consuming some of the text being parsed, usually to skip an
1475erroneous input. In its simplest form \f(CW\*(C`<resync>\*(C'\fR simply
1476consumes text up to and including the next newline (\f(CW"\en"\fR)
1477character, succeeding only if the newline is found, in which case it
1478causes its surrounding rule to return zero on success.
1479.Sp
1480In other words, a \f(CW\*(C`<resync>\*(C'\fR is exactly equivalent to the token
1481\&\f(CW\*(C`/[^\en]*\en/\*(C'\fR followed by the action \f(CW\*(C`{\ $return\ =\ 0\ }\*(C'\fR (except that
1482productions beginning with a \f(CW\*(C`<resync>\*(C'\fR are ignored when generating
1483error messages). A typical use might be:
1484.Sp
1485.Vb 1
1486\& script : command(s)
1487.Ve
1488.Sp
1489.Vb 3
1490\& command: save_command
1491\& | restore_command
1492\& | <resync> # TRY NEXT LINE, IF POSSIBLE
1493.Ve
1494.Sp
1495It is also possible to explicitly specify a resynchronization
1496pattern, using the \f(CW\*(C`<resync:\f(CIpattern\f(CW>\*(C'\fR variant. This version
1497succeeds only if the specified pattern matches (and consumes) the
1498parsed text. In other words, \f(CW\*(C`<resync:\f(CIpattern\f(CW>\*(C'\fR is exactly
1499equivalent to the token \f(CW\*(C`/\f(CIpattern\f(CW/\*(C'\fR (followed by a \f(CW\*(C`{\ $return\ =\ 0\ }\*(C'\fR
1500action). For example, if commands were terminated by newlines or semi\-colons:
1501.Sp
1502.Vb 3
1503\& command: save_command
1504\& | restore_command
1505\& | <resync:[^;\en]*[;\en]>
1506.Ve
1507.Sp
1508The value of a successfully matched \f(CW\*(C`<resync>\*(C'\fR directive (of either
1509type) is the text that it consumed. Note, however, that since the
1510directive also sets \f(CW$return\fR, a production consisting of a lone
1511\&\f(CW\*(C`<resync>\*(C'\fR succeeds but returns the value zero (which a calling rule
1512may find useful to distinguish between \*(L"true\*(R" matches and \*(L"tolerant\*(R" matches).
1513Remember that returning a zero value indicates that the rule \fIsucceeded\fR (since
1514only an \f(CW\*(C`undef\*(C'\fR denotes failure within \f(CW\*(C`Parse::RecDescent\*(C'\fR parsers.
1515.IP "Error handling" 4
1516.IX Item "Error handling"
1517The \f(CW\*(C`<error>\*(C'\fR directive provides automatic or user-defined
1518generation of error messages during a parse. In its simplest form
1519\&\f(CW\*(C`<error>\*(C'\fR prepares an error message based on
1520the mismatch between the last item expected and the text which cause
1521it to fail. For example, given the rule:
1522.Sp
1523.Vb 3
1524\& McCoy: curse ',' name ', I'm a doctor, not a' a_profession '!'
1525\& | pronoun 'dead,' name '!'
1526\& | <error>
1527.Ve
1528.Sp
1529the following strings would produce the following messages:
1530.RS 4
1531.ie n .IP """Amen, Jim!""" 4
1532.el .IP "``Amen, Jim!''" 4
1533.IX Item "Amen, Jim!"
1534.Vb 2
1535\& ERROR (line 1): Invalid McCoy: Expected curse or pronoun
1536\& not found
1537.Ve
1538.ie n .IP """Dammit, Jim, I'm a doctor!""" 4
1539.el .IP "``Dammit, Jim, I'm a doctor!''" 4
1540.IX Item "Dammit, Jim, I'm a doctor!"
1541.Vb 2
1542\& ERROR (line 1): Invalid McCoy: Expected ", I'm a doctor, not a"
1543\& but found ", I'm a doctor!" instead
1544.Ve
1545.ie n .IP """He's dead,\en""" 4
1546.el .IP "``He's dead,\en''" 4
1547.IX Item "He's dead,n"
1548.Vb 1
1549\& ERROR (line 2): Invalid McCoy: Expected name not found
1550.Ve
1551.ie n .IP """He's alive!""" 4
1552.el .IP "``He's alive!''" 4
1553.IX Item "He's alive!"
1554.Vb 2
1555\& ERROR (line 1): Invalid McCoy: Expected 'dead,' but found
1556\& "alive!" instead
1557.Ve
1558.ie n .IP """Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!""" 4
1559.el .IP "``Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!''" 4
1560.IX Item "Dammit, Jim, I'm a doctor, not a pointy-eared Vulcan!"
1561.Vb 2
1562\& ERROR (line 1): Invalid McCoy: Expected a profession but found
1563\& "pointy-eared Vulcan!" instead
1564.Ve
1565.RE
1566.RS 4
1567.Sp
1568Note that, when autogenerating error messages, all underscores in any
1569rule name used in a message are replaced by single spaces (for example
1570\&\*(L"a_production\*(R" becomes \*(L"a production\*(R"). Judicious choice of rule
1571names can therefore considerably improve the readability of automatic
1572error messages (as well as the maintainability of the original
1573grammar).
1574.Sp
1575If the automatically generated error is not sufficient, it is possible to
1576provide an explicit message as part of the error directive. For example:
1577.Sp
1578.Vb 3
1579\& Spock: "Fascinating ',' (name | 'Captain') '.'
1580\& | "Highly illogical, doctor."
1581\& | <error: He never said that!>
1582.Ve
1583.Sp
1584which would result in \fIall\fR failures to parse a \*(L"Spock\*(R" subrule printing the
1585following message:
1586.Sp
1587.Vb 1
1588\& ERROR (line <N>): Invalid Spock: He never said that!
1589.Ve
1590.Sp
1591The error message is treated as a \*(L"qq{...}\*(R" string and interpolated
1592when the error is generated (\fInot\fR when the directive is specified!).
1593Hence:
1594.Sp
1595.Vb 1
1596\& <error: Mystical error near "$text">
1597.Ve
1598.Sp
1599would correctly insert the ambient text string which caused the error.
1600.Sp
1601There are two other forms of error directive: \f(CW\*(C`<error?>\*(C'\fR and
1602\&\f(CW\*(C`<error?:\ msg>\*(C'\fR. These behave just like \f(CW\*(C`<error>\*(C'\fR
1603and \f(CW\*(C`<error:\ msg>\*(C'\fR respectively, except that they are
1604only triggered if the rule is \*(L"committed\*(R" at the time they are
1605encountered. For example:
1606.Sp
1607.Vb 3
1608\& Scotty: "Ya kenna change the Laws of Phusics," <commit> name
1609\& | name <commit> ',' 'she's goanta blaw!'
1610\& | <error?>
1611.Ve
1612.Sp
1613will only generate an error for a string beginning with \*(L"Ya kenna
1614change the Laws o' Phusics,\*(R" or a valid name, but which still fails to match the
1615corresponding production. That is, \f(CW\*(C`$parser\->Scotty("Aye, Cap'ain")\*(C'\fR will
1616fail silently (since neither production will \*(L"commit\*(R" the rule on that
1617input), whereas \f(CW\*(C`$parser\->Scotty("Mr\ Spock,\ ah\ jest\ kenna\ do'ut!")\*(C'\fR
1618will fail with the error message:
1619.Sp
1620.Vb 2
1621\& ERROR (line 1): Invalid Scotty: expected 'she's goanta blaw!'
1622\& but found 'I jest kenna do'ut!' instead.
1623.Ve
1624.Sp
1625since in that case the second production would commit after matching
1626the leading name.
1627.Sp
1628Note that to allow this behaviour, all \f(CW\*(C`<error>\*(C'\fR directives which are
1629the first item in a production automatically uncommit the rule just
1630long enough to allow their production to be attempted (that is, when
1631their production fails, the commitment is reinstated so that
1632subsequent productions are skipped).
1633.Sp
1634In order to \fIpermanently\fR uncommit the rule before an error message,
1635it is necessary to put an explicit \f(CW\*(C`<uncommit>\*(C'\fR before the
1636\&\f(CW\*(C`<error>\*(C'\fR. For example:
1637.Sp
1638.Vb 5
1639\& line: 'Kirk:' <commit> Kirk
1640\& | 'Spock:' <commit> Spock
1641\& | 'McCoy:' <commit> McCoy
1642\& | <uncommit> <error?> <reject>
1643\& | <resync>
1644.Ve
1645.Sp
1646Error messages generated by the various \f(CW\*(C`<error...>\*(C'\fR directives
1647are not displayed immediately. Instead, they are \*(L"queued\*(R" in a buffer and
1648are only displayed once parsing ultimately fails. Moreover,
1649\&\f(CW\*(C`<error...>\*(C'\fR directives that cause one production of a rule
1650to fail are automatically removed from the message queue
1651if another production subsequently causes the entire rule to succeed.
1652This means that you can put
1653\&\f(CW\*(C`<error...>\*(C'\fR directives wherever useful diagnosis can be done,
1654and only those associated with actual parser failure will ever be
1655displayed. Also see \*(L"Gotchas\*(R".
1656.Sp
1657As a general rule, the most useful diagnostics are usually generated
1658either at the very lowest level within the grammar, or at the very
1659highest. A good rule of thumb is to identify those subrules which
1660consist mainly (or entirely) of terminals, and then put an
1661\&\f(CW\*(C`<error...>\*(C'\fR directive at the end of any other rule which calls
1662one or more of those subrules.
1663.Sp
1664There is one other situation in which the output of the various types of
1665error directive is suppressed; namely, when the rule containing them
1666is being parsed as part of a \*(L"look\-ahead\*(R" (see \*(L"Look\-ahead\*(R"). In this
1667case, the error directive will still cause the rule to fail, but will do
1668so silently.
1669.Sp
1670An unconditional \f(CW\*(C`<error>\*(C'\fR directive always fails (and hence has no
1671associated value). This means that encountering such a directive
1672always causes the production containing it to fail. Hence an
1673\&\f(CW\*(C`<error>\*(C'\fR directive will inevitably be the last (useful) item of a
1674rule (a level 3 warning is issued if a production contains items after an unconditional
1675\&\f(CW\*(C`<error>\*(C'\fR directive).
1676.Sp
1677An \f(CW\*(C`<error?>\*(C'\fR directive will \fIsucceed\fR (that is: fail to fail :\-), if
1678the current rule is uncommitted when the directive is encountered. In
1679that case the directive's associated value is zero. Hence, this type
1680of error directive \fIcan\fR be used before the end of a
1681production. For example:
1682.Sp
1683.Vb 3
1684\& command: 'do' <commit> something
1685\& | 'report' <commit> something
1686\& | <error?: Syntax error> <error: Unknown command>
1687.Ve
1688.Sp
1689\&\fBWarning:\fR The \f(CW\*(C`<error?>\*(C'\fR directive does \fInot\fR mean \*(L"always fail (but
1690do so silently unless committed)\*(R". It actually means "only fail (and report) if
1691committed, otherwise \fIsucceed\fR\*(L". To achieve the \*(R"fail silently if uncommitted"
1692semantics, it is necessary to use:
1693.Sp
1694.Vb 2
1695\& rule: item <commit> item(s)
1696\& | <error?> <reject> # FAIL SILENTLY UNLESS COMMITTED
1697.Ve
1698.Sp
1699However, because people seem to expect a lone \f(CW\*(C`<error?>\*(C'\fR directive
1700to work like this:
1701.Sp
1702.Vb 3
1703\& rule: item <commit> item(s)
1704\& | <error?: Error message if committed>
1705\& | <error: Error message if uncommitted>
1706.Ve
1707.Sp
1708Parse::RecDescent automatically appends a
1709\&\f(CW\*(C`<reject>\*(C'\fR directive if the \f(CW\*(C`<error?>\*(C'\fR directive
1710is the only item in a production. A level 2 warning (see below)
1711is issued when this happens.
1712.Sp
1713The level of error reporting during both parser construction and
1714parsing is controlled by the presence or absence of four global
1715variables: \f(CW$::RD_ERRORS\fR, \f(CW$::RD_WARN\fR, \f(CW$::RD_HINT\fR, and
1716<$::RD_TRACE>. If \f(CW$::RD_ERRORS\fR is defined (and, by default, it is)
1717then fatal errors are reported.
1718.Sp
1719Whenever \f(CW$::RD_WARN\fR is defined, certain non-fatal problems are also reported.
1720Warnings have an associated \*(L"level\*(R": 1, 2, or 3. The higher the level,
1721the more serious the warning. The value of the corresponding global
1722variable (\f(CW$::RD_WARN\fR) determines the \fIlowest\fR level of warning to
1723be displayed. Hence, to see \fIall\fR warnings, set \f(CW$::RD_WARN\fR to 1.
1724To see only the most serious warnings set \f(CW$::RD_WARN\fR to 3.
1725By default \f(CW$::RD_WARN\fR is initialized to 3, ensuring that serious but
1726non-fatal errors are automatically reported.
1727.Sp
1728See \fI\*(L"\s-1DIAGNOSTICS\s0\*(R"\fR for a list of the varous error and warning messages
1729that Parse::RecDescent generates when these two variables are defined.
1730.Sp
1731Defining any of the remaining variables (which are not defined by
1732default) further increases the amount of information reported.
1733Defining \f(CW$::RD_HINT\fR causes the parser generator to offer
1734more detailed analyses and hints on both errors and warnings.
1735Note that setting \f(CW$::RD_HINT\fR at any point automagically
1736sets \f(CW$::RD_WARN\fR to 1.
1737.Sp
1738Defining \f(CW$::RD_TRACE\fR causes the parser generator and the parser to
1739report their progress to \s-1STDERR\s0 in excruciating detail (although, without hints
1740unless \f(CW$::RD_HINT\fR is separately defined). This detail
1741can be moderated in only one respect: if \f(CW$::RD_TRACE\fR has an
1742integer value (\fIN\fR) greater than 1, only the \fIN\fR characters of
1743the \*(L"current parsing context\*(R" (that is, where in the input string we
1744are at any point in the parse) is reported at any time.
1745.Sp
1746\&\f(CW$::RD_TRACE\fR is mainly useful for debugging a grammar that isn't
1747behaving as you expected it to. To this end, if \f(CW$::RD_TRACE\fR is
1748defined when a parser is built, any actual parser code which is
1749generated is also written to a file named \*(L"\s-1RD_TRACE\s0\*(R" in the local
1750directory.
1751.Sp
1752Note that the four variables belong to the \*(L"main\*(R" package, which
1753makes them easier to refer to in the code controlling the parser, and
1754also makes it easy to turn them into command line flags (\*(L"\-RD_ERRORS\*(R",
1755\&\*(L"\-RD_WARN\*(R", \*(L"\-RD_HINT\*(R", \*(L"\-RD_TRACE\*(R") under \fBperl \-s\fR.
1756.RE
1757.IP "Specifying local variables" 4
1758.IX Item "Specifying local variables"
1759It is occasionally convenient to specify variables which are local
1760to a single rule. This may be achieved by including a
1761\&\f(CW\*(C`<rulevar:...>\*(C'\fR directive anywhere in the rule. For example:
1762.Sp
1763.Vb 1
1764\& markup: <rulevar: $tag>
1765.Ve
1766.Sp
1767.Vb 1
1768\& markup: tag {($tag=$item[1]) =~ s/^<|>$//g} body[$tag]
1769.Ve
1770.Sp
1771The example \f(CW\*(C`<rulevar: $tag>\*(C'\fR directive causes a \*(L"my\*(R" variable named
1772\&\f(CW$tag\fR to be declared at the start of the subroutine implementing the
1773\&\f(CW\*(C`markup\*(C'\fR rule (that is, \fIbefore\fR the first production, regardless of
1774where in the rule it is specified).
1775.Sp
1776Specifically, any directive of the form:
1777\&\f(CW\*(C`<rulevar:\f(CItext\f(CW>\*(C'\fR causes a line of the form \f(CW\*(C`my \f(CItext\f(CW;\*(C'\fR
1778to be added at the beginning of the rule subroutine, immediately after
1779the definitions of the following local variables:
1780.Sp
1781.Vb 4
1782\& $thisparser $commit
1783\& $thisrule @item
1784\& $thisline @arg
1785\& $text %arg
1786.Ve
1787.Sp
1788This means that the following \f(CW\*(C`<rulevar>\*(C'\fR directives work
1789as expected:
1790.Sp
1791.Vb 1
1792\& <rulevar: $count = 0 >
1793.Ve
1794.Sp
1795.Vb 1
1796\& <rulevar: $firstarg = $arg[0] || '' >
1797.Ve
1798.Sp
1799.Vb 1
1800\& <rulevar: $myItems = \e@item >
1801.Ve
1802.Sp
1803.Vb 1
1804\& <rulevar: @context = ( $thisline, $text, @arg ) >
1805.Ve
1806.Sp
1807.Vb 1
1808\& <rulevar: ($name,$age) = $arg{"name","age"} >
1809.Ve
1810.Sp
1811Note however that, because all such variables are \*(L"my\*(R" variables, their
1812values \fIdo not persist\fR between match attempts on a given rule. To
1813preserve values between match attempts, values can be stored within the
1814\&\*(L"local\*(R" member of the \f(CW$thisrule\fR object:
1815.Sp
1816.Vb 6
1817\& countedrule: { $thisrule->{"local"}{"count"}++ }
1818\& <reject>
1819\& | subrule1
1820\& | subrule2
1821\& | <reject: $thisrule->{"local"}{"count"} == 1>
1822\& subrule3
1823.Ve
1824.Sp
1825When matching a rule, each \f(CW\*(C`<rulevar>\*(C'\fR directive is matched as
1826if it were an unconditional \f(CW\*(C`<reject>\*(C'\fR directive (that is, it
1827causes any production in which it appears to immediately fail to match).
1828For this reason (and to improve readability) it is usual to specify any
1829\&\f(CW\*(C`<rulevar>\*(C'\fR directive in a separate production at the start of
1830the rule (this has the added advantage that it enables
1831\&\f(CW\*(C`Parse::RecDescent\*(C'\fR to optimize away such productions, just as it does
1832for the \f(CW\*(C`<reject>\*(C'\fR directive).
1833.IP "Dynamically matched rules" 4
1834.IX Item "Dynamically matched rules"
1835Because regexes and double-quoted strings are interpolated, it is relatively
1836easy to specify productions with \*(L"context sensitive\*(R" tokens. For example:
1837.Sp
1838.Vb 1
1839\& command: keyword body "end $item[1]"
1840.Ve
1841.Sp
1842which ensures that a command block is bounded by a
1843"\fI<keyword>\fR...end \fI<same keyword>\fR" pair.
1844.Sp
1845Building productions in which subrules are context sensitive is also possible,
1846via the \f(CW\*(C`<matchrule:...>\*(C'\fR directive. This directive behaves
1847identically to a subrule item, except that the rule which is invoked to match
1848it is determined by the string specified after the colon. For example, we could
1849rewrite the \f(CW\*(C`command\*(C'\fR rule like this:
1850.Sp
1851.Vb 1
1852\& command: keyword <matchrule:body> "end $item[1]"
1853.Ve
1854.Sp
1855Whatever appears after the colon in the directive is treated as an interpolated
1856string (that is, as if it appeared in \f(CW\*(C`qq{...}\*(C'\fR operator) and the value of
1857that interpolated string is the name of the subrule to be matched.
1858.Sp
1859Of course, just putting a constant string like \f(CW\*(C`body\*(C'\fR in a
1860\&\f(CW\*(C`<matchrule:...>\*(C'\fR directive is of little interest or benefit.
1861The power of directive is seen when we use a string that interpolates
1862to something interesting. For example:
1863.Sp
1864.Vb 1
1865\& command: keyword <matchrule:$item[1]_body> "end $item[1]"
1866.Ve
1867.Sp
1868.Vb 1
1869\& keyword: 'while' | 'if' | 'function'
1870.Ve
1871.Sp
1872.Vb 1
1873\& while_body: condition block
1874.Ve
1875.Sp
1876.Vb 1
1877\& if_body: condition block ('else' block)(?)
1878.Ve
1879.Sp
1880.Vb 1
1881\& function_body: arglist block
1882.Ve
1883.Sp
1884Now the \f(CW\*(C`command\*(C'\fR rule selects how to proceed on the basis of the keyword
1885that is found. It is as if \f(CW\*(C`command\*(C'\fR were declared:
1886.Sp
1887.Vb 3
1888\& command: 'while' while_body "end while"
1889\& | 'if' if_body "end if"
1890\& | 'function' function_body "end function"
1891.Ve
1892.Sp
1893When a \f(CW\*(C`<matchrule:...>\*(C'\fR directive is used as a repeated
1894subrule, the rule name expression is \*(L"late\-bound\*(R". That is, the name of
1895the rule to be called is re-evaluated \fIeach time\fR a match attempt is
1896made. Hence, the following grammar:
1897.Sp
1898.Vb 1
1899\& { $::species = 'dogs' }
1900.Ve
1901.Sp
1902.Vb 1
1903\& pair: 'two' <matchrule:$::species>(s)
1904.Ve
1905.Sp
1906.Vb 1
1907\& dogs: /dogs/ { $::species = 'cats' }
1908.Ve
1909.Sp
1910.Vb 1
1911\& cats: /cats/
1912.Ve
1913.Sp
1914will match the string \*(L"two dogs cats cats\*(R" completely, whereas it will
1915only match the string \*(L"two dogs dogs dogs\*(R" up to the eighth letter. If
1916the rule name were \*(L"early bound\*(R" (that is, evaluated only the first
1917time the directive is encountered in a production), the reverse
1918behaviour would be expected.
1919.IP "Deferred actions" 4
1920.IX Item "Deferred actions"
1921The \f(CW\*(C`<defer:...>\*(C'\fR directive is used to specify an action to be
1922performed when (and only if!) the current production ultimately succeeds.
1923.Sp
1924Whenever a \f(CW\*(C`<defer:...>\*(C'\fR directive appears, the code it specifies
1925is converted to a closure (an anonymous subroutine reference) which is
1926queued within the active parser object. Note that,
1927because the deferred code is converted to a closure, the values of any
1928\&\*(L"local\*(R" variable (such as \f(CW$text\fR, <@item>, etc.) are preserved
1929until the deferred code is actually executed.
1930.Sp
1931If the parse ultimately succeeds
1932\&\fIand\fR the production in which the \f(CW\*(C`<defer:...>\*(C'\fR directive was
1933evaluated formed part of the successful parse, then the deferred code is
1934executed immediately before the parse returns. If however the production
1935which queued a deferred action fails, or one of the higher-level
1936rules which called that production fails, then the deferred action is
1937removed from the queue, and hence is never executed.
1938.Sp
1939For example, given the grammar:
1940.Sp
1941.Vb 2
1942\& sentence: noun trans noun
1943\& | noun intrans
1944.Ve
1945.Sp
1946.Vb 4
1947\& noun: 'the dog'
1948\& { print "$item[1]\et(noun)\en" }
1949\& | 'the meat'
1950\& { print "$item[1]\et(noun)\en" }
1951.Ve
1952.Sp
1953.Vb 2
1954\& trans: 'ate'
1955\& { print "$item[1]\et(transitive)\en" }
1956.Ve
1957.Sp
1958.Vb 4
1959\& intrans: 'ate'
1960\& { print "$item[1]\et(intransitive)\en" }
1961\& | 'barked'
1962\& { print "$item[1]\et(intransitive)\en" }
1963.Ve
1964.Sp
1965then parsing the sentence \f(CW"the dog ate"\fR would produce the output:
1966.Sp
1967.Vb 4
1968\& the dog (noun)
1969\& ate (transitive)
1970\& the dog (noun)
1971\& ate (intransitive)
1972.Ve
1973.Sp
1974This is because, even though the first production of \f(CW\*(C`sentence\*(C'\fR
1975ultimately fails, its initial subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`trans\*(C'\fR do match,
1976and hence they execute their associated actions.
1977Then the second production of \f(CW\*(C`sentence\*(C'\fR succeeds, causing the
1978actions of the subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`intrans\*(C'\fR to be executed as well.
1979.Sp
1980On the other hand, if the actions were replaced by \f(CW\*(C`<defer:...>\*(C'\fR
1981directives:
1982.Sp
1983.Vb 2
1984\& sentence: noun trans noun
1985\& | noun intrans
1986.Ve
1987.Sp
1988.Vb 4
1989\& noun: 'the dog'
1990\& <defer: print "$item[1]\et(noun)\en" >
1991\& | 'the meat'
1992\& <defer: print "$item[1]\et(noun)\en" >
1993.Ve
1994.Sp
1995.Vb 2
1996\& trans: 'ate'
1997\& <defer: print "$item[1]\et(transitive)\en" >
1998.Ve
1999.Sp
2000.Vb 4
2001\& intrans: 'ate'
2002\& <defer: print "$item[1]\et(intransitive)\en" >
2003\& | 'barked'
2004\& <defer: print "$item[1]\et(intransitive)\en" >
2005.Ve
2006.Sp
2007the output would be:
2008.Sp
2009.Vb 2
2010\& the dog (noun)
2011\& ate (intransitive)
2012.Ve
2013.Sp
2014since deferred actions are only executed if they were evaluated in
2015a production which ultimately contributes to the successful parse.
2016.Sp
2017In this case, even though the first production of \f(CW\*(C`sentence\*(C'\fR caused
2018the subrules \f(CW\*(C`noun\*(C'\fR and \f(CW\*(C`trans\*(C'\fR to match, that production ultimately
2019failed and so the deferred actions queued by those subrules were subsequently
2020disgarded. The second production then succeeded, causing the entire
2021parse to succeed, and so the deferred actions queued by the (second) match of
2022the \f(CW\*(C`noun\*(C'\fR subrule and the subsequent match of \f(CW\*(C`intrans\*(C'\fR \fIare\fR preserved and
2023eventually executed.
2024.Sp
2025Deferred actions provide a means of improving the performance of a parser,
2026by only executing those actions which are part of the final parse-tree
2027for the input data.
2028.Sp
2029Alternatively, deferred actions can be viewed as a mechanism for building
2030(and executing) a
2031customized subroutine corresponding to the given input data, much in the
2032same way that autoactions (see \*(L"Autoactions\*(R") can be used to build a
2033customized data structure for specific input.
2034.Sp
2035Whether or not the action it specifies is ever executed,
2036a \f(CW\*(C`<defer:...>\*(C'\fR directive always succeeds, returning the
2037number of deferred actions currently queued at that point.
2038.IP "Parsing Perl" 4
2039.IX Item "Parsing Perl"
2040Parse::RecDescent provides limited support for parsing subsets of Perl,
2041namely: quote-like operators, Perl variables, and complete code blocks.
2042.Sp
2043The \f(CW\*(C`<perl_quotelike>\*(C'\fR directive can be used to parse any Perl
2044quote-like operator: \f(CW'a string'\fR, \f(CW\*(C`m/a pattern/\*(C'\fR, \f(CW\*(C`tr{ans}{lation}\*(C'\fR,
2045etc. It does this by calling \fIText::Balanced::quotelike()\fR.
2046.Sp
2047If a quote-like operator is found, a reference to an array of eight elements
2048is returned. Those elements are identical to the last eight elements returned
2049by \fIText::Balanced::extract_quotelike()\fR in an array context, namely:
2050.RS 4
2051.IP "[0]" 4
2052.IX Item "[0]"
2053the name of the quotelike operator \*(-- 'q', 'qq', 'm', 's', 'tr' \*(-- if the
2054operator was named; otherwise \f(CW\*(C`undef\*(C'\fR,
2055.IP "[1]" 4
2056.IX Item "[1]"
2057the left delimiter of the first block of the operation,
2058.IP "[2]" 4
2059.IX Item "[2]"
2060the text of the first block of the operation
2061(that is, the contents of
2062a quote, the regex of a match, or substitution or the target list of a
2063translation),
2064.IP "[3]" 4
2065.IX Item "[3]"
2066the right delimiter of the first block of the operation,
2067.IP "[4]" 4
2068.IX Item "[4]"
2069the left delimiter of the second block of the operation if there is one
2070(that is, if it is a \f(CW\*(C`s\*(C'\fR, \f(CW\*(C`tr\*(C'\fR, or \f(CW\*(C`y\*(C'\fR); otherwise \f(CW\*(C`undef\*(C'\fR,
2071.IP "[5]" 4
2072.IX Item "[5]"
2073the text of the second block of the operation if there is one
2074(that is, the replacement of a substitution or the translation list
2075of a translation); otherwise \f(CW\*(C`undef\*(C'\fR,
2076.IP "[6]" 4
2077.IX Item "[6]"
2078the right delimiter of the second block of the operation (if any);
2079otherwise \f(CW\*(C`undef\*(C'\fR,
2080.IP "[7]" 4
2081.IX Item "[7]"
2082the trailing modifiers on the operation (if any); otherwise \f(CW\*(C`undef\*(C'\fR.
2083.RE
2084.RS 4
2085.Sp
2086If a quote-like expression is not found, the directive fails with the usual
2087\&\f(CW\*(C`undef\*(C'\fR value.
2088.Sp
2089The \f(CW\*(C`<perl_variable>\*(C'\fR directive can be used to parse any Perl
2090variable: \f(CW$scalar\fR, \f(CW@array\fR, \f(CW%hash\fR, \f(CW$ref\fR\->{field}[$index], etc.
2091It does this by calling \fIText::Balanced::extract_variable()\fR.
2092.Sp
2093If the directive matches text representing a valid Perl variable
2094specification, it returns that text. Otherwise it fails with the usual
2095\&\f(CW\*(C`undef\*(C'\fR value.
2096.Sp
2097The \f(CW\*(C`<perl_codeblock>\*(C'\fR directive can be used to parse curly-brace-delimited block of Perl code, such as: { \f(CW$a\fR = 1; f() =~ m/pat/; }.
2098It does this by calling \fIText::Balanced::extract_codeblock()\fR.
2099.Sp
2100If the directive matches text representing a valid Perl code block,
2101it returns that text. Otherwise it fails with the usual \f(CW\*(C`undef\*(C'\fR value.
2102.RE
2103.IP "Constructing tokens" 4
2104.IX Item "Constructing tokens"
2105Eventually, Parse::RecDescent will be able to parse tokenized input, as
2106well as ordinary strings. In preparation for this joyous day, the
2107\&\f(CW\*(C`<token:...>\*(C'\fR directive has been provided.
2108This directive creates a token which will be suitable for
2109input to a Parse::RecDescent parser (when it eventually supports
2110tokenized input).
2111.Sp
2112The text of the token is the value of the
2113immediately preceding item in the production. A
2114\&\f(CW\*(C`<token:...>\*(C'\fR directive always succeeds with a return
2115value which is the hash reference that is the new token. It also
2116sets the return value for the production to that hash ref.
2117.Sp
2118The \f(CW\*(C`<token:...>\*(C'\fR directive makes it easy to build
2119a Parse::RecDescent\-compatible lexer in Parse::RecDescent:
2120.Sp
2121.Vb 3
2122\& my $lexer = new Parse::RecDescent q
2123\& {
2124\& lex: token(s)
2125.Ve
2126.Sp
2127.Vb 5
2128\& token: /a\eb/ <token:INDEF>
2129\& | /the\eb/ <token:DEF>
2130\& | /fly\eb/ <token:NOUN,VERB>
2131\& | /[a-z]+/i { lc $item[1] } <token:ALPHA>
2132\& | <error: Unknown token>
2133.Ve
2134.Sp
2135.Vb 1
2136\& };
2137.Ve
2138.Sp
2139which will eventually be able to be used with a regular Parse::RecDescent
2140grammar:
2141.Sp
2142.Vb 3
2143\& my $parser = new Parse::RecDescent q
2144\& {
2145\& startrule: subrule1 subrule 2
2146.Ve
2147.Sp
2148.Vb 2
2149\& # ETC...
2150\& };
2151.Ve
2152.Sp
2153either with a pre-lexing phase:
2154.Sp
2155.Vb 1
2156\& $parser->startrule( $lexer->lex($data) );
2157.Ve
2158.Sp
2159or with a lex-on-demand approach:
2160.Sp
2161.Vb 1
2162\& $parser->startrule( sub{$lexer->token(\e$data)} );
2163.Ve
2164.Sp
2165But at present, only the \f(CW\*(C`<token:...>\*(C'\fR directive is
2166actually implemented. The rest is vapourware.
2167.IP "Specifying operations" 4
2168.IX Item "Specifying operations"
2169One of the commonest requirements when building a parser is to specify
2170binary operators. Unfortunately, in a normal grammar, the rules for
2171such things are awkward:
2172.Sp
2173.Vb 2
2174\& disjunction: conjunction ('or' conjunction)(s?)
2175\& { $return = [ $item[1], @{$item[2]} ] }
2176.Ve
2177.Sp
2178.Vb 2
2179\& conjunction: atom ('and' atom)(s?)
2180\& { $return = [ $item[1], @{$item[2]} ] }
2181.Ve
2182.Sp
2183or inefficient:
2184.Sp
2185.Vb 4
2186\& disjunction: conjunction 'or' disjunction
2187\& { $return = [ $item[1], @{$item[2]} ] }
2188\& | conjunction
2189\& { $return = [ $item[1] ] }
2190.Ve
2191.Sp
2192.Vb 4
2193\& conjunction: atom 'and' conjunction
2194\& { $return = [ $item[1], @{$item[2]} ] }
2195\& | atom
2196\& { $return = [ $item[1] ] }
2197.Ve
2198.Sp
2199and either way is ugly and hard to get right.
2200.Sp
2201The \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives provide an
2202easier way of specifying such operations. Using \f(CW\*(C`<leftop:...>\*(C'\fR the
2203above examples become:
2204.Sp
2205.Vb 2
2206\& disjunction: <leftop: conjunction 'or' conjunction>
2207\& conjunction: <leftop: atom 'and' atom>
2208.Ve
2209.Sp
2210The \f(CW\*(C`<leftop:...>\*(C'\fR directive specifies a left-associative binary operator.
2211It is specified around three other grammar elements
2212(typically subrules or terminals), which match the left operand,
2213the operator itself, and the right operand respectively.
2214.Sp
2215A \f(CW\*(C`<leftop:...>\*(C'\fR directive such as:
2216.Sp
2217.Vb 1
2218\& disjunction: <leftop: conjunction 'or' conjunction>
2219.Ve
2220.Sp
2221is converted to the following:
2222.Sp
2223.Vb 2
2224\& disjunction: ( conjunction ('or' conjunction)(s?)
2225\& { $return = [ $item[1], @{$item[2]} ] } )
2226.Ve
2227.Sp
2228In other words, a \f(CW\*(C`<leftop:...>\*(C'\fR directive matches the left operand followed by zero
2229or more repetitions of both the operator and the right operand. It then
2230flattens the matched items into an anonymous array which becomes the
2231(single) value of the entire \f(CW\*(C`<leftop:...>\*(C'\fR directive.
2232.Sp
2233For example, an \f(CW\*(C`<leftop:...>\*(C'\fR directive such as:
2234.Sp
2235.Vb 1
2236\& output: <leftop: ident '<<' expr >
2237.Ve
2238.Sp
2239when given a string such as:
2240.Sp
2241.Vb 1
2242\& cout << var << "str" << 3
2243.Ve
2244.Sp
2245would match, and \f(CW$item[1]\fR would be set to:
2246.Sp
2247.Vb 1
2248\& [ 'cout', 'var', '"str"', '3' ]
2249.Ve
2250.Sp
2251In other words:
2252.Sp
2253.Vb 1
2254\& output: <leftop: ident '<<' expr >
2255.Ve
2256.Sp
2257is equivalent to a left-associative operator:
2258.Sp
2259.Vb 5
2260\& output: ident { $return = [$item[1]] }
2261\& | ident '<<' expr { $return = [@item[1,3]] }
2262\& | ident '<<' expr '<<' expr { $return = [@item[1,3,5]] }
2263\& | ident '<<' expr '<<' expr '<<' expr { $return = [@item[1,3,5,7]] }
2264\& # ...etc...
2265.Ve
2266.Sp
2267Similarly, the \f(CW\*(C`<rightop:...>\*(C'\fR directive takes a left operand, an operator, and a right operand:
2268.Sp
2269.Vb 1
2270\& assign: <rightop: var '=' expr >
2271.Ve
2272.Sp
2273and converts them to:
2274.Sp
2275.Vb 2
2276\& assign: ( (var '=' {$return=$item[1]})(s?) expr
2277\& { $return = [ @{$item[1]}, $item[2] ] } )
2278.Ve
2279.Sp
2280which is equivalent to a right-associative operator:
2281.Sp
2282.Vb 5
2283\& assign: var { $return = [$item[1]] }
2284\& | var '=' expr { $return = [@item[1,3]] }
2285\& | var '=' var '=' expr { $return = [@item[1,3,5]] }
2286\& | var '=' var '=' var '=' expr { $return = [@item[1,3,5,7]] }
2287\& # ...etc...
2288.Ve
2289.Sp
2290Note that for both the \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives, the directive does not normally
2291return the operator itself, just a list of the operands involved. This is
2292particularly handy for specifying lists:
2293.Sp
2294.Vb 2
2295\& list: '(' <leftop: list_item ',' list_item> ')'
2296\& { $return = $item[2] }
2297.Ve
2298.Sp
2299There is, however, a problem: sometimes the operator is itself significant.
2300For example, in a Perl list a comma and a \f(CW\*(C`=>\*(C'\fR are both
2301valid separators, but the \f(CW\*(C`=>\*(C'\fR has additional stringification semantics.
2302Hence it's important to know which was used in each case.
2303.Sp
2304To solve this problem the
2305\&\f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives
2306\&\fIdo\fR return the operator(s) as well, under two circumstances.
2307The first case is where the operator is specified as a subrule. In that instance,
2308whatever the operator matches is returned (on the assumption that if the operator
2309is important enough to have its own subrule, then it's important enough to return).
2310.Sp
2311The second case is where the operator is specified as a regular
2312expression. In that case, if the first bracketed subpattern of the
2313regular expression matches, that matching value is returned (this is analogous to
2314the behaviour of the Perl \f(CW\*(C`split\*(C'\fR function, except that only the first subpattern
2315is returned).
2316.Sp
2317In other words, given the input:
2318.Sp
2319.Vb 1
2320\& ( a=>1, b=>2 )
2321.Ve
2322.Sp
2323the specifications:
2324.Sp
2325.Vb 1
2326\& list: '(' <leftop: list_item separator list_item> ')'
2327.Ve
2328.Sp
2329.Vb 1
2330\& separator: ',' | '=>'
2331.Ve
2332.Sp
2333or:
2334.Sp
2335.Vb 1
2336\& list: '(' <leftop: list_item /(,|=>)/ list_item> ')'
2337.Ve
2338.Sp
2339cause the list separators to be interleaved with the operands in the
2340anonymous array in \f(CW$item[2]\fR:
2341.Sp
2342.Vb 1
2343\& [ 'a', '=>', '1', ',', 'b', '=>', '2' ]
2344.Ve
2345.Sp
2346But the following version:
2347.Sp
2348.Vb 1
2349\& list: '(' <leftop: list_item /,|=>/ list_item> ')'
2350.Ve
2351.Sp
2352returns only the operators:
2353.Sp
2354.Vb 1
2355\& [ 'a', '1', 'b', '2' ]
2356.Ve
2357.Sp
2358Of course, none of the above specifications handle the case of an empty
2359list, since the \f(CW\*(C`<leftop:...>\*(C'\fR and \f(CW\*(C`<rightop:...>\*(C'\fR directives
2360require at least a single right or left operand to match. To specify
2361that the operator can match \*(L"trivially\*(R",
2362it's necessary to add a \f(CW\*(C`(?)\*(C'\fR qualifier to the directive:
2363.Sp
2364.Vb 1
2365\& list: '(' <leftop: list_item /(,|=>)/ list_item>(?) ')'
2366.Ve
2367.Sp
2368Note that in almost all the above examples, the first and third arguments
2369of the \f(CW\*(C`<leftop:...>\*(C'\fR directive were the same subrule. That is because
2370\&\f(CW\*(C`<leftop:...>\*(C'\fR's are frequently used to specify \*(L"separated\*(R" lists of the
2371same type of item. To make such lists easier to specify, the following
2372syntax:
2373.Sp
2374.Vb 1
2375\& list: element(s /,/)
2376.Ve
2377.Sp
2378is exactly equivalent to:
2379.Sp
2380.Vb 1
2381\& list: <leftop: element /,/ element>
2382.Ve
2383.Sp
2384Note that the separator must be specified as a raw pattern (i.e.
2385not a string or subrule).
2386.IP "Scored productions" 4
2387.IX Item "Scored productions"
2388By default, Parse::RecDescent grammar rules always accept the first
2389production that matches the input. But if two or more productions may
2390potentially match the same input, choosing the first that does so may
2391not be optimal.
2392.Sp
2393For example, if you were parsing the sentence \*(L"time flies like an arrow\*(R",
2394you might use a rule like this:
2395.Sp
2396.Vb 3
2397\& sentence: verb noun preposition article noun { [@item] }
2398\& | adjective noun verb article noun { [@item] }
2399\& | noun verb preposition article noun { [@item] }
2400.Ve
2401.Sp
2402Each of these productions matches the sentence, but the third one
2403is the most likely interpretation. However, if the sentence had been
2404\&\*(L"fruit flies like a banana\*(R", then the second production is probably
2405the right match.
2406.Sp
2407To cater for such situtations, the \f(CW\*(C`<score:...>\*(C'\fR can be used.
2408The directive is equivalent to an unconditional \f(CW\*(C`<reject>\*(C'\fR,
2409except that it allows you to specify a \*(L"score\*(R" for the current
2410production. If that score is numerically greater than the best
2411score of any preceding production, the current production is cached for later
2412consideration. If no later production matches, then the cached
2413production is treated as having matched, and the value of the
2414item immediately before its \f(CW\*(C`<score:...>\*(C'\fR directive is returned as the
2415result.
2416.Sp
2417In other words, by putting a \f(CW\*(C`<score:...>\*(C'\fR directive at the end of
2418each production, you can select which production matches using
2419criteria other than specification order. For example:
2420.Sp
2421.Vb 3
2422\& sentence: verb noun preposition article noun { [@item] } <score: sensible(@item)>
2423\& | adjective noun verb article noun { [@item] } <score: sensible(@item)>
2424\& | noun verb preposition article noun { [@item] } <score: sensible(@item)>
2425.Ve
2426.Sp
2427Now, when each production reaches its respective \f(CW\*(C`<score:...>\*(C'\fR
2428directive, the subroutine \f(CW\*(C`sensible\*(C'\fR will be called to evaluate the
2429matched items (somehow). Once all productions have been tried, the
2430one which \f(CW\*(C`sensible\*(C'\fR scored most highly will be the one that is
2431accepted as a match for the rule.
2432.Sp
2433The variable \f(CW$score\fR always holds the current best score of any production,
2434and the variable \f(CW$score_return\fR holds the corresponding return value.
2435.Sp
2436As another example, the following grammar matches lines that may be
2437separated by commas, colons, or semi\-colons. This can be tricky if
2438a colon-separated line also contains commas, or vice versa. The grammar
2439resolves the ambiguity by selecting the rule that results in the
2440fewest fields:
2441.Sp
2442.Vb 3
2443\& line: seplist[sep=>','] <score: -@{$item[1]}>
2444\& | seplist[sep=>':'] <score: -@{$item[1]}>
2445\& | seplist[sep=>" "] <score: -@{$item[1]}>
2446.Ve
2447.Sp
2448.Vb 1
2449\& seplist: <skip:""> <leftop: /[^$arg{sep}]*/ "$arg{sep}" /[^$arg{sep}]*/>
2450.Ve
2451.Sp
2452Note the use of negation within the \f(CW\*(C`<score:...>\*(C'\fR directive
2453to ensure that the seplist with the most items gets the lowest score.
2454.Sp
2455As the above examples indicate, it is often the case that all productions
2456in a rule use exactly the same \f(CW\*(C`<score:...>\*(C'\fR directive. It is
2457tedious to have to repeat this identical directive in every production, so
2458Parse::RecDescent also provides the \f(CW\*(C`<autoscore:...>\*(C'\fR directive.
2459.Sp
2460If an \f(CW\*(C`<autoscore:...>\*(C'\fR directive appears in any
2461production of a rule, the code it specifies is used as the scoring
2462code for every production of that rule, except productions that already
2463end with an explicit \f(CW\*(C`<score:...>\*(C'\fR directive. Thus the rules above could
2464be rewritten:
2465.Sp
2466.Vb 4
2467\& line: <autoscore: -@{$item[1]}>
2468\& line: seplist[sep=>',']
2469\& | seplist[sep=>':']
2470\& | seplist[sep=>" "]
2471.Ve
2472.Sp
2473.Vb 4
2474\& sentence: <autoscore: sensible(@item)>
2475\& | verb noun preposition article noun { [@item] }
2476\& | adjective noun verb article noun { [@item] }
2477\& | noun verb preposition article noun { [@item] }
2478.Ve
2479.Sp
2480Note that the \f(CW\*(C`<autoscore:...>\*(C'\fR directive itself acts as an
2481unconditional \f(CW\*(C`<reject>\*(C'\fR, and (like the \f(CW\*(C`<rulevar:...>\*(C'\fR
2482directive) is pruned at compile-time wherever possible.
2483.IP "Dispensing with grammar checks" 4
2484.IX Item "Dispensing with grammar checks"
2485During the compilation phase of parser construction, Parse::RecDescent performs
2486a small number of checks on the grammar it's given. Specifically it checks that
2487the grammar is not left\-recursive, that there are no \*(L"insatiable\*(R" constructs of
2488the form:
2489.Sp
2490.Vb 1
2491\& rule: subrule(s) subrule
2492.Ve
2493.Sp
2494and that there are no rules missing (i.e. referred to, but never defined).
2495.Sp
2496These checks are important during development, but can slow down parser
2497construction in stable code. So Parse::RecDescent provides the
2498<nocheck> directive to turn them off. The directive can only appear
2499before the first rule definition, and switches off checking throughout the rest
2500of the current grammar.
2501.Sp
2502Typically, this directive would be added when a parser has been thoroughly
2503tested and is ready for release.
2504.Sh "Subrule argument lists"
2505.IX Subsection "Subrule argument lists"
2506It is occasionally useful to pass data to a subrule which is being invoked. For
2507example, consider the following grammar fragment:
2508.PP
2509.Vb 1
2510\& classdecl: keyword decl
2511.Ve
2512.PP
2513.Vb 1
2514\& keyword: 'struct' | 'class';
2515.Ve
2516.PP
2517.Vb 1
2518\& decl: # WHATEVER
2519.Ve
2520.PP
2521The \f(CW\*(C`decl\*(C'\fR rule might wish to know which of the two keywords was used
2522(since it may affect some aspect of the way the subsequent declaration
2523is interpreted). \f(CW\*(C`Parse::RecDescent\*(C'\fR allows the grammar designer to
2524pass data into a rule, by placing that data in an \fIargument list\fR
2525(that is, in square brackets) immediately after any subrule item in a
2526production. Hence, we could pass the keyword to \f(CW\*(C`decl\*(C'\fR as follows:
2527.PP
2528.Vb 1
2529\& classdecl: keyword decl[ $item[1] ]
2530.Ve
2531.PP
2532.Vb 1
2533\& keyword: 'struct' | 'class';
2534.Ve
2535.PP
2536.Vb 1
2537\& decl: # WHATEVER
2538.Ve
2539.PP
2540The argument list can consist of any number (including zero!) of comma-separated
2541Perl expressions. In other words, it looks exactly like a Perl anonymous
2542array reference. For example, we could pass the keyword, the name of the
2543surrounding rule, and the literal 'keyword' to \f(CW\*(C`decl\*(C'\fR like so:
2544.PP
2545.Vb 1
2546\& classdecl: keyword decl[$item[1],$item[0],'keyword']
2547.Ve
2548.PP
2549.Vb 1
2550\& keyword: 'struct' | 'class';
2551.Ve
2552.PP
2553.Vb 1
2554\& decl: # WHATEVER
2555.Ve
2556.PP
2557Within the rule to which the data is passed (\f(CW\*(C`decl\*(C'\fR in the above examples)
2558that data is available as the elements of a local variable \f(CW@arg\fR. Hence
2559\&\f(CW\*(C`decl\*(C'\fR might report its intentions as follows:
2560.PP
2561.Vb 1
2562\& classdecl: keyword decl[$item[1],$item[0],'keyword']
2563.Ve
2564.PP
2565.Vb 1
2566\& keyword: 'struct' | 'class';
2567.Ve
2568.PP
2569.Vb 2
2570\& decl: { print "Declaring $arg[0] (a $arg[2])\en";
2571\& print "(this rule called by $arg[1])" }
2572.Ve
2573.PP
2574Subrule argument lists can also be interpreted as hashes, simply by using
2575the local variable \f(CW%arg\fR instead of \f(CW@arg\fR. Hence we could rewrite the
2576previous example:
2577.PP
2578.Vb 3
2579\& classdecl: keyword decl[keyword => $item[1],
2580\& caller => $item[0],
2581\& type => 'keyword']
2582.Ve
2583.PP
2584.Vb 1
2585\& keyword: 'struct' | 'class';
2586.Ve
2587.PP
2588.Vb 2
2589\& decl: { print "Declaring $arg{keyword} (a $arg{type})\en";
2590\& print "(this rule called by $arg{caller})" }
2591.Ve
2592.PP
2593Both \f(CW@arg\fR and \f(CW%arg\fR are always available, so the grammar designer may
2594choose whichever convention (or combination of conventions) suits best.
2595.PP
2596Subrule argument lists are also useful for creating \*(L"rule templates\*(R"
2597(especially when used in conjunction with the \f(CW\*(C`<matchrule:...>\*(C'\fR
2598directive). For example, the subrule:
2599.PP
2600.Vb 4
2601\& list: <matchrule:$arg{rule}> /$arg{sep}/ list[%arg]
2602\& { $return = [ $item[1], @{$item[3]} ] }
2603\& | <matchrule:$arg{rule}>
2604\& { $return = [ $item[1]] }
2605.Ve
2606.PP
2607is a handy template for the common problem of matching a separated list.
2608For example:
2609.PP
2610.Vb 1
2611\& function: 'func' name '(' list[rule=>'param',sep=>';'] ')'
2612.Ve
2613.PP
2614.Vb 1
2615\& param: list[rule=>'name',sep=>','] ':' typename
2616.Ve
2617.PP
2618.Vb 1
2619\& name: /\ew+/
2620.Ve
2621.PP
2622.Vb 1
2623\& typename: name
2624.Ve
2625.PP
2626When a subrule argument list is used with a repeated subrule, the argument list
2627goes \fIbefore\fR the repetition specifier:
2628.PP
2629.Vb 1
2630\& list: /some|many/ thing[ $item[1] ](s)
2631.Ve
2632.PP
2633The argument list is \*(L"late bound\*(R". That is, it is re-evaluated for every
2634repetition of the repeated subrule.
2635This means that each repeated attempt to match the subrule may be
2636passed a completely different set of arguments if the value of the
2637expression in the argument list changes between attempts. So, for
2638example, the grammar:
2639.PP
2640.Vb 1
2641\& { $::species = 'dogs' }
2642.Ve
2643.PP
2644.Vb 1
2645\& pair: 'two' animal[$::species](s)
2646.Ve
2647.PP
2648.Vb 1
2649\& animal: /$arg[0]/ { $::species = 'cats' }
2650.Ve
2651.PP
2652will match the string \*(L"two dogs cats cats\*(R" completely, whereas
2653it will only match the string \*(L"two dogs dogs dogs\*(R" up to the
2654eighth letter. If the value of the argument list were \*(L"early bound\*(R"
2655(that is, evaluated only the first time a repeated subrule match is
2656attempted), one would expect the matching behaviours to be reversed.
2657.PP
2658Of course, it is possible to effectively \*(L"early bind\*(R" such argument lists
2659by passing them a value which does not change on each repetition. For example:
2660.PP
2661.Vb 1
2662\& { $::species = 'dogs' }
2663.Ve
2664.PP
2665.Vb 1
2666\& pair: 'two' { $::species } animal[$item[2]](s)
2667.Ve
2668.PP
2669.Vb 1
2670\& animal: /$arg[0]/ { $::species = 'cats' }
2671.Ve
2672.PP
2673Arguments can also be passed to the start rule, simply by appending them
2674to the argument list with which the start rule is called (\fIafter\fR the
2675\&\*(L"line number\*(R" parameter). For example, given:
2676.PP
2677.Vb 1
2678\& $parser = new Parse::RecDescent ( $grammar );
2679.Ve
2680.PP
2681.Vb 1
2682\& $parser->data($text, 1, "str", 2, \e@arr);
2683.Ve
2684.PP
2685.Vb 5
2686\& # ^^^^^ ^ ^^^^^^^^^^^^^^^
2687\& # | | |
2688\& # TEXT TO BE PARSED | |
2689\& # STARTING LINE NUMBER |
2690\& # ELEMENTS OF @arg WHICH IS PASSED TO RULE data
2691.Ve
2692.PP
2693then within the productions of the rule \f(CW\*(C`data\*(C'\fR, the array \f(CW@arg\fR will contain
2694\&\f(CW\*(C`("str", 2, \e@arr)\*(C'\fR.
2695.Sh "Alternations"
2696.IX Subsection "Alternations"
2697Alternations are implicit (unnamed) rules defined as part of a production. An
2698alternation is defined as a series of '|'\-separated productions inside a
2699pair of round brackets. For example:
2700.PP
2701.Vb 1
2702\& character: 'the' ( good | bad | ugly ) /dude/
2703.Ve
2704.PP
2705Every alternation implicitly defines a new subrule, whose
2706automatically-generated name indicates its origin:
2707\&\*(L"_alternation_<I>_of_production_<P>_of_rule<R>\*(R" for the appropriate
2708values of <I>, <P>, and <R>. A call to this implicit subrule is then
2709inserted in place of the brackets. Hence the above example is merely a
2710convenient short-hand for:
2711.PP
2712.Vb 3
2713\& character: 'the'
2714\& _alternation_1_of_production_1_of_rule_character
2715\& /dude/
2716.Ve
2717.PP
2718.Vb 2
2719\& _alternation_1_of_production_1_of_rule_character:
2720\& good | bad | ugly
2721.Ve
2722.PP
2723Since alternations are parsed by recursively calling the parser generator,
2724any type(s) of item can appear in an alternation. For example:
2725.PP
2726.Vb 5
2727\& character: 'the' ( 'high' "plains" # Silent, with poncho
2728\& | /no[- ]name/ # Silent, no poncho
2729\& | vengeance_seeking # Poncho-optional
2730\& | <error>
2731\& ) drifter
2732.Ve
2733.PP
2734In this case, if an error occurred, the automatically generated
2735message would be:
2736.PP
2737.Vb 3
2738\& ERROR (line <N>): Invalid implicit subrule: Expected
2739\& 'high' or /no[- ]name/ or generic,
2740\& but found "pacifist" instead
2741.Ve
2742.PP
2743Since every alternation actually has a name, it's even possible
2744to extend or replace them:
2745.PP
2746.Vb 4
2747\& parser->Replace(
2748\& "_alternation_1_of_production_1_of_rule_character:
2749\& 'generic Eastwood'"
2750\& );
2751.Ve
2752.PP
2753More importantly, since alternations are a form of subrule, they can be given
2754repetition specifiers:
2755.PP
2756.Vb 1
2757\& character: 'the' ( good | bad | ugly )(?) /dude/
2758.Ve
2759.Sh "Incremental Parsing"
2760.IX Subsection "Incremental Parsing"
2761\&\f(CW\*(C`Parse::RecDescent\*(C'\fR provides two methods \- \f(CW\*(C`Extend\*(C'\fR and \f(CW\*(C`Replace\*(C'\fR \- which
2762can be used to alter the grammar matched by a parser. Both methods
2763take the same argument as \f(CW\*(C`Parse::RecDescent::new\*(C'\fR, namely a
2764grammar specification string
2765.PP
2766\&\f(CW\*(C`Parse::RecDescent::Extend\*(C'\fR interprets the grammar specification and adds any
2767productions it finds to the end of the rules for which they are specified. For
2768example:
2769.PP
2770.Vb 2
2771\& $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\endesc: colour /necks?/";
2772\& parser->Extend($add);
2773.Ve
2774.PP
2775adds two productions to the rule \*(L"name\*(R" (creating it if necessary) and one
2776production to the rule \*(L"desc\*(R".
2777.PP
2778\&\f(CW\*(C`Parse::RecDescent::Replace\*(C'\fR is identical, except that it first resets are
2779rule specified in the additional grammar, removing any existing productions.
2780Hence after:
2781.PP
2782.Vb 2
2783\& $add = "name: 'Jimmy-Bob' | 'Bobby-Jim'\endesc: colour /necks?/";
2784\& parser->Replace($add);
2785.Ve
2786.PP
2787are are \fIonly\fR valid \*(L"name\*(R"s and the one possible description.
2788.PP
2789A more interesting use of the \f(CW\*(C`Extend\*(C'\fR and \f(CW\*(C`Replace\*(C'\fR methods is to call them
2790inside the action of an executing parser. For example:
2791.PP
2792.Vb 3
2793\& typedef: 'typedef' type_name identifier ';'
2794\& { $thisparser->Extend("type_name: '$item[3]'") }
2795\& | <error>
2796.Ve
2797.PP
2798.Vb 1
2799\& identifier: ...!type_name /[A-Za-z_]w*/
2800.Ve
2801.PP
2802which automatically prevents type names from being typedef'd, or:
2803.PP
2804.Vb 6
2805\& command: 'map' key_name 'to' abort_key
2806\& { $thisparser->Replace("abort_key: '$item[2]'") }
2807\& | 'map' key_name 'to' key_name
2808\& { map_key($item[2],$item[4]) }
2809\& | abort_key
2810\& { exit if confirm("abort?") }
2811.Ve
2812.PP
2813.Vb 1
2814\& abort_key: 'q'
2815.Ve
2816.PP
2817.Vb 1
2818\& key_name: ...!abort_key /[A-Za-z]/
2819.Ve
2820.PP
2821which allows the user to change the abort key binding, but not to unbind it.
2822.PP
2823The careful use of such constructs makes it possible to reconfigure a
2824a running parser, eliminating the need for semantic feedback by
2825providing syntactic feedback instead. However, as currently implemented,
2826\&\f(CW\*(C`Replace()\*(C'\fR and \f(CW\*(C`Extend()\*(C'\fR have to regenerate and re\-\f(CW\*(C`eval\*(C'\fR the
2827entire parser whenever they are called. This makes them quite slow for
2828large grammars.
2829.PP
2830In such cases, the judicious use of an interpolated regex is likely to
2831be far more efficient:
2832.PP
2833.Vb 3
2834\& typedef: 'typedef' type_name/ identifier ';'
2835\& { $thisparser->{local}{type_name} .= "|$item[3]" }
2836\& | <error>
2837.Ve
2838.PP
2839.Vb 1
2840\& identifier: ...!type_name /[A-Za-z_]w*/
2841.Ve
2842.PP
2843.Vb 1
2844\& type_name: /$thisparser->{local}{type_name}/
2845.Ve
2846.Sh "Precompiling parsers"
2847.IX Subsection "Precompiling parsers"
2848Normally Parse::RecDescent builds a parser from a grammar at run\-time.
2849That approach simplifies the design and implementation of parsing code,
2850but has the disadvantage that it slows the parsing process down \- you
2851have to wait for Parse::RecDescent to build the parser every time the
2852program runs. Long or complex grammars can be particularly slow to
2853build, leading to unacceptable delays at start\-up.
2854.PP
2855To overcome this, the module provides a way of \*(L"pre\-building\*(R" a parser
2856object and saving it in a separate module. That module can then be used
2857to create clones of the original parser.
2858.PP
2859A grammar may be precompiled using the \f(CW\*(C`Precompile\*(C'\fR class method.
2860For example, to precompile a grammar stored in the scalar \f(CW$grammar\fR,
2861and produce a class named PreGrammar in a module file named PreGrammar.pm,
2862you could use:
2863.PP
2864.Vb 1
2865\& use Parse::RecDescent;
2866.Ve
2867.PP
2868.Vb 1
2869\& Parse::RecDescent->Precompile($grammar, "PreGrammar");
2870.Ve
2871.PP
2872The first argument is the grammar string, the second is the name of the class
2873to be built. The name of the module file is generated automatically by
2874appending \*(L".pm\*(R" to the last element of the class name. Thus
2875.PP
2876.Vb 1
2877\& Parse::RecDescent->Precompile($grammar, "My::New::Parser");
2878.Ve
2879.PP
2880would produce a module file named Parser.pm.
2881.PP
2882It is somewhat tedious to have to write a small Perl program just to
2883generate a precompiled grammar class, so Parse::RecDescent has some special
2884magic that allows you to do the job directly from the command\-line.
2885.PP
2886If your grammar is specified in a file named \fIgrammar\fR, you can generate
2887a class named Yet::Another::Grammar like so:
2888.PP
2889.Vb 1
2890\& > perl -MParse::RecDescent - grammar Yet::Another::Grammar
2891.Ve
2892.PP
2893This would produce a file named \fIGrammar.pm\fR containing the full
2894definition of a class called Yet::Another::Grammar. Of course, to use
2895that class, you would need to put the \fIGrammar.pm\fR file in a
2896directory named \fIYet/Another\fR, somewhere in your Perl include path.
2897.PP
2898Having created the new class, it's very easy to use it to build
2899a parser. You simply \f(CW\*(C`use\*(C'\fR the new module, and then call its
2900\&\f(CW\*(C`new\*(C'\fR method to create a parser object. For example:
2901.PP
2902.Vb 2
2903\& use Yet::Another::Grammar;
2904\& my $parser = Yet::Another::Grammar->new();
2905.Ve
2906.PP
2907The effect of these two lines is exactly the same as:
2908.PP
2909.Vb 1
2910\& use Parse::RecDescent;
2911.Ve
2912.PP
2913.Vb 3
2914\& open GRAMMAR_FILE, "grammar" or die;
2915\& local $/;
2916\& my $grammar = <GRAMMAR_FILE>;
2917.Ve
2918.PP
2919.Vb 1
2920\& my $parser = Parse::RecDescent->new($grammar);
2921.Ve
2922.PP
2923only considerably faster.
2924.PP
2925Note however that the parsers produced by either approach are exactly
2926the same, so whilst precompilation has an effect on \fIset-up\fR speed,
2927it has no effect on \fIparsing\fR speed. RecDescent 2.0 will address that
2928problem.
2929.ie n .Sh "A Metagrammar for ""Parse::RecDescent"""
2930.el .Sh "A Metagrammar for \f(CWParse::RecDescent\fP"
2931.IX Subsection "A Metagrammar for Parse::RecDescent"
2932The following is a specification of grammar format accepted by
2933\&\f(CW\*(C`Parse::RecDescent::new\*(C'\fR (specified in the \f(CW\*(C`Parse::RecDescent\*(C'\fR grammar format!):
2934.PP
2935.Vb 1
2936\& grammar : components(s)
2937.Ve
2938.PP
2939.Vb 1
2940\& component : rule | comment
2941.Ve
2942.PP
2943.Vb 1
2944\& rule : "\en" identifier ":" production(s?)
2945.Ve
2946.PP
2947.Vb 1
2948\& production : items(s)
2949.Ve
2950.PP
2951.Vb 3
2952\& item : lookahead(?) simpleitem
2953\& | directive
2954\& | comment
2955.Ve
2956.PP
2957.Vb 1
2958\& lookahead : '...' | '...!' # +'ve or -'ve lookahead
2959.Ve
2960.PP
2961.Vb 5
2962\& simpleitem : subrule args(?) # match another rule
2963\& | repetition # match repeated subrules
2964\& | terminal # match the next input
2965\& | bracket args(?) # match alternative items
2966\& | action # do something
2967.Ve
2968.PP
2969.Vb 1
2970\& subrule : identifier # the name of the rule
2971.Ve
2972.PP
2973.Vb 1
2974\& args : {extract_codeblock($text,'[]')} # just like a [...] array ref
2975.Ve
2976.PP
2977.Vb 1
2978\& repetition : subrule args(?) howoften
2979.Ve
2980.PP
2981.Vb 6
2982\& howoften : '(?)' # 0 or 1 times
2983\& | '(s?)' # 0 or more times
2984\& | '(s)' # 1 or more times
2985\& | /(\ed+)[.][.](/\ed+)/ # $1 to $2 times
2986\& | /[.][.](/\ed*)/ # at most $1 times
2987\& | /(\ed*)[.][.])/ # at least $1 times
2988.Ve
2989.PP
2990.Vb 3
2991\& terminal : /[/]([\e][/]|[^/])*[/]/ # interpolated pattern
2992\& | /"([\e]"|[^"])*"/ # interpolated literal
2993\& | /'([\e]'|[^'])*'/ # uninterpolated literal
2994.Ve
2995.PP
2996.Vb 1
2997\& action : { extract_codeblock($text) } # embedded Perl code
2998.Ve
2999.PP
3000.Vb 1
3001\& bracket : '(' Item(s) production(s?) ')' # alternative subrules
3002.Ve
3003.PP
3004.Vb 12
3005\& directive : '<commit>' # commit to production
3006\& | '<uncommit>' # cancel commitment
3007\& | '<resync>' # skip to newline
3008\& | '<resync:' pattern '>' # skip <pattern>
3009\& | '<reject>' # fail this production
3010\& | '<reject:' condition '>' # fail if <condition>
3011\& | '<error>' # report an error
3012\& | '<error:' string '>' # report error as "<string>"
3013\& | '<error?>' # error only if committed
3014\& | '<error?:' string '>' # " " " "
3015\& | '<rulevar:' /[^>]+/ '>' # define rule-local variable
3016\& | '<matchrule:' string '>' # invoke rule named in string
3017.Ve
3018.PP
3019.Vb 1
3020\& identifier : /[a-z]\ew*/i # must start with alpha
3021.Ve
3022.PP
3023.Vb 1
3024\& comment : /#[^\en]*/ # same as Perl
3025.Ve
3026.PP
3027.Vb 1
3028\& pattern : {extract_bracketed($text,'<')} # allow embedded "<..>"
3029.Ve
3030.PP
3031.Vb 1
3032\& condition : {extract_codeblock($text,'{<')} # full Perl expression
3033.Ve
3034.PP
3035.Vb 3
3036\& string : {extract_variable($text)} # any Perl variable
3037\& | {extract_quotelike($text)} # or quotelike string
3038\& | {extract_bracketed($text,'<')} # or balanced brackets
3039.Ve
3040.SH "GOTCHAS"
3041.IX Header "GOTCHAS"
3042This section describes common mistakes that grammar writers seem to
3043make on a regular basis.
3044.Sh "1. Expecting an error to always invalidate a parse"
3045.IX Subsection "1. Expecting an error to always invalidate a parse"
3046A common mistake when using error messages is to write the grammar like this:
3047.PP
3048.Vb 1
3049\& file: line(s)
3050.Ve
3051.PP
3052.Vb 4
3053\& line: line_type_1
3054\& | line_type_2
3055\& | line_type_3
3056\& | <error>
3057.Ve
3058.PP
3059The expectation seems to be that any line that is not of type 1, 2 or 3 will
3060invoke the \f(CW\*(C`<error>\*(C'\fR directive and thereby cause the parse to fail.
3061.PP
3062Unfortunately, that only happens if the error occurs in the very first line.
3063The first rule states that a \f(CW\*(C`file\*(C'\fR is matched by one or more lines, so if
3064even a single line succeeds, the first rule is completely satisfied and the
3065parse as a whole succeeds. That means that any error messages generated by
3066subsequent failures in the \f(CW\*(C`line\*(C'\fR rule are quietly ignored.
3067.PP
3068Typically what's really needed is this:
3069.PP
3070.Vb 1
3071\& file: line(s) eofile { $return = $item[1] }
3072.Ve
3073.PP
3074.Vb 4
3075\& line: line_type_1
3076\& | line_type_2
3077\& | line_type_3
3078\& | <error>
3079.Ve
3080.PP
3081.Vb 1
3082\& eofile: /^\eZ/
3083.Ve
3084.PP
3085The addition of the \f(CW\*(C`eofile\*(C'\fR subrule to the first production means that
3086a file only matches a series of successful \f(CW\*(C`line\*(C'\fR matches \fIthat consume the
3087complete input text\fR. If any input text remains after the lines are matched,
3088there must have been an error in the last \f(CW\*(C`line\*(C'\fR. In that case the \f(CW\*(C`eofile\*(C'\fR
3089rule will fail, causing the entire \f(CW\*(C`file\*(C'\fR rule to fail too.
3090.PP
3091Note too that \f(CW\*(C`eofile\*(C'\fR must match \f(CW\*(C`/^\eZ/\*(C'\fR (end\-of\-text), \fInot\fR
3092\&\f(CW\*(C`/^\ecZ/\*(C'\fR or \f(CW\*(C`/^\ecD/\*(C'\fR (end\-of\-file).
3093.PP
3094And don't forget the action at the end of the production. If you just
3095write:
3096.PP
3097.Vb 1
3098\& file: line(s) eofile
3099.Ve
3100.PP
3101then the value returned by the \f(CW\*(C`file\*(C'\fR rule will be the value of its
3102last item: \f(CW\*(C`eofile\*(C'\fR. Since \f(CW\*(C`eofile\*(C'\fR always returns an empty string
3103on success, that will cause the \f(CW\*(C`file\*(C'\fR rule to return that empty
3104string. Apart from returning the wrong value, returning an empty string
3105will trip up code such as:
3106.PP
3107.Vb 1
3108\& $parser->file($filetext) || die;
3109.Ve
3110.PP
3111(since "" is false).
3112.PP
3113Remember that Parse::RecDescent returns undef on failure,
3114so the only safe test for failure is:
3115.PP
3116.Vb 1
3117\& defined($parser->file($filetext)) || die;
3118.Ve
3119.SH "DIAGNOSTICS"
3120.IX Header "DIAGNOSTICS"
3121Diagnostics are intended to be self-explanatory (particularly if you
3122use \fB\-RD_HINT\fR (under \fBperl \-s\fR) or define \f(CW$::RD_HINT\fR inside the program).
3123.PP
3124\&\f(CW\*(C`Parse::RecDescent\*(C'\fR currently diagnoses the following:
3125.IP "\(bu" 4
3126Invalid regular expressions used as pattern terminals (fatal error).
3127.IP "\(bu" 4
3128Invalid Perl code in code blocks (fatal error).
3129.IP "\(bu" 4
3130Lookahead used in the wrong place or in a nonsensical way (fatal error).
3131.IP "\(bu" 4
3132\&\*(L"Obvious\*(R" cases of left-recursion (fatal error).
3133.IP "\(bu" 4
3134Missing or extra components in a \f(CW\*(C`<leftop>\*(C'\fR or \f(CW\*(C`<rightop>\*(C'\fR
3135directive.
3136.IP "\(bu" 4
3137Unrecognisable components in the grammar specification (fatal error).
3138.IP "\(bu" 4
3139\&\*(L"Orphaned\*(R" rule components specified before the first rule (fatal error)
3140or after an \f(CW\*(C`<error>\*(C'\fR directive (level 3 warning).
3141.IP "\(bu" 4
3142Missing rule definitions (this only generates a level 3 warning, since you
3143may be providing them later via \f(CW\*(C`Parse::RecDescent::Extend()\*(C'\fR).
3144.IP "\(bu" 4
3145Instances where greedy repetition behaviour will almost certainly
3146cause the failure of a production (a level 3 warning \- see
3147\&\*(L"\s-1ON\-GOING\s0 \s-1ISSUES\s0 \s-1AND\s0 \s-1FUTURE\s0 \s-1DIRECTIONS\s0\*(R" below).
3148.IP "\(bu" 4
3149Attempts to define rules named 'Replace' or 'Extend', which cannot be
3150called directly through the parser object because of the predefined
3151meaning of \f(CW\*(C`Parse::RecDescent::Replace\*(C'\fR and
3152\&\f(CW\*(C`Parse::RecDescent::Extend\*(C'\fR. (Only a level 2 warning is generated, since
3153such rules \fIcan\fR still be used as subrules).
3154.IP "\(bu" 4
3155Productions which consist of a single \f(CW\*(C`<error?>\*(C'\fR
3156directive, and which therefore may succeed unexpectedly
3157(a level 2 warning, since this might conceivably be the desired effect).
3158.IP "\(bu" 4
3159Multiple consecutive lookahead specifiers (a level 1 warning only, since their
3160effects simply accumulate).
3161.IP "\(bu" 4
3162Productions which start with a \f(CW\*(C`<reject>\*(C'\fR or \f(CW\*(C`<rulevar:...>\*(C'\fR
3163directive. Such productions are optimized away (a level 1 warning).
3164.IP "\(bu" 4
3165Rules which are autogenerated under \f(CW$::AUTOSTUB\fR (a level 1 warning).
3166.SH "AUTHOR"
3167.IX Header "AUTHOR"
3168Damian Conway (damian@conway.org)
3169.SH "BUGS AND IRRITATIONS"
3170.IX Header "BUGS AND IRRITATIONS"
3171There are undoubtedly serious bugs lurking somewhere in this much code :\-)
3172Bug reports and other feedback are most welcome.
3173.PP
3174Ongoing annoyances include:
3175.IP "\(bu" 4
3176There's no support for parsing directly from an input stream.
3177If and when the Perl Gods give us regular expressions on streams,
3178this should be trivial (ahem!) to implement.
3179.IP "\(bu" 4
3180The parser generator can get confused if actions aren't properly
3181closed or if they contain particularly nasty Perl syntax errors
3182(especially unmatched curly brackets).
3183.IP "\(bu" 4
3184The generator only detects the most obvious form of left recursion
3185(potential recursion on the first subrule in a rule). More subtle
3186forms of left recursion (for example, through the second item in a
3187rule after a \*(L"zero\*(R" match of a preceding \*(L"zero\-or\-more\*(R" repetition,
3188or after a match of a subrule with an empty production) are not found.
3189.IP "\(bu" 4
3190Instead of complaining about left\-recursion, the generator should
3191silently transform the grammar to remove it. Don't expect this
3192feature any time soon as it would require a more sophisticated
3193approach to parser generation than is currently used.
3194.IP "\(bu" 4
3195The generated parsers don't always run as fast as might be wished.
3196.IP "\(bu" 4
3197The meta-parser should be bootstrapped using \f(CW\*(C`Parse::RecDescent\*(C'\fR :\-)
3198.SH "ON-GOING ISSUES AND FUTURE DIRECTIONS"
3199.IX Header "ON-GOING ISSUES AND FUTURE DIRECTIONS"
3200.IP "1." 4
3201Repetitions are \*(L"incorrigibly greedy\*(R" in that they will eat everything they can
3202and won't backtrack if that behaviour causes a production to fail needlessly.
3203So, for example:
3204.Sp
3205.Vb 1
3206\& rule: subrule(s) subrule
3207.Ve
3208.Sp
3209will \fInever\fR succeed, because the repetition will eat all the
3210subrules it finds, leaving none to match the second item. Such
3211constructions are relatively rare (and \f(CW\*(C`Parse::RecDescent::new\*(C'\fR generates a
3212warning whenever they occur) so this may not be a problem, especially
3213since the insatiable behaviour can be overcome \*(L"manually\*(R" by writing:
3214.Sp
3215.Vb 1
3216\& rule: penultimate_subrule(s) subrule
3217.Ve
3218.Sp
3219.Vb 1
3220\& penultimate_subrule: subrule ...subrule
3221.Ve
3222.Sp
3223The issue is that this construction is exactly twice as expensive as the
3224original, whereas backtracking would add only 1/\fIN\fR to the cost (for
3225matching \fIN\fR repetitions of \f(CW\*(C`subrule\*(C'\fR). I would welcome feedback on
3226the need for backtracking; particularly on cases where the lack of it
3227makes parsing performance problematical.
3228.IP "2." 4
3229Having opened that can of worms, it's also necessary to consider whether there
3230is a need for non-greedy repetition specifiers. Again, it's possible (at some
3231cost) to manually provide the required functionality:
3232.Sp
3233.Vb 1
3234\& rule: nongreedy_subrule(s) othersubrule
3235.Ve
3236.Sp
3237.Vb 1
3238\& nongreedy_subrule: subrule ...!othersubrule
3239.Ve
3240.Sp
3241Overall, the issue is whether the benefit of this extra functionality
3242outweighs the drawbacks of further complicating the (currently
3243minimalist) grammar specification syntax, and (worse) introducing more overhead
3244into the generated parsers.
3245.IP "3." 4
3246An \f(CW\*(C`<autocommit>\*(C'\fR directive would be nice. That is, it would be useful to be
3247able to say:
3248.Sp
3249.Vb 7
3250\& command: <autocommit>
3251\& command: 'find' name
3252\& | 'find' address
3253\& | 'do' command 'at' time 'if' condition
3254\& | 'do' command 'at' time
3255\& | 'do' command
3256\& | unusual_command
3257.Ve
3258.Sp
3259and have the generator work out that this should be \*(L"pruned\*(R" thus:
3260.Sp
3261.Vb 9
3262\& command: 'find' name
3263\& | 'find' <commit> address
3264\& | 'do' <commit> command <uncommit>
3265\& 'at' time
3266\& 'if' <commit> condition
3267\& | 'do' <commit> command <uncommit>
3268\& 'at' <commit> time
3269\& | 'do' <commit> command
3270\& | unusual_command
3271.Ve
3272.Sp
3273There are several issues here. Firstly, should the
3274\&\f(CW\*(C`<autocommit>\*(C'\fR automatically install an \f(CW\*(C`<uncommit>\*(C'\fR
3275at the start of the last production (on the grounds that the \*(L"command\*(R"
3276rule doesn't know whether an \*(L"unusual_command\*(R" might start with \*(L"find\*(R"
3277or \*(L"do\*(R") or should the \*(L"unusual_command\*(R" subgraph be analysed (to see
3278if it \fImight\fR be viable after a \*(L"find\*(R" or \*(L"do\*(R")?
3279.Sp
3280The second issue is how regular expressions should be treated. The simplest
3281approach would be simply to uncommit before them (on the grounds that they
3282\&\fImight\fR match). Better efficiency would be obtained by analyzing all preceding
3283literal tokens to determine whether the pattern would match them.
3284.Sp
3285Overall, the issues are: can such automated \*(L"pruning\*(R" approach a hand-tuned
3286version sufficiently closely to warrant the extra set-up expense, and (more
3287importantly) is the problem important enough to even warrant the non-trivial
3288effort of building an automated solution?
3289.SH "COPYRIGHT"
3290.IX Header "COPYRIGHT"
3291Copyright (c) 1997\-2000, Damian Conway. All Rights Reserved.
3292This module is free software. It may be used, redistributed
3293and/or modified under the terms of the Perl Artistic License
3294 (see http://www.perl.com/perl/misc/Artistic.html)