Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / man / mann / re_syntax.n
CommitLineData
920dae64
AT
1'\"
2'\" Copyright (c) 1998 Sun Microsystems, Inc.
3'\" Copyright (c) 1999 Scriptics Corporation
4'\"
5'\" See the file "license.terms" for information on usage and redistribution
6'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
7'\"
8'\" RCS: @(#) $Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp $
9'\"
10'\" The definitions below are for supplemental macros used in Tcl/Tk
11'\" manual entries.
12'\"
13'\" .AP type name in/out ?indent?
14'\" Start paragraph describing an argument to a library procedure.
15'\" type is type of argument (int, etc.), in/out is either "in", "out",
16'\" or "in/out" to describe whether procedure reads or modifies arg,
17'\" and indent is equivalent to second arg of .IP (shouldn't ever be
18'\" needed; use .AS below instead)
19'\"
20'\" .AS ?type? ?name?
21'\" Give maximum sizes of arguments for setting tab stops. Type and
22'\" name are examples of largest possible arguments that will be passed
23'\" to .AP later. If args are omitted, default tab stops are used.
24'\"
25'\" .BS
26'\" Start box enclosure. From here until next .BE, everything will be
27'\" enclosed in one large box.
28'\"
29'\" .BE
30'\" End of box enclosure.
31'\"
32'\" .CS
33'\" Begin code excerpt.
34'\"
35'\" .CE
36'\" End code excerpt.
37'\"
38'\" .VS ?version? ?br?
39'\" Begin vertical sidebar, for use in marking newly-changed parts
40'\" of man pages. The first argument is ignored and used for recording
41'\" the version when the .VS was added, so that the sidebars can be
42'\" found and removed when they reach a certain age. If another argument
43'\" is present, then a line break is forced before starting the sidebar.
44'\"
45'\" .VE
46'\" End of vertical sidebar.
47'\"
48'\" .DS
49'\" Begin an indented unfilled display.
50'\"
51'\" .DE
52'\" End of indented unfilled display.
53'\"
54'\" .SO
55'\" Start of list of standard options for a Tk widget. The
56'\" options follow on successive lines, in four columns separated
57'\" by tabs.
58'\"
59'\" .SE
60'\" End of list of standard options for a Tk widget.
61'\"
62'\" .OP cmdName dbName dbClass
63'\" Start of description of a specific option. cmdName gives the
64'\" option's name as specified in the class command, dbName gives
65'\" the option's name in the option database, and dbClass gives
66'\" the option's class in the option database.
67'\"
68'\" .UL arg1 arg2
69'\" Print arg1 underlined, then print arg2 normally.
70'\"
71'\" RCS: @(#) $Id: man.macros,v 1.4 2000/08/25 06:18:32 ericm Exp $
72'\"
73'\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages.
74.if t .wh -1.3i ^B
75.nr ^l \n(.l
76.ad b
77'\" # Start an argument description
78.de AP
79.ie !"\\$4"" .TP \\$4
80.el \{\
81. ie !"\\$2"" .TP \\n()Cu
82. el .TP 15
83.\}
84.ta \\n()Au \\n()Bu
85.ie !"\\$3"" \{\
86\&\\$1 \\fI\\$2\\fP (\\$3)
87.\".b
88.\}
89.el \{\
90.br
91.ie !"\\$2"" \{\
92\&\\$1 \\fI\\$2\\fP
93.\}
94.el \{\
95\&\\fI\\$1\\fP
96.\}
97.\}
98..
99'\" # define tabbing values for .AP
100.de AS
101.nr )A 10n
102.if !"\\$1"" .nr )A \\w'\\$1'u+3n
103.nr )B \\n()Au+15n
104.\"
105.if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n
106.nr )C \\n()Bu+\\w'(in/out)'u+2n
107..
108.AS Tcl_Interp Tcl_CreateInterp in/out
109'\" # BS - start boxed text
110'\" # ^y = starting y location
111'\" # ^b = 1
112.de BS
113.br
114.mk ^y
115.nr ^b 1u
116.if n .nf
117.if n .ti 0
118.if n \l'\\n(.lu\(ul'
119.if n .fi
120..
121'\" # BE - end boxed text (draw box now)
122.de BE
123.nf
124.ti 0
125.mk ^t
126.ie n \l'\\n(^lu\(ul'
127.el \{\
128.\" Draw four-sided box normally, but don't draw top of
129.\" box if the box started on an earlier page.
130.ie !\\n(^b-1 \{\
131\h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
132.\}
133.el \}\
134\h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
135.\}
136.\}
137.fi
138.br
139.nr ^b 0
140..
141'\" # VS - start vertical sidebar
142'\" # ^Y = starting y location
143'\" # ^v = 1 (for troff; for nroff this doesn't matter)
144.de VS
145.if !"\\$2"" .br
146.mk ^Y
147.ie n 'mc \s12\(br\s0
148.el .nr ^v 1u
149..
150'\" # VE - end of vertical sidebar
151.de VE
152.ie n 'mc
153.el \{\
154.ev 2
155.nf
156.ti 0
157.mk ^t
158\h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n'
159.sp -1
160.fi
161.ev
162.\}
163.nr ^v 0
164..
165'\" # Special macro to handle page bottom: finish off current
166'\" # box/sidebar if in box/sidebar mode, then invoked standard
167'\" # page bottom macro.
168.de ^B
169.ev 2
170'ti 0
171'nf
172.mk ^t
173.if \\n(^b \{\
174.\" Draw three-sided box if this is the box's first page,
175.\" draw two sides but no top otherwise.
176.ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
177.el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
178.\}
179.if \\n(^v \{\
180.nr ^x \\n(^tu+1v-\\n(^Yu
181\kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c
182.\}
183.bp
184'fi
185.ev
186.if \\n(^b \{\
187.mk ^y
188.nr ^b 2
189.\}
190.if \\n(^v \{\
191.mk ^Y
192.\}
193..
194'\" # DS - begin display
195.de DS
196.RS
197.nf
198.sp
199..
200'\" # DE - end display
201.de DE
202.fi
203.RE
204.sp
205..
206'\" # SO - start of list of standard options
207.de SO
208.SH "STANDARD OPTIONS"
209.LP
210.nf
211.ta 5.5c 11c
212.ft B
213..
214'\" # SE - end of list of standard options
215.de SE
216.fi
217.ft R
218.LP
219See the \\fBoptions\\fR manual entry for details on the standard options.
220..
221'\" # OP - start of full description for a single option
222.de OP
223.LP
224.nf
225.ta 4c
226Command-Line Name: \\fB\\$1\\fR
227Database Name: \\fB\\$2\\fR
228Database Class: \\fB\\$3\\fR
229.fi
230.IP
231..
232'\" # CS - begin code excerpt
233.de CS
234.RS
235.nf
236.ta .25i .5i .75i 1i
237..
238'\" # CE - end code excerpt
239.de CE
240.fi
241.RE
242..
243.de UL
244\\$1\l'|0\(ul'\\$2
245..
246.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
247.BS
248.SH NAME
249re_syntax \- Syntax of Tcl regular expressions.
250.BE
251
252.SH DESCRIPTION
253.PP
254A \fIregular expression\fR describes strings of characters.
255It's a pattern that matches certain strings and doesn't match others.
256
257.SH "DIFFERENT FLAVORS OF REs"
258Regular expressions (``RE''s), as defined by POSIX, come in two
259flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
260EREs are roughly those of the traditional \fIegrep\fR, while BREs are
261roughly those of the traditional \fIed\fR. This implementation adds
262a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
263some significant extensions.
264.PP
265This manual page primarily describes AREs. BREs mostly exist for
266backward compatibility in some old programs; they will be discussed at
267the end. POSIX EREs are almost an exact subset of AREs. Features of
268AREs that are not present in EREs will be indicated.
269
270.SH "REGULAR EXPRESSION SYNTAX"
271.PP
272Tcl regular expressions are implemented using the package written by
273Henry Spencer, based on the 1003.2 spec and some (not quite all) of
274the Perl5 extensions (thanks, Henry!). Much of the description of
275regular expressions below is copied verbatim from his manual entry.
276.PP
277An ARE is one or more \fIbranches\fR,
278separated by `\fB|\fR',
279matching anything that matches any of the branches.
280.PP
281A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
282concatenated.
283It matches a match for the first, followed by a match for the second, etc;
284an empty branch matches the empty string.
285.PP
286A quantified atom is an \fIatom\fR possibly followed
287by a single \fIquantifier\fR.
288Without a quantifier, it matches a match for the atom.
289The quantifiers,
290and what a so-quantified atom matches, are:
291.RS 2
292.TP 6
293\fB*\fR
294a sequence of 0 or more matches of the atom
295.TP
296\fB+\fR
297a sequence of 1 or more matches of the atom
298.TP
299\fB?\fR
300a sequence of 0 or 1 matches of the atom
301.TP
302\fB{\fIm\fB}\fR
303a sequence of exactly \fIm\fR matches of the atom
304.TP
305\fB{\fIm\fB,}\fR
306a sequence of \fIm\fR or more matches of the atom
307.TP
308\fB{\fIm\fB,\fIn\fB}\fR
309a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
310\fIm\fR may not exceed \fIn\fR
311.TP
312\fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR
313\fInon-greedy\fR quantifiers,
314which match the same possibilities,
315but prefer the smallest number rather than the largest number
316of matches (see MATCHING)
317.RE
318.PP
319The forms using
320\fB{\fR and \fB}\fR
321are known as \fIbound\fRs.
322The numbers
323\fIm\fR and \fIn\fR are unsigned decimal integers
324with permissible values from 0 to 255 inclusive.
325.PP
326An atom is one of:
327.RS 2
328.TP 6
329\fB(\fIre\fB)\fR
330(where \fIre\fR is any regular expression)
331matches a match for
332\fIre\fR, with the match noted for possible reporting
333.TP
334\fB(?:\fIre\fB)\fR
335as previous,
336but does no reporting
337(a ``non-capturing'' set of parentheses)
338.TP
339\fB()\fR
340matches an empty string,
341noted for possible reporting
342.TP
343\fB(?:)\fR
344matches an empty string,
345without reporting
346.TP
347\fB[\fIchars\fB]\fR
348a \fIbracket expression\fR,
349matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
350.TP
351 \fB.\fR
352matches any single character
353.TP
354\fB\e\fIk\fR
355(where \fIk\fR is a non-alphanumeric character)
356matches that character taken as an ordinary character,
357e.g. \e\e matches a backslash character
358.TP
359\fB\e\fIc\fR
360where \fIc\fR is alphanumeric
361(possibly followed by other characters),
362an \fIescape\fR (AREs only),
363see ESCAPES below
364.TP
365\fB{\fR
366when followed by a character other than a digit,
367matches the left-brace character `\fB{\fR';
368when followed by a digit, it is the beginning of a
369\fIbound\fR (see above)
370.TP
371\fIx\fR
372where \fIx\fR is
373a single character with no other significance, matches that character.
374.RE
375.PP
376A \fIconstraint\fR matches an empty string when specific conditions
377are met.
378A constraint may not be followed by a quantifier.
379The simple constraints are as follows; some more constraints are
380described later, under ESCAPES.
381.RS 2
382.TP 8
383\fB^\fR
384matches at the beginning of a line
385.TP
386\fB$\fR
387matches at the end of a line
388.TP
389\fB(?=\fIre\fB)\fR
390\fIpositive lookahead\fR (AREs only), matches at any point
391where a substring matching \fIre\fR begins
392.TP
393\fB(?!\fIre\fB)\fR
394\fInegative lookahead\fR (AREs only), matches at any point
395where no substring matching \fIre\fR begins
396.RE
397.PP
398The lookahead constraints may not contain back references (see later),
399and all parentheses within them are considered non-capturing.
400.PP
401An RE may not end with `\fB\e\fR'.
402
403.SH "BRACKET EXPRESSIONS"
404A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
405It normally matches any single character from the list (but see below).
406If the list begins with `\fB^\fR',
407it matches any single character
408(but see below) \fInot\fR from the rest of the list.
409.PP
410If two characters in the list are separated by `\fB\-\fR',
411this is shorthand
412for the full \fIrange\fR of characters between those two (inclusive) in the
413collating sequence,
414e.g.
415\fB[0\-9]\fR
416in ASCII matches any decimal digit.
417Two ranges may not share an
418endpoint, so e.g.
419\fBa\-c\-e\fR
420is illegal.
421Ranges are very collating-sequence-dependent,
422and portable programs should avoid relying on them.
423.PP
424To include a literal
425\fB]\fR
426or
427\fB\-\fR
428in the list,
429the simplest method is to
430enclose it in
431\fB[.\fR and \fB.]\fR
432to make it a collating element (see below).
433Alternatively,
434make it the first character
435(following a possible `\fB^\fR'),
436or (AREs only) precede it with `\fB\e\fR'.
437Alternatively, for `\fB\-\fR',
438make it the last character,
439or the second endpoint of a range.
440To use a literal
441\fB\-\fR
442as the first endpoint of a range,
443make it a collating element
444or (AREs only) precede it with `\fB\e\fR'.
445With the exception of these, some combinations using
446\fB[\fR
447(see next
448paragraphs), and escapes,
449all other special characters lose their
450special significance within a bracket expression.
451.PP
452Within a bracket expression, a collating element (a character,
453a multi-character sequence that collates as if it were a single character,
454or a collating-sequence name for either)
455enclosed in
456\fB[.\fR and \fB.]\fR
457stands for the
458sequence of characters of that collating element.
459The sequence is a single element of the bracket expression's list.
460A bracket expression in a locale that has
461multi-character collating elements
462can thus match more than one character.
463.VS 8.2
464So (insidiously), a bracket expression that starts with \fB^\fR
465can match multi-character collating elements even if none of them
466appear in the bracket expression!
467(\fINote:\fR Tcl currently has no multi-character collating elements.
468This information is only for illustration.)
469.PP
470For example, assume the collating sequence includes a \fBch\fR
471multi-character collating element.
472Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
473matches the first five characters of `\fBchchcc\fR'.
474Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
475(because \fB[^c]\fR matches the multi-character \fBch\fR).
476.VE 8.2
477.PP
478Within a bracket expression, a collating element enclosed in
479\fB[=\fR
480and
481\fB=]\fR
482is an equivalence class, standing for the sequences of characters
483of all collating elements equivalent to that one, including itself.
484(If there are no other equivalent collating elements,
485the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
486and `\fB.]\fR'.)
487For example, if
488\fBo\fR
489and
490\fB\o'o^'\fR
491are the members of an equivalence class,
492then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
493and `\fB[o\o'o^']\fR'\&
494are all synonymous.
495An equivalence class may not be an endpoint
496of a range.
497.VS 8.2
498(\fINote:\fR
499Tcl currently implements only the Unicode locale.
500It doesn't define any equivalence classes.
501The examples above are just illustrations.)
502.VE 8.2
503.PP
504Within a bracket expression, the name of a \fIcharacter class\fR enclosed
505in
506\fB[:\fR
507and
508\fB:]\fR
509stands for the list of all characters
510(not all collating elements!)
511belonging to that
512class.
513Standard character classes are:
514.PP
515.RS
516.ne 5
517.nf
518.ta 3c
519\fBalpha\fR A letter.
520\fBupper\fR An upper-case letter.
521\fBlower\fR A lower-case letter.
522\fBdigit\fR A decimal digit.
523\fBxdigit\fR A hexadecimal digit.
524\fBalnum\fR An alphanumeric (letter or digit).
525\fBprint\fR An alphanumeric (same as alnum).
526\fBblank\fR A space or tab character.
527\fBspace\fR A character producing white space in displayed text.
528\fBpunct\fR A punctuation character.
529\fBgraph\fR A character with a visible representation.
530\fBcntrl\fR A control character.
531.fi
532.RE
533.PP
534A locale may provide others.
535.VS 8.2
536(Note that the current Tcl implementation has only one locale:
537the Unicode locale.)
538.VE 8.2
539A character class may not be used as an endpoint of a range.
540.PP
541There are two special cases of bracket expressions:
542the bracket expressions
543\fB[[:<:]]\fR
544and
545\fB[[:>:]]\fR
546are constraints, matching empty strings at
547the beginning and end of a word respectively.
548'\" note, discussion of escapes below references this definition of word
549A word is defined as a sequence of
550word characters
551that is neither preceded nor followed by
552word characters.
553A word character is an
554\fIalnum\fR
555character
556or an underscore
557(\fB_\fR).
558These special bracket expressions are deprecated;
559users of AREs should use constraint escapes instead (see below).
560.SH ESCAPES
561Escapes (AREs only), which begin with a
562\fB\e\fR
563followed by an alphanumeric character,
564come in several varieties:
565character entry, class shorthands, constraint escapes, and back references.
566A
567\fB\e\fR
568followed by an alphanumeric character but not constituting
569a valid escape is illegal in AREs.
570In EREs, there are no escapes:
571outside a bracket expression,
572a
573\fB\e\fR
574followed by an alphanumeric character merely stands for that
575character as an ordinary character,
576and inside a bracket expression,
577\fB\e\fR
578is an ordinary character.
579(The latter is the one actual incompatibility between EREs and AREs.)
580.PP
581Character-entry escapes (AREs only) exist to make it easier to specify
582non-printing and otherwise inconvenient characters in REs:
583.RS 2
584.TP 5
585\fB\ea\fR
586alert (bell) character, as in C
587.TP
588\fB\eb\fR
589backspace, as in C
590.TP
591\fB\eB\fR
592synonym for
593\fB\e\fR
594to help reduce backslash doubling in some
595applications where there are multiple levels of backslash processing
596.TP
597\fB\ec\fIX\fR
598(where X is any character) the character whose
599low-order 5 bits are the same as those of
600\fIX\fR,
601and whose other bits are all zero
602.TP
603\fB\ee\fR
604the character whose collating-sequence name
605is `\fBESC\fR',
606or failing that, the character with octal value 033
607.TP
608\fB\ef\fR
609formfeed, as in C
610.TP
611\fB\en\fR
612newline, as in C
613.TP
614\fB\er\fR
615carriage return, as in C
616.TP
617\fB\et\fR
618horizontal tab, as in C
619.TP
620\fB\eu\fIwxyz\fR
621(where
622\fIwxyz\fR
623is exactly four hexadecimal digits)
624the Unicode character
625\fBU+\fIwxyz\fR
626in the local byte ordering
627.TP
628\fB\eU\fIstuvwxyz\fR
629(where
630\fIstuvwxyz\fR
631is exactly eight hexadecimal digits)
632reserved for a somewhat-hypothetical Unicode extension to 32 bits
633.TP
634\fB\ev\fR
635vertical tab, as in C
636are all available.
637.TP
638\fB\ex\fIhhh\fR
639(where
640\fIhhh\fR
641is any sequence of hexadecimal digits)
642the character whose hexadecimal value is
643\fB0x\fIhhh\fR
644(a single character no matter how many hexadecimal digits are used).
645.TP
646\fB\e0\fR
647the character whose value is
648\fB0\fR
649.TP
650\fB\e\fIxy\fR
651(where
652\fIxy\fR
653is exactly two octal digits,
654and is not a
655\fIback reference\fR (see below))
656the character whose octal value is
657\fB0\fIxy\fR
658.TP
659\fB\e\fIxyz\fR
660(where
661\fIxyz\fR
662is exactly three octal digits,
663and is not a
664back reference (see below))
665the character whose octal value is
666\fB0\fIxyz\fR
667.RE
668.PP
669Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
670and `\fBA\fR'-`\fBF\fR'.
671Octal digits are `\fB0\fR'-`\fB7\fR'.
672.PP
673The character-entry escapes are always taken as ordinary characters.
674For example,
675\fB\e135\fR
676is
677\fB]\fR
678in ASCII,
679but
680\fB\e135\fR
681does not terminate a bracket expression.
682Beware, however, that some applications (e.g., C compilers) interpret
683such sequences themselves before the regular-expression package
684gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
685.PP
686Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
687character classes:
688.RS 2
689.TP 10
690\fB\ed\fR
691\fB[[:digit:]]\fR
692.TP
693\fB\es\fR
694\fB[[:space:]]\fR
695.TP
696\fB\ew\fR
697\fB[[:alnum:]_]\fR
698(note underscore)
699.TP
700\fB\eD\fR
701\fB[^[:digit:]]\fR
702.TP
703\fB\eS\fR
704\fB[^[:space:]]\fR
705.TP
706\fB\eW\fR
707\fB[^[:alnum:]_]\fR
708(note underscore)
709.RE
710.PP
711Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
712and `\fB\ew\fR'\&
713lose their outer brackets,
714and `\fB\eD\fR', `\fB\eS\fR',
715and `\fB\eW\fR'\&
716are illegal.
717.VS 8.2
718(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
719Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
720.VE 8.2
721.PP
722A constraint escape (AREs only) is a constraint,
723matching the empty string if specific conditions are met,
724written as an escape:
725.RS 2
726.TP 6
727\fB\eA\fR
728matches only at the beginning of the string
729(see MATCHING, below, for how this differs from `\fB^\fR')
730.TP
731\fB\em\fR
732matches only at the beginning of a word
733.TP
734\fB\eM\fR
735matches only at the end of a word
736.TP
737\fB\ey\fR
738matches only at the beginning or end of a word
739.TP
740\fB\eY\fR
741matches only at a point that is not the beginning or end of a word
742.TP
743\fB\eZ\fR
744matches only at the end of the string
745(see MATCHING, below, for how this differs from `\fB$\fR')
746.TP
747\fB\e\fIm\fR
748(where
749\fIm\fR
750is a nonzero digit) a \fIback reference\fR, see below
751.TP
752\fB\e\fImnn\fR
753(where
754\fIm\fR
755is a nonzero digit, and
756\fInn\fR
757is some more digits,
758and the decimal value
759\fImnn\fR
760is not greater than the number of closing capturing parentheses seen so far)
761a \fIback reference\fR, see below
762.RE
763.PP
764A word is defined as in the specification of
765\fB[[:<:]]\fR
766and
767\fB[[:>:]]\fR
768above.
769Constraint escapes are illegal within bracket expressions.
770.PP
771A back reference (AREs only) matches the same string matched by the parenthesized
772subexpression specified by the number,
773so that (e.g.)
774\fB([bc])\e1\fR
775matches
776\fBbb\fR
777or
778\fBcc\fR
779but not `\fBbc\fR'.
780The subexpression must entirely precede the back reference in the RE.
781Subexpressions are numbered in the order of their leading parentheses.
782Non-capturing parentheses do not define subexpressions.
783.PP
784There is an inherent historical ambiguity between octal character-entry
785escapes and back references, which is resolved by heuristics,
786as hinted at above.
787A leading zero always indicates an octal escape.
788A single non-zero digit, not followed by another digit,
789is always taken as a back reference.
790A multi-digit sequence not starting with a zero is taken as a back
791reference if it comes after a suitable subexpression
792(i.e. the number is in the legal range for a back reference),
793and otherwise is taken as octal.
794.SH "METASYNTAX"
795In addition to the main syntax described above, there are some special
796forms and miscellaneous syntactic facilities available.
797.PP
798Normally the flavor of RE being used is specified by
799application-dependent means.
800However, this can be overridden by a \fIdirector\fR.
801If an RE of any flavor begins with `\fB***:\fR',
802the rest of the RE is an ARE.
803If an RE of any flavor begins with `\fB***=\fR',
804the rest of the RE is taken to be a literal string,
805with all characters considered ordinary characters.
806.PP
807An ARE may begin with \fIembedded options\fR:
808a sequence
809\fB(?\fIxyz\fB)\fR
810(where
811\fIxyz\fR
812is one or more alphabetic characters)
813specifies options affecting the rest of the RE.
814These supplement, and can override,
815any options specified by the application.
816The available option letters are:
817.RS 2
818.TP 3
819\fBb\fR
820rest of RE is a BRE
821.TP 3
822\fBc\fR
823case-sensitive matching (usual default)
824.TP 3
825\fBe\fR
826rest of RE is an ERE
827.TP 3
828\fBi\fR
829case-insensitive matching (see MATCHING, below)
830.TP 3
831\fBm\fR
832historical synonym for
833\fBn\fR
834.TP 3
835\fBn\fR
836newline-sensitive matching (see MATCHING, below)
837.TP 3
838\fBp\fR
839partial newline-sensitive matching (see MATCHING, below)
840.TP 3
841\fBq\fR
842rest of RE is a literal (``quoted'') string, all ordinary characters
843.TP 3
844\fBs\fR
845non-newline-sensitive matching (usual default)
846.TP 3
847\fBt\fR
848tight syntax (usual default; see below)
849.TP 3
850\fBw\fR
851inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
852.TP 3
853\fBx\fR
854expanded syntax (see below)
855.RE
856.PP
857Embedded options take effect at the
858\fB)\fR
859terminating the sequence.
860They are available only at the start of an ARE,
861and may not be used later within it.
862.PP
863In addition to the usual (\fItight\fR) RE syntax, in which all characters are
864significant, there is an \fIexpanded\fR syntax,
865available in all flavors of RE
866with the \fB-expanded\fR switch, or in AREs with the embedded x option.
867In the expanded syntax,
868white-space characters are ignored
869and all characters between a
870\fB#\fR
871and the following newline (or the end of the RE) are ignored,
872permitting paragraphing and commenting a complex RE.
873There are three exceptions to that basic rule:
874.RS 2
875.PP
876a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
877.PP
878white space or `\fB#\fR' within a bracket expression is retained
879.PP
880white space and comments are illegal within multi-character symbols
881like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
882.RE
883.PP
884Expanded-syntax white-space characters are blank, tab, newline, and
885.VS 8.2
886any character that belongs to the \fIspace\fR character class.
887.VE 8.2
888.PP
889Finally, in an ARE,
890outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
891(where
892\fIttt\fR
893is any text not containing a `\fB)\fR')
894is a comment,
895completely ignored.
896Again, this is not allowed between the characters of
897multi-character symbols like `\fB(?:\fR'.
898Such comments are more a historical artifact than a useful facility,
899and their use is deprecated;
900use the expanded syntax instead.
901.PP
902\fINone\fR of these metasyntax extensions is available if the application
903(or an initial
904\fB***=\fR
905director)
906has specified that the user's input be treated as a literal string
907rather than as an RE.
908.SH MATCHING
909In the event that an RE could match more than one substring of a given
910string,
911the RE matches the one starting earliest in the string.
912If the RE could match more than one substring starting at that point,
913its choice is determined by its \fIpreference\fR:
914either the longest substring, or the shortest.
915.PP
916Most atoms, and all constraints, have no preference.
917A parenthesized RE has the same preference (possibly none) as the RE.
918A quantified atom with quantifier
919\fB{\fIm\fB}\fR
920or
921\fB{\fIm\fB}?\fR
922has the same preference (possibly none) as the atom itself.
923A quantified atom with other normal quantifiers (including
924\fB{\fIm\fB,\fIn\fB}\fR
925with
926\fIm\fR
927equal to
928\fIn\fR)
929prefers longest match.
930A quantified atom with other non-greedy quantifiers (including
931\fB{\fIm\fB,\fIn\fB}?\fR
932with
933\fIm\fR
934equal to
935\fIn\fR)
936prefers shortest match.
937A branch has the same preference as the first quantified atom in it
938which has a preference.
939An RE consisting of two or more branches connected by the
940\fB|\fR
941operator prefers longest match.
942.PP
943Subject to the constraints imposed by the rules for matching the whole RE,
944subexpressions also match the longest or shortest possible substrings,
945based on their preferences,
946with subexpressions starting earlier in the RE taking priority over
947ones starting later.
948Note that outer subexpressions thus take priority over
949their component subexpressions.
950.PP
951Note that the quantifiers
952\fB{1,1}\fR
953and
954\fB{1,1}?\fR
955can be used to force longest and shortest preference, respectively,
956on a subexpression or a whole RE.
957.PP
958Match lengths are measured in characters, not collating elements.
959An empty string is considered longer than no match at all.
960For example,
961\fBbb*\fR
962matches the three middle characters of `\fBabbbc\fR',
963\fB(week|wee)(night|knights)\fR
964matches all ten characters of `\fBweeknights\fR',
965when
966\fB(.*).*\fR
967is matched against
968\fBabc\fR
969the parenthesized subexpression
970matches all three characters, and
971when
972\fB(a*)*\fR
973is matched against
974\fBbc\fR
975both the whole RE and the parenthesized
976subexpression match an empty string.
977.PP
978If case-independent matching is specified,
979the effect is much as if all case distinctions had vanished from the
980alphabet.
981When an alphabetic that exists in multiple cases appears as an
982ordinary character outside a bracket expression, it is effectively
983transformed into a bracket expression containing both cases,
984so that
985\fBx\fR
986becomes `\fB[xX]\fR'.
987When it appears inside a bracket expression, all case counterparts
988of it are added to the bracket expression, so that
989\fB[x]\fR
990becomes
991\fB[xX]\fR
992and
993\fB[^x]\fR
994becomes `\fB[^xX]\fR'.
995.PP
996If newline-sensitive matching is specified, \fB.\fR
997and bracket expressions using
998\fB^\fR
999will never match the newline character
1000(so that matches will never cross newlines unless the RE
1001explicitly arranges it)
1002and
1003\fB^\fR
1004and
1005\fB$\fR
1006will match the empty string after and before a newline
1007respectively, in addition to matching at beginning and end of string
1008respectively.
1009ARE
1010\fB\eA\fR
1011and
1012\fB\eZ\fR
1013continue to match beginning or end of string \fIonly\fR.
1014.PP
1015If partial newline-sensitive matching is specified,
1016this affects \fB.\fR
1017and bracket expressions
1018as with newline-sensitive matching, but not
1019\fB^\fR
1020and `\fB$\fR'.
1021.PP
1022If inverse partial newline-sensitive matching is specified,
1023this affects
1024\fB^\fR
1025and
1026\fB$\fR
1027as with
1028newline-sensitive matching,
1029but not \fB.\fR
1030and bracket expressions.
1031This isn't very useful but is provided for symmetry.
1032.SH "LIMITS AND COMPATIBILITY"
1033No particular limit is imposed on the length of REs.
1034Programs intended to be highly portable should not employ REs longer
1035than 256 bytes,
1036as a POSIX-compliant implementation can refuse to accept such REs.
1037.PP
1038The only feature of AREs that is actually incompatible with
1039POSIX EREs is that
1040\fB\e\fR
1041does not lose its special
1042significance inside bracket expressions.
1043All other ARE features use syntax which is illegal or has
1044undefined or unspecified effects in POSIX EREs;
1045the
1046\fB***\fR
1047syntax of directors likewise is outside the POSIX
1048syntax for both BREs and EREs.
1049.PP
1050Many of the ARE extensions are borrowed from Perl, but some have
1051been changed to clean them up, and a few Perl extensions are not present.
1052Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
1053the lack of special treatment for a trailing newline,
1054the addition of complemented bracket expressions to the things
1055affected by newline-sensitive matching,
1056the restrictions on parentheses and back references in lookahead constraints,
1057and the longest/shortest-match (rather than first-match) matching semantics.
1058.PP
1059The matching rules for REs containing both normal and non-greedy quantifiers
1060have changed since early beta-test versions of this package.
1061(The new rules are much simpler and cleaner,
1062but don't work as hard at guessing the user's real intentions.)
1063.PP
1064Henry Spencer's original 1986 \fIregexp\fR package,
1065still in widespread use (e.g., in pre-8.1 releases of Tcl),
1066implemented an early version of today's EREs.
1067There are four incompatibilities between \fIregexp\fR's near-EREs
1068(`RREs' for short) and AREs.
1069In roughly increasing order of significance:
1070.PP
1071.RS
1072In AREs,
1073\fB\e\fR
1074followed by an alphanumeric character is either an
1075escape or an error,
1076while in RREs, it was just another way of writing the
1077alphanumeric.
1078This should not be a problem because there was no reason to write
1079such a sequence in RREs.
1080.PP
1081\fB{\fR
1082followed by a digit in an ARE is the beginning of a bound,
1083while in RREs,
1084\fB{\fR
1085was always an ordinary character.
1086Such sequences should be rare,
1087and will often result in an error because following characters
1088will not look like a valid bound.
1089.PP
1090In AREs,
1091\fB\e\fR
1092remains a special character within `\fB[\|]\fR',
1093so a literal
1094\fB\e\fR
1095within
1096\fB[\|]\fR
1097must be written `\fB\e\e\fR'.
1098\fB\e\e\fR
1099also gives a literal
1100\fB\e\fR
1101within
1102\fB[\|]\fR
1103in RREs,
1104but only truly paranoid programmers routinely doubled the backslash.
1105.PP
1106AREs report the longest/shortest match for the RE,
1107rather than the first found in a specified search order.
1108This may affect some RREs which were written in the expectation that
1109the first match would be reported.
1110(The careful crafting of RREs to optimize the search order for fast
1111matching is obsolete (AREs examine all possible matches
1112in parallel, and their performance is largely insensitive to their
1113complexity) but cases where the search order was exploited to deliberately
1114find a match which was \fInot\fR the longest/shortest will need rewriting.)
1115.RE
1116
1117.SH "BASIC REGULAR EXPRESSIONS"
1118BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR',
1119and
1120\fB?\fR
1121are ordinary characters and there is no equivalent
1122for their functionality.
1123The delimiters for bounds are
1124\fB\e{\fR
1125and `\fB\e}\fR',
1126with
1127\fB{\fR
1128and
1129\fB}\fR
1130by themselves ordinary characters.
1131The parentheses for nested subexpressions are
1132\fB\e(\fR
1133and `\fB\e)\fR',
1134with
1135\fB(\fR
1136and
1137\fB)\fR
1138by themselves ordinary characters.
1139\fB^\fR
1140is an ordinary character except at the beginning of the
1141RE or the beginning of a parenthesized subexpression,
1142\fB$\fR
1143is an ordinary character except at the end of the
1144RE or the end of a parenthesized subexpression,
1145and
1146\fB*\fR
1147is an ordinary character if it appears at the beginning of the
1148RE or the beginning of a parenthesized subexpression
1149(after a possible leading `\fB^\fR').
1150Finally,
1151single-digit back references are available,
1152and
1153\fB\e<\fR
1154and
1155\fB\e>\fR
1156are synonyms for
1157\fB[[:<:]]\fR
1158and
1159\fB[[:>:]]\fR
1160respectively;
1161no other escapes are available.
1162
1163.SH "SEE ALSO"
1164RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
1165
1166.SH KEYWORDS
1167match, regular expression, string