Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | '\" |
2 | '\" Copyright (c) 1998 Sun Microsystems, Inc. | |
3 | '\" Copyright (c) 1999 Scriptics Corporation | |
4 | '\" | |
5 | '\" See the file "license.terms" for information on usage and redistribution | |
6 | '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. | |
7 | '\" | |
8 | '\" RCS: @(#) $Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp $ | |
9 | '\" | |
10 | '\" The definitions below are for supplemental macros used in Tcl/Tk | |
11 | '\" manual entries. | |
12 | '\" | |
13 | '\" .AP type name in/out ?indent? | |
14 | '\" Start paragraph describing an argument to a library procedure. | |
15 | '\" type is type of argument (int, etc.), in/out is either "in", "out", | |
16 | '\" or "in/out" to describe whether procedure reads or modifies arg, | |
17 | '\" and indent is equivalent to second arg of .IP (shouldn't ever be | |
18 | '\" needed; use .AS below instead) | |
19 | '\" | |
20 | '\" .AS ?type? ?name? | |
21 | '\" Give maximum sizes of arguments for setting tab stops. Type and | |
22 | '\" name are examples of largest possible arguments that will be passed | |
23 | '\" to .AP later. If args are omitted, default tab stops are used. | |
24 | '\" | |
25 | '\" .BS | |
26 | '\" Start box enclosure. From here until next .BE, everything will be | |
27 | '\" enclosed in one large box. | |
28 | '\" | |
29 | '\" .BE | |
30 | '\" End of box enclosure. | |
31 | '\" | |
32 | '\" .CS | |
33 | '\" Begin code excerpt. | |
34 | '\" | |
35 | '\" .CE | |
36 | '\" End code excerpt. | |
37 | '\" | |
38 | '\" .VS ?version? ?br? | |
39 | '\" Begin vertical sidebar, for use in marking newly-changed parts | |
40 | '\" of man pages. The first argument is ignored and used for recording | |
41 | '\" the version when the .VS was added, so that the sidebars can be | |
42 | '\" found and removed when they reach a certain age. If another argument | |
43 | '\" is present, then a line break is forced before starting the sidebar. | |
44 | '\" | |
45 | '\" .VE | |
46 | '\" End of vertical sidebar. | |
47 | '\" | |
48 | '\" .DS | |
49 | '\" Begin an indented unfilled display. | |
50 | '\" | |
51 | '\" .DE | |
52 | '\" End of indented unfilled display. | |
53 | '\" | |
54 | '\" .SO | |
55 | '\" Start of list of standard options for a Tk widget. The | |
56 | '\" options follow on successive lines, in four columns separated | |
57 | '\" by tabs. | |
58 | '\" | |
59 | '\" .SE | |
60 | '\" End of list of standard options for a Tk widget. | |
61 | '\" | |
62 | '\" .OP cmdName dbName dbClass | |
63 | '\" Start of description of a specific option. cmdName gives the | |
64 | '\" option's name as specified in the class command, dbName gives | |
65 | '\" the option's name in the option database, and dbClass gives | |
66 | '\" the option's class in the option database. | |
67 | '\" | |
68 | '\" .UL arg1 arg2 | |
69 | '\" Print arg1 underlined, then print arg2 normally. | |
70 | '\" | |
71 | '\" RCS: @(#) $Id: man.macros,v 1.4 2000/08/25 06:18:32 ericm Exp $ | |
72 | '\" | |
73 | '\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages. | |
74 | .if t .wh -1.3i ^B | |
75 | .nr ^l \n(.l | |
76 | .ad b | |
77 | '\" # Start an argument description | |
78 | .de AP | |
79 | .ie !"\\$4"" .TP \\$4 | |
80 | .el \{\ | |
81 | . ie !"\\$2"" .TP \\n()Cu | |
82 | . el .TP 15 | |
83 | .\} | |
84 | .ta \\n()Au \\n()Bu | |
85 | .ie !"\\$3"" \{\ | |
86 | \&\\$1 \\fI\\$2\\fP (\\$3) | |
87 | .\".b | |
88 | .\} | |
89 | .el \{\ | |
90 | .br | |
91 | .ie !"\\$2"" \{\ | |
92 | \&\\$1 \\fI\\$2\\fP | |
93 | .\} | |
94 | .el \{\ | |
95 | \&\\fI\\$1\\fP | |
96 | .\} | |
97 | .\} | |
98 | .. | |
99 | '\" # define tabbing values for .AP | |
100 | .de AS | |
101 | .nr )A 10n | |
102 | .if !"\\$1"" .nr )A \\w'\\$1'u+3n | |
103 | .nr )B \\n()Au+15n | |
104 | .\" | |
105 | .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n | |
106 | .nr )C \\n()Bu+\\w'(in/out)'u+2n | |
107 | .. | |
108 | .AS Tcl_Interp Tcl_CreateInterp in/out | |
109 | '\" # BS - start boxed text | |
110 | '\" # ^y = starting y location | |
111 | '\" # ^b = 1 | |
112 | .de BS | |
113 | .br | |
114 | .mk ^y | |
115 | .nr ^b 1u | |
116 | .if n .nf | |
117 | .if n .ti 0 | |
118 | .if n \l'\\n(.lu\(ul' | |
119 | .if n .fi | |
120 | .. | |
121 | '\" # BE - end boxed text (draw box now) | |
122 | .de BE | |
123 | .nf | |
124 | .ti 0 | |
125 | .mk ^t | |
126 | .ie n \l'\\n(^lu\(ul' | |
127 | .el \{\ | |
128 | .\" Draw four-sided box normally, but don't draw top of | |
129 | .\" box if the box started on an earlier page. | |
130 | .ie !\\n(^b-1 \{\ | |
131 | \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' | |
132 | .\} | |
133 | .el \}\ | |
134 | \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' | |
135 | .\} | |
136 | .\} | |
137 | .fi | |
138 | .br | |
139 | .nr ^b 0 | |
140 | .. | |
141 | '\" # VS - start vertical sidebar | |
142 | '\" # ^Y = starting y location | |
143 | '\" # ^v = 1 (for troff; for nroff this doesn't matter) | |
144 | .de VS | |
145 | .if !"\\$2"" .br | |
146 | .mk ^Y | |
147 | .ie n 'mc \s12\(br\s0 | |
148 | .el .nr ^v 1u | |
149 | .. | |
150 | '\" # VE - end of vertical sidebar | |
151 | .de VE | |
152 | .ie n 'mc | |
153 | .el \{\ | |
154 | .ev 2 | |
155 | .nf | |
156 | .ti 0 | |
157 | .mk ^t | |
158 | \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n' | |
159 | .sp -1 | |
160 | .fi | |
161 | .ev | |
162 | .\} | |
163 | .nr ^v 0 | |
164 | .. | |
165 | '\" # Special macro to handle page bottom: finish off current | |
166 | '\" # box/sidebar if in box/sidebar mode, then invoked standard | |
167 | '\" # page bottom macro. | |
168 | .de ^B | |
169 | .ev 2 | |
170 | 'ti 0 | |
171 | 'nf | |
172 | .mk ^t | |
173 | .if \\n(^b \{\ | |
174 | .\" Draw three-sided box if this is the box's first page, | |
175 | .\" draw two sides but no top otherwise. | |
176 | .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c | |
177 | .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c | |
178 | .\} | |
179 | .if \\n(^v \{\ | |
180 | .nr ^x \\n(^tu+1v-\\n(^Yu | |
181 | \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c | |
182 | .\} | |
183 | .bp | |
184 | 'fi | |
185 | .ev | |
186 | .if \\n(^b \{\ | |
187 | .mk ^y | |
188 | .nr ^b 2 | |
189 | .\} | |
190 | .if \\n(^v \{\ | |
191 | .mk ^Y | |
192 | .\} | |
193 | .. | |
194 | '\" # DS - begin display | |
195 | .de DS | |
196 | .RS | |
197 | .nf | |
198 | .sp | |
199 | .. | |
200 | '\" # DE - end display | |
201 | .de DE | |
202 | .fi | |
203 | .RE | |
204 | .sp | |
205 | .. | |
206 | '\" # SO - start of list of standard options | |
207 | .de SO | |
208 | .SH "STANDARD OPTIONS" | |
209 | .LP | |
210 | .nf | |
211 | .ta 5.5c 11c | |
212 | .ft B | |
213 | .. | |
214 | '\" # SE - end of list of standard options | |
215 | .de SE | |
216 | .fi | |
217 | .ft R | |
218 | .LP | |
219 | See the \\fBoptions\\fR manual entry for details on the standard options. | |
220 | .. | |
221 | '\" # OP - start of full description for a single option | |
222 | .de OP | |
223 | .LP | |
224 | .nf | |
225 | .ta 4c | |
226 | Command-Line Name: \\fB\\$1\\fR | |
227 | Database Name: \\fB\\$2\\fR | |
228 | Database Class: \\fB\\$3\\fR | |
229 | .fi | |
230 | .IP | |
231 | .. | |
232 | '\" # CS - begin code excerpt | |
233 | .de CS | |
234 | .RS | |
235 | .nf | |
236 | .ta .25i .5i .75i 1i | |
237 | .. | |
238 | '\" # CE - end code excerpt | |
239 | .de CE | |
240 | .fi | |
241 | .RE | |
242 | .. | |
243 | .de UL | |
244 | \\$1\l'|0\(ul'\\$2 | |
245 | .. | |
246 | .TH re_syntax n "8.1" Tcl "Tcl Built-In Commands" | |
247 | .BS | |
248 | .SH NAME | |
249 | re_syntax \- Syntax of Tcl regular expressions. | |
250 | .BE | |
251 | ||
252 | .SH DESCRIPTION | |
253 | .PP | |
254 | A \fIregular expression\fR describes strings of characters. | |
255 | It's a pattern that matches certain strings and doesn't match others. | |
256 | ||
257 | .SH "DIFFERENT FLAVORS OF REs" | |
258 | Regular expressions (``RE''s), as defined by POSIX, come in two | |
259 | flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs''). | |
260 | EREs are roughly those of the traditional \fIegrep\fR, while BREs are | |
261 | roughly those of the traditional \fIed\fR. This implementation adds | |
262 | a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with | |
263 | some significant extensions. | |
264 | .PP | |
265 | This manual page primarily describes AREs. BREs mostly exist for | |
266 | backward compatibility in some old programs; they will be discussed at | |
267 | the end. POSIX EREs are almost an exact subset of AREs. Features of | |
268 | AREs that are not present in EREs will be indicated. | |
269 | ||
270 | .SH "REGULAR EXPRESSION SYNTAX" | |
271 | .PP | |
272 | Tcl regular expressions are implemented using the package written by | |
273 | Henry Spencer, based on the 1003.2 spec and some (not quite all) of | |
274 | the Perl5 extensions (thanks, Henry!). Much of the description of | |
275 | regular expressions below is copied verbatim from his manual entry. | |
276 | .PP | |
277 | An ARE is one or more \fIbranches\fR, | |
278 | separated by `\fB|\fR', | |
279 | matching anything that matches any of the branches. | |
280 | .PP | |
281 | A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR, | |
282 | concatenated. | |
283 | It matches a match for the first, followed by a match for the second, etc; | |
284 | an empty branch matches the empty string. | |
285 | .PP | |
286 | A quantified atom is an \fIatom\fR possibly followed | |
287 | by a single \fIquantifier\fR. | |
288 | Without a quantifier, it matches a match for the atom. | |
289 | The quantifiers, | |
290 | and what a so-quantified atom matches, are: | |
291 | .RS 2 | |
292 | .TP 6 | |
293 | \fB*\fR | |
294 | a sequence of 0 or more matches of the atom | |
295 | .TP | |
296 | \fB+\fR | |
297 | a sequence of 1 or more matches of the atom | |
298 | .TP | |
299 | \fB?\fR | |
300 | a sequence of 0 or 1 matches of the atom | |
301 | .TP | |
302 | \fB{\fIm\fB}\fR | |
303 | a sequence of exactly \fIm\fR matches of the atom | |
304 | .TP | |
305 | \fB{\fIm\fB,}\fR | |
306 | a sequence of \fIm\fR or more matches of the atom | |
307 | .TP | |
308 | \fB{\fIm\fB,\fIn\fB}\fR | |
309 | a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom; | |
310 | \fIm\fR may not exceed \fIn\fR | |
311 | .TP | |
312 | \fB*? +? ?? {\fIm\fB}? {\fIm\fB,}? {\fIm\fB,\fIn\fB}?\fR | |
313 | \fInon-greedy\fR quantifiers, | |
314 | which match the same possibilities, | |
315 | but prefer the smallest number rather than the largest number | |
316 | of matches (see MATCHING) | |
317 | .RE | |
318 | .PP | |
319 | The forms using | |
320 | \fB{\fR and \fB}\fR | |
321 | are known as \fIbound\fRs. | |
322 | The numbers | |
323 | \fIm\fR and \fIn\fR are unsigned decimal integers | |
324 | with permissible values from 0 to 255 inclusive. | |
325 | .PP | |
326 | An atom is one of: | |
327 | .RS 2 | |
328 | .TP 6 | |
329 | \fB(\fIre\fB)\fR | |
330 | (where \fIre\fR is any regular expression) | |
331 | matches a match for | |
332 | \fIre\fR, with the match noted for possible reporting | |
333 | .TP | |
334 | \fB(?:\fIre\fB)\fR | |
335 | as previous, | |
336 | but does no reporting | |
337 | (a ``non-capturing'' set of parentheses) | |
338 | .TP | |
339 | \fB()\fR | |
340 | matches an empty string, | |
341 | noted for possible reporting | |
342 | .TP | |
343 | \fB(?:)\fR | |
344 | matches an empty string, | |
345 | without reporting | |
346 | .TP | |
347 | \fB[\fIchars\fB]\fR | |
348 | a \fIbracket expression\fR, | |
349 | matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail) | |
350 | .TP | |
351 | \fB.\fR | |
352 | matches any single character | |
353 | .TP | |
354 | \fB\e\fIk\fR | |
355 | (where \fIk\fR is a non-alphanumeric character) | |
356 | matches that character taken as an ordinary character, | |
357 | e.g. \e\e matches a backslash character | |
358 | .TP | |
359 | \fB\e\fIc\fR | |
360 | where \fIc\fR is alphanumeric | |
361 | (possibly followed by other characters), | |
362 | an \fIescape\fR (AREs only), | |
363 | see ESCAPES below | |
364 | .TP | |
365 | \fB{\fR | |
366 | when followed by a character other than a digit, | |
367 | matches the left-brace character `\fB{\fR'; | |
368 | when followed by a digit, it is the beginning of a | |
369 | \fIbound\fR (see above) | |
370 | .TP | |
371 | \fIx\fR | |
372 | where \fIx\fR is | |
373 | a single character with no other significance, matches that character. | |
374 | .RE | |
375 | .PP | |
376 | A \fIconstraint\fR matches an empty string when specific conditions | |
377 | are met. | |
378 | A constraint may not be followed by a quantifier. | |
379 | The simple constraints are as follows; some more constraints are | |
380 | described later, under ESCAPES. | |
381 | .RS 2 | |
382 | .TP 8 | |
383 | \fB^\fR | |
384 | matches at the beginning of a line | |
385 | .TP | |
386 | \fB$\fR | |
387 | matches at the end of a line | |
388 | .TP | |
389 | \fB(?=\fIre\fB)\fR | |
390 | \fIpositive lookahead\fR (AREs only), matches at any point | |
391 | where a substring matching \fIre\fR begins | |
392 | .TP | |
393 | \fB(?!\fIre\fB)\fR | |
394 | \fInegative lookahead\fR (AREs only), matches at any point | |
395 | where no substring matching \fIre\fR begins | |
396 | .RE | |
397 | .PP | |
398 | The lookahead constraints may not contain back references (see later), | |
399 | and all parentheses within them are considered non-capturing. | |
400 | .PP | |
401 | An RE may not end with `\fB\e\fR'. | |
402 | ||
403 | .SH "BRACKET EXPRESSIONS" | |
404 | A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'. | |
405 | It normally matches any single character from the list (but see below). | |
406 | If the list begins with `\fB^\fR', | |
407 | it matches any single character | |
408 | (but see below) \fInot\fR from the rest of the list. | |
409 | .PP | |
410 | If two characters in the list are separated by `\fB\-\fR', | |
411 | this is shorthand | |
412 | for the full \fIrange\fR of characters between those two (inclusive) in the | |
413 | collating sequence, | |
414 | e.g. | |
415 | \fB[0\-9]\fR | |
416 | in ASCII matches any decimal digit. | |
417 | Two ranges may not share an | |
418 | endpoint, so e.g. | |
419 | \fBa\-c\-e\fR | |
420 | is illegal. | |
421 | Ranges are very collating-sequence-dependent, | |
422 | and portable programs should avoid relying on them. | |
423 | .PP | |
424 | To include a literal | |
425 | \fB]\fR | |
426 | or | |
427 | \fB\-\fR | |
428 | in the list, | |
429 | the simplest method is to | |
430 | enclose it in | |
431 | \fB[.\fR and \fB.]\fR | |
432 | to make it a collating element (see below). | |
433 | Alternatively, | |
434 | make it the first character | |
435 | (following a possible `\fB^\fR'), | |
436 | or (AREs only) precede it with `\fB\e\fR'. | |
437 | Alternatively, for `\fB\-\fR', | |
438 | make it the last character, | |
439 | or the second endpoint of a range. | |
440 | To use a literal | |
441 | \fB\-\fR | |
442 | as the first endpoint of a range, | |
443 | make it a collating element | |
444 | or (AREs only) precede it with `\fB\e\fR'. | |
445 | With the exception of these, some combinations using | |
446 | \fB[\fR | |
447 | (see next | |
448 | paragraphs), and escapes, | |
449 | all other special characters lose their | |
450 | special significance within a bracket expression. | |
451 | .PP | |
452 | Within a bracket expression, a collating element (a character, | |
453 | a multi-character sequence that collates as if it were a single character, | |
454 | or a collating-sequence name for either) | |
455 | enclosed in | |
456 | \fB[.\fR and \fB.]\fR | |
457 | stands for the | |
458 | sequence of characters of that collating element. | |
459 | The sequence is a single element of the bracket expression's list. | |
460 | A bracket expression in a locale that has | |
461 | multi-character collating elements | |
462 | can thus match more than one character. | |
463 | .VS 8.2 | |
464 | So (insidiously), a bracket expression that starts with \fB^\fR | |
465 | can match multi-character collating elements even if none of them | |
466 | appear in the bracket expression! | |
467 | (\fINote:\fR Tcl currently has no multi-character collating elements. | |
468 | This information is only for illustration.) | |
469 | .PP | |
470 | For example, assume the collating sequence includes a \fBch\fR | |
471 | multi-character collating element. | |
472 | Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP) | |
473 | matches the first five characters of `\fBchchcc\fR'. | |
474 | Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR' | |
475 | (because \fB[^c]\fR matches the multi-character \fBch\fR). | |
476 | .VE 8.2 | |
477 | .PP | |
478 | Within a bracket expression, a collating element enclosed in | |
479 | \fB[=\fR | |
480 | and | |
481 | \fB=]\fR | |
482 | is an equivalence class, standing for the sequences of characters | |
483 | of all collating elements equivalent to that one, including itself. | |
484 | (If there are no other equivalent collating elements, | |
485 | the treatment is as if the enclosing delimiters were `\fB[.\fR'\& | |
486 | and `\fB.]\fR'.) | |
487 | For example, if | |
488 | \fBo\fR | |
489 | and | |
490 | \fB\o'o^'\fR | |
491 | are the members of an equivalence class, | |
492 | then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR', | |
493 | and `\fB[o\o'o^']\fR'\& | |
494 | are all synonymous. | |
495 | An equivalence class may not be an endpoint | |
496 | of a range. | |
497 | .VS 8.2 | |
498 | (\fINote:\fR | |
499 | Tcl currently implements only the Unicode locale. | |
500 | It doesn't define any equivalence classes. | |
501 | The examples above are just illustrations.) | |
502 | .VE 8.2 | |
503 | .PP | |
504 | Within a bracket expression, the name of a \fIcharacter class\fR enclosed | |
505 | in | |
506 | \fB[:\fR | |
507 | and | |
508 | \fB:]\fR | |
509 | stands for the list of all characters | |
510 | (not all collating elements!) | |
511 | belonging to that | |
512 | class. | |
513 | Standard character classes are: | |
514 | .PP | |
515 | .RS | |
516 | .ne 5 | |
517 | .nf | |
518 | .ta 3c | |
519 | \fBalpha\fR A letter. | |
520 | \fBupper\fR An upper-case letter. | |
521 | \fBlower\fR A lower-case letter. | |
522 | \fBdigit\fR A decimal digit. | |
523 | \fBxdigit\fR A hexadecimal digit. | |
524 | \fBalnum\fR An alphanumeric (letter or digit). | |
525 | \fBprint\fR An alphanumeric (same as alnum). | |
526 | \fBblank\fR A space or tab character. | |
527 | \fBspace\fR A character producing white space in displayed text. | |
528 | \fBpunct\fR A punctuation character. | |
529 | \fBgraph\fR A character with a visible representation. | |
530 | \fBcntrl\fR A control character. | |
531 | .fi | |
532 | .RE | |
533 | .PP | |
534 | A locale may provide others. | |
535 | .VS 8.2 | |
536 | (Note that the current Tcl implementation has only one locale: | |
537 | the Unicode locale.) | |
538 | .VE 8.2 | |
539 | A character class may not be used as an endpoint of a range. | |
540 | .PP | |
541 | There are two special cases of bracket expressions: | |
542 | the bracket expressions | |
543 | \fB[[:<:]]\fR | |
544 | and | |
545 | \fB[[:>:]]\fR | |
546 | are constraints, matching empty strings at | |
547 | the beginning and end of a word respectively. | |
548 | '\" note, discussion of escapes below references this definition of word | |
549 | A word is defined as a sequence of | |
550 | word characters | |
551 | that is neither preceded nor followed by | |
552 | word characters. | |
553 | A word character is an | |
554 | \fIalnum\fR | |
555 | character | |
556 | or an underscore | |
557 | (\fB_\fR). | |
558 | These special bracket expressions are deprecated; | |
559 | users of AREs should use constraint escapes instead (see below). | |
560 | .SH ESCAPES | |
561 | Escapes (AREs only), which begin with a | |
562 | \fB\e\fR | |
563 | followed by an alphanumeric character, | |
564 | come in several varieties: | |
565 | character entry, class shorthands, constraint escapes, and back references. | |
566 | A | |
567 | \fB\e\fR | |
568 | followed by an alphanumeric character but not constituting | |
569 | a valid escape is illegal in AREs. | |
570 | In EREs, there are no escapes: | |
571 | outside a bracket expression, | |
572 | a | |
573 | \fB\e\fR | |
574 | followed by an alphanumeric character merely stands for that | |
575 | character as an ordinary character, | |
576 | and inside a bracket expression, | |
577 | \fB\e\fR | |
578 | is an ordinary character. | |
579 | (The latter is the one actual incompatibility between EREs and AREs.) | |
580 | .PP | |
581 | Character-entry escapes (AREs only) exist to make it easier to specify | |
582 | non-printing and otherwise inconvenient characters in REs: | |
583 | .RS 2 | |
584 | .TP 5 | |
585 | \fB\ea\fR | |
586 | alert (bell) character, as in C | |
587 | .TP | |
588 | \fB\eb\fR | |
589 | backspace, as in C | |
590 | .TP | |
591 | \fB\eB\fR | |
592 | synonym for | |
593 | \fB\e\fR | |
594 | to help reduce backslash doubling in some | |
595 | applications where there are multiple levels of backslash processing | |
596 | .TP | |
597 | \fB\ec\fIX\fR | |
598 | (where X is any character) the character whose | |
599 | low-order 5 bits are the same as those of | |
600 | \fIX\fR, | |
601 | and whose other bits are all zero | |
602 | .TP | |
603 | \fB\ee\fR | |
604 | the character whose collating-sequence name | |
605 | is `\fBESC\fR', | |
606 | or failing that, the character with octal value 033 | |
607 | .TP | |
608 | \fB\ef\fR | |
609 | formfeed, as in C | |
610 | .TP | |
611 | \fB\en\fR | |
612 | newline, as in C | |
613 | .TP | |
614 | \fB\er\fR | |
615 | carriage return, as in C | |
616 | .TP | |
617 | \fB\et\fR | |
618 | horizontal tab, as in C | |
619 | .TP | |
620 | \fB\eu\fIwxyz\fR | |
621 | (where | |
622 | \fIwxyz\fR | |
623 | is exactly four hexadecimal digits) | |
624 | the Unicode character | |
625 | \fBU+\fIwxyz\fR | |
626 | in the local byte ordering | |
627 | .TP | |
628 | \fB\eU\fIstuvwxyz\fR | |
629 | (where | |
630 | \fIstuvwxyz\fR | |
631 | is exactly eight hexadecimal digits) | |
632 | reserved for a somewhat-hypothetical Unicode extension to 32 bits | |
633 | .TP | |
634 | \fB\ev\fR | |
635 | vertical tab, as in C | |
636 | are all available. | |
637 | .TP | |
638 | \fB\ex\fIhhh\fR | |
639 | (where | |
640 | \fIhhh\fR | |
641 | is any sequence of hexadecimal digits) | |
642 | the character whose hexadecimal value is | |
643 | \fB0x\fIhhh\fR | |
644 | (a single character no matter how many hexadecimal digits are used). | |
645 | .TP | |
646 | \fB\e0\fR | |
647 | the character whose value is | |
648 | \fB0\fR | |
649 | .TP | |
650 | \fB\e\fIxy\fR | |
651 | (where | |
652 | \fIxy\fR | |
653 | is exactly two octal digits, | |
654 | and is not a | |
655 | \fIback reference\fR (see below)) | |
656 | the character whose octal value is | |
657 | \fB0\fIxy\fR | |
658 | .TP | |
659 | \fB\e\fIxyz\fR | |
660 | (where | |
661 | \fIxyz\fR | |
662 | is exactly three octal digits, | |
663 | and is not a | |
664 | back reference (see below)) | |
665 | the character whose octal value is | |
666 | \fB0\fIxyz\fR | |
667 | .RE | |
668 | .PP | |
669 | Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR', | |
670 | and `\fBA\fR'-`\fBF\fR'. | |
671 | Octal digits are `\fB0\fR'-`\fB7\fR'. | |
672 | .PP | |
673 | The character-entry escapes are always taken as ordinary characters. | |
674 | For example, | |
675 | \fB\e135\fR | |
676 | is | |
677 | \fB]\fR | |
678 | in ASCII, | |
679 | but | |
680 | \fB\e135\fR | |
681 | does not terminate a bracket expression. | |
682 | Beware, however, that some applications (e.g., C compilers) interpret | |
683 | such sequences themselves before the regular-expression package | |
684 | gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'. | |
685 | .PP | |
686 | Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used | |
687 | character classes: | |
688 | .RS 2 | |
689 | .TP 10 | |
690 | \fB\ed\fR | |
691 | \fB[[:digit:]]\fR | |
692 | .TP | |
693 | \fB\es\fR | |
694 | \fB[[:space:]]\fR | |
695 | .TP | |
696 | \fB\ew\fR | |
697 | \fB[[:alnum:]_]\fR | |
698 | (note underscore) | |
699 | .TP | |
700 | \fB\eD\fR | |
701 | \fB[^[:digit:]]\fR | |
702 | .TP | |
703 | \fB\eS\fR | |
704 | \fB[^[:space:]]\fR | |
705 | .TP | |
706 | \fB\eW\fR | |
707 | \fB[^[:alnum:]_]\fR | |
708 | (note underscore) | |
709 | .RE | |
710 | .PP | |
711 | Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', | |
712 | and `\fB\ew\fR'\& | |
713 | lose their outer brackets, | |
714 | and `\fB\eD\fR', `\fB\eS\fR', | |
715 | and `\fB\eW\fR'\& | |
716 | are illegal. | |
717 | .VS 8.2 | |
718 | (So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. | |
719 | Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.) | |
720 | .VE 8.2 | |
721 | .PP | |
722 | A constraint escape (AREs only) is a constraint, | |
723 | matching the empty string if specific conditions are met, | |
724 | written as an escape: | |
725 | .RS 2 | |
726 | .TP 6 | |
727 | \fB\eA\fR | |
728 | matches only at the beginning of the string | |
729 | (see MATCHING, below, for how this differs from `\fB^\fR') | |
730 | .TP | |
731 | \fB\em\fR | |
732 | matches only at the beginning of a word | |
733 | .TP | |
734 | \fB\eM\fR | |
735 | matches only at the end of a word | |
736 | .TP | |
737 | \fB\ey\fR | |
738 | matches only at the beginning or end of a word | |
739 | .TP | |
740 | \fB\eY\fR | |
741 | matches only at a point that is not the beginning or end of a word | |
742 | .TP | |
743 | \fB\eZ\fR | |
744 | matches only at the end of the string | |
745 | (see MATCHING, below, for how this differs from `\fB$\fR') | |
746 | .TP | |
747 | \fB\e\fIm\fR | |
748 | (where | |
749 | \fIm\fR | |
750 | is a nonzero digit) a \fIback reference\fR, see below | |
751 | .TP | |
752 | \fB\e\fImnn\fR | |
753 | (where | |
754 | \fIm\fR | |
755 | is a nonzero digit, and | |
756 | \fInn\fR | |
757 | is some more digits, | |
758 | and the decimal value | |
759 | \fImnn\fR | |
760 | is not greater than the number of closing capturing parentheses seen so far) | |
761 | a \fIback reference\fR, see below | |
762 | .RE | |
763 | .PP | |
764 | A word is defined as in the specification of | |
765 | \fB[[:<:]]\fR | |
766 | and | |
767 | \fB[[:>:]]\fR | |
768 | above. | |
769 | Constraint escapes are illegal within bracket expressions. | |
770 | .PP | |
771 | A back reference (AREs only) matches the same string matched by the parenthesized | |
772 | subexpression specified by the number, | |
773 | so that (e.g.) | |
774 | \fB([bc])\e1\fR | |
775 | matches | |
776 | \fBbb\fR | |
777 | or | |
778 | \fBcc\fR | |
779 | but not `\fBbc\fR'. | |
780 | The subexpression must entirely precede the back reference in the RE. | |
781 | Subexpressions are numbered in the order of their leading parentheses. | |
782 | Non-capturing parentheses do not define subexpressions. | |
783 | .PP | |
784 | There is an inherent historical ambiguity between octal character-entry | |
785 | escapes and back references, which is resolved by heuristics, | |
786 | as hinted at above. | |
787 | A leading zero always indicates an octal escape. | |
788 | A single non-zero digit, not followed by another digit, | |
789 | is always taken as a back reference. | |
790 | A multi-digit sequence not starting with a zero is taken as a back | |
791 | reference if it comes after a suitable subexpression | |
792 | (i.e. the number is in the legal range for a back reference), | |
793 | and otherwise is taken as octal. | |
794 | .SH "METASYNTAX" | |
795 | In addition to the main syntax described above, there are some special | |
796 | forms and miscellaneous syntactic facilities available. | |
797 | .PP | |
798 | Normally the flavor of RE being used is specified by | |
799 | application-dependent means. | |
800 | However, this can be overridden by a \fIdirector\fR. | |
801 | If an RE of any flavor begins with `\fB***:\fR', | |
802 | the rest of the RE is an ARE. | |
803 | If an RE of any flavor begins with `\fB***=\fR', | |
804 | the rest of the RE is taken to be a literal string, | |
805 | with all characters considered ordinary characters. | |
806 | .PP | |
807 | An ARE may begin with \fIembedded options\fR: | |
808 | a sequence | |
809 | \fB(?\fIxyz\fB)\fR | |
810 | (where | |
811 | \fIxyz\fR | |
812 | is one or more alphabetic characters) | |
813 | specifies options affecting the rest of the RE. | |
814 | These supplement, and can override, | |
815 | any options specified by the application. | |
816 | The available option letters are: | |
817 | .RS 2 | |
818 | .TP 3 | |
819 | \fBb\fR | |
820 | rest of RE is a BRE | |
821 | .TP 3 | |
822 | \fBc\fR | |
823 | case-sensitive matching (usual default) | |
824 | .TP 3 | |
825 | \fBe\fR | |
826 | rest of RE is an ERE | |
827 | .TP 3 | |
828 | \fBi\fR | |
829 | case-insensitive matching (see MATCHING, below) | |
830 | .TP 3 | |
831 | \fBm\fR | |
832 | historical synonym for | |
833 | \fBn\fR | |
834 | .TP 3 | |
835 | \fBn\fR | |
836 | newline-sensitive matching (see MATCHING, below) | |
837 | .TP 3 | |
838 | \fBp\fR | |
839 | partial newline-sensitive matching (see MATCHING, below) | |
840 | .TP 3 | |
841 | \fBq\fR | |
842 | rest of RE is a literal (``quoted'') string, all ordinary characters | |
843 | .TP 3 | |
844 | \fBs\fR | |
845 | non-newline-sensitive matching (usual default) | |
846 | .TP 3 | |
847 | \fBt\fR | |
848 | tight syntax (usual default; see below) | |
849 | .TP 3 | |
850 | \fBw\fR | |
851 | inverse partial newline-sensitive (``weird'') matching (see MATCHING, below) | |
852 | .TP 3 | |
853 | \fBx\fR | |
854 | expanded syntax (see below) | |
855 | .RE | |
856 | .PP | |
857 | Embedded options take effect at the | |
858 | \fB)\fR | |
859 | terminating the sequence. | |
860 | They are available only at the start of an ARE, | |
861 | and may not be used later within it. | |
862 | .PP | |
863 | In addition to the usual (\fItight\fR) RE syntax, in which all characters are | |
864 | significant, there is an \fIexpanded\fR syntax, | |
865 | available in all flavors of RE | |
866 | with the \fB-expanded\fR switch, or in AREs with the embedded x option. | |
867 | In the expanded syntax, | |
868 | white-space characters are ignored | |
869 | and all characters between a | |
870 | \fB#\fR | |
871 | and the following newline (or the end of the RE) are ignored, | |
872 | permitting paragraphing and commenting a complex RE. | |
873 | There are three exceptions to that basic rule: | |
874 | .RS 2 | |
875 | .PP | |
876 | a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained | |
877 | .PP | |
878 | white space or `\fB#\fR' within a bracket expression is retained | |
879 | .PP | |
880 | white space and comments are illegal within multi-character symbols | |
881 | like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR' | |
882 | .RE | |
883 | .PP | |
884 | Expanded-syntax white-space characters are blank, tab, newline, and | |
885 | .VS 8.2 | |
886 | any character that belongs to the \fIspace\fR character class. | |
887 | .VE 8.2 | |
888 | .PP | |
889 | Finally, in an ARE, | |
890 | outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR' | |
891 | (where | |
892 | \fIttt\fR | |
893 | is any text not containing a `\fB)\fR') | |
894 | is a comment, | |
895 | completely ignored. | |
896 | Again, this is not allowed between the characters of | |
897 | multi-character symbols like `\fB(?:\fR'. | |
898 | Such comments are more a historical artifact than a useful facility, | |
899 | and their use is deprecated; | |
900 | use the expanded syntax instead. | |
901 | .PP | |
902 | \fINone\fR of these metasyntax extensions is available if the application | |
903 | (or an initial | |
904 | \fB***=\fR | |
905 | director) | |
906 | has specified that the user's input be treated as a literal string | |
907 | rather than as an RE. | |
908 | .SH MATCHING | |
909 | In the event that an RE could match more than one substring of a given | |
910 | string, | |
911 | the RE matches the one starting earliest in the string. | |
912 | If the RE could match more than one substring starting at that point, | |
913 | its choice is determined by its \fIpreference\fR: | |
914 | either the longest substring, or the shortest. | |
915 | .PP | |
916 | Most atoms, and all constraints, have no preference. | |
917 | A parenthesized RE has the same preference (possibly none) as the RE. | |
918 | A quantified atom with quantifier | |
919 | \fB{\fIm\fB}\fR | |
920 | or | |
921 | \fB{\fIm\fB}?\fR | |
922 | has the same preference (possibly none) as the atom itself. | |
923 | A quantified atom with other normal quantifiers (including | |
924 | \fB{\fIm\fB,\fIn\fB}\fR | |
925 | with | |
926 | \fIm\fR | |
927 | equal to | |
928 | \fIn\fR) | |
929 | prefers longest match. | |
930 | A quantified atom with other non-greedy quantifiers (including | |
931 | \fB{\fIm\fB,\fIn\fB}?\fR | |
932 | with | |
933 | \fIm\fR | |
934 | equal to | |
935 | \fIn\fR) | |
936 | prefers shortest match. | |
937 | A branch has the same preference as the first quantified atom in it | |
938 | which has a preference. | |
939 | An RE consisting of two or more branches connected by the | |
940 | \fB|\fR | |
941 | operator prefers longest match. | |
942 | .PP | |
943 | Subject to the constraints imposed by the rules for matching the whole RE, | |
944 | subexpressions also match the longest or shortest possible substrings, | |
945 | based on their preferences, | |
946 | with subexpressions starting earlier in the RE taking priority over | |
947 | ones starting later. | |
948 | Note that outer subexpressions thus take priority over | |
949 | their component subexpressions. | |
950 | .PP | |
951 | Note that the quantifiers | |
952 | \fB{1,1}\fR | |
953 | and | |
954 | \fB{1,1}?\fR | |
955 | can be used to force longest and shortest preference, respectively, | |
956 | on a subexpression or a whole RE. | |
957 | .PP | |
958 | Match lengths are measured in characters, not collating elements. | |
959 | An empty string is considered longer than no match at all. | |
960 | For example, | |
961 | \fBbb*\fR | |
962 | matches the three middle characters of `\fBabbbc\fR', | |
963 | \fB(week|wee)(night|knights)\fR | |
964 | matches all ten characters of `\fBweeknights\fR', | |
965 | when | |
966 | \fB(.*).*\fR | |
967 | is matched against | |
968 | \fBabc\fR | |
969 | the parenthesized subexpression | |
970 | matches all three characters, and | |
971 | when | |
972 | \fB(a*)*\fR | |
973 | is matched against | |
974 | \fBbc\fR | |
975 | both the whole RE and the parenthesized | |
976 | subexpression match an empty string. | |
977 | .PP | |
978 | If case-independent matching is specified, | |
979 | the effect is much as if all case distinctions had vanished from the | |
980 | alphabet. | |
981 | When an alphabetic that exists in multiple cases appears as an | |
982 | ordinary character outside a bracket expression, it is effectively | |
983 | transformed into a bracket expression containing both cases, | |
984 | so that | |
985 | \fBx\fR | |
986 | becomes `\fB[xX]\fR'. | |
987 | When it appears inside a bracket expression, all case counterparts | |
988 | of it are added to the bracket expression, so that | |
989 | \fB[x]\fR | |
990 | becomes | |
991 | \fB[xX]\fR | |
992 | and | |
993 | \fB[^x]\fR | |
994 | becomes `\fB[^xX]\fR'. | |
995 | .PP | |
996 | If newline-sensitive matching is specified, \fB.\fR | |
997 | and bracket expressions using | |
998 | \fB^\fR | |
999 | will never match the newline character | |
1000 | (so that matches will never cross newlines unless the RE | |
1001 | explicitly arranges it) | |
1002 | and | |
1003 | \fB^\fR | |
1004 | and | |
1005 | \fB$\fR | |
1006 | will match the empty string after and before a newline | |
1007 | respectively, in addition to matching at beginning and end of string | |
1008 | respectively. | |
1009 | ARE | |
1010 | \fB\eA\fR | |
1011 | and | |
1012 | \fB\eZ\fR | |
1013 | continue to match beginning or end of string \fIonly\fR. | |
1014 | .PP | |
1015 | If partial newline-sensitive matching is specified, | |
1016 | this affects \fB.\fR | |
1017 | and bracket expressions | |
1018 | as with newline-sensitive matching, but not | |
1019 | \fB^\fR | |
1020 | and `\fB$\fR'. | |
1021 | .PP | |
1022 | If inverse partial newline-sensitive matching is specified, | |
1023 | this affects | |
1024 | \fB^\fR | |
1025 | and | |
1026 | \fB$\fR | |
1027 | as with | |
1028 | newline-sensitive matching, | |
1029 | but not \fB.\fR | |
1030 | and bracket expressions. | |
1031 | This isn't very useful but is provided for symmetry. | |
1032 | .SH "LIMITS AND COMPATIBILITY" | |
1033 | No particular limit is imposed on the length of REs. | |
1034 | Programs intended to be highly portable should not employ REs longer | |
1035 | than 256 bytes, | |
1036 | as a POSIX-compliant implementation can refuse to accept such REs. | |
1037 | .PP | |
1038 | The only feature of AREs that is actually incompatible with | |
1039 | POSIX EREs is that | |
1040 | \fB\e\fR | |
1041 | does not lose its special | |
1042 | significance inside bracket expressions. | |
1043 | All other ARE features use syntax which is illegal or has | |
1044 | undefined or unspecified effects in POSIX EREs; | |
1045 | the | |
1046 | \fB***\fR | |
1047 | syntax of directors likewise is outside the POSIX | |
1048 | syntax for both BREs and EREs. | |
1049 | .PP | |
1050 | Many of the ARE extensions are borrowed from Perl, but some have | |
1051 | been changed to clean them up, and a few Perl extensions are not present. | |
1052 | Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR', | |
1053 | the lack of special treatment for a trailing newline, | |
1054 | the addition of complemented bracket expressions to the things | |
1055 | affected by newline-sensitive matching, | |
1056 | the restrictions on parentheses and back references in lookahead constraints, | |
1057 | and the longest/shortest-match (rather than first-match) matching semantics. | |
1058 | .PP | |
1059 | The matching rules for REs containing both normal and non-greedy quantifiers | |
1060 | have changed since early beta-test versions of this package. | |
1061 | (The new rules are much simpler and cleaner, | |
1062 | but don't work as hard at guessing the user's real intentions.) | |
1063 | .PP | |
1064 | Henry Spencer's original 1986 \fIregexp\fR package, | |
1065 | still in widespread use (e.g., in pre-8.1 releases of Tcl), | |
1066 | implemented an early version of today's EREs. | |
1067 | There are four incompatibilities between \fIregexp\fR's near-EREs | |
1068 | (`RREs' for short) and AREs. | |
1069 | In roughly increasing order of significance: | |
1070 | .PP | |
1071 | .RS | |
1072 | In AREs, | |
1073 | \fB\e\fR | |
1074 | followed by an alphanumeric character is either an | |
1075 | escape or an error, | |
1076 | while in RREs, it was just another way of writing the | |
1077 | alphanumeric. | |
1078 | This should not be a problem because there was no reason to write | |
1079 | such a sequence in RREs. | |
1080 | .PP | |
1081 | \fB{\fR | |
1082 | followed by a digit in an ARE is the beginning of a bound, | |
1083 | while in RREs, | |
1084 | \fB{\fR | |
1085 | was always an ordinary character. | |
1086 | Such sequences should be rare, | |
1087 | and will often result in an error because following characters | |
1088 | will not look like a valid bound. | |
1089 | .PP | |
1090 | In AREs, | |
1091 | \fB\e\fR | |
1092 | remains a special character within `\fB[\|]\fR', | |
1093 | so a literal | |
1094 | \fB\e\fR | |
1095 | within | |
1096 | \fB[\|]\fR | |
1097 | must be written `\fB\e\e\fR'. | |
1098 | \fB\e\e\fR | |
1099 | also gives a literal | |
1100 | \fB\e\fR | |
1101 | within | |
1102 | \fB[\|]\fR | |
1103 | in RREs, | |
1104 | but only truly paranoid programmers routinely doubled the backslash. | |
1105 | .PP | |
1106 | AREs report the longest/shortest match for the RE, | |
1107 | rather than the first found in a specified search order. | |
1108 | This may affect some RREs which were written in the expectation that | |
1109 | the first match would be reported. | |
1110 | (The careful crafting of RREs to optimize the search order for fast | |
1111 | matching is obsolete (AREs examine all possible matches | |
1112 | in parallel, and their performance is largely insensitive to their | |
1113 | complexity) but cases where the search order was exploited to deliberately | |
1114 | find a match which was \fInot\fR the longest/shortest will need rewriting.) | |
1115 | .RE | |
1116 | ||
1117 | .SH "BASIC REGULAR EXPRESSIONS" | |
1118 | BREs differ from EREs in several respects. `\fB|\fR', `\fB+\fR', | |
1119 | and | |
1120 | \fB?\fR | |
1121 | are ordinary characters and there is no equivalent | |
1122 | for their functionality. | |
1123 | The delimiters for bounds are | |
1124 | \fB\e{\fR | |
1125 | and `\fB\e}\fR', | |
1126 | with | |
1127 | \fB{\fR | |
1128 | and | |
1129 | \fB}\fR | |
1130 | by themselves ordinary characters. | |
1131 | The parentheses for nested subexpressions are | |
1132 | \fB\e(\fR | |
1133 | and `\fB\e)\fR', | |
1134 | with | |
1135 | \fB(\fR | |
1136 | and | |
1137 | \fB)\fR | |
1138 | by themselves ordinary characters. | |
1139 | \fB^\fR | |
1140 | is an ordinary character except at the beginning of the | |
1141 | RE or the beginning of a parenthesized subexpression, | |
1142 | \fB$\fR | |
1143 | is an ordinary character except at the end of the | |
1144 | RE or the end of a parenthesized subexpression, | |
1145 | and | |
1146 | \fB*\fR | |
1147 | is an ordinary character if it appears at the beginning of the | |
1148 | RE or the beginning of a parenthesized subexpression | |
1149 | (after a possible leading `\fB^\fR'). | |
1150 | Finally, | |
1151 | single-digit back references are available, | |
1152 | and | |
1153 | \fB\e<\fR | |
1154 | and | |
1155 | \fB\e>\fR | |
1156 | are synonyms for | |
1157 | \fB[[:<:]]\fR | |
1158 | and | |
1159 | \fB[[:>:]]\fR | |
1160 | respectively; | |
1161 | no other escapes are available. | |
1162 | ||
1163 | .SH "SEE ALSO" | |
1164 | RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n) | |
1165 | ||
1166 | .SH KEYWORDS | |
1167 | match, regular expression, string |