Commit | Line | Data |
---|---|---|
15637ed4 RG |
1 | .\" Copyright 1991 The Regents of the University of California. |
2 | .\" All rights reserved. | |
3 | .\" | |
4 | .\" Redistribution and use in source and binary forms, with or without | |
5 | .\" modification, are permitted provided that the following conditions | |
6 | .\" are met: | |
7 | .\" 1. Redistributions of source code must retain the above copyright | |
8 | .\" notice, this list of conditions and the following disclaimer. | |
9 | .\" 2. Redistributions in binary form must reproduce the above copyright | |
10 | .\" notice, this list of conditions and the following disclaimer in the | |
11 | .\" documentation and/or other materials provided with the distribution. | |
12 | .\" 3. All advertising materials mentioning features or use of this software | |
13 | .\" must display the following acknowledgement: | |
14 | .\" This product includes software developed by the University of | |
15 | .\" California, Berkeley and its contributors. | |
16 | .\" 4. Neither the name of the University nor the names of its contributors | |
17 | .\" may be used to endorse or promote products derived from this software | |
18 | .\" without specific prior written permission. | |
19 | .\" | |
20 | .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
21 | .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
22 | .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
23 | .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
24 | .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
25 | .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
26 | .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
27 | .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
28 | .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
29 | .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
30 | .\" SUCH DAMAGE. | |
31 | .\" | |
32 | .\" @(#)regexp.3 5.2 (Berkeley) 4/19/91 | |
33 | .\" | |
34 | .Dd April 19, 1991 | |
35 | .Dt REGEXP 3 | |
36 | .Os | |
37 | .Sh NAME | |
38 | .Nm regcomp , | |
39 | .Nm regexec , | |
40 | .Nm regsub , | |
41 | .Nm regerror | |
42 | .Nd regular expression handlers | |
43 | .Sh SYNOPSIS | |
44 | .Fd #include <regexp.h> | |
45 | .Ft regexp * | |
46 | .Fn regcomp "const char *exp" | |
47 | .Ft int | |
48 | .Fn regexec "const regexp *prog" "const char *string" | |
49 | .Ft void | |
50 | .Fn regsub "const regexp *prog" "const char *source" "char *dest" | |
51 | .Sh DESCRIPTION | |
52 | The | |
53 | .Fn regcomp , | |
54 | .Fn regexec , | |
55 | .Fn regsub , | |
56 | and | |
57 | .Fn regerror | |
58 | functions | |
59 | implement | |
60 | .Xr egrep 1 Ns -style | |
61 | regular expressions and supporting facilities. | |
62 | .Pp | |
63 | The | |
64 | .Fn regcomp | |
65 | function | |
66 | compiles a regular expression into a structure of type | |
67 | .Xr regexp , | |
68 | and returns a pointer to it. | |
69 | The space has been allocated using | |
70 | .Xr malloc 3 | |
71 | and may be released by | |
72 | .Xr free . | |
73 | .Pp | |
74 | The | |
75 | .Fn regexec | |
76 | function | |
77 | matches a | |
78 | .Dv NUL Ns -terminated | |
79 | .Fa string | |
80 | against the compiled regular expression | |
81 | in | |
82 | .Fa prog . | |
83 | It returns 1 for success and 0 for failure, and adjusts the contents of | |
84 | .Fa prog Ns 's | |
85 | .Em startp | |
86 | and | |
87 | .Em endp | |
88 | (see below) accordingly. | |
89 | .Pp | |
90 | The members of a | |
91 | .Xr regexp | |
92 | structure include at least the following (not necessarily in order): | |
93 | .Bd -literal -offset indent | |
94 | char *startp[NSUBEXP]; | |
95 | char *endp[NSUBEXP]; | |
96 | .Ed | |
97 | .Pp | |
98 | where | |
99 | .Dv NSUBEXP | |
100 | is defined (as 10) in the header file. | |
101 | Once a successful | |
102 | .Fn regexec | |
103 | has been done using the | |
104 | .Fn regexp , | |
105 | each | |
106 | .Em startp Ns - Em endp | |
107 | pair describes one substring | |
108 | within the | |
109 | .Fa string , | |
110 | with the | |
111 | .Em startp | |
112 | pointing to the first character of the substring and | |
113 | the | |
114 | .Em endp | |
115 | pointing to the first character following the substring. | |
116 | The 0th substring is the substring of | |
117 | .Fa string | |
118 | that matched the whole | |
119 | regular expression. | |
120 | The others are those substrings that matched parenthesized expressions | |
121 | within the regular expression, with parenthesized expressions numbered | |
122 | in left-to-right order of their opening parentheses. | |
123 | .Pp | |
124 | The | |
125 | .Fn regsub | |
126 | function | |
127 | copies | |
128 | .Fa source | |
129 | to | |
130 | .Fa dest , | |
131 | making substitutions according to the | |
132 | most recent | |
133 | .Fn regexec | |
134 | performed using | |
135 | .Fa prog . | |
136 | Each instance of `&' in | |
137 | .Fa source | |
138 | is replaced by the substring | |
139 | indicated by | |
140 | .Em startp Ns Bq | |
141 | and | |
142 | .Em endp Ns Bq . | |
143 | Each instance of | |
144 | .Sq \e Ns Em n , | |
145 | where | |
146 | .Em n | |
147 | is a digit, is replaced by | |
148 | the substring indicated by | |
149 | .Em startp Ns Bq Em n | |
150 | and | |
151 | .Em endp Ns Bq Em n . | |
152 | To get a literal `&' or | |
153 | .Sq \e Ns Em n | |
154 | into | |
155 | .Fa dest , | |
156 | prefix it with `\e'; | |
157 | to get a literal `\e' preceding `&' or | |
158 | .Sq \e Ns Em n , | |
159 | prefix it with | |
160 | another `\e'. | |
161 | .Pp | |
162 | The | |
163 | .Fn regerror | |
164 | function | |
165 | is called whenever an error is detected in | |
166 | .Fn regcomp , | |
167 | .Fn regexec , | |
168 | or | |
169 | .Fn regsub . | |
170 | The default | |
171 | .Fn regerror | |
172 | writes the string | |
173 | .Fa msg , | |
174 | with a suitable indicator of origin, | |
175 | on the standard | |
176 | error output | |
177 | and invokes | |
178 | .Xr exit 2 . | |
179 | The | |
180 | .Fn regerror | |
181 | function | |
182 | can be replaced by the user if other actions are desirable. | |
183 | .Sh REGULAR EXPRESSION SYNTAX | |
184 | A regular expression is zero or more | |
185 | .Em branches , | |
186 | separated by `|'. | |
187 | It matches anything that matches one of the branches. | |
188 | .Pp | |
189 | A branch is zero or more | |
190 | .Em pieces , | |
191 | concatenated. | |
192 | It matches a match for the first, followed by a match for the second, etc. | |
193 | .Pp | |
194 | A piece is an | |
195 | .Em atom | |
196 | possibly followed by `*', `+', or `?'. | |
197 | An atom followed by `*' matches a sequence of 0 or more matches of the atom. | |
198 | An atom followed by `+' matches a sequence of 1 or more matches of the atom. | |
199 | An atom followed by `?' matches a match of the atom, or the null string. | |
200 | .Pp | |
201 | An atom is a regular expression in parentheses (matching a match for the | |
202 | regular expression), a | |
203 | .Em range | |
204 | (see below), `.' | |
205 | (matching any single character), `^' (matching the null string at the | |
206 | beginning of the input string), `$' (matching the null string at the | |
207 | end of the input string), a `\e' followed by a single character (matching | |
208 | that character), or a single character with no other significance | |
209 | (matching that character). | |
210 | .Pp | |
211 | A | |
212 | .Em range | |
213 | is a sequence of characters enclosed in `[]'. | |
214 | It normally matches any single character from the sequence. | |
215 | If the sequence begins with `^', | |
216 | it matches any single character | |
217 | .Em not | |
218 | from the rest of the sequence. | |
219 | If two characters in the sequence are separated by `\-', this is shorthand | |
220 | for the full list of | |
221 | .Tn ASCII | |
222 | characters between them | |
223 | (e.g. `[0-9]' matches any decimal digit). | |
224 | To include a literal `]' in the sequence, make it the first character | |
225 | (following a possible `^'). | |
226 | To include a literal `\-', make it the first or last character. | |
227 | .Sh AMBIGUITY | |
228 | If a regular expression could match two different parts of the input string, | |
229 | it will match the one which begins earliest. | |
230 | If both begin in the same place but match different lengths, or match | |
231 | the same length in different ways, life gets messier, as follows. | |
232 | .Pp | |
233 | In general, the possibilities in a list of branches are considered in | |
234 | left-to-right order, the possibilities for `*', `+', and `?' are | |
235 | considered longest-first, nested constructs are considered from the | |
236 | outermost in, and concatenated constructs are considered leftmost-first. | |
237 | The match that will be chosen is the one that uses the earliest | |
238 | possibility in the first choice that has to be made. | |
239 | If there is more than one choice, the next will be made in the same manner | |
240 | (earliest possibility) subject to the decision on the first choice. | |
241 | And so forth. | |
242 | .Pp | |
243 | For example, | |
244 | .Sq Li (ab|a)b*c | |
245 | could match | |
246 | `abc' in one of two ways. | |
247 | The first choice is between `ab' and `a'; since `ab' is earlier, and does | |
248 | lead to a successful overall match, it is chosen. | |
249 | Since the `b' is already spoken for, | |
250 | the `b*' must match its last possibility\(emthe empty string\(emsince | |
251 | it must respect the earlier choice. | |
252 | .Pp | |
253 | In the particular case where no `|'s are present and there is only one | |
254 | `*', `+', or `?', the net effect is that the longest possible | |
255 | match will be chosen. | |
256 | So | |
257 | .Sq Li ab* , | |
258 | presented with `xabbbby', will match `abbbb'. | |
259 | Note that if | |
260 | .Sq Li ab* , | |
261 | is tried against `xabyabbbz', it | |
262 | will match `ab' just after `x', due to the begins-earliest rule. | |
263 | (In effect, the decision on where to start the match is the first choice | |
264 | to be made, hence subsequent choices must respect it even if this leads them | |
265 | to less-preferred alternatives.) | |
266 | .Sh RETURN VALUES | |
267 | The | |
268 | .Fn regcomp | |
269 | function | |
270 | returns | |
271 | .Dv NULL | |
272 | for a failure | |
273 | .Pf ( Fn regerror | |
274 | permitting), | |
275 | where failures are syntax errors, exceeding implementation limits, | |
276 | or applying `+' or `*' to a possibly-null operand. | |
277 | .Sh SEE ALSO | |
278 | .Xr ed 1 , | |
279 | .Xr ex 1 , | |
280 | .Xr expr 1 , | |
281 | .Xr egrep 1 , | |
282 | .Xr fgrep 1 , | |
283 | .Xr grep 1 , | |
284 | .Xr regex 3 | |
285 | .Sh HISTORY | |
286 | Both code and manual page for | |
287 | .Fn regcomp , | |
288 | .Fn regexec , | |
289 | .Fn regsub , | |
290 | and | |
291 | .Fn regerror | |
292 | were written at the University of Toronto | |
293 | and appeared in | |
294 | .Bx 4.3 tahoe . | |
295 | They are intended to be compatible with the Bell V8 | |
296 | .Xr regexp 3 , | |
297 | but are not derived from Bell code. | |
298 | .Sh BUGS | |
299 | Empty branches and empty regular expressions are not portable to V8. | |
300 | .Pp | |
301 | The restriction against | |
302 | applying `*' or `+' to a possibly-null operand is an artifact of the | |
303 | simplistic implementation. | |
304 | .Pp | |
305 | Does not support | |
306 | .Xr egrep Ns 's | |
307 | newline-separated branches; | |
308 | neither does the V8 | |
309 | .Xr regexp 3 , | |
310 | though. | |
311 | .Pp | |
312 | Due to emphasis on | |
313 | compactness and simplicity, | |
314 | it's not strikingly fast. | |
315 | It does give special attention to handling simple cases quickly. |