Commit | Line | Data |
---|---|---|
bb0cfa24 DF |
1 | /* |
2 | * Copyright (c) 1980 Regents of the University of California. | |
3 | * All rights reserved. The Berkeley software License Agreement | |
4 | * specifies the terms and conditions for redistribution. | |
5 | */ | |
6 | ||
2ce81398 | 7 | #if defined(LIBC_SCCS) && !defined(lint) |
c5980113 | 8 | static char sccsid[] = "@(#)regex.c 5.4 (Berkeley) %G%"; |
2ce81398 | 9 | #endif LIBC_SCCS and not lint |
bb0cfa24 | 10 | |
c5980113 DS |
11 | #include <unistd.h> |
12 | ||
13 | static int advance(); | |
14 | static int backref(); | |
15 | static int cclass(); | |
af509ab3 BJ |
16 | |
17 | /* | |
18 | * routines to do regular expression matching | |
19 | * | |
20 | * Entry points: | |
21 | * | |
22 | * re_comp(s) | |
23 | * char *s; | |
24 | * ... returns 0 if the string s was compiled successfully, | |
25 | * a pointer to an error message otherwise. | |
26 | * If passed 0 or a null string returns without changing | |
27 | * the currently compiled re (see note 11 below). | |
28 | * | |
29 | * re_exec(s) | |
30 | * char *s; | |
31 | * ... returns 1 if the string s matches the last compiled regular | |
32 | * expression, | |
33 | * 0 if the string s failed to match the last compiled | |
34 | * regular expression, and | |
35 | * -1 if the compiled regular expression was invalid | |
36 | * (indicating an internal error). | |
37 | * | |
38 | * The strings passed to both re_comp and re_exec may have trailing or | |
39 | * embedded newline characters; they are terminated by nulls. | |
40 | * | |
41 | * The identity of the author of these routines is lost in antiquity; | |
42 | * this is essentially the same as the re code in the original V6 ed. | |
43 | * | |
44 | * The regular expressions recognized are described below. This description | |
45 | * is essentially the same as that for ed. | |
46 | * | |
47 | * A regular expression specifies a set of strings of characters. | |
48 | * A member of this set of strings is said to be matched by | |
49 | * the regular expression. In the following specification for | |
50 | * regular expressions the word `character' means any character but NUL. | |
51 | * | |
52 | * 1. Any character except a special character matches itself. | |
53 | * Special characters are the regular expression delimiter plus | |
54 | * \ [ . and sometimes ^ * $. | |
55 | * 2. A . matches any character. | |
56 | * 3. A \ followed by any character except a digit or ( ) | |
57 | * matches that character. | |
58 | * 4. A nonempty string s bracketed [s] (or [^s]) matches any | |
59 | * character in (or not in) s. In s, \ has no special meaning, | |
60 | * and ] may only appear as the first letter. A substring | |
61 | * a-b, with a and b in ascending ASCII order, stands for | |
62 | * the inclusive range of ASCII characters. | |
63 | * 5. A regular expression of form 1-4 followed by * matches a | |
64 | * sequence of 0 or more matches of the regular expression. | |
65 | * 6. A regular expression, x, of form 1-8, bracketed \(x\) | |
66 | * matches what x matches. | |
67 | * 7. A \ followed by a digit n matches a copy of the string that the | |
68 | * bracketed regular expression beginning with the nth \( matched. | |
69 | * 8. A regular expression of form 1-8, x, followed by a regular | |
70 | * expression of form 1-7, y matches a match for x followed by | |
71 | * a match for y, with the x match being as long as possible | |
72 | * while still permitting a y match. | |
73 | * 9. A regular expression of form 1-8 preceded by ^ (or followed | |
74 | * by $), is constrained to matches that begin at the left | |
75 | * (or end at the right) end of a line. | |
76 | * 10. A regular expression of form 1-9 picks out the longest among | |
77 | * the leftmost matches in a line. | |
78 | * 11. An empty regular expression stands for a copy of the last | |
79 | * regular expression encountered. | |
80 | */ | |
81 | ||
82 | /* | |
83 | * constants for re's | |
84 | */ | |
85 | #define CBRA 1 | |
86 | #define CCHR 2 | |
87 | #define CDOT 4 | |
88 | #define CCL 6 | |
89 | #define NCCL 8 | |
90 | #define CDOL 10 | |
91 | #define CEOF 11 | |
92 | #define CKET 12 | |
93 | #define CBACK 18 | |
94 | ||
95 | #define CSTAR 01 | |
96 | ||
97 | #define ESIZE 512 | |
98 | #define NBRA 9 | |
99 | ||
100 | static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; | |
101 | static char circf; | |
102 | ||
103 | /* | |
104 | * compile the regular expression argument into a dfa | |
105 | */ | |
106 | char * | |
107 | re_comp(sp) | |
c5980113 | 108 | register const char *sp; |
af509ab3 BJ |
109 | { |
110 | register int c; | |
111 | register char *ep = expbuf; | |
112 | int cclcnt, numbra = 0; | |
113 | char *lastep = 0; | |
114 | char bracket[NBRA]; | |
115 | char *bracketp = &bracket[0]; | |
116 | static char *retoolong = "Regular expression too long"; | |
117 | ||
118 | #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } | |
119 | ||
120 | if (sp == 0 || *sp == '\0') { | |
121 | if (*ep == 0) | |
122 | return("No previous regular expression"); | |
123 | return(0); | |
124 | } | |
125 | if (*sp == '^') { | |
126 | circf = 1; | |
127 | sp++; | |
128 | } | |
129 | else | |
130 | circf = 0; | |
131 | for (;;) { | |
132 | if (ep >= &expbuf[ESIZE]) | |
133 | comerr(retoolong); | |
134 | if ((c = *sp++) == '\0') { | |
135 | if (bracketp != bracket) | |
136 | comerr("unmatched \\("); | |
137 | *ep++ = CEOF; | |
138 | *ep++ = 0; | |
139 | return(0); | |
140 | } | |
141 | if (c != '*') | |
142 | lastep = ep; | |
143 | switch (c) { | |
144 | ||
145 | case '.': | |
146 | *ep++ = CDOT; | |
147 | continue; | |
148 | ||
149 | case '*': | |
150 | if (lastep == 0 || *lastep == CBRA || *lastep == CKET) | |
151 | goto defchar; | |
152 | *lastep |= CSTAR; | |
153 | continue; | |
154 | ||
155 | case '$': | |
156 | if (*sp != '\0') | |
157 | goto defchar; | |
158 | *ep++ = CDOL; | |
159 | continue; | |
160 | ||
161 | case '[': | |
162 | *ep++ = CCL; | |
163 | *ep++ = 0; | |
164 | cclcnt = 1; | |
165 | if ((c = *sp++) == '^') { | |
166 | c = *sp++; | |
167 | ep[-2] = NCCL; | |
168 | } | |
169 | do { | |
170 | if (c == '\0') | |
171 | comerr("missing ]"); | |
172 | if (c == '-' && ep [-1] != 0) { | |
173 | if ((c = *sp++) == ']') { | |
174 | *ep++ = '-'; | |
175 | cclcnt++; | |
176 | break; | |
177 | } | |
178 | while (ep[-1] < c) { | |
179 | *ep = ep[-1] + 1; | |
180 | ep++; | |
181 | cclcnt++; | |
182 | if (ep >= &expbuf[ESIZE]) | |
183 | comerr(retoolong); | |
184 | } | |
185 | } | |
186 | *ep++ = c; | |
187 | cclcnt++; | |
188 | if (ep >= &expbuf[ESIZE]) | |
189 | comerr(retoolong); | |
190 | } while ((c = *sp++) != ']'); | |
191 | lastep[1] = cclcnt; | |
192 | continue; | |
193 | ||
194 | case '\\': | |
195 | if ((c = *sp++) == '(') { | |
196 | if (numbra >= NBRA) | |
197 | comerr("too many \\(\\) pairs"); | |
198 | *bracketp++ = numbra; | |
199 | *ep++ = CBRA; | |
200 | *ep++ = numbra++; | |
201 | continue; | |
202 | } | |
203 | if (c == ')') { | |
204 | if (bracketp <= bracket) | |
205 | comerr("unmatched \\)"); | |
206 | *ep++ = CKET; | |
207 | *ep++ = *--bracketp; | |
208 | continue; | |
209 | } | |
210 | if (c >= '1' && c < ('1' + NBRA)) { | |
211 | *ep++ = CBACK; | |
212 | *ep++ = c - '1'; | |
213 | continue; | |
214 | } | |
215 | *ep++ = CCHR; | |
216 | *ep++ = c; | |
217 | continue; | |
218 | ||
219 | defchar: | |
220 | default: | |
221 | *ep++ = CCHR; | |
222 | *ep++ = c; | |
223 | } | |
224 | } | |
225 | } | |
226 | ||
227 | /* | |
228 | * match the argument string against the compiled re | |
229 | */ | |
230 | int | |
231 | re_exec(p1) | |
c5980113 | 232 | register const char *p1; |
af509ab3 BJ |
233 | { |
234 | register char *p2 = expbuf; | |
235 | register int c; | |
236 | int rv; | |
237 | ||
238 | for (c = 0; c < NBRA; c++) { | |
239 | braslist[c] = 0; | |
240 | braelist[c] = 0; | |
241 | } | |
242 | if (circf) | |
243 | return((advance(p1, p2))); | |
244 | /* | |
245 | * fast check for first character | |
246 | */ | |
247 | if (*p2 == CCHR) { | |
248 | c = p2[1]; | |
249 | do { | |
250 | if (*p1 != c) | |
251 | continue; | |
252 | if (rv = advance(p1, p2)) | |
253 | return(rv); | |
254 | } while (*p1++); | |
255 | return(0); | |
256 | } | |
257 | /* | |
258 | * regular algorithm | |
259 | */ | |
260 | do | |
261 | if (rv = advance(p1, p2)) | |
262 | return(rv); | |
263 | while (*p1++); | |
264 | return(0); | |
265 | } | |
266 | ||
267 | /* | |
268 | * try to match the next thing in the dfa | |
269 | */ | |
270 | static int | |
271 | advance(lp, ep) | |
272 | register char *lp, *ep; | |
273 | { | |
274 | register char *curlp; | |
275 | int ct, i; | |
276 | int rv; | |
277 | ||
278 | for (;;) | |
279 | switch (*ep++) { | |
280 | ||
281 | case CCHR: | |
282 | if (*ep++ == *lp++) | |
283 | continue; | |
284 | return(0); | |
285 | ||
286 | case CDOT: | |
287 | if (*lp++) | |
288 | continue; | |
289 | return(0); | |
290 | ||
291 | case CDOL: | |
292 | if (*lp == '\0') | |
293 | continue; | |
294 | return(0); | |
295 | ||
296 | case CEOF: | |
297 | return(1); | |
298 | ||
299 | case CCL: | |
300 | if (cclass(ep, *lp++, 1)) { | |
301 | ep += *ep; | |
302 | continue; | |
303 | } | |
304 | return(0); | |
305 | ||
306 | case NCCL: | |
307 | if (cclass(ep, *lp++, 0)) { | |
308 | ep += *ep; | |
309 | continue; | |
310 | } | |
311 | return(0); | |
312 | ||
313 | case CBRA: | |
314 | braslist[*ep++] = lp; | |
315 | continue; | |
316 | ||
317 | case CKET: | |
318 | braelist[*ep++] = lp; | |
319 | continue; | |
320 | ||
321 | case CBACK: | |
322 | if (braelist[i = *ep++] == 0) | |
323 | return(-1); | |
324 | if (backref(i, lp)) { | |
325 | lp += braelist[i] - braslist[i]; | |
326 | continue; | |
327 | } | |
328 | return(0); | |
329 | ||
330 | case CBACK|CSTAR: | |
331 | if (braelist[i = *ep++] == 0) | |
332 | return(-1); | |
333 | curlp = lp; | |
334 | ct = braelist[i] - braslist[i]; | |
335 | while (backref(i, lp)) | |
336 | lp += ct; | |
337 | while (lp >= curlp) { | |
338 | if (rv = advance(lp, ep)) | |
339 | return(rv); | |
340 | lp -= ct; | |
341 | } | |
342 | continue; | |
343 | ||
344 | case CDOT|CSTAR: | |
345 | curlp = lp; | |
346 | while (*lp++) | |
347 | ; | |
348 | goto star; | |
349 | ||
350 | case CCHR|CSTAR: | |
351 | curlp = lp; | |
352 | while (*lp++ == *ep) | |
353 | ; | |
354 | ep++; | |
355 | goto star; | |
356 | ||
357 | case CCL|CSTAR: | |
358 | case NCCL|CSTAR: | |
359 | curlp = lp; | |
360 | while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) | |
361 | ; | |
362 | ep += *ep; | |
363 | goto star; | |
364 | ||
365 | star: | |
366 | do { | |
367 | lp--; | |
368 | if (rv = advance(lp, ep)) | |
369 | return(rv); | |
370 | } while (lp > curlp); | |
371 | return(0); | |
372 | ||
373 | default: | |
374 | return(-1); | |
375 | } | |
376 | } | |
377 | ||
90f51f9a | 378 | static |
af509ab3 BJ |
379 | backref(i, lp) |
380 | register int i; | |
381 | register char *lp; | |
382 | { | |
383 | register char *bp; | |
384 | ||
385 | bp = braslist[i]; | |
386 | while (*bp++ == *lp++) | |
387 | if (bp >= braelist[i]) | |
388 | return(1); | |
389 | return(0); | |
390 | } | |
391 | ||
90f51f9a | 392 | static int |
af509ab3 BJ |
393 | cclass(set, c, af) |
394 | register char *set, c; | |
395 | int af; | |
396 | { | |
397 | register int n; | |
398 | ||
399 | if (c == 0) | |
400 | return(0); | |
401 | n = *set++; | |
402 | while (--n) | |
403 | if (*set++ == c) | |
404 | return(af); | |
405 | return(! af); | |
406 | } |