Commit | Line | Data |
---|---|---|
af509ab3 BJ |
1 | /* @(#)regex.c 4.1 (Berkeley) %G% */ |
2 | # | |
3 | ||
4 | /* | |
5 | * routines to do regular expression matching | |
6 | * | |
7 | * Entry points: | |
8 | * | |
9 | * re_comp(s) | |
10 | * char *s; | |
11 | * ... returns 0 if the string s was compiled successfully, | |
12 | * a pointer to an error message otherwise. | |
13 | * If passed 0 or a null string returns without changing | |
14 | * the currently compiled re (see note 11 below). | |
15 | * | |
16 | * re_exec(s) | |
17 | * char *s; | |
18 | * ... returns 1 if the string s matches the last compiled regular | |
19 | * expression, | |
20 | * 0 if the string s failed to match the last compiled | |
21 | * regular expression, and | |
22 | * -1 if the compiled regular expression was invalid | |
23 | * (indicating an internal error). | |
24 | * | |
25 | * The strings passed to both re_comp and re_exec may have trailing or | |
26 | * embedded newline characters; they are terminated by nulls. | |
27 | * | |
28 | * The identity of the author of these routines is lost in antiquity; | |
29 | * this is essentially the same as the re code in the original V6 ed. | |
30 | * | |
31 | * The regular expressions recognized are described below. This description | |
32 | * is essentially the same as that for ed. | |
33 | * | |
34 | * A regular expression specifies a set of strings of characters. | |
35 | * A member of this set of strings is said to be matched by | |
36 | * the regular expression. In the following specification for | |
37 | * regular expressions the word `character' means any character but NUL. | |
38 | * | |
39 | * 1. Any character except a special character matches itself. | |
40 | * Special characters are the regular expression delimiter plus | |
41 | * \ [ . and sometimes ^ * $. | |
42 | * 2. A . matches any character. | |
43 | * 3. A \ followed by any character except a digit or ( ) | |
44 | * matches that character. | |
45 | * 4. A nonempty string s bracketed [s] (or [^s]) matches any | |
46 | * character in (or not in) s. In s, \ has no special meaning, | |
47 | * and ] may only appear as the first letter. A substring | |
48 | * a-b, with a and b in ascending ASCII order, stands for | |
49 | * the inclusive range of ASCII characters. | |
50 | * 5. A regular expression of form 1-4 followed by * matches a | |
51 | * sequence of 0 or more matches of the regular expression. | |
52 | * 6. A regular expression, x, of form 1-8, bracketed \(x\) | |
53 | * matches what x matches. | |
54 | * 7. A \ followed by a digit n matches a copy of the string that the | |
55 | * bracketed regular expression beginning with the nth \( matched. | |
56 | * 8. A regular expression of form 1-8, x, followed by a regular | |
57 | * expression of form 1-7, y matches a match for x followed by | |
58 | * a match for y, with the x match being as long as possible | |
59 | * while still permitting a y match. | |
60 | * 9. A regular expression of form 1-8 preceded by ^ (or followed | |
61 | * by $), is constrained to matches that begin at the left | |
62 | * (or end at the right) end of a line. | |
63 | * 10. A regular expression of form 1-9 picks out the longest among | |
64 | * the leftmost matches in a line. | |
65 | * 11. An empty regular expression stands for a copy of the last | |
66 | * regular expression encountered. | |
67 | */ | |
68 | ||
69 | /* | |
70 | * constants for re's | |
71 | */ | |
72 | #define CBRA 1 | |
73 | #define CCHR 2 | |
74 | #define CDOT 4 | |
75 | #define CCL 6 | |
76 | #define NCCL 8 | |
77 | #define CDOL 10 | |
78 | #define CEOF 11 | |
79 | #define CKET 12 | |
80 | #define CBACK 18 | |
81 | ||
82 | #define CSTAR 01 | |
83 | ||
84 | #define ESIZE 512 | |
85 | #define NBRA 9 | |
86 | ||
87 | static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; | |
88 | static char circf; | |
89 | ||
90 | /* | |
91 | * compile the regular expression argument into a dfa | |
92 | */ | |
93 | char * | |
94 | re_comp(sp) | |
95 | register char *sp; | |
96 | { | |
97 | register int c; | |
98 | register char *ep = expbuf; | |
99 | int cclcnt, numbra = 0; | |
100 | char *lastep = 0; | |
101 | char bracket[NBRA]; | |
102 | char *bracketp = &bracket[0]; | |
103 | static char *retoolong = "Regular expression too long"; | |
104 | ||
105 | #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } | |
106 | ||
107 | if (sp == 0 || *sp == '\0') { | |
108 | if (*ep == 0) | |
109 | return("No previous regular expression"); | |
110 | return(0); | |
111 | } | |
112 | if (*sp == '^') { | |
113 | circf = 1; | |
114 | sp++; | |
115 | } | |
116 | else | |
117 | circf = 0; | |
118 | for (;;) { | |
119 | if (ep >= &expbuf[ESIZE]) | |
120 | comerr(retoolong); | |
121 | if ((c = *sp++) == '\0') { | |
122 | if (bracketp != bracket) | |
123 | comerr("unmatched \\("); | |
124 | *ep++ = CEOF; | |
125 | *ep++ = 0; | |
126 | return(0); | |
127 | } | |
128 | if (c != '*') | |
129 | lastep = ep; | |
130 | switch (c) { | |
131 | ||
132 | case '.': | |
133 | *ep++ = CDOT; | |
134 | continue; | |
135 | ||
136 | case '*': | |
137 | if (lastep == 0 || *lastep == CBRA || *lastep == CKET) | |
138 | goto defchar; | |
139 | *lastep |= CSTAR; | |
140 | continue; | |
141 | ||
142 | case '$': | |
143 | if (*sp != '\0') | |
144 | goto defchar; | |
145 | *ep++ = CDOL; | |
146 | continue; | |
147 | ||
148 | case '[': | |
149 | *ep++ = CCL; | |
150 | *ep++ = 0; | |
151 | cclcnt = 1; | |
152 | if ((c = *sp++) == '^') { | |
153 | c = *sp++; | |
154 | ep[-2] = NCCL; | |
155 | } | |
156 | do { | |
157 | if (c == '\0') | |
158 | comerr("missing ]"); | |
159 | if (c == '-' && ep [-1] != 0) { | |
160 | if ((c = *sp++) == ']') { | |
161 | *ep++ = '-'; | |
162 | cclcnt++; | |
163 | break; | |
164 | } | |
165 | while (ep[-1] < c) { | |
166 | *ep = ep[-1] + 1; | |
167 | ep++; | |
168 | cclcnt++; | |
169 | if (ep >= &expbuf[ESIZE]) | |
170 | comerr(retoolong); | |
171 | } | |
172 | } | |
173 | *ep++ = c; | |
174 | cclcnt++; | |
175 | if (ep >= &expbuf[ESIZE]) | |
176 | comerr(retoolong); | |
177 | } while ((c = *sp++) != ']'); | |
178 | lastep[1] = cclcnt; | |
179 | continue; | |
180 | ||
181 | case '\\': | |
182 | if ((c = *sp++) == '(') { | |
183 | if (numbra >= NBRA) | |
184 | comerr("too many \\(\\) pairs"); | |
185 | *bracketp++ = numbra; | |
186 | *ep++ = CBRA; | |
187 | *ep++ = numbra++; | |
188 | continue; | |
189 | } | |
190 | if (c == ')') { | |
191 | if (bracketp <= bracket) | |
192 | comerr("unmatched \\)"); | |
193 | *ep++ = CKET; | |
194 | *ep++ = *--bracketp; | |
195 | continue; | |
196 | } | |
197 | if (c >= '1' && c < ('1' + NBRA)) { | |
198 | *ep++ = CBACK; | |
199 | *ep++ = c - '1'; | |
200 | continue; | |
201 | } | |
202 | *ep++ = CCHR; | |
203 | *ep++ = c; | |
204 | continue; | |
205 | ||
206 | defchar: | |
207 | default: | |
208 | *ep++ = CCHR; | |
209 | *ep++ = c; | |
210 | } | |
211 | } | |
212 | } | |
213 | ||
214 | /* | |
215 | * match the argument string against the compiled re | |
216 | */ | |
217 | int | |
218 | re_exec(p1) | |
219 | register char *p1; | |
220 | { | |
221 | register char *p2 = expbuf; | |
222 | register int c; | |
223 | int rv; | |
224 | ||
225 | for (c = 0; c < NBRA; c++) { | |
226 | braslist[c] = 0; | |
227 | braelist[c] = 0; | |
228 | } | |
229 | if (circf) | |
230 | return((advance(p1, p2))); | |
231 | /* | |
232 | * fast check for first character | |
233 | */ | |
234 | if (*p2 == CCHR) { | |
235 | c = p2[1]; | |
236 | do { | |
237 | if (*p1 != c) | |
238 | continue; | |
239 | if (rv = advance(p1, p2)) | |
240 | return(rv); | |
241 | } while (*p1++); | |
242 | return(0); | |
243 | } | |
244 | /* | |
245 | * regular algorithm | |
246 | */ | |
247 | do | |
248 | if (rv = advance(p1, p2)) | |
249 | return(rv); | |
250 | while (*p1++); | |
251 | return(0); | |
252 | } | |
253 | ||
254 | /* | |
255 | * try to match the next thing in the dfa | |
256 | */ | |
257 | static int | |
258 | advance(lp, ep) | |
259 | register char *lp, *ep; | |
260 | { | |
261 | register char *curlp; | |
262 | int ct, i; | |
263 | int rv; | |
264 | ||
265 | for (;;) | |
266 | switch (*ep++) { | |
267 | ||
268 | case CCHR: | |
269 | if (*ep++ == *lp++) | |
270 | continue; | |
271 | return(0); | |
272 | ||
273 | case CDOT: | |
274 | if (*lp++) | |
275 | continue; | |
276 | return(0); | |
277 | ||
278 | case CDOL: | |
279 | if (*lp == '\0') | |
280 | continue; | |
281 | return(0); | |
282 | ||
283 | case CEOF: | |
284 | return(1); | |
285 | ||
286 | case CCL: | |
287 | if (cclass(ep, *lp++, 1)) { | |
288 | ep += *ep; | |
289 | continue; | |
290 | } | |
291 | return(0); | |
292 | ||
293 | case NCCL: | |
294 | if (cclass(ep, *lp++, 0)) { | |
295 | ep += *ep; | |
296 | continue; | |
297 | } | |
298 | return(0); | |
299 | ||
300 | case CBRA: | |
301 | braslist[*ep++] = lp; | |
302 | continue; | |
303 | ||
304 | case CKET: | |
305 | braelist[*ep++] = lp; | |
306 | continue; | |
307 | ||
308 | case CBACK: | |
309 | if (braelist[i = *ep++] == 0) | |
310 | return(-1); | |
311 | if (backref(i, lp)) { | |
312 | lp += braelist[i] - braslist[i]; | |
313 | continue; | |
314 | } | |
315 | return(0); | |
316 | ||
317 | case CBACK|CSTAR: | |
318 | if (braelist[i = *ep++] == 0) | |
319 | return(-1); | |
320 | curlp = lp; | |
321 | ct = braelist[i] - braslist[i]; | |
322 | while (backref(i, lp)) | |
323 | lp += ct; | |
324 | while (lp >= curlp) { | |
325 | if (rv = advance(lp, ep)) | |
326 | return(rv); | |
327 | lp -= ct; | |
328 | } | |
329 | continue; | |
330 | ||
331 | case CDOT|CSTAR: | |
332 | curlp = lp; | |
333 | while (*lp++) | |
334 | ; | |
335 | goto star; | |
336 | ||
337 | case CCHR|CSTAR: | |
338 | curlp = lp; | |
339 | while (*lp++ == *ep) | |
340 | ; | |
341 | ep++; | |
342 | goto star; | |
343 | ||
344 | case CCL|CSTAR: | |
345 | case NCCL|CSTAR: | |
346 | curlp = lp; | |
347 | while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) | |
348 | ; | |
349 | ep += *ep; | |
350 | goto star; | |
351 | ||
352 | star: | |
353 | do { | |
354 | lp--; | |
355 | if (rv = advance(lp, ep)) | |
356 | return(rv); | |
357 | } while (lp > curlp); | |
358 | return(0); | |
359 | ||
360 | default: | |
361 | return(-1); | |
362 | } | |
363 | } | |
364 | ||
365 | backref(i, lp) | |
366 | register int i; | |
367 | register char *lp; | |
368 | { | |
369 | register char *bp; | |
370 | ||
371 | bp = braslist[i]; | |
372 | while (*bp++ == *lp++) | |
373 | if (bp >= braelist[i]) | |
374 | return(1); | |
375 | return(0); | |
376 | } | |
377 | ||
378 | int | |
379 | cclass(set, c, af) | |
380 | register char *set, c; | |
381 | int af; | |
382 | { | |
383 | register int n; | |
384 | ||
385 | if (c == 0) | |
386 | return(0); | |
387 | n = *set++; | |
388 | while (--n) | |
389 | if (*set++ == c) | |
390 | return(af); | |
391 | return(! af); | |
392 | } |