Commit | Line | Data |
---|---|---|
bb0cfa24 DF |
1 | /* |
2 | * Copyright (c) 1980 Regents of the University of California. | |
3 | * All rights reserved. The Berkeley software License Agreement | |
4 | * specifies the terms and conditions for redistribution. | |
5 | */ | |
6 | ||
2ce81398 DS |
7 | #if defined(LIBC_SCCS) && !defined(lint) |
8 | static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) %G%"; | |
9 | #endif LIBC_SCCS and not lint | |
bb0cfa24 | 10 | |
af509ab3 BJ |
11 | # |
12 | ||
13 | /* | |
14 | * routines to do regular expression matching | |
15 | * | |
16 | * Entry points: | |
17 | * | |
18 | * re_comp(s) | |
19 | * char *s; | |
20 | * ... returns 0 if the string s was compiled successfully, | |
21 | * a pointer to an error message otherwise. | |
22 | * If passed 0 or a null string returns without changing | |
23 | * the currently compiled re (see note 11 below). | |
24 | * | |
25 | * re_exec(s) | |
26 | * char *s; | |
27 | * ... returns 1 if the string s matches the last compiled regular | |
28 | * expression, | |
29 | * 0 if the string s failed to match the last compiled | |
30 | * regular expression, and | |
31 | * -1 if the compiled regular expression was invalid | |
32 | * (indicating an internal error). | |
33 | * | |
34 | * The strings passed to both re_comp and re_exec may have trailing or | |
35 | * embedded newline characters; they are terminated by nulls. | |
36 | * | |
37 | * The identity of the author of these routines is lost in antiquity; | |
38 | * this is essentially the same as the re code in the original V6 ed. | |
39 | * | |
40 | * The regular expressions recognized are described below. This description | |
41 | * is essentially the same as that for ed. | |
42 | * | |
43 | * A regular expression specifies a set of strings of characters. | |
44 | * A member of this set of strings is said to be matched by | |
45 | * the regular expression. In the following specification for | |
46 | * regular expressions the word `character' means any character but NUL. | |
47 | * | |
48 | * 1. Any character except a special character matches itself. | |
49 | * Special characters are the regular expression delimiter plus | |
50 | * \ [ . and sometimes ^ * $. | |
51 | * 2. A . matches any character. | |
52 | * 3. A \ followed by any character except a digit or ( ) | |
53 | * matches that character. | |
54 | * 4. A nonempty string s bracketed [s] (or [^s]) matches any | |
55 | * character in (or not in) s. In s, \ has no special meaning, | |
56 | * and ] may only appear as the first letter. A substring | |
57 | * a-b, with a and b in ascending ASCII order, stands for | |
58 | * the inclusive range of ASCII characters. | |
59 | * 5. A regular expression of form 1-4 followed by * matches a | |
60 | * sequence of 0 or more matches of the regular expression. | |
61 | * 6. A regular expression, x, of form 1-8, bracketed \(x\) | |
62 | * matches what x matches. | |
63 | * 7. A \ followed by a digit n matches a copy of the string that the | |
64 | * bracketed regular expression beginning with the nth \( matched. | |
65 | * 8. A regular expression of form 1-8, x, followed by a regular | |
66 | * expression of form 1-7, y matches a match for x followed by | |
67 | * a match for y, with the x match being as long as possible | |
68 | * while still permitting a y match. | |
69 | * 9. A regular expression of form 1-8 preceded by ^ (or followed | |
70 | * by $), is constrained to matches that begin at the left | |
71 | * (or end at the right) end of a line. | |
72 | * 10. A regular expression of form 1-9 picks out the longest among | |
73 | * the leftmost matches in a line. | |
74 | * 11. An empty regular expression stands for a copy of the last | |
75 | * regular expression encountered. | |
76 | */ | |
77 | ||
78 | /* | |
79 | * constants for re's | |
80 | */ | |
81 | #define CBRA 1 | |
82 | #define CCHR 2 | |
83 | #define CDOT 4 | |
84 | #define CCL 6 | |
85 | #define NCCL 8 | |
86 | #define CDOL 10 | |
87 | #define CEOF 11 | |
88 | #define CKET 12 | |
89 | #define CBACK 18 | |
90 | ||
91 | #define CSTAR 01 | |
92 | ||
93 | #define ESIZE 512 | |
94 | #define NBRA 9 | |
95 | ||
96 | static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; | |
97 | static char circf; | |
98 | ||
99 | /* | |
100 | * compile the regular expression argument into a dfa | |
101 | */ | |
102 | char * | |
103 | re_comp(sp) | |
104 | register char *sp; | |
105 | { | |
106 | register int c; | |
107 | register char *ep = expbuf; | |
108 | int cclcnt, numbra = 0; | |
109 | char *lastep = 0; | |
110 | char bracket[NBRA]; | |
111 | char *bracketp = &bracket[0]; | |
112 | static char *retoolong = "Regular expression too long"; | |
113 | ||
114 | #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } | |
115 | ||
116 | if (sp == 0 || *sp == '\0') { | |
117 | if (*ep == 0) | |
118 | return("No previous regular expression"); | |
119 | return(0); | |
120 | } | |
121 | if (*sp == '^') { | |
122 | circf = 1; | |
123 | sp++; | |
124 | } | |
125 | else | |
126 | circf = 0; | |
127 | for (;;) { | |
128 | if (ep >= &expbuf[ESIZE]) | |
129 | comerr(retoolong); | |
130 | if ((c = *sp++) == '\0') { | |
131 | if (bracketp != bracket) | |
132 | comerr("unmatched \\("); | |
133 | *ep++ = CEOF; | |
134 | *ep++ = 0; | |
135 | return(0); | |
136 | } | |
137 | if (c != '*') | |
138 | lastep = ep; | |
139 | switch (c) { | |
140 | ||
141 | case '.': | |
142 | *ep++ = CDOT; | |
143 | continue; | |
144 | ||
145 | case '*': | |
146 | if (lastep == 0 || *lastep == CBRA || *lastep == CKET) | |
147 | goto defchar; | |
148 | *lastep |= CSTAR; | |
149 | continue; | |
150 | ||
151 | case '$': | |
152 | if (*sp != '\0') | |
153 | goto defchar; | |
154 | *ep++ = CDOL; | |
155 | continue; | |
156 | ||
157 | case '[': | |
158 | *ep++ = CCL; | |
159 | *ep++ = 0; | |
160 | cclcnt = 1; | |
161 | if ((c = *sp++) == '^') { | |
162 | c = *sp++; | |
163 | ep[-2] = NCCL; | |
164 | } | |
165 | do { | |
166 | if (c == '\0') | |
167 | comerr("missing ]"); | |
168 | if (c == '-' && ep [-1] != 0) { | |
169 | if ((c = *sp++) == ']') { | |
170 | *ep++ = '-'; | |
171 | cclcnt++; | |
172 | break; | |
173 | } | |
174 | while (ep[-1] < c) { | |
175 | *ep = ep[-1] + 1; | |
176 | ep++; | |
177 | cclcnt++; | |
178 | if (ep >= &expbuf[ESIZE]) | |
179 | comerr(retoolong); | |
180 | } | |
181 | } | |
182 | *ep++ = c; | |
183 | cclcnt++; | |
184 | if (ep >= &expbuf[ESIZE]) | |
185 | comerr(retoolong); | |
186 | } while ((c = *sp++) != ']'); | |
187 | lastep[1] = cclcnt; | |
188 | continue; | |
189 | ||
190 | case '\\': | |
191 | if ((c = *sp++) == '(') { | |
192 | if (numbra >= NBRA) | |
193 | comerr("too many \\(\\) pairs"); | |
194 | *bracketp++ = numbra; | |
195 | *ep++ = CBRA; | |
196 | *ep++ = numbra++; | |
197 | continue; | |
198 | } | |
199 | if (c == ')') { | |
200 | if (bracketp <= bracket) | |
201 | comerr("unmatched \\)"); | |
202 | *ep++ = CKET; | |
203 | *ep++ = *--bracketp; | |
204 | continue; | |
205 | } | |
206 | if (c >= '1' && c < ('1' + NBRA)) { | |
207 | *ep++ = CBACK; | |
208 | *ep++ = c - '1'; | |
209 | continue; | |
210 | } | |
211 | *ep++ = CCHR; | |
212 | *ep++ = c; | |
213 | continue; | |
214 | ||
215 | defchar: | |
216 | default: | |
217 | *ep++ = CCHR; | |
218 | *ep++ = c; | |
219 | } | |
220 | } | |
221 | } | |
222 | ||
223 | /* | |
224 | * match the argument string against the compiled re | |
225 | */ | |
226 | int | |
227 | re_exec(p1) | |
228 | register char *p1; | |
229 | { | |
230 | register char *p2 = expbuf; | |
231 | register int c; | |
232 | int rv; | |
233 | ||
234 | for (c = 0; c < NBRA; c++) { | |
235 | braslist[c] = 0; | |
236 | braelist[c] = 0; | |
237 | } | |
238 | if (circf) | |
239 | return((advance(p1, p2))); | |
240 | /* | |
241 | * fast check for first character | |
242 | */ | |
243 | if (*p2 == CCHR) { | |
244 | c = p2[1]; | |
245 | do { | |
246 | if (*p1 != c) | |
247 | continue; | |
248 | if (rv = advance(p1, p2)) | |
249 | return(rv); | |
250 | } while (*p1++); | |
251 | return(0); | |
252 | } | |
253 | /* | |
254 | * regular algorithm | |
255 | */ | |
256 | do | |
257 | if (rv = advance(p1, p2)) | |
258 | return(rv); | |
259 | while (*p1++); | |
260 | return(0); | |
261 | } | |
262 | ||
263 | /* | |
264 | * try to match the next thing in the dfa | |
265 | */ | |
266 | static int | |
267 | advance(lp, ep) | |
268 | register char *lp, *ep; | |
269 | { | |
270 | register char *curlp; | |
271 | int ct, i; | |
272 | int rv; | |
273 | ||
274 | for (;;) | |
275 | switch (*ep++) { | |
276 | ||
277 | case CCHR: | |
278 | if (*ep++ == *lp++) | |
279 | continue; | |
280 | return(0); | |
281 | ||
282 | case CDOT: | |
283 | if (*lp++) | |
284 | continue; | |
285 | return(0); | |
286 | ||
287 | case CDOL: | |
288 | if (*lp == '\0') | |
289 | continue; | |
290 | return(0); | |
291 | ||
292 | case CEOF: | |
293 | return(1); | |
294 | ||
295 | case CCL: | |
296 | if (cclass(ep, *lp++, 1)) { | |
297 | ep += *ep; | |
298 | continue; | |
299 | } | |
300 | return(0); | |
301 | ||
302 | case NCCL: | |
303 | if (cclass(ep, *lp++, 0)) { | |
304 | ep += *ep; | |
305 | continue; | |
306 | } | |
307 | return(0); | |
308 | ||
309 | case CBRA: | |
310 | braslist[*ep++] = lp; | |
311 | continue; | |
312 | ||
313 | case CKET: | |
314 | braelist[*ep++] = lp; | |
315 | continue; | |
316 | ||
317 | case CBACK: | |
318 | if (braelist[i = *ep++] == 0) | |
319 | return(-1); | |
320 | if (backref(i, lp)) { | |
321 | lp += braelist[i] - braslist[i]; | |
322 | continue; | |
323 | } | |
324 | return(0); | |
325 | ||
326 | case CBACK|CSTAR: | |
327 | if (braelist[i = *ep++] == 0) | |
328 | return(-1); | |
329 | curlp = lp; | |
330 | ct = braelist[i] - braslist[i]; | |
331 | while (backref(i, lp)) | |
332 | lp += ct; | |
333 | while (lp >= curlp) { | |
334 | if (rv = advance(lp, ep)) | |
335 | return(rv); | |
336 | lp -= ct; | |
337 | } | |
338 | continue; | |
339 | ||
340 | case CDOT|CSTAR: | |
341 | curlp = lp; | |
342 | while (*lp++) | |
343 | ; | |
344 | goto star; | |
345 | ||
346 | case CCHR|CSTAR: | |
347 | curlp = lp; | |
348 | while (*lp++ == *ep) | |
349 | ; | |
350 | ep++; | |
351 | goto star; | |
352 | ||
353 | case CCL|CSTAR: | |
354 | case NCCL|CSTAR: | |
355 | curlp = lp; | |
356 | while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) | |
357 | ; | |
358 | ep += *ep; | |
359 | goto star; | |
360 | ||
361 | star: | |
362 | do { | |
363 | lp--; | |
364 | if (rv = advance(lp, ep)) | |
365 | return(rv); | |
366 | } while (lp > curlp); | |
367 | return(0); | |
368 | ||
369 | default: | |
370 | return(-1); | |
371 | } | |
372 | } | |
373 | ||
374 | backref(i, lp) | |
375 | register int i; | |
376 | register char *lp; | |
377 | { | |
378 | register char *bp; | |
379 | ||
380 | bp = braslist[i]; | |
381 | while (*bp++ == *lp++) | |
382 | if (bp >= braelist[i]) | |
383 | return(1); | |
384 | return(0); | |
385 | } | |
386 | ||
387 | int | |
388 | cclass(set, c, af) | |
389 | register char *set, c; | |
390 | int af; | |
391 | { | |
392 | register int n; | |
393 | ||
394 | if (c == 0) | |
395 | return(0); | |
396 | n = *set++; | |
397 | while (--n) | |
398 | if (*set++ == c) | |
399 | return(af); | |
400 | return(! af); | |
401 | } |