advance snd_max even when persisting
[unix-history] / usr / src / lib / libcompat / 4.3 / regex.c
CommitLineData
af509ab3
BJ
1/* @(#)regex.c 4.1 (Berkeley) %G% */
2#
3
4/*
5 * routines to do regular expression matching
6 *
7 * Entry points:
8 *
9 * re_comp(s)
10 * char *s;
11 * ... returns 0 if the string s was compiled successfully,
12 * a pointer to an error message otherwise.
13 * If passed 0 or a null string returns without changing
14 * the currently compiled re (see note 11 below).
15 *
16 * re_exec(s)
17 * char *s;
18 * ... returns 1 if the string s matches the last compiled regular
19 * expression,
20 * 0 if the string s failed to match the last compiled
21 * regular expression, and
22 * -1 if the compiled regular expression was invalid
23 * (indicating an internal error).
24 *
25 * The strings passed to both re_comp and re_exec may have trailing or
26 * embedded newline characters; they are terminated by nulls.
27 *
28 * The identity of the author of these routines is lost in antiquity;
29 * this is essentially the same as the re code in the original V6 ed.
30 *
31 * The regular expressions recognized are described below. This description
32 * is essentially the same as that for ed.
33 *
34 * A regular expression specifies a set of strings of characters.
35 * A member of this set of strings is said to be matched by
36 * the regular expression. In the following specification for
37 * regular expressions the word `character' means any character but NUL.
38 *
39 * 1. Any character except a special character matches itself.
40 * Special characters are the regular expression delimiter plus
41 * \ [ . and sometimes ^ * $.
42 * 2. A . matches any character.
43 * 3. A \ followed by any character except a digit or ( )
44 * matches that character.
45 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
46 * character in (or not in) s. In s, \ has no special meaning,
47 * and ] may only appear as the first letter. A substring
48 * a-b, with a and b in ascending ASCII order, stands for
49 * the inclusive range of ASCII characters.
50 * 5. A regular expression of form 1-4 followed by * matches a
51 * sequence of 0 or more matches of the regular expression.
52 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
53 * matches what x matches.
54 * 7. A \ followed by a digit n matches a copy of the string that the
55 * bracketed regular expression beginning with the nth \( matched.
56 * 8. A regular expression of form 1-8, x, followed by a regular
57 * expression of form 1-7, y matches a match for x followed by
58 * a match for y, with the x match being as long as possible
59 * while still permitting a y match.
60 * 9. A regular expression of form 1-8 preceded by ^ (or followed
61 * by $), is constrained to matches that begin at the left
62 * (or end at the right) end of a line.
63 * 10. A regular expression of form 1-9 picks out the longest among
64 * the leftmost matches in a line.
65 * 11. An empty regular expression stands for a copy of the last
66 * regular expression encountered.
67 */
68
69/*
70 * constants for re's
71 */
72#define CBRA 1
73#define CCHR 2
74#define CDOT 4
75#define CCL 6
76#define NCCL 8
77#define CDOL 10
78#define CEOF 11
79#define CKET 12
80#define CBACK 18
81
82#define CSTAR 01
83
84#define ESIZE 512
85#define NBRA 9
86
87static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
88static char circf;
89
90/*
91 * compile the regular expression argument into a dfa
92 */
93char *
94re_comp(sp)
95 register char *sp;
96{
97 register int c;
98 register char *ep = expbuf;
99 int cclcnt, numbra = 0;
100 char *lastep = 0;
101 char bracket[NBRA];
102 char *bracketp = &bracket[0];
103 static char *retoolong = "Regular expression too long";
104
105#define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
106
107 if (sp == 0 || *sp == '\0') {
108 if (*ep == 0)
109 return("No previous regular expression");
110 return(0);
111 }
112 if (*sp == '^') {
113 circf = 1;
114 sp++;
115 }
116 else
117 circf = 0;
118 for (;;) {
119 if (ep >= &expbuf[ESIZE])
120 comerr(retoolong);
121 if ((c = *sp++) == '\0') {
122 if (bracketp != bracket)
123 comerr("unmatched \\(");
124 *ep++ = CEOF;
125 *ep++ = 0;
126 return(0);
127 }
128 if (c != '*')
129 lastep = ep;
130 switch (c) {
131
132 case '.':
133 *ep++ = CDOT;
134 continue;
135
136 case '*':
137 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
138 goto defchar;
139 *lastep |= CSTAR;
140 continue;
141
142 case '$':
143 if (*sp != '\0')
144 goto defchar;
145 *ep++ = CDOL;
146 continue;
147
148 case '[':
149 *ep++ = CCL;
150 *ep++ = 0;
151 cclcnt = 1;
152 if ((c = *sp++) == '^') {
153 c = *sp++;
154 ep[-2] = NCCL;
155 }
156 do {
157 if (c == '\0')
158 comerr("missing ]");
159 if (c == '-' && ep [-1] != 0) {
160 if ((c = *sp++) == ']') {
161 *ep++ = '-';
162 cclcnt++;
163 break;
164 }
165 while (ep[-1] < c) {
166 *ep = ep[-1] + 1;
167 ep++;
168 cclcnt++;
169 if (ep >= &expbuf[ESIZE])
170 comerr(retoolong);
171 }
172 }
173 *ep++ = c;
174 cclcnt++;
175 if (ep >= &expbuf[ESIZE])
176 comerr(retoolong);
177 } while ((c = *sp++) != ']');
178 lastep[1] = cclcnt;
179 continue;
180
181 case '\\':
182 if ((c = *sp++) == '(') {
183 if (numbra >= NBRA)
184 comerr("too many \\(\\) pairs");
185 *bracketp++ = numbra;
186 *ep++ = CBRA;
187 *ep++ = numbra++;
188 continue;
189 }
190 if (c == ')') {
191 if (bracketp <= bracket)
192 comerr("unmatched \\)");
193 *ep++ = CKET;
194 *ep++ = *--bracketp;
195 continue;
196 }
197 if (c >= '1' && c < ('1' + NBRA)) {
198 *ep++ = CBACK;
199 *ep++ = c - '1';
200 continue;
201 }
202 *ep++ = CCHR;
203 *ep++ = c;
204 continue;
205
206 defchar:
207 default:
208 *ep++ = CCHR;
209 *ep++ = c;
210 }
211 }
212}
213
214/*
215 * match the argument string against the compiled re
216 */
217int
218re_exec(p1)
219 register char *p1;
220{
221 register char *p2 = expbuf;
222 register int c;
223 int rv;
224
225 for (c = 0; c < NBRA; c++) {
226 braslist[c] = 0;
227 braelist[c] = 0;
228 }
229 if (circf)
230 return((advance(p1, p2)));
231 /*
232 * fast check for first character
233 */
234 if (*p2 == CCHR) {
235 c = p2[1];
236 do {
237 if (*p1 != c)
238 continue;
239 if (rv = advance(p1, p2))
240 return(rv);
241 } while (*p1++);
242 return(0);
243 }
244 /*
245 * regular algorithm
246 */
247 do
248 if (rv = advance(p1, p2))
249 return(rv);
250 while (*p1++);
251 return(0);
252}
253
254/*
255 * try to match the next thing in the dfa
256 */
257static int
258advance(lp, ep)
259 register char *lp, *ep;
260{
261 register char *curlp;
262 int ct, i;
263 int rv;
264
265 for (;;)
266 switch (*ep++) {
267
268 case CCHR:
269 if (*ep++ == *lp++)
270 continue;
271 return(0);
272
273 case CDOT:
274 if (*lp++)
275 continue;
276 return(0);
277
278 case CDOL:
279 if (*lp == '\0')
280 continue;
281 return(0);
282
283 case CEOF:
284 return(1);
285
286 case CCL:
287 if (cclass(ep, *lp++, 1)) {
288 ep += *ep;
289 continue;
290 }
291 return(0);
292
293 case NCCL:
294 if (cclass(ep, *lp++, 0)) {
295 ep += *ep;
296 continue;
297 }
298 return(0);
299
300 case CBRA:
301 braslist[*ep++] = lp;
302 continue;
303
304 case CKET:
305 braelist[*ep++] = lp;
306 continue;
307
308 case CBACK:
309 if (braelist[i = *ep++] == 0)
310 return(-1);
311 if (backref(i, lp)) {
312 lp += braelist[i] - braslist[i];
313 continue;
314 }
315 return(0);
316
317 case CBACK|CSTAR:
318 if (braelist[i = *ep++] == 0)
319 return(-1);
320 curlp = lp;
321 ct = braelist[i] - braslist[i];
322 while (backref(i, lp))
323 lp += ct;
324 while (lp >= curlp) {
325 if (rv = advance(lp, ep))
326 return(rv);
327 lp -= ct;
328 }
329 continue;
330
331 case CDOT|CSTAR:
332 curlp = lp;
333 while (*lp++)
334 ;
335 goto star;
336
337 case CCHR|CSTAR:
338 curlp = lp;
339 while (*lp++ == *ep)
340 ;
341 ep++;
342 goto star;
343
344 case CCL|CSTAR:
345 case NCCL|CSTAR:
346 curlp = lp;
347 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
348 ;
349 ep += *ep;
350 goto star;
351
352 star:
353 do {
354 lp--;
355 if (rv = advance(lp, ep))
356 return(rv);
357 } while (lp > curlp);
358 return(0);
359
360 default:
361 return(-1);
362 }
363}
364
365backref(i, lp)
366 register int i;
367 register char *lp;
368{
369 register char *bp;
370
371 bp = braslist[i];
372 while (*bp++ == *lp++)
373 if (bp >= braelist[i])
374 return(1);
375 return(0);
376}
377
378int
379cclass(set, c, af)
380 register char *set, c;
381 int af;
382{
383 register int n;
384
385 if (c == 0)
386 return(0);
387 n = *set++;
388 while (--n)
389 if (*set++ == c)
390 return(af);
391 return(! af);
392}