Add include files to get prototype declarations, and fix bugs found.
[unix-history] / usr / src / lib / libcompat / 4.3 / regex.c
CommitLineData
bb0cfa24
DF
1/*
2 * Copyright (c) 1980 Regents of the University of California.
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 */
6
2ce81398 7#if defined(LIBC_SCCS) && !defined(lint)
c5980113 8static char sccsid[] = "@(#)regex.c 5.4 (Berkeley) %G%";
2ce81398 9#endif LIBC_SCCS and not lint
bb0cfa24 10
c5980113
DS
11#include <unistd.h>
12
13static int advance();
14static int backref();
15static int cclass();
af509ab3
BJ
16
17/*
18 * routines to do regular expression matching
19 *
20 * Entry points:
21 *
22 * re_comp(s)
23 * char *s;
24 * ... returns 0 if the string s was compiled successfully,
25 * a pointer to an error message otherwise.
26 * If passed 0 or a null string returns without changing
27 * the currently compiled re (see note 11 below).
28 *
29 * re_exec(s)
30 * char *s;
31 * ... returns 1 if the string s matches the last compiled regular
32 * expression,
33 * 0 if the string s failed to match the last compiled
34 * regular expression, and
35 * -1 if the compiled regular expression was invalid
36 * (indicating an internal error).
37 *
38 * The strings passed to both re_comp and re_exec may have trailing or
39 * embedded newline characters; they are terminated by nulls.
40 *
41 * The identity of the author of these routines is lost in antiquity;
42 * this is essentially the same as the re code in the original V6 ed.
43 *
44 * The regular expressions recognized are described below. This description
45 * is essentially the same as that for ed.
46 *
47 * A regular expression specifies a set of strings of characters.
48 * A member of this set of strings is said to be matched by
49 * the regular expression. In the following specification for
50 * regular expressions the word `character' means any character but NUL.
51 *
52 * 1. Any character except a special character matches itself.
53 * Special characters are the regular expression delimiter plus
54 * \ [ . and sometimes ^ * $.
55 * 2. A . matches any character.
56 * 3. A \ followed by any character except a digit or ( )
57 * matches that character.
58 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
59 * character in (or not in) s. In s, \ has no special meaning,
60 * and ] may only appear as the first letter. A substring
61 * a-b, with a and b in ascending ASCII order, stands for
62 * the inclusive range of ASCII characters.
63 * 5. A regular expression of form 1-4 followed by * matches a
64 * sequence of 0 or more matches of the regular expression.
65 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
66 * matches what x matches.
67 * 7. A \ followed by a digit n matches a copy of the string that the
68 * bracketed regular expression beginning with the nth \( matched.
69 * 8. A regular expression of form 1-8, x, followed by a regular
70 * expression of form 1-7, y matches a match for x followed by
71 * a match for y, with the x match being as long as possible
72 * while still permitting a y match.
73 * 9. A regular expression of form 1-8 preceded by ^ (or followed
74 * by $), is constrained to matches that begin at the left
75 * (or end at the right) end of a line.
76 * 10. A regular expression of form 1-9 picks out the longest among
77 * the leftmost matches in a line.
78 * 11. An empty regular expression stands for a copy of the last
79 * regular expression encountered.
80 */
81
82/*
83 * constants for re's
84 */
85#define CBRA 1
86#define CCHR 2
87#define CDOT 4
88#define CCL 6
89#define NCCL 8
90#define CDOL 10
91#define CEOF 11
92#define CKET 12
93#define CBACK 18
94
95#define CSTAR 01
96
97#define ESIZE 512
98#define NBRA 9
99
100static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
101static char circf;
102
103/*
104 * compile the regular expression argument into a dfa
105 */
106char *
107re_comp(sp)
c5980113 108 register const char *sp;
af509ab3
BJ
109{
110 register int c;
111 register char *ep = expbuf;
112 int cclcnt, numbra = 0;
113 char *lastep = 0;
114 char bracket[NBRA];
115 char *bracketp = &bracket[0];
116 static char *retoolong = "Regular expression too long";
117
118#define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
119
120 if (sp == 0 || *sp == '\0') {
121 if (*ep == 0)
122 return("No previous regular expression");
123 return(0);
124 }
125 if (*sp == '^') {
126 circf = 1;
127 sp++;
128 }
129 else
130 circf = 0;
131 for (;;) {
132 if (ep >= &expbuf[ESIZE])
133 comerr(retoolong);
134 if ((c = *sp++) == '\0') {
135 if (bracketp != bracket)
136 comerr("unmatched \\(");
137 *ep++ = CEOF;
138 *ep++ = 0;
139 return(0);
140 }
141 if (c != '*')
142 lastep = ep;
143 switch (c) {
144
145 case '.':
146 *ep++ = CDOT;
147 continue;
148
149 case '*':
150 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
151 goto defchar;
152 *lastep |= CSTAR;
153 continue;
154
155 case '$':
156 if (*sp != '\0')
157 goto defchar;
158 *ep++ = CDOL;
159 continue;
160
161 case '[':
162 *ep++ = CCL;
163 *ep++ = 0;
164 cclcnt = 1;
165 if ((c = *sp++) == '^') {
166 c = *sp++;
167 ep[-2] = NCCL;
168 }
169 do {
170 if (c == '\0')
171 comerr("missing ]");
172 if (c == '-' && ep [-1] != 0) {
173 if ((c = *sp++) == ']') {
174 *ep++ = '-';
175 cclcnt++;
176 break;
177 }
178 while (ep[-1] < c) {
179 *ep = ep[-1] + 1;
180 ep++;
181 cclcnt++;
182 if (ep >= &expbuf[ESIZE])
183 comerr(retoolong);
184 }
185 }
186 *ep++ = c;
187 cclcnt++;
188 if (ep >= &expbuf[ESIZE])
189 comerr(retoolong);
190 } while ((c = *sp++) != ']');
191 lastep[1] = cclcnt;
192 continue;
193
194 case '\\':
195 if ((c = *sp++) == '(') {
196 if (numbra >= NBRA)
197 comerr("too many \\(\\) pairs");
198 *bracketp++ = numbra;
199 *ep++ = CBRA;
200 *ep++ = numbra++;
201 continue;
202 }
203 if (c == ')') {
204 if (bracketp <= bracket)
205 comerr("unmatched \\)");
206 *ep++ = CKET;
207 *ep++ = *--bracketp;
208 continue;
209 }
210 if (c >= '1' && c < ('1' + NBRA)) {
211 *ep++ = CBACK;
212 *ep++ = c - '1';
213 continue;
214 }
215 *ep++ = CCHR;
216 *ep++ = c;
217 continue;
218
219 defchar:
220 default:
221 *ep++ = CCHR;
222 *ep++ = c;
223 }
224 }
225}
226
227/*
228 * match the argument string against the compiled re
229 */
230int
231re_exec(p1)
c5980113 232 register const char *p1;
af509ab3
BJ
233{
234 register char *p2 = expbuf;
235 register int c;
236 int rv;
237
238 for (c = 0; c < NBRA; c++) {
239 braslist[c] = 0;
240 braelist[c] = 0;
241 }
242 if (circf)
243 return((advance(p1, p2)));
244 /*
245 * fast check for first character
246 */
247 if (*p2 == CCHR) {
248 c = p2[1];
249 do {
250 if (*p1 != c)
251 continue;
252 if (rv = advance(p1, p2))
253 return(rv);
254 } while (*p1++);
255 return(0);
256 }
257 /*
258 * regular algorithm
259 */
260 do
261 if (rv = advance(p1, p2))
262 return(rv);
263 while (*p1++);
264 return(0);
265}
266
267/*
268 * try to match the next thing in the dfa
269 */
270static int
271advance(lp, ep)
272 register char *lp, *ep;
273{
274 register char *curlp;
275 int ct, i;
276 int rv;
277
278 for (;;)
279 switch (*ep++) {
280
281 case CCHR:
282 if (*ep++ == *lp++)
283 continue;
284 return(0);
285
286 case CDOT:
287 if (*lp++)
288 continue;
289 return(0);
290
291 case CDOL:
292 if (*lp == '\0')
293 continue;
294 return(0);
295
296 case CEOF:
297 return(1);
298
299 case CCL:
300 if (cclass(ep, *lp++, 1)) {
301 ep += *ep;
302 continue;
303 }
304 return(0);
305
306 case NCCL:
307 if (cclass(ep, *lp++, 0)) {
308 ep += *ep;
309 continue;
310 }
311 return(0);
312
313 case CBRA:
314 braslist[*ep++] = lp;
315 continue;
316
317 case CKET:
318 braelist[*ep++] = lp;
319 continue;
320
321 case CBACK:
322 if (braelist[i = *ep++] == 0)
323 return(-1);
324 if (backref(i, lp)) {
325 lp += braelist[i] - braslist[i];
326 continue;
327 }
328 return(0);
329
330 case CBACK|CSTAR:
331 if (braelist[i = *ep++] == 0)
332 return(-1);
333 curlp = lp;
334 ct = braelist[i] - braslist[i];
335 while (backref(i, lp))
336 lp += ct;
337 while (lp >= curlp) {
338 if (rv = advance(lp, ep))
339 return(rv);
340 lp -= ct;
341 }
342 continue;
343
344 case CDOT|CSTAR:
345 curlp = lp;
346 while (*lp++)
347 ;
348 goto star;
349
350 case CCHR|CSTAR:
351 curlp = lp;
352 while (*lp++ == *ep)
353 ;
354 ep++;
355 goto star;
356
357 case CCL|CSTAR:
358 case NCCL|CSTAR:
359 curlp = lp;
360 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
361 ;
362 ep += *ep;
363 goto star;
364
365 star:
366 do {
367 lp--;
368 if (rv = advance(lp, ep))
369 return(rv);
370 } while (lp > curlp);
371 return(0);
372
373 default:
374 return(-1);
375 }
376}
377
90f51f9a 378static
af509ab3
BJ
379backref(i, lp)
380 register int i;
381 register char *lp;
382{
383 register char *bp;
384
385 bp = braslist[i];
386 while (*bp++ == *lp++)
387 if (bp >= braelist[i])
388 return(1);
389 return(0);
390}
391
90f51f9a 392static int
af509ab3
BJ
393cclass(set, c, af)
394 register char *set, c;
395 int af;
396{
397 register int n;
398
399 if (c == 0)
400 return(0);
401 n = *set++;
402 while (--n)
403 if (*set++ == c)
404 return(af);
405 return(! af);
406}