added LIBC_SCCS condition for sccs ids
[unix-history] / usr / src / lib / libcompat / 4.3 / regex.c
CommitLineData
bb0cfa24
DF
1/*
2 * Copyright (c) 1980 Regents of the University of California.
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 */
6
2ce81398
DS
7#if defined(LIBC_SCCS) && !defined(lint)
8static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) %G%";
9#endif LIBC_SCCS and not lint
bb0cfa24 10
af509ab3
BJ
11#
12
13/*
14 * routines to do regular expression matching
15 *
16 * Entry points:
17 *
18 * re_comp(s)
19 * char *s;
20 * ... returns 0 if the string s was compiled successfully,
21 * a pointer to an error message otherwise.
22 * If passed 0 or a null string returns without changing
23 * the currently compiled re (see note 11 below).
24 *
25 * re_exec(s)
26 * char *s;
27 * ... returns 1 if the string s matches the last compiled regular
28 * expression,
29 * 0 if the string s failed to match the last compiled
30 * regular expression, and
31 * -1 if the compiled regular expression was invalid
32 * (indicating an internal error).
33 *
34 * The strings passed to both re_comp and re_exec may have trailing or
35 * embedded newline characters; they are terminated by nulls.
36 *
37 * The identity of the author of these routines is lost in antiquity;
38 * this is essentially the same as the re code in the original V6 ed.
39 *
40 * The regular expressions recognized are described below. This description
41 * is essentially the same as that for ed.
42 *
43 * A regular expression specifies a set of strings of characters.
44 * A member of this set of strings is said to be matched by
45 * the regular expression. In the following specification for
46 * regular expressions the word `character' means any character but NUL.
47 *
48 * 1. Any character except a special character matches itself.
49 * Special characters are the regular expression delimiter plus
50 * \ [ . and sometimes ^ * $.
51 * 2. A . matches any character.
52 * 3. A \ followed by any character except a digit or ( )
53 * matches that character.
54 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
55 * character in (or not in) s. In s, \ has no special meaning,
56 * and ] may only appear as the first letter. A substring
57 * a-b, with a and b in ascending ASCII order, stands for
58 * the inclusive range of ASCII characters.
59 * 5. A regular expression of form 1-4 followed by * matches a
60 * sequence of 0 or more matches of the regular expression.
61 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
62 * matches what x matches.
63 * 7. A \ followed by a digit n matches a copy of the string that the
64 * bracketed regular expression beginning with the nth \( matched.
65 * 8. A regular expression of form 1-8, x, followed by a regular
66 * expression of form 1-7, y matches a match for x followed by
67 * a match for y, with the x match being as long as possible
68 * while still permitting a y match.
69 * 9. A regular expression of form 1-8 preceded by ^ (or followed
70 * by $), is constrained to matches that begin at the left
71 * (or end at the right) end of a line.
72 * 10. A regular expression of form 1-9 picks out the longest among
73 * the leftmost matches in a line.
74 * 11. An empty regular expression stands for a copy of the last
75 * regular expression encountered.
76 */
77
78/*
79 * constants for re's
80 */
81#define CBRA 1
82#define CCHR 2
83#define CDOT 4
84#define CCL 6
85#define NCCL 8
86#define CDOL 10
87#define CEOF 11
88#define CKET 12
89#define CBACK 18
90
91#define CSTAR 01
92
93#define ESIZE 512
94#define NBRA 9
95
96static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
97static char circf;
98
99/*
100 * compile the regular expression argument into a dfa
101 */
102char *
103re_comp(sp)
104 register char *sp;
105{
106 register int c;
107 register char *ep = expbuf;
108 int cclcnt, numbra = 0;
109 char *lastep = 0;
110 char bracket[NBRA];
111 char *bracketp = &bracket[0];
112 static char *retoolong = "Regular expression too long";
113
114#define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
115
116 if (sp == 0 || *sp == '\0') {
117 if (*ep == 0)
118 return("No previous regular expression");
119 return(0);
120 }
121 if (*sp == '^') {
122 circf = 1;
123 sp++;
124 }
125 else
126 circf = 0;
127 for (;;) {
128 if (ep >= &expbuf[ESIZE])
129 comerr(retoolong);
130 if ((c = *sp++) == '\0') {
131 if (bracketp != bracket)
132 comerr("unmatched \\(");
133 *ep++ = CEOF;
134 *ep++ = 0;
135 return(0);
136 }
137 if (c != '*')
138 lastep = ep;
139 switch (c) {
140
141 case '.':
142 *ep++ = CDOT;
143 continue;
144
145 case '*':
146 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
147 goto defchar;
148 *lastep |= CSTAR;
149 continue;
150
151 case '$':
152 if (*sp != '\0')
153 goto defchar;
154 *ep++ = CDOL;
155 continue;
156
157 case '[':
158 *ep++ = CCL;
159 *ep++ = 0;
160 cclcnt = 1;
161 if ((c = *sp++) == '^') {
162 c = *sp++;
163 ep[-2] = NCCL;
164 }
165 do {
166 if (c == '\0')
167 comerr("missing ]");
168 if (c == '-' && ep [-1] != 0) {
169 if ((c = *sp++) == ']') {
170 *ep++ = '-';
171 cclcnt++;
172 break;
173 }
174 while (ep[-1] < c) {
175 *ep = ep[-1] + 1;
176 ep++;
177 cclcnt++;
178 if (ep >= &expbuf[ESIZE])
179 comerr(retoolong);
180 }
181 }
182 *ep++ = c;
183 cclcnt++;
184 if (ep >= &expbuf[ESIZE])
185 comerr(retoolong);
186 } while ((c = *sp++) != ']');
187 lastep[1] = cclcnt;
188 continue;
189
190 case '\\':
191 if ((c = *sp++) == '(') {
192 if (numbra >= NBRA)
193 comerr("too many \\(\\) pairs");
194 *bracketp++ = numbra;
195 *ep++ = CBRA;
196 *ep++ = numbra++;
197 continue;
198 }
199 if (c == ')') {
200 if (bracketp <= bracket)
201 comerr("unmatched \\)");
202 *ep++ = CKET;
203 *ep++ = *--bracketp;
204 continue;
205 }
206 if (c >= '1' && c < ('1' + NBRA)) {
207 *ep++ = CBACK;
208 *ep++ = c - '1';
209 continue;
210 }
211 *ep++ = CCHR;
212 *ep++ = c;
213 continue;
214
215 defchar:
216 default:
217 *ep++ = CCHR;
218 *ep++ = c;
219 }
220 }
221}
222
223/*
224 * match the argument string against the compiled re
225 */
226int
227re_exec(p1)
228 register char *p1;
229{
230 register char *p2 = expbuf;
231 register int c;
232 int rv;
233
234 for (c = 0; c < NBRA; c++) {
235 braslist[c] = 0;
236 braelist[c] = 0;
237 }
238 if (circf)
239 return((advance(p1, p2)));
240 /*
241 * fast check for first character
242 */
243 if (*p2 == CCHR) {
244 c = p2[1];
245 do {
246 if (*p1 != c)
247 continue;
248 if (rv = advance(p1, p2))
249 return(rv);
250 } while (*p1++);
251 return(0);
252 }
253 /*
254 * regular algorithm
255 */
256 do
257 if (rv = advance(p1, p2))
258 return(rv);
259 while (*p1++);
260 return(0);
261}
262
263/*
264 * try to match the next thing in the dfa
265 */
266static int
267advance(lp, ep)
268 register char *lp, *ep;
269{
270 register char *curlp;
271 int ct, i;
272 int rv;
273
274 for (;;)
275 switch (*ep++) {
276
277 case CCHR:
278 if (*ep++ == *lp++)
279 continue;
280 return(0);
281
282 case CDOT:
283 if (*lp++)
284 continue;
285 return(0);
286
287 case CDOL:
288 if (*lp == '\0')
289 continue;
290 return(0);
291
292 case CEOF:
293 return(1);
294
295 case CCL:
296 if (cclass(ep, *lp++, 1)) {
297 ep += *ep;
298 continue;
299 }
300 return(0);
301
302 case NCCL:
303 if (cclass(ep, *lp++, 0)) {
304 ep += *ep;
305 continue;
306 }
307 return(0);
308
309 case CBRA:
310 braslist[*ep++] = lp;
311 continue;
312
313 case CKET:
314 braelist[*ep++] = lp;
315 continue;
316
317 case CBACK:
318 if (braelist[i = *ep++] == 0)
319 return(-1);
320 if (backref(i, lp)) {
321 lp += braelist[i] - braslist[i];
322 continue;
323 }
324 return(0);
325
326 case CBACK|CSTAR:
327 if (braelist[i = *ep++] == 0)
328 return(-1);
329 curlp = lp;
330 ct = braelist[i] - braslist[i];
331 while (backref(i, lp))
332 lp += ct;
333 while (lp >= curlp) {
334 if (rv = advance(lp, ep))
335 return(rv);
336 lp -= ct;
337 }
338 continue;
339
340 case CDOT|CSTAR:
341 curlp = lp;
342 while (*lp++)
343 ;
344 goto star;
345
346 case CCHR|CSTAR:
347 curlp = lp;
348 while (*lp++ == *ep)
349 ;
350 ep++;
351 goto star;
352
353 case CCL|CSTAR:
354 case NCCL|CSTAR:
355 curlp = lp;
356 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
357 ;
358 ep += *ep;
359 goto star;
360
361 star:
362 do {
363 lp--;
364 if (rv = advance(lp, ep))
365 return(rv);
366 } while (lp > curlp);
367 return(0);
368
369 default:
370 return(-1);
371 }
372}
373
374backref(i, lp)
375 register int i;
376 register char *lp;
377{
378 register char *bp;
379
380 bp = braslist[i];
381 while (*bp++ == *lp++)
382 if (bp >= braelist[i])
383 return(1);
384 return(0);
385}
386
387int
388cclass(set, c, af)
389 register char *set, c;
390 int af;
391{
392 register int n;
393
394 if (c == 0)
395 return(0);
396 n = *set++;
397 while (--n)
398 if (*set++ == c)
399 return(af);
400 return(! af);
401}