BSD 4_3_Net_2 release
[unix-history] / usr / src / usr.bin / vgrind / regexp.c
CommitLineData
76797561 1/*
616af90a
KB
2 * Copyright (c) 1980 The Regents of the University of California.
3 * All rights reserved.
4 *
af359dea
C
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
76797561
DF
32 */
33
34#ifndef lint
1c15e888 35static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 6/1/90";
616af90a 36#endif /* not lint */
23e9541a
KM
37
38#include <ctype.h>
39
40typedef int boolean;
41#define TRUE 1
42#define FALSE 0
43#define NIL 0
44
45boolean l_onecase; /* true if upper and lower equivalent */
46
47#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
48
49/* STRNCMP - like strncmp except that we convert the
50 * first string to lower case before comparing
51 * if l_onecase is set.
52 */
53
54STRNCMP(s1, s2, len)
55 register char *s1,*s2;
56 register int len;
57{
58 if (l_onecase) {
59 do
60 if (*s2 - makelower(*s1))
61 return (*s2 - makelower(*s1));
62 else {
63 s2++;
64 s1++;
65 }
66 while (--len);
67 } else {
68 do
69 if (*s2 - *s1)
70 return (*s2 - *s1);
71 else {
72 s2++;
73 s1++;
74 }
75 while (--len);
76 }
77 return(0);
78}
79
80/* The following routine converts an irregular expression to
81 * internal format.
82 *
83 * Either meta symbols (\a \d or \p) or character strings or
84 * operations ( alternation or perenthesizing ) can be
85 * specified. Each starts with a descriptor byte. The descriptor
86 * byte has STR set for strings, META set for meta symbols
87 * and OPER set for operations.
88 * The descriptor byte can also have the OPT bit set if the object
89 * defined is optional. Also ALT can be set to indicate an alternation.
90 *
91 * For metasymbols the byte following the descriptor byte identities
92 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
93 * strings the byte after the descriptor is a character count for
94 * the string:
95 *
96 * meta symbols := descriptor
97 * symbol
98 *
99 * strings := descriptor
100 * character count
101 * the string
102 *
103 * operatins := descriptor
104 * symbol
105 * character count
106 */
107
108/*
109 * handy macros for accessing parts of match blocks
110 */
111#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
112#define MNEXT(A) (A+2) /* character following a metasymbol block */
113
114#define OSYM(A) (*(A+1)) /* symbol in an operation block */
115#define OCNT(A) (*(A+2)) /* character count */
116#define ONEXT(A) (A+3) /* next character after the operation */
117#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
118
119#define SCNT(A) (*(A+1)) /* byte count of a string */
120#define SSTR(A) (A+2) /* address of the string */
121#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
122
123/*
124 * bit flags in the descriptor
125 */
126#define OPT 1
127#define STR 2
128#define META 4
129#define ALT 8
130#define OPER 16
131
132char *ure; /* pointer current position in unconverted exp */
133char *ccre; /* pointer to current position in converted exp*/
134char *malloc();
135
136char *
137convexp(re)
138 char *re; /* unconverted irregular expression */
139{
140 register char *cre; /* pointer to converted regular expression */
141
142 /* allocate room for the converted expression */
143 if (re == NIL)
144 return (NIL);
145 if (*re == '\0')
146 return (NIL);
147 cre = malloc (4 * strlen(re) + 3);
148 ccre = cre;
149 ure = re;
150
151 /* start the conversion with a \a */
152 *cre = META | OPT;
153 MSYM(cre) = 'a';
154 ccre = MNEXT(cre);
155
156 /* start the conversion (its recursive) */
157 expconv ();
158 *ccre = 0;
159 return (cre);
160}
161
162expconv()
163{
164 register char *cs; /* pointer to current symbol in converted exp */
165 register char c; /* character being processed */
166 register char *acs; /* pinter to last alternate */
167 register int temp;
168
169 /* let the conversion begin */
170 acs = NIL;
1a8c1b81 171 cs = NIL;
23e9541a
KM
172 while (*ure != NIL) {
173 switch (c = *ure++) {
174
175 case '\\':
176 switch (c = *ure++) {
177
178 /* escaped characters are just characters */
179 default:
1a8c1b81 180 if (cs == NIL || (*cs & STR) == 0) {
23e9541a
KM
181 cs = ccre;
182 *cs = STR;
183 SCNT(cs) = 1;
184 ccre += 2;
185 } else
186 SCNT(cs)++;
187 *ccre++ = c;
188 break;
189
190 /* normal(?) metacharacters */
191 case 'a':
192 case 'd':
193 case 'e':
194 case 'p':
195 if (acs != NIL && acs != cs) {
196 do {
197 temp = OCNT(acs);
198 OCNT(acs) = ccre - acs;
199 acs -= temp;
200 } while (temp != 0);
201 acs = NIL;
202 }
203 cs = ccre;
204 *cs = META;
205 MSYM(cs) = c;
206 ccre = MNEXT(cs);
207 break;
208 }
209 break;
210
211 /* just put the symbol in */
212 case '^':
213 case '$':
214 if (acs != NIL && acs != cs) {
215 do {
216 temp = OCNT(acs);
217 OCNT(acs) = ccre - acs;
218 acs -= temp;
219 } while (temp != 0);
220 acs = NIL;
221 }
222 cs = ccre;
223 *cs = META;
224 MSYM(cs) = c;
225 ccre = MNEXT(cs);
226 break;
227
228 /* mark the last match sequence as optional */
229 case '?':
1a8c1b81
RC
230 if (cs)
231 *cs = *cs | OPT;
23e9541a
KM
232 break;
233
234 /* recurse and define a subexpression */
235 case '(':
236 if (acs != NIL && acs != cs) {
237 do {
238 temp = OCNT(acs);
239 OCNT(acs) = ccre - acs;
240 acs -= temp;
241 } while (temp != 0);
242 acs = NIL;
243 }
244 cs = ccre;
245 *cs = OPER;
246 OSYM(cs) = '(';
247 ccre = ONEXT(cs);
248 expconv ();
249 OCNT(cs) = ccre - cs; /* offset to next symbol */
250 break;
251
252 /* return from a recursion */
253 case ')':
254 if (acs != NIL) {
255 do {
256 temp = OCNT(acs);
257 OCNT(acs) = ccre - acs;
258 acs -= temp;
259 } while (temp != 0);
260 acs = NIL;
261 }
262 cs = ccre;
263 *cs = META;
264 MSYM(cs) = c;
265 ccre = MNEXT(cs);
266 return;
267
268 /* mark the last match sequence as having an alternate */
269 /* the third byte will contain an offset to jump over the */
270 /* alternate match in case the first did not fail */
271 case '|':
272 if (acs != NIL && acs != cs)
273 OCNT(ccre) = ccre - acs; /* make a back pointer */
274 else
275 OCNT(ccre) = 0;
276 *cs |= ALT;
277 cs = ccre;
278 *cs = OPER;
279 OSYM(cs) = '|';
280 ccre = ONEXT(cs);
281 acs = cs; /* remember that the pointer is to be filles */
282 break;
283
284 /* if its not a metasymbol just build a scharacter string */
285 default:
1a8c1b81 286 if (cs == NIL || (*cs & STR) == 0) {
23e9541a
KM
287 cs = ccre;
288 *cs = STR;
289 SCNT(cs) = 1;
290 ccre = SSTR(cs);
291 } else
292 SCNT(cs)++;
293 *ccre++ = c;
294 break;
295 }
296 }
297 if (acs != NIL) {
298 do {
299 temp = OCNT(acs);
300 OCNT(acs) = ccre - acs;
301 acs -= temp;
302 } while (temp != 0);
303 acs = NIL;
304 }
305 return;
306}
307/* end of convertre */
308
309
310/*
311 * The following routine recognises an irregular expresion
312 * with the following special characters:
313 *
314 * \? - means last match was optional
315 * \a - matches any number of characters
316 * \d - matches any number of spaces and tabs
317 * \p - matches any number of alphanumeric
318 * characters. The
319 * characters matched will be copied into
320 * the area pointed to by 'name'.
321 * \| - alternation
322 * \( \) - grouping used mostly for alternation and
323 * optionality
324 *
325 * The irregular expression must be translated to internal form
326 * prior to calling this routine
327 *
328 * The value returned is the pointer to the first non \a
329 * character matched.
330 */
331
332boolean _escaped; /* true if we are currently _escaped */
333char *_start; /* start of string */
334
335char *
336expmatch (s, re, mstring)
337 register char *s; /* string to check for a match in */
338 register char *re; /* a converted irregular expression */
339 register char *mstring; /* where to put whatever matches a \p */
340{
341 register char *cs; /* the current symbol */
342 register char *ptr,*s1; /* temporary pointer */
343 boolean matched; /* a temporary boolean */
344
345 /* initial conditions */
346 if (re == NIL)
347 return (NIL);
348 cs = re;
349 matched = FALSE;
350
351 /* loop till expression string is exhausted (or at least pretty tired) */
352 while (*cs) {
353 switch (*cs & (OPER | STR | META)) {
354
355 /* try to match a string */
356 case STR:
357 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
358 if (matched) {
359
360 /* hoorah it matches */
361 s += SCNT(cs);
362 cs = SNEXT(cs);
363 } else if (*cs & ALT) {
364
365 /* alternation, skip to next expression */
366 cs = SNEXT(cs);
367 } else if (*cs & OPT) {
368
369 /* the match is optional */
370 cs = SNEXT(cs);
371 matched = 1; /* indicate a successful match */
372 } else {
373
374 /* no match, error return */
375 return (NIL);
376 }
377 break;
378
379 /* an operator, do something fancy */
380 case OPER:
381 switch (OSYM(cs)) {
382
383 /* this is an alternation */
384 case '|':
385 if (matched)
386
387 /* last thing in the alternation was a match, skip ahead */
388 cs = OPTR(cs);
389 else
390
391 /* no match, keep trying */
392 cs = ONEXT(cs);
393 break;
394
395 /* this is a grouping, recurse */
396 case '(':
397 ptr = expmatch (s, ONEXT(cs), mstring);
398 if (ptr != NIL) {
399
400 /* the subexpression matched */
401 matched = 1;
402 s = ptr;
403 } else if (*cs & ALT) {
404
405 /* alternation, skip to next expression */
406 matched = 0;
407 } else if (*cs & OPT) {
408
409 /* the match is optional */
410 matched = 1; /* indicate a successful match */
411 } else {
412
413 /* no match, error return */
414 return (NIL);
415 }
416 cs = OPTR(cs);
417 break;
418 }
419 break;
420
421 /* try to match a metasymbol */
422 case META:
423 switch (MSYM(cs)) {
424
425 /* try to match anything and remember what was matched */
426 case 'p':
427 /*
428 * This is really the same as trying the match the
429 * remaining parts of the expression to any subset
430 * of the string.
431 */
432 s1 = s;
433 do {
434 ptr = expmatch (s1, MNEXT(cs), mstring);
435 if (ptr != NIL && s1 != s) {
436
437 /* we have a match, remember the match */
438 strncpy (mstring, s, s1 - s);
439 mstring[s1 - s] = '\0';
440 return (ptr);
441 } else if (ptr != NIL && (*cs & OPT)) {
442
443 /* it was aoptional so no match is ok */
444 return (ptr);
445 } else if (ptr != NIL) {
446
447 /* not optional and we still matched */
448 return (NIL);
449 }
450 if (!isalnum(*s1) && *s1 != '_')
451 return (NIL);
452 if (*s1 == '\\')
453 _escaped = _escaped ? FALSE : TRUE;
454 else
455 _escaped = FALSE;
456 } while (*s1++);
457 return (NIL);
458
459 /* try to match anything */
460 case 'a':
461 /*
462 * This is really the same as trying the match the
463 * remaining parts of the expression to any subset
464 * of the string.
465 */
466 s1 = s;
467 do {
468 ptr = expmatch (s1, MNEXT(cs), mstring);
469 if (ptr != NIL && s1 != s) {
470
471 /* we have a match */
472 return (ptr);
473 } else if (ptr != NIL && (*cs & OPT)) {
474
475 /* it was aoptional so no match is ok */
476 return (ptr);
477 } else if (ptr != NIL) {
478
479 /* not optional and we still matched */
480 return (NIL);
481 }
482 if (*s1 == '\\')
483 _escaped = _escaped ? FALSE : TRUE;
484 else
485 _escaped = FALSE;
486 } while (*s1++);
487 return (NIL);
488
489 /* fail if we are currently _escaped */
490 case 'e':
491 if (_escaped)
492 return(NIL);
493 cs = MNEXT(cs);
494 break;
495
496 /* match any number of tabs and spaces */
497 case 'd':
498 ptr = s;
499 while (*s == ' ' || *s == '\t')
500 s++;
501 if (s != ptr || s == _start) {
502
503 /* match, be happy */
504 matched = 1;
505 cs = MNEXT(cs);
506 } else if (*s == '\n' || *s == '\0') {
507
508 /* match, be happy */
509 matched = 1;
510 cs = MNEXT(cs);
511 } else if (*cs & ALT) {
512
513 /* try the next part */
514 matched = 0;
515 cs = MNEXT(cs);
516 } else if (*cs & OPT) {
517
518 /* doesn't matter */
519 matched = 1;
520 cs = MNEXT(cs);
521 } else
522
523 /* no match, error return */
524 return (NIL);
525 break;
526
527 /* check for end of line */
528 case '$':
529 if (*s == '\0' || *s == '\n') {
530
531 /* match, be happy */
532 s++;
533 matched = 1;
534 cs = MNEXT(cs);
535 } else if (*cs & ALT) {
536
537 /* try the next part */
538 matched = 0;
539 cs = MNEXT(cs);
540 } else if (*cs & OPT) {
541
542 /* doesn't matter */
543 matched = 1;
544 cs = MNEXT(cs);
545 } else
546
547 /* no match, error return */
548 return (NIL);
549 break;
550
551 /* check for start of line */
552 case '^':
553 if (s == _start) {
554
555 /* match, be happy */
556 matched = 1;
557 cs = MNEXT(cs);
558 } else if (*cs & ALT) {
559
560 /* try the next part */
561 matched = 0;
562 cs = MNEXT(cs);
563 } else if (*cs & OPT) {
564
565 /* doesn't matter */
566 matched = 1;
567 cs = MNEXT(cs);
568 } else
569
570 /* no match, error return */
571 return (NIL);
572 break;
573
574 /* end of a subexpression, return success */
575 case ')':
576 return (s);
577 }
578 break;
579 }
580 }
581 return (s);
582}