date and time created 82/10/19 00:52:04 by mckusick
[unix-history] / usr / src / usr.bin / vgrind / regexp.c
CommitLineData
23e9541a
KM
1static char sccsid[] = "@(#)regexp.c 4.1 (Berkeley) %G%";
2
3#include <ctype.h>
4
5typedef int boolean;
6#define TRUE 1
7#define FALSE 0
8#define NIL 0
9
10boolean l_onecase; /* true if upper and lower equivalent */
11
12#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
13
14/* STRNCMP - like strncmp except that we convert the
15 * first string to lower case before comparing
16 * if l_onecase is set.
17 */
18
19STRNCMP(s1, s2, len)
20 register char *s1,*s2;
21 register int len;
22{
23 if (l_onecase) {
24 do
25 if (*s2 - makelower(*s1))
26 return (*s2 - makelower(*s1));
27 else {
28 s2++;
29 s1++;
30 }
31 while (--len);
32 } else {
33 do
34 if (*s2 - *s1)
35 return (*s2 - *s1);
36 else {
37 s2++;
38 s1++;
39 }
40 while (--len);
41 }
42 return(0);
43}
44
45/* The following routine converts an irregular expression to
46 * internal format.
47 *
48 * Either meta symbols (\a \d or \p) or character strings or
49 * operations ( alternation or perenthesizing ) can be
50 * specified. Each starts with a descriptor byte. The descriptor
51 * byte has STR set for strings, META set for meta symbols
52 * and OPER set for operations.
53 * The descriptor byte can also have the OPT bit set if the object
54 * defined is optional. Also ALT can be set to indicate an alternation.
55 *
56 * For metasymbols the byte following the descriptor byte identities
57 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
58 * strings the byte after the descriptor is a character count for
59 * the string:
60 *
61 * meta symbols := descriptor
62 * symbol
63 *
64 * strings := descriptor
65 * character count
66 * the string
67 *
68 * operatins := descriptor
69 * symbol
70 * character count
71 */
72
73/*
74 * handy macros for accessing parts of match blocks
75 */
76#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
77#define MNEXT(A) (A+2) /* character following a metasymbol block */
78
79#define OSYM(A) (*(A+1)) /* symbol in an operation block */
80#define OCNT(A) (*(A+2)) /* character count */
81#define ONEXT(A) (A+3) /* next character after the operation */
82#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
83
84#define SCNT(A) (*(A+1)) /* byte count of a string */
85#define SSTR(A) (A+2) /* address of the string */
86#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
87
88/*
89 * bit flags in the descriptor
90 */
91#define OPT 1
92#define STR 2
93#define META 4
94#define ALT 8
95#define OPER 16
96
97char *ure; /* pointer current position in unconverted exp */
98char *ccre; /* pointer to current position in converted exp*/
99char *malloc();
100
101char *
102convexp(re)
103 char *re; /* unconverted irregular expression */
104{
105 register char *cre; /* pointer to converted regular expression */
106
107 /* allocate room for the converted expression */
108 if (re == NIL)
109 return (NIL);
110 if (*re == '\0')
111 return (NIL);
112 cre = malloc (4 * strlen(re) + 3);
113 ccre = cre;
114 ure = re;
115
116 /* start the conversion with a \a */
117 *cre = META | OPT;
118 MSYM(cre) = 'a';
119 ccre = MNEXT(cre);
120
121 /* start the conversion (its recursive) */
122 expconv ();
123 *ccre = 0;
124 return (cre);
125}
126
127expconv()
128{
129 register char *cs; /* pointer to current symbol in converted exp */
130 register char c; /* character being processed */
131 register char *acs; /* pinter to last alternate */
132 register int temp;
133
134 /* let the conversion begin */
135 acs = NIL;
136 while (*ure != NIL) {
137 switch (c = *ure++) {
138
139 case '\\':
140 switch (c = *ure++) {
141
142 /* escaped characters are just characters */
143 default:
144 if ((*cs & STR) == 0) {
145 cs = ccre;
146 *cs = STR;
147 SCNT(cs) = 1;
148 ccre += 2;
149 } else
150 SCNT(cs)++;
151 *ccre++ = c;
152 break;
153
154 /* normal(?) metacharacters */
155 case 'a':
156 case 'd':
157 case 'e':
158 case 'p':
159 if (acs != NIL && acs != cs) {
160 do {
161 temp = OCNT(acs);
162 OCNT(acs) = ccre - acs;
163 acs -= temp;
164 } while (temp != 0);
165 acs = NIL;
166 }
167 cs = ccre;
168 *cs = META;
169 MSYM(cs) = c;
170 ccre = MNEXT(cs);
171 break;
172 }
173 break;
174
175 /* just put the symbol in */
176 case '^':
177 case '$':
178 if (acs != NIL && acs != cs) {
179 do {
180 temp = OCNT(acs);
181 OCNT(acs) = ccre - acs;
182 acs -= temp;
183 } while (temp != 0);
184 acs = NIL;
185 }
186 cs = ccre;
187 *cs = META;
188 MSYM(cs) = c;
189 ccre = MNEXT(cs);
190 break;
191
192 /* mark the last match sequence as optional */
193 case '?':
194 *cs = *cs | OPT;
195 break;
196
197 /* recurse and define a subexpression */
198 case '(':
199 if (acs != NIL && acs != cs) {
200 do {
201 temp = OCNT(acs);
202 OCNT(acs) = ccre - acs;
203 acs -= temp;
204 } while (temp != 0);
205 acs = NIL;
206 }
207 cs = ccre;
208 *cs = OPER;
209 OSYM(cs) = '(';
210 ccre = ONEXT(cs);
211 expconv ();
212 OCNT(cs) = ccre - cs; /* offset to next symbol */
213 break;
214
215 /* return from a recursion */
216 case ')':
217 if (acs != NIL) {
218 do {
219 temp = OCNT(acs);
220 OCNT(acs) = ccre - acs;
221 acs -= temp;
222 } while (temp != 0);
223 acs = NIL;
224 }
225 cs = ccre;
226 *cs = META;
227 MSYM(cs) = c;
228 ccre = MNEXT(cs);
229 return;
230
231 /* mark the last match sequence as having an alternate */
232 /* the third byte will contain an offset to jump over the */
233 /* alternate match in case the first did not fail */
234 case '|':
235 if (acs != NIL && acs != cs)
236 OCNT(ccre) = ccre - acs; /* make a back pointer */
237 else
238 OCNT(ccre) = 0;
239 *cs |= ALT;
240 cs = ccre;
241 *cs = OPER;
242 OSYM(cs) = '|';
243 ccre = ONEXT(cs);
244 acs = cs; /* remember that the pointer is to be filles */
245 break;
246
247 /* if its not a metasymbol just build a scharacter string */
248 default:
249 if ((*cs & STR) == 0) {
250 cs = ccre;
251 *cs = STR;
252 SCNT(cs) = 1;
253 ccre = SSTR(cs);
254 } else
255 SCNT(cs)++;
256 *ccre++ = c;
257 break;
258 }
259 }
260 if (acs != NIL) {
261 do {
262 temp = OCNT(acs);
263 OCNT(acs) = ccre - acs;
264 acs -= temp;
265 } while (temp != 0);
266 acs = NIL;
267 }
268 return;
269}
270/* end of convertre */
271
272
273/*
274 * The following routine recognises an irregular expresion
275 * with the following special characters:
276 *
277 * \? - means last match was optional
278 * \a - matches any number of characters
279 * \d - matches any number of spaces and tabs
280 * \p - matches any number of alphanumeric
281 * characters. The
282 * characters matched will be copied into
283 * the area pointed to by 'name'.
284 * \| - alternation
285 * \( \) - grouping used mostly for alternation and
286 * optionality
287 *
288 * The irregular expression must be translated to internal form
289 * prior to calling this routine
290 *
291 * The value returned is the pointer to the first non \a
292 * character matched.
293 */
294
295boolean _escaped; /* true if we are currently _escaped */
296char *_start; /* start of string */
297
298char *
299expmatch (s, re, mstring)
300 register char *s; /* string to check for a match in */
301 register char *re; /* a converted irregular expression */
302 register char *mstring; /* where to put whatever matches a \p */
303{
304 register char *cs; /* the current symbol */
305 register char *ptr,*s1; /* temporary pointer */
306 boolean matched; /* a temporary boolean */
307
308 /* initial conditions */
309 if (re == NIL)
310 return (NIL);
311 cs = re;
312 matched = FALSE;
313
314 /* loop till expression string is exhausted (or at least pretty tired) */
315 while (*cs) {
316 switch (*cs & (OPER | STR | META)) {
317
318 /* try to match a string */
319 case STR:
320 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
321 if (matched) {
322
323 /* hoorah it matches */
324 s += SCNT(cs);
325 cs = SNEXT(cs);
326 } else if (*cs & ALT) {
327
328 /* alternation, skip to next expression */
329 cs = SNEXT(cs);
330 } else if (*cs & OPT) {
331
332 /* the match is optional */
333 cs = SNEXT(cs);
334 matched = 1; /* indicate a successful match */
335 } else {
336
337 /* no match, error return */
338 return (NIL);
339 }
340 break;
341
342 /* an operator, do something fancy */
343 case OPER:
344 switch (OSYM(cs)) {
345
346 /* this is an alternation */
347 case '|':
348 if (matched)
349
350 /* last thing in the alternation was a match, skip ahead */
351 cs = OPTR(cs);
352 else
353
354 /* no match, keep trying */
355 cs = ONEXT(cs);
356 break;
357
358 /* this is a grouping, recurse */
359 case '(':
360 ptr = expmatch (s, ONEXT(cs), mstring);
361 if (ptr != NIL) {
362
363 /* the subexpression matched */
364 matched = 1;
365 s = ptr;
366 } else if (*cs & ALT) {
367
368 /* alternation, skip to next expression */
369 matched = 0;
370 } else if (*cs & OPT) {
371
372 /* the match is optional */
373 matched = 1; /* indicate a successful match */
374 } else {
375
376 /* no match, error return */
377 return (NIL);
378 }
379 cs = OPTR(cs);
380 break;
381 }
382 break;
383
384 /* try to match a metasymbol */
385 case META:
386 switch (MSYM(cs)) {
387
388 /* try to match anything and remember what was matched */
389 case 'p':
390 /*
391 * This is really the same as trying the match the
392 * remaining parts of the expression to any subset
393 * of the string.
394 */
395 s1 = s;
396 do {
397 ptr = expmatch (s1, MNEXT(cs), mstring);
398 if (ptr != NIL && s1 != s) {
399
400 /* we have a match, remember the match */
401 strncpy (mstring, s, s1 - s);
402 mstring[s1 - s] = '\0';
403 return (ptr);
404 } else if (ptr != NIL && (*cs & OPT)) {
405
406 /* it was aoptional so no match is ok */
407 return (ptr);
408 } else if (ptr != NIL) {
409
410 /* not optional and we still matched */
411 return (NIL);
412 }
413 if (!isalnum(*s1) && *s1 != '_')
414 return (NIL);
415 if (*s1 == '\\')
416 _escaped = _escaped ? FALSE : TRUE;
417 else
418 _escaped = FALSE;
419 } while (*s1++);
420 return (NIL);
421
422 /* try to match anything */
423 case 'a':
424 /*
425 * This is really the same as trying the match the
426 * remaining parts of the expression to any subset
427 * of the string.
428 */
429 s1 = s;
430 do {
431 ptr = expmatch (s1, MNEXT(cs), mstring);
432 if (ptr != NIL && s1 != s) {
433
434 /* we have a match */
435 return (ptr);
436 } else if (ptr != NIL && (*cs & OPT)) {
437
438 /* it was aoptional so no match is ok */
439 return (ptr);
440 } else if (ptr != NIL) {
441
442 /* not optional and we still matched */
443 return (NIL);
444 }
445 if (*s1 == '\\')
446 _escaped = _escaped ? FALSE : TRUE;
447 else
448 _escaped = FALSE;
449 } while (*s1++);
450 return (NIL);
451
452 /* fail if we are currently _escaped */
453 case 'e':
454 if (_escaped)
455 return(NIL);
456 cs = MNEXT(cs);
457 break;
458
459 /* match any number of tabs and spaces */
460 case 'd':
461 ptr = s;
462 while (*s == ' ' || *s == '\t')
463 s++;
464 if (s != ptr || s == _start) {
465
466 /* match, be happy */
467 matched = 1;
468 cs = MNEXT(cs);
469 } else if (*s == '\n' || *s == '\0') {
470
471 /* match, be happy */
472 matched = 1;
473 cs = MNEXT(cs);
474 } else if (*cs & ALT) {
475
476 /* try the next part */
477 matched = 0;
478 cs = MNEXT(cs);
479 } else if (*cs & OPT) {
480
481 /* doesn't matter */
482 matched = 1;
483 cs = MNEXT(cs);
484 } else
485
486 /* no match, error return */
487 return (NIL);
488 break;
489
490 /* check for end of line */
491 case '$':
492 if (*s == '\0' || *s == '\n') {
493
494 /* match, be happy */
495 s++;
496 matched = 1;
497 cs = MNEXT(cs);
498 } else if (*cs & ALT) {
499
500 /* try the next part */
501 matched = 0;
502 cs = MNEXT(cs);
503 } else if (*cs & OPT) {
504
505 /* doesn't matter */
506 matched = 1;
507 cs = MNEXT(cs);
508 } else
509
510 /* no match, error return */
511 return (NIL);
512 break;
513
514 /* check for start of line */
515 case '^':
516 if (s == _start) {
517
518 /* match, be happy */
519 matched = 1;
520 cs = MNEXT(cs);
521 } else if (*cs & ALT) {
522
523 /* try the next part */
524 matched = 0;
525 cs = MNEXT(cs);
526 } else if (*cs & OPT) {
527
528 /* doesn't matter */
529 matched = 1;
530 cs = MNEXT(cs);
531 } else
532
533 /* no match, error return */
534 return (NIL);
535 break;
536
537 /* end of a subexpression, return success */
538 case ')':
539 return (s);
540 }
541 break;
542 }
543 }
544 return (s);
545}