sign-extension corrections (from donn@utah-cs)
[unix-history] / usr / src / usr.bin / vgrind / regexp.c
CommitLineData
1a8c1b81 1static char sccsid[] = "@(#)regexp.c 4.2 (Berkeley) %G%";
23e9541a
KM
2
3#include <ctype.h>
4
5typedef int boolean;
6#define TRUE 1
7#define FALSE 0
8#define NIL 0
9
10boolean l_onecase; /* true if upper and lower equivalent */
11
12#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
13
14/* STRNCMP - like strncmp except that we convert the
15 * first string to lower case before comparing
16 * if l_onecase is set.
17 */
18
19STRNCMP(s1, s2, len)
20 register char *s1,*s2;
21 register int len;
22{
23 if (l_onecase) {
24 do
25 if (*s2 - makelower(*s1))
26 return (*s2 - makelower(*s1));
27 else {
28 s2++;
29 s1++;
30 }
31 while (--len);
32 } else {
33 do
34 if (*s2 - *s1)
35 return (*s2 - *s1);
36 else {
37 s2++;
38 s1++;
39 }
40 while (--len);
41 }
42 return(0);
43}
44
45/* The following routine converts an irregular expression to
46 * internal format.
47 *
48 * Either meta symbols (\a \d or \p) or character strings or
49 * operations ( alternation or perenthesizing ) can be
50 * specified. Each starts with a descriptor byte. The descriptor
51 * byte has STR set for strings, META set for meta symbols
52 * and OPER set for operations.
53 * The descriptor byte can also have the OPT bit set if the object
54 * defined is optional. Also ALT can be set to indicate an alternation.
55 *
56 * For metasymbols the byte following the descriptor byte identities
57 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
58 * strings the byte after the descriptor is a character count for
59 * the string:
60 *
61 * meta symbols := descriptor
62 * symbol
63 *
64 * strings := descriptor
65 * character count
66 * the string
67 *
68 * operatins := descriptor
69 * symbol
70 * character count
71 */
72
73/*
74 * handy macros for accessing parts of match blocks
75 */
76#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
77#define MNEXT(A) (A+2) /* character following a metasymbol block */
78
79#define OSYM(A) (*(A+1)) /* symbol in an operation block */
80#define OCNT(A) (*(A+2)) /* character count */
81#define ONEXT(A) (A+3) /* next character after the operation */
82#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
83
84#define SCNT(A) (*(A+1)) /* byte count of a string */
85#define SSTR(A) (A+2) /* address of the string */
86#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
87
88/*
89 * bit flags in the descriptor
90 */
91#define OPT 1
92#define STR 2
93#define META 4
94#define ALT 8
95#define OPER 16
96
97char *ure; /* pointer current position in unconverted exp */
98char *ccre; /* pointer to current position in converted exp*/
99char *malloc();
100
101char *
102convexp(re)
103 char *re; /* unconverted irregular expression */
104{
105 register char *cre; /* pointer to converted regular expression */
106
107 /* allocate room for the converted expression */
108 if (re == NIL)
109 return (NIL);
110 if (*re == '\0')
111 return (NIL);
112 cre = malloc (4 * strlen(re) + 3);
113 ccre = cre;
114 ure = re;
115
116 /* start the conversion with a \a */
117 *cre = META | OPT;
118 MSYM(cre) = 'a';
119 ccre = MNEXT(cre);
120
121 /* start the conversion (its recursive) */
122 expconv ();
123 *ccre = 0;
124 return (cre);
125}
126
127expconv()
128{
129 register char *cs; /* pointer to current symbol in converted exp */
130 register char c; /* character being processed */
131 register char *acs; /* pinter to last alternate */
132 register int temp;
133
134 /* let the conversion begin */
135 acs = NIL;
1a8c1b81 136 cs = NIL;
23e9541a
KM
137 while (*ure != NIL) {
138 switch (c = *ure++) {
139
140 case '\\':
141 switch (c = *ure++) {
142
143 /* escaped characters are just characters */
144 default:
1a8c1b81 145 if (cs == NIL || (*cs & STR) == 0) {
23e9541a
KM
146 cs = ccre;
147 *cs = STR;
148 SCNT(cs) = 1;
149 ccre += 2;
150 } else
151 SCNT(cs)++;
152 *ccre++ = c;
153 break;
154
155 /* normal(?) metacharacters */
156 case 'a':
157 case 'd':
158 case 'e':
159 case 'p':
160 if (acs != NIL && acs != cs) {
161 do {
162 temp = OCNT(acs);
163 OCNT(acs) = ccre - acs;
164 acs -= temp;
165 } while (temp != 0);
166 acs = NIL;
167 }
168 cs = ccre;
169 *cs = META;
170 MSYM(cs) = c;
171 ccre = MNEXT(cs);
172 break;
173 }
174 break;
175
176 /* just put the symbol in */
177 case '^':
178 case '$':
179 if (acs != NIL && acs != cs) {
180 do {
181 temp = OCNT(acs);
182 OCNT(acs) = ccre - acs;
183 acs -= temp;
184 } while (temp != 0);
185 acs = NIL;
186 }
187 cs = ccre;
188 *cs = META;
189 MSYM(cs) = c;
190 ccre = MNEXT(cs);
191 break;
192
193 /* mark the last match sequence as optional */
194 case '?':
1a8c1b81
RC
195 if (cs)
196 *cs = *cs | OPT;
23e9541a
KM
197 break;
198
199 /* recurse and define a subexpression */
200 case '(':
201 if (acs != NIL && acs != cs) {
202 do {
203 temp = OCNT(acs);
204 OCNT(acs) = ccre - acs;
205 acs -= temp;
206 } while (temp != 0);
207 acs = NIL;
208 }
209 cs = ccre;
210 *cs = OPER;
211 OSYM(cs) = '(';
212 ccre = ONEXT(cs);
213 expconv ();
214 OCNT(cs) = ccre - cs; /* offset to next symbol */
215 break;
216
217 /* return from a recursion */
218 case ')':
219 if (acs != NIL) {
220 do {
221 temp = OCNT(acs);
222 OCNT(acs) = ccre - acs;
223 acs -= temp;
224 } while (temp != 0);
225 acs = NIL;
226 }
227 cs = ccre;
228 *cs = META;
229 MSYM(cs) = c;
230 ccre = MNEXT(cs);
231 return;
232
233 /* mark the last match sequence as having an alternate */
234 /* the third byte will contain an offset to jump over the */
235 /* alternate match in case the first did not fail */
236 case '|':
237 if (acs != NIL && acs != cs)
238 OCNT(ccre) = ccre - acs; /* make a back pointer */
239 else
240 OCNT(ccre) = 0;
241 *cs |= ALT;
242 cs = ccre;
243 *cs = OPER;
244 OSYM(cs) = '|';
245 ccre = ONEXT(cs);
246 acs = cs; /* remember that the pointer is to be filles */
247 break;
248
249 /* if its not a metasymbol just build a scharacter string */
250 default:
1a8c1b81 251 if (cs == NIL || (*cs & STR) == 0) {
23e9541a
KM
252 cs = ccre;
253 *cs = STR;
254 SCNT(cs) = 1;
255 ccre = SSTR(cs);
256 } else
257 SCNT(cs)++;
258 *ccre++ = c;
259 break;
260 }
261 }
262 if (acs != NIL) {
263 do {
264 temp = OCNT(acs);
265 OCNT(acs) = ccre - acs;
266 acs -= temp;
267 } while (temp != 0);
268 acs = NIL;
269 }
270 return;
271}
272/* end of convertre */
273
274
275/*
276 * The following routine recognises an irregular expresion
277 * with the following special characters:
278 *
279 * \? - means last match was optional
280 * \a - matches any number of characters
281 * \d - matches any number of spaces and tabs
282 * \p - matches any number of alphanumeric
283 * characters. The
284 * characters matched will be copied into
285 * the area pointed to by 'name'.
286 * \| - alternation
287 * \( \) - grouping used mostly for alternation and
288 * optionality
289 *
290 * The irregular expression must be translated to internal form
291 * prior to calling this routine
292 *
293 * The value returned is the pointer to the first non \a
294 * character matched.
295 */
296
297boolean _escaped; /* true if we are currently _escaped */
298char *_start; /* start of string */
299
300char *
301expmatch (s, re, mstring)
302 register char *s; /* string to check for a match in */
303 register char *re; /* a converted irregular expression */
304 register char *mstring; /* where to put whatever matches a \p */
305{
306 register char *cs; /* the current symbol */
307 register char *ptr,*s1; /* temporary pointer */
308 boolean matched; /* a temporary boolean */
309
310 /* initial conditions */
311 if (re == NIL)
312 return (NIL);
313 cs = re;
314 matched = FALSE;
315
316 /* loop till expression string is exhausted (or at least pretty tired) */
317 while (*cs) {
318 switch (*cs & (OPER | STR | META)) {
319
320 /* try to match a string */
321 case STR:
322 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
323 if (matched) {
324
325 /* hoorah it matches */
326 s += SCNT(cs);
327 cs = SNEXT(cs);
328 } else if (*cs & ALT) {
329
330 /* alternation, skip to next expression */
331 cs = SNEXT(cs);
332 } else if (*cs & OPT) {
333
334 /* the match is optional */
335 cs = SNEXT(cs);
336 matched = 1; /* indicate a successful match */
337 } else {
338
339 /* no match, error return */
340 return (NIL);
341 }
342 break;
343
344 /* an operator, do something fancy */
345 case OPER:
346 switch (OSYM(cs)) {
347
348 /* this is an alternation */
349 case '|':
350 if (matched)
351
352 /* last thing in the alternation was a match, skip ahead */
353 cs = OPTR(cs);
354 else
355
356 /* no match, keep trying */
357 cs = ONEXT(cs);
358 break;
359
360 /* this is a grouping, recurse */
361 case '(':
362 ptr = expmatch (s, ONEXT(cs), mstring);
363 if (ptr != NIL) {
364
365 /* the subexpression matched */
366 matched = 1;
367 s = ptr;
368 } else if (*cs & ALT) {
369
370 /* alternation, skip to next expression */
371 matched = 0;
372 } else if (*cs & OPT) {
373
374 /* the match is optional */
375 matched = 1; /* indicate a successful match */
376 } else {
377
378 /* no match, error return */
379 return (NIL);
380 }
381 cs = OPTR(cs);
382 break;
383 }
384 break;
385
386 /* try to match a metasymbol */
387 case META:
388 switch (MSYM(cs)) {
389
390 /* try to match anything and remember what was matched */
391 case 'p':
392 /*
393 * This is really the same as trying the match the
394 * remaining parts of the expression to any subset
395 * of the string.
396 */
397 s1 = s;
398 do {
399 ptr = expmatch (s1, MNEXT(cs), mstring);
400 if (ptr != NIL && s1 != s) {
401
402 /* we have a match, remember the match */
403 strncpy (mstring, s, s1 - s);
404 mstring[s1 - s] = '\0';
405 return (ptr);
406 } else if (ptr != NIL && (*cs & OPT)) {
407
408 /* it was aoptional so no match is ok */
409 return (ptr);
410 } else if (ptr != NIL) {
411
412 /* not optional and we still matched */
413 return (NIL);
414 }
415 if (!isalnum(*s1) && *s1 != '_')
416 return (NIL);
417 if (*s1 == '\\')
418 _escaped = _escaped ? FALSE : TRUE;
419 else
420 _escaped = FALSE;
421 } while (*s1++);
422 return (NIL);
423
424 /* try to match anything */
425 case 'a':
426 /*
427 * This is really the same as trying the match the
428 * remaining parts of the expression to any subset
429 * of the string.
430 */
431 s1 = s;
432 do {
433 ptr = expmatch (s1, MNEXT(cs), mstring);
434 if (ptr != NIL && s1 != s) {
435
436 /* we have a match */
437 return (ptr);
438 } else if (ptr != NIL && (*cs & OPT)) {
439
440 /* it was aoptional so no match is ok */
441 return (ptr);
442 } else if (ptr != NIL) {
443
444 /* not optional and we still matched */
445 return (NIL);
446 }
447 if (*s1 == '\\')
448 _escaped = _escaped ? FALSE : TRUE;
449 else
450 _escaped = FALSE;
451 } while (*s1++);
452 return (NIL);
453
454 /* fail if we are currently _escaped */
455 case 'e':
456 if (_escaped)
457 return(NIL);
458 cs = MNEXT(cs);
459 break;
460
461 /* match any number of tabs and spaces */
462 case 'd':
463 ptr = s;
464 while (*s == ' ' || *s == '\t')
465 s++;
466 if (s != ptr || s == _start) {
467
468 /* match, be happy */
469 matched = 1;
470 cs = MNEXT(cs);
471 } else if (*s == '\n' || *s == '\0') {
472
473 /* match, be happy */
474 matched = 1;
475 cs = MNEXT(cs);
476 } else if (*cs & ALT) {
477
478 /* try the next part */
479 matched = 0;
480 cs = MNEXT(cs);
481 } else if (*cs & OPT) {
482
483 /* doesn't matter */
484 matched = 1;
485 cs = MNEXT(cs);
486 } else
487
488 /* no match, error return */
489 return (NIL);
490 break;
491
492 /* check for end of line */
493 case '$':
494 if (*s == '\0' || *s == '\n') {
495
496 /* match, be happy */
497 s++;
498 matched = 1;
499 cs = MNEXT(cs);
500 } else if (*cs & ALT) {
501
502 /* try the next part */
503 matched = 0;
504 cs = MNEXT(cs);
505 } else if (*cs & OPT) {
506
507 /* doesn't matter */
508 matched = 1;
509 cs = MNEXT(cs);
510 } else
511
512 /* no match, error return */
513 return (NIL);
514 break;
515
516 /* check for start of line */
517 case '^':
518 if (s == _start) {
519
520 /* match, be happy */
521 matched = 1;
522 cs = MNEXT(cs);
523 } else if (*cs & ALT) {
524
525 /* try the next part */
526 matched = 0;
527 cs = MNEXT(cs);
528 } else if (*cs & OPT) {
529
530 /* doesn't matter */
531 matched = 1;
532 cs = MNEXT(cs);
533 } else
534
535 /* no match, error return */
536 return (NIL);
537 break;
538
539 /* end of a subexpression, return success */
540 case ')':
541 return (s);
542 }
543 break;
544 }
545 }
546 return (s);
547}