added "more" command
[unix-history] / usr / src / usr.bin / vgrind / regexp.c
CommitLineData
76797561
DF
1/*
2 * Copyright (c) 1980 Regents of the University of California.
3 * All rights reserved. The Berkeley software License Agreement
4 * specifies the terms and conditions for redistribution.
5 */
6
7#ifndef lint
8static char sccsid[] = "@(#)regexp.c 5.1 (Berkeley) %G%";
9#endif not lint
23e9541a
KM
10
11#include <ctype.h>
12
13typedef int boolean;
14#define TRUE 1
15#define FALSE 0
16#define NIL 0
17
18boolean l_onecase; /* true if upper and lower equivalent */
19
20#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
21
22/* STRNCMP - like strncmp except that we convert the
23 * first string to lower case before comparing
24 * if l_onecase is set.
25 */
26
27STRNCMP(s1, s2, len)
28 register char *s1,*s2;
29 register int len;
30{
31 if (l_onecase) {
32 do
33 if (*s2 - makelower(*s1))
34 return (*s2 - makelower(*s1));
35 else {
36 s2++;
37 s1++;
38 }
39 while (--len);
40 } else {
41 do
42 if (*s2 - *s1)
43 return (*s2 - *s1);
44 else {
45 s2++;
46 s1++;
47 }
48 while (--len);
49 }
50 return(0);
51}
52
53/* The following routine converts an irregular expression to
54 * internal format.
55 *
56 * Either meta symbols (\a \d or \p) or character strings or
57 * operations ( alternation or perenthesizing ) can be
58 * specified. Each starts with a descriptor byte. The descriptor
59 * byte has STR set for strings, META set for meta symbols
60 * and OPER set for operations.
61 * The descriptor byte can also have the OPT bit set if the object
62 * defined is optional. Also ALT can be set to indicate an alternation.
63 *
64 * For metasymbols the byte following the descriptor byte identities
65 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
66 * strings the byte after the descriptor is a character count for
67 * the string:
68 *
69 * meta symbols := descriptor
70 * symbol
71 *
72 * strings := descriptor
73 * character count
74 * the string
75 *
76 * operatins := descriptor
77 * symbol
78 * character count
79 */
80
81/*
82 * handy macros for accessing parts of match blocks
83 */
84#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
85#define MNEXT(A) (A+2) /* character following a metasymbol block */
86
87#define OSYM(A) (*(A+1)) /* symbol in an operation block */
88#define OCNT(A) (*(A+2)) /* character count */
89#define ONEXT(A) (A+3) /* next character after the operation */
90#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
91
92#define SCNT(A) (*(A+1)) /* byte count of a string */
93#define SSTR(A) (A+2) /* address of the string */
94#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
95
96/*
97 * bit flags in the descriptor
98 */
99#define OPT 1
100#define STR 2
101#define META 4
102#define ALT 8
103#define OPER 16
104
105char *ure; /* pointer current position in unconverted exp */
106char *ccre; /* pointer to current position in converted exp*/
107char *malloc();
108
109char *
110convexp(re)
111 char *re; /* unconverted irregular expression */
112{
113 register char *cre; /* pointer to converted regular expression */
114
115 /* allocate room for the converted expression */
116 if (re == NIL)
117 return (NIL);
118 if (*re == '\0')
119 return (NIL);
120 cre = malloc (4 * strlen(re) + 3);
121 ccre = cre;
122 ure = re;
123
124 /* start the conversion with a \a */
125 *cre = META | OPT;
126 MSYM(cre) = 'a';
127 ccre = MNEXT(cre);
128
129 /* start the conversion (its recursive) */
130 expconv ();
131 *ccre = 0;
132 return (cre);
133}
134
135expconv()
136{
137 register char *cs; /* pointer to current symbol in converted exp */
138 register char c; /* character being processed */
139 register char *acs; /* pinter to last alternate */
140 register int temp;
141
142 /* let the conversion begin */
143 acs = NIL;
1a8c1b81 144 cs = NIL;
23e9541a
KM
145 while (*ure != NIL) {
146 switch (c = *ure++) {
147
148 case '\\':
149 switch (c = *ure++) {
150
151 /* escaped characters are just characters */
152 default:
1a8c1b81 153 if (cs == NIL || (*cs & STR) == 0) {
23e9541a
KM
154 cs = ccre;
155 *cs = STR;
156 SCNT(cs) = 1;
157 ccre += 2;
158 } else
159 SCNT(cs)++;
160 *ccre++ = c;
161 break;
162
163 /* normal(?) metacharacters */
164 case 'a':
165 case 'd':
166 case 'e':
167 case 'p':
168 if (acs != NIL && acs != cs) {
169 do {
170 temp = OCNT(acs);
171 OCNT(acs) = ccre - acs;
172 acs -= temp;
173 } while (temp != 0);
174 acs = NIL;
175 }
176 cs = ccre;
177 *cs = META;
178 MSYM(cs) = c;
179 ccre = MNEXT(cs);
180 break;
181 }
182 break;
183
184 /* just put the symbol in */
185 case '^':
186 case '$':
187 if (acs != NIL && acs != cs) {
188 do {
189 temp = OCNT(acs);
190 OCNT(acs) = ccre - acs;
191 acs -= temp;
192 } while (temp != 0);
193 acs = NIL;
194 }
195 cs = ccre;
196 *cs = META;
197 MSYM(cs) = c;
198 ccre = MNEXT(cs);
199 break;
200
201 /* mark the last match sequence as optional */
202 case '?':
1a8c1b81
RC
203 if (cs)
204 *cs = *cs | OPT;
23e9541a
KM
205 break;
206
207 /* recurse and define a subexpression */
208 case '(':
209 if (acs != NIL && acs != cs) {
210 do {
211 temp = OCNT(acs);
212 OCNT(acs) = ccre - acs;
213 acs -= temp;
214 } while (temp != 0);
215 acs = NIL;
216 }
217 cs = ccre;
218 *cs = OPER;
219 OSYM(cs) = '(';
220 ccre = ONEXT(cs);
221 expconv ();
222 OCNT(cs) = ccre - cs; /* offset to next symbol */
223 break;
224
225 /* return from a recursion */
226 case ')':
227 if (acs != NIL) {
228 do {
229 temp = OCNT(acs);
230 OCNT(acs) = ccre - acs;
231 acs -= temp;
232 } while (temp != 0);
233 acs = NIL;
234 }
235 cs = ccre;
236 *cs = META;
237 MSYM(cs) = c;
238 ccre = MNEXT(cs);
239 return;
240
241 /* mark the last match sequence as having an alternate */
242 /* the third byte will contain an offset to jump over the */
243 /* alternate match in case the first did not fail */
244 case '|':
245 if (acs != NIL && acs != cs)
246 OCNT(ccre) = ccre - acs; /* make a back pointer */
247 else
248 OCNT(ccre) = 0;
249 *cs |= ALT;
250 cs = ccre;
251 *cs = OPER;
252 OSYM(cs) = '|';
253 ccre = ONEXT(cs);
254 acs = cs; /* remember that the pointer is to be filles */
255 break;
256
257 /* if its not a metasymbol just build a scharacter string */
258 default:
1a8c1b81 259 if (cs == NIL || (*cs & STR) == 0) {
23e9541a
KM
260 cs = ccre;
261 *cs = STR;
262 SCNT(cs) = 1;
263 ccre = SSTR(cs);
264 } else
265 SCNT(cs)++;
266 *ccre++ = c;
267 break;
268 }
269 }
270 if (acs != NIL) {
271 do {
272 temp = OCNT(acs);
273 OCNT(acs) = ccre - acs;
274 acs -= temp;
275 } while (temp != 0);
276 acs = NIL;
277 }
278 return;
279}
280/* end of convertre */
281
282
283/*
284 * The following routine recognises an irregular expresion
285 * with the following special characters:
286 *
287 * \? - means last match was optional
288 * \a - matches any number of characters
289 * \d - matches any number of spaces and tabs
290 * \p - matches any number of alphanumeric
291 * characters. The
292 * characters matched will be copied into
293 * the area pointed to by 'name'.
294 * \| - alternation
295 * \( \) - grouping used mostly for alternation and
296 * optionality
297 *
298 * The irregular expression must be translated to internal form
299 * prior to calling this routine
300 *
301 * The value returned is the pointer to the first non \a
302 * character matched.
303 */
304
305boolean _escaped; /* true if we are currently _escaped */
306char *_start; /* start of string */
307
308char *
309expmatch (s, re, mstring)
310 register char *s; /* string to check for a match in */
311 register char *re; /* a converted irregular expression */
312 register char *mstring; /* where to put whatever matches a \p */
313{
314 register char *cs; /* the current symbol */
315 register char *ptr,*s1; /* temporary pointer */
316 boolean matched; /* a temporary boolean */
317
318 /* initial conditions */
319 if (re == NIL)
320 return (NIL);
321 cs = re;
322 matched = FALSE;
323
324 /* loop till expression string is exhausted (or at least pretty tired) */
325 while (*cs) {
326 switch (*cs & (OPER | STR | META)) {
327
328 /* try to match a string */
329 case STR:
330 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
331 if (matched) {
332
333 /* hoorah it matches */
334 s += SCNT(cs);
335 cs = SNEXT(cs);
336 } else if (*cs & ALT) {
337
338 /* alternation, skip to next expression */
339 cs = SNEXT(cs);
340 } else if (*cs & OPT) {
341
342 /* the match is optional */
343 cs = SNEXT(cs);
344 matched = 1; /* indicate a successful match */
345 } else {
346
347 /* no match, error return */
348 return (NIL);
349 }
350 break;
351
352 /* an operator, do something fancy */
353 case OPER:
354 switch (OSYM(cs)) {
355
356 /* this is an alternation */
357 case '|':
358 if (matched)
359
360 /* last thing in the alternation was a match, skip ahead */
361 cs = OPTR(cs);
362 else
363
364 /* no match, keep trying */
365 cs = ONEXT(cs);
366 break;
367
368 /* this is a grouping, recurse */
369 case '(':
370 ptr = expmatch (s, ONEXT(cs), mstring);
371 if (ptr != NIL) {
372
373 /* the subexpression matched */
374 matched = 1;
375 s = ptr;
376 } else if (*cs & ALT) {
377
378 /* alternation, skip to next expression */
379 matched = 0;
380 } else if (*cs & OPT) {
381
382 /* the match is optional */
383 matched = 1; /* indicate a successful match */
384 } else {
385
386 /* no match, error return */
387 return (NIL);
388 }
389 cs = OPTR(cs);
390 break;
391 }
392 break;
393
394 /* try to match a metasymbol */
395 case META:
396 switch (MSYM(cs)) {
397
398 /* try to match anything and remember what was matched */
399 case 'p':
400 /*
401 * This is really the same as trying the match the
402 * remaining parts of the expression to any subset
403 * of the string.
404 */
405 s1 = s;
406 do {
407 ptr = expmatch (s1, MNEXT(cs), mstring);
408 if (ptr != NIL && s1 != s) {
409
410 /* we have a match, remember the match */
411 strncpy (mstring, s, s1 - s);
412 mstring[s1 - s] = '\0';
413 return (ptr);
414 } else if (ptr != NIL && (*cs & OPT)) {
415
416 /* it was aoptional so no match is ok */
417 return (ptr);
418 } else if (ptr != NIL) {
419
420 /* not optional and we still matched */
421 return (NIL);
422 }
423 if (!isalnum(*s1) && *s1 != '_')
424 return (NIL);
425 if (*s1 == '\\')
426 _escaped = _escaped ? FALSE : TRUE;
427 else
428 _escaped = FALSE;
429 } while (*s1++);
430 return (NIL);
431
432 /* try to match anything */
433 case 'a':
434 /*
435 * This is really the same as trying the match the
436 * remaining parts of the expression to any subset
437 * of the string.
438 */
439 s1 = s;
440 do {
441 ptr = expmatch (s1, MNEXT(cs), mstring);
442 if (ptr != NIL && s1 != s) {
443
444 /* we have a match */
445 return (ptr);
446 } else if (ptr != NIL && (*cs & OPT)) {
447
448 /* it was aoptional so no match is ok */
449 return (ptr);
450 } else if (ptr != NIL) {
451
452 /* not optional and we still matched */
453 return (NIL);
454 }
455 if (*s1 == '\\')
456 _escaped = _escaped ? FALSE : TRUE;
457 else
458 _escaped = FALSE;
459 } while (*s1++);
460 return (NIL);
461
462 /* fail if we are currently _escaped */
463 case 'e':
464 if (_escaped)
465 return(NIL);
466 cs = MNEXT(cs);
467 break;
468
469 /* match any number of tabs and spaces */
470 case 'd':
471 ptr = s;
472 while (*s == ' ' || *s == '\t')
473 s++;
474 if (s != ptr || s == _start) {
475
476 /* match, be happy */
477 matched = 1;
478 cs = MNEXT(cs);
479 } else if (*s == '\n' || *s == '\0') {
480
481 /* match, be happy */
482 matched = 1;
483 cs = MNEXT(cs);
484 } else if (*cs & ALT) {
485
486 /* try the next part */
487 matched = 0;
488 cs = MNEXT(cs);
489 } else if (*cs & OPT) {
490
491 /* doesn't matter */
492 matched = 1;
493 cs = MNEXT(cs);
494 } else
495
496 /* no match, error return */
497 return (NIL);
498 break;
499
500 /* check for end of line */
501 case '$':
502 if (*s == '\0' || *s == '\n') {
503
504 /* match, be happy */
505 s++;
506 matched = 1;
507 cs = MNEXT(cs);
508 } else if (*cs & ALT) {
509
510 /* try the next part */
511 matched = 0;
512 cs = MNEXT(cs);
513 } else if (*cs & OPT) {
514
515 /* doesn't matter */
516 matched = 1;
517 cs = MNEXT(cs);
518 } else
519
520 /* no match, error return */
521 return (NIL);
522 break;
523
524 /* check for start of line */
525 case '^':
526 if (s == _start) {
527
528 /* match, be happy */
529 matched = 1;
530 cs = MNEXT(cs);
531 } else if (*cs & ALT) {
532
533 /* try the next part */
534 matched = 0;
535 cs = MNEXT(cs);
536 } else if (*cs & OPT) {
537
538 /* doesn't matter */
539 matched = 1;
540 cs = MNEXT(cs);
541 } else
542
543 /* no match, error return */
544 return (NIL);
545 break;
546
547 /* end of a subexpression, return success */
548 case ')':
549 return (s);
550 }
551 break;
552 }
553 }
554 return (s);
555}