new version from Chris Torek
[unix-history] / usr / src / old / regexp / regexp.c
CommitLineData
0df031ff
BJ
1#include <ctype.h>
2
3typedef int boolean;
4#define TRUE 1
5#define FALSE 0
6#define NIL 0
7
8boolean l_onecase; /* true if upper and lower equivalent */
9
10#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
11
12/* STRNCMP - like strncmp except that we convert the
13 * first string to lower case before comparing
14 * if l_onecase is set.
15 */
16
17STRNCMP(s1, s2, len)
18 register char *s1,*s2;
19 register int len;
20{
21 if (l_onecase) {
22 do
23 if (*s2 - makelower(*s1))
24 return (*s2 - makelower(*s1));
25 else {
26 s2++;
27 s1++;
28 }
29 while (--len);
30 } else {
31 do
32 if (*s2 - *s1)
33 return (*s2 - *s1);
34 else {
35 s2++;
36 s1++;
37 }
38 while (--len);
39 }
40 return(0);
41}
42
43/* The following routine converts an irregular expression to
44 * internal format.
45 *
46 * Either meta symbols (\a \d or \p) or character strings or
47 * operations ( alternation or perenthesizing ) can be
48 * specified. Each starts with a descriptor byte. The descriptor
49 * byte has STR set for strings, META set for meta symbols
50 * and OPER set for operations.
51 * The descriptor byte can also have the OPT bit set if the object
52 * defined is optional. Also ALT can be set to indicate an alternation.
53 *
54 * For metasymbols the byte following the descriptor byte identities
55 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
56 * strings the byte after the descriptor is a character count for
57 * the string:
58 *
59 * meta symbols := descriptor
60 * symbol
61 *
62 * strings := descriptor
63 * character count
64 * the string
65 *
66 * operatins := descriptor
67 * symbol
68 * character count
69 */
70
71/*
72 * handy macros for accessing parts of match blocks
73 */
74#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
75#define MNEXT(A) (A+2) /* character following a metasymbol block */
76
77#define OSYM(A) (*(A+1)) /* symbol in an operation block */
78#define OCNT(A) (*(A+2)) /* character count */
79#define ONEXT(A) (A+3) /* next character after the operation */
80#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
81
82#define SCNT(A) (*(A+1)) /* byte count of a string */
83#define SSTR(A) (A+2) /* address of the string */
84#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
85
86/*
87 * bit flags in the descriptor
88 */
89#define OPT 1
90#define STR 2
91#define META 4
92#define ALT 8
93#define OPER 16
94
95char *ure; /* pointer current position in unconverted exp */
96char *ccre; /* pointer to current position in converted exp*/
97char *malloc();
98
99char *
100convexp(re)
101 char *re; /* unconverted irregular expression */
102{
103 register char *cre; /* pointer to converted regular expression */
104
105 /* allocate room for the converted expression */
106 if (re == NIL)
107 return (NIL);
108 if (*re == '\0')
109 return (NIL);
110 cre = malloc (4 * strlen(re) + 3);
111 ccre = cre;
112 ure = re;
113
114 /* start the conversion with a \a */
115 *cre = META | OPT;
116 MSYM(cre) = 'a';
117 ccre = MNEXT(cre);
118
119 /* start the conversion (its recursive) */
120 expconv ();
121 *ccre = 0;
122 return (cre);
123}
124
125expconv()
126{
127 register char *cs; /* pointer to current symbol in converted exp */
128 register char c; /* character being processed */
129 register char *acs; /* pinter to last alternate */
130 register int temp;
131
132 /* let the conversion begin */
133 acs = NIL;
134 while (*ure != NIL) {
135 switch (c = *ure++) {
136
137 case '\\':
138 switch (c = *ure++) {
139
140 /* escaped characters are just characters */
141 default:
142 if ((*cs & STR) == 0) {
143 cs = ccre;
144 *cs = STR;
145 SCNT(cs) = 1;
146 ccre += 2;
147 } else
148 SCNT(cs)++;
149 *ccre++ = c;
150 break;
151
152 /* normal(?) metacharacters */
153 case 'a':
154 case 'd':
155 case 'e':
156 case 'p':
157 if (acs != NIL && acs != cs) {
158 do {
159 temp = OCNT(acs);
160 OCNT(acs) = ccre - acs;
161 acs -= temp;
162 } while (temp != 0);
163 acs = NIL;
164 }
165 cs = ccre;
166 *cs = META;
167 MSYM(cs) = c;
168 ccre = MNEXT(cs);
169 break;
170 }
171 break;
172
173 /* just put the symbol in */
174 case '^':
175 case '$':
176 if (acs != NIL && acs != cs) {
177 do {
178 temp = OCNT(acs);
179 OCNT(acs) = ccre - acs;
180 acs -= temp;
181 } while (temp != 0);
182 acs = NIL;
183 }
184 cs = ccre;
185 *cs = META;
186 MSYM(cs) = c;
187 ccre = MNEXT(cs);
188 break;
189
190 /* mark the last match sequence as optional */
191 case '?':
192 *cs = *cs | OPT;
193 break;
194
195 /* recurse and define a subexpression */
196 case '(':
197 if (acs != NIL && acs != cs) {
198 do {
199 temp = OCNT(acs);
200 OCNT(acs) = ccre - acs;
201 acs -= temp;
202 } while (temp != 0);
203 acs = NIL;
204 }
205 cs = ccre;
206 *cs = OPER;
207 OSYM(cs) = '(';
208 ccre = ONEXT(cs);
209 expconv ();
210 OCNT(cs) = ccre - cs; /* offset to next symbol */
211 break;
212
213 /* return from a recursion */
214 case ')':
215 if (acs != NIL) {
216 do {
217 temp = OCNT(acs);
218 OCNT(acs) = ccre - acs;
219 acs -= temp;
220 } while (temp != 0);
221 acs = NIL;
222 }
223 cs = ccre;
224 *cs = META;
225 MSYM(cs) = c;
226 ccre = MNEXT(cs);
227 return;
228
229 /* mark the last match sequence as having an alternate */
230 /* the third byte will contain an offset to jump over the */
231 /* alternate match in case the first did not fail */
232 case '|':
233 if (acs != NIL && acs != cs)
234 OCNT(ccre) = ccre - acs; /* make a back pointer */
235 else
236 OCNT(ccre) = 0;
237 *cs |= ALT;
238 cs = ccre;
239 *cs = OPER;
240 OSYM(cs) = '|';
241 ccre = ONEXT(cs);
242 acs = cs; /* remember that the pointer is to be filles */
243 break;
244
245 /* if its not a metasymbol just build a scharacter string */
246 default:
247 if ((*cs & STR) == 0) {
248 cs = ccre;
249 *cs = STR;
250 SCNT(cs) = 1;
251 ccre = SSTR(cs);
252 } else
253 SCNT(cs)++;
254 *ccre++ = c;
255 break;
256 }
257 }
258 if (acs != NIL) {
259 do {
260 temp = OCNT(acs);
261 OCNT(acs) = ccre - acs;
262 acs -= temp;
263 } while (temp != 0);
264 acs = NIL;
265 }
266 return;
267}
268/* end of convertre */
269
270
271/*
272 * The following routine recognises an irregular expresion
273 * with the following special characters:
274 *
275 * \? - means last match was optional
276 * \a - matches any number of characters
277 * \d - matches any number of spaces and tabs
278 * \p - matches any number of alphanumeric
279 * characters. The
280 * characters matched will be copied into
281 * the area pointed to by 'name'.
282 * \| - alternation
283 * \( \) - grouping used mostly for alternation and
284 * optionality
285 *
286 * The irregular expression must be translated to internal form
287 * prior to calling this routine
288 *
289 * The value returned is the pointer to the first non \a
290 * character matched.
291 */
292
293boolean _escaped; /* true if we are currently _escaped */
294char *_start; /* start of string */
295
296char *
297expmatch (s, re, mstring)
298 register char *s; /* string to check for a match in */
299 register char *re; /* a converted irregular expression */
300 register char *mstring; /* where to put whatever matches a \p */
301{
302 register char *cs; /* the current symbol */
303 register char *ptr,*s1; /* temporary pointer */
304 boolean matched; /* a temporary boolean */
305
306 /* initial conditions */
307 if (re == NIL)
308 return (NIL);
309 cs = re;
310 matched = FALSE;
311
312 /* loop till expression string is exhausted (or at least pretty tired) */
313 while (*cs) {
314 switch (*cs & (OPER | STR | META)) {
315
316 /* try to match a string */
317 case STR:
318 matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
319 if (matched) {
320
321 /* hoorah it matches */
322 s += SCNT(cs);
323 cs = SNEXT(cs);
324 } else if (*cs & ALT) {
325
326 /* alternation, skip to next expression */
327 cs = SNEXT(cs);
328 } else if (*cs & OPT) {
329
330 /* the match is optional */
331 cs = SNEXT(cs);
332 matched = 1; /* indicate a successful match */
333 } else {
334
335 /* no match, error return */
336 return (NIL);
337 }
338 break;
339
340 /* an operator, do something fancy */
341 case OPER:
342 switch (OSYM(cs)) {
343
344 /* this is an alternation */
345 case '|':
346 if (matched)
347
348 /* last thing in the alternation was a match, skip ahead */
349 cs = OPTR(cs);
350 else
351
352 /* no match, keep trying */
353 cs = ONEXT(cs);
354 break;
355
356 /* this is a grouping, recurse */
357 case '(':
358 ptr = expmatch (s, ONEXT(cs), mstring);
359 if (ptr != NIL) {
360
361 /* the subexpression matched */
362 matched = 1;
363 s = ptr;
364 } else if (*cs & ALT) {
365
366 /* alternation, skip to next expression */
367 matched = 0;
368 } else if (*cs & OPT) {
369
370 /* the match is optional */
371 matched = 1; /* indicate a successful match */
372 } else {
373
374 /* no match, error return */
375 return (NIL);
376 }
377 cs = OPTR(cs);
378 break;
379 }
380 break;
381
382 /* try to match a metasymbol */
383 case META:
384 switch (MSYM(cs)) {
385
386 /* try to match anything and remember what was matched */
387 case 'p':
388 /*
389 * This is really the same as trying the match the
390 * remaining parts of the expression to any subset
391 * of the string.
392 */
393 s1 = s;
394 do {
395 ptr = expmatch (s1, MNEXT(cs), mstring);
396 if (ptr != NIL && s1 != s) {
397
398 /* we have a match, remember the match */
399 strncpy (mstring, s, s1 - s);
400 mstring[s1 - s] = '\0';
401 return (ptr);
402 } else if (ptr != NIL && (*cs & OPT)) {
403
404 /* it was aoptional so no match is ok */
405 return (ptr);
406 } else if (ptr != NIL) {
407
408 /* not optional and we still matched */
409 return (NIL);
410 }
411 if (!isalnum(*s1) && *s1 != '_')
412 return (NIL);
413 if (*s1 == '\\')
414 _escaped = _escaped ? FALSE : TRUE;
415 else
416 _escaped = FALSE;
417 } while (*s1++);
418 return (NIL);
419
420 /* try to match anything */
421 case 'a':
422 /*
423 * This is really the same as trying the match the
424 * remaining parts of the expression to any subset
425 * of the string.
426 */
427 s1 = s;
428 do {
429 ptr = expmatch (s1, MNEXT(cs), mstring);
430 if (ptr != NIL && s1 != s) {
431
432 /* we have a match */
433 return (ptr);
434 } else if (ptr != NIL && (*cs & OPT)) {
435
436 /* it was aoptional so no match is ok */
437 return (ptr);
438 } else if (ptr != NIL) {
439
440 /* not optional and we still matched */
441 return (NIL);
442 }
443 if (*s1 == '\\')
444 _escaped = _escaped ? FALSE : TRUE;
445 else
446 _escaped = FALSE;
447 } while (*s1++);
448 return (NIL);
449
450 /* fail if we are currently _escaped */
451 case 'e':
452 if (_escaped)
453 return(NIL);
454 cs = MNEXT(cs);
455 break;
456
457 /* match any number of tabs and spaces */
458 case 'd':
459 ptr = s;
460 while (*s == ' ' || *s == '\t')
461 s++;
462 if (s != ptr || s == _start) {
463
464 /* match, be happy */
465 matched = 1;
466 cs = MNEXT(cs);
467 } else if (*s == '\n' || *s == '\0') {
468
469 /* match, be happy */
470 matched = 1;
471 cs = MNEXT(cs);
472 } else if (*cs & ALT) {
473
474 /* try the next part */
475 matched = 0;
476 cs = MNEXT(cs);
477 } else if (*cs & OPT) {
478
479 /* doesn't matter */
480 matched = 1;
481 cs = MNEXT(cs);
482 } else
483
484 /* no match, error return */
485 return (NIL);
486 break;
487
488 /* check for end of line */
489 case '$':
490 if (*s == '\0' || *s == '\n') {
491
492 /* match, be happy */
493 s++;
494 matched = 1;
495 cs = MNEXT(cs);
496 } else if (*cs & ALT) {
497
498 /* try the next part */
499 matched = 0;
500 cs = MNEXT(cs);
501 } else if (*cs & OPT) {
502
503 /* doesn't matter */
504 matched = 1;
505 cs = MNEXT(cs);
506 } else
507
508 /* no match, error return */
509 return (NIL);
510 break;
511
512 /* check for start of line */
513 case '^':
514 if (s == _start) {
515
516 /* match, be happy */
517 matched = 1;
518 cs = MNEXT(cs);
519 } else if (*cs & ALT) {
520
521 /* try the next part */
522 matched = 0;
523 cs = MNEXT(cs);
524 } else if (*cs & OPT) {
525
526 /* doesn't matter */
527 matched = 1;
528 cs = MNEXT(cs);
529 } else
530
531 /* no match, error return */
532 return (NIL);
533 break;
534
535 /* end of a subexpression, return success */
536 case ')':
537 return (s);
538 }
539 break;
540 }
541 }
542 return (s);
543}