Commit | Line | Data |
---|---|---|
23e9541a KM |
1 | static char sccsid[] = "@(#)regexp.c 4.1 (Berkeley) %G%"; |
2 | ||
3 | #include <ctype.h> | |
4 | ||
5 | typedef int boolean; | |
6 | #define TRUE 1 | |
7 | #define FALSE 0 | |
8 | #define NIL 0 | |
9 | ||
10 | boolean l_onecase; /* true if upper and lower equivalent */ | |
11 | ||
12 | #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) | |
13 | ||
14 | /* STRNCMP - like strncmp except that we convert the | |
15 | * first string to lower case before comparing | |
16 | * if l_onecase is set. | |
17 | */ | |
18 | ||
19 | STRNCMP(s1, s2, len) | |
20 | register char *s1,*s2; | |
21 | register int len; | |
22 | { | |
23 | if (l_onecase) { | |
24 | do | |
25 | if (*s2 - makelower(*s1)) | |
26 | return (*s2 - makelower(*s1)); | |
27 | else { | |
28 | s2++; | |
29 | s1++; | |
30 | } | |
31 | while (--len); | |
32 | } else { | |
33 | do | |
34 | if (*s2 - *s1) | |
35 | return (*s2 - *s1); | |
36 | else { | |
37 | s2++; | |
38 | s1++; | |
39 | } | |
40 | while (--len); | |
41 | } | |
42 | return(0); | |
43 | } | |
44 | ||
45 | /* The following routine converts an irregular expression to | |
46 | * internal format. | |
47 | * | |
48 | * Either meta symbols (\a \d or \p) or character strings or | |
49 | * operations ( alternation or perenthesizing ) can be | |
50 | * specified. Each starts with a descriptor byte. The descriptor | |
51 | * byte has STR set for strings, META set for meta symbols | |
52 | * and OPER set for operations. | |
53 | * The descriptor byte can also have the OPT bit set if the object | |
54 | * defined is optional. Also ALT can be set to indicate an alternation. | |
55 | * | |
56 | * For metasymbols the byte following the descriptor byte identities | |
57 | * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For | |
58 | * strings the byte after the descriptor is a character count for | |
59 | * the string: | |
60 | * | |
61 | * meta symbols := descriptor | |
62 | * symbol | |
63 | * | |
64 | * strings := descriptor | |
65 | * character count | |
66 | * the string | |
67 | * | |
68 | * operatins := descriptor | |
69 | * symbol | |
70 | * character count | |
71 | */ | |
72 | ||
73 | /* | |
74 | * handy macros for accessing parts of match blocks | |
75 | */ | |
76 | #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ | |
77 | #define MNEXT(A) (A+2) /* character following a metasymbol block */ | |
78 | ||
79 | #define OSYM(A) (*(A+1)) /* symbol in an operation block */ | |
80 | #define OCNT(A) (*(A+2)) /* character count */ | |
81 | #define ONEXT(A) (A+3) /* next character after the operation */ | |
82 | #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ | |
83 | ||
84 | #define SCNT(A) (*(A+1)) /* byte count of a string */ | |
85 | #define SSTR(A) (A+2) /* address of the string */ | |
86 | #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ | |
87 | ||
88 | /* | |
89 | * bit flags in the descriptor | |
90 | */ | |
91 | #define OPT 1 | |
92 | #define STR 2 | |
93 | #define META 4 | |
94 | #define ALT 8 | |
95 | #define OPER 16 | |
96 | ||
97 | char *ure; /* pointer current position in unconverted exp */ | |
98 | char *ccre; /* pointer to current position in converted exp*/ | |
99 | char *malloc(); | |
100 | ||
101 | char * | |
102 | convexp(re) | |
103 | char *re; /* unconverted irregular expression */ | |
104 | { | |
105 | register char *cre; /* pointer to converted regular expression */ | |
106 | ||
107 | /* allocate room for the converted expression */ | |
108 | if (re == NIL) | |
109 | return (NIL); | |
110 | if (*re == '\0') | |
111 | return (NIL); | |
112 | cre = malloc (4 * strlen(re) + 3); | |
113 | ccre = cre; | |
114 | ure = re; | |
115 | ||
116 | /* start the conversion with a \a */ | |
117 | *cre = META | OPT; | |
118 | MSYM(cre) = 'a'; | |
119 | ccre = MNEXT(cre); | |
120 | ||
121 | /* start the conversion (its recursive) */ | |
122 | expconv (); | |
123 | *ccre = 0; | |
124 | return (cre); | |
125 | } | |
126 | ||
127 | expconv() | |
128 | { | |
129 | register char *cs; /* pointer to current symbol in converted exp */ | |
130 | register char c; /* character being processed */ | |
131 | register char *acs; /* pinter to last alternate */ | |
132 | register int temp; | |
133 | ||
134 | /* let the conversion begin */ | |
135 | acs = NIL; | |
136 | while (*ure != NIL) { | |
137 | switch (c = *ure++) { | |
138 | ||
139 | case '\\': | |
140 | switch (c = *ure++) { | |
141 | ||
142 | /* escaped characters are just characters */ | |
143 | default: | |
144 | if ((*cs & STR) == 0) { | |
145 | cs = ccre; | |
146 | *cs = STR; | |
147 | SCNT(cs) = 1; | |
148 | ccre += 2; | |
149 | } else | |
150 | SCNT(cs)++; | |
151 | *ccre++ = c; | |
152 | break; | |
153 | ||
154 | /* normal(?) metacharacters */ | |
155 | case 'a': | |
156 | case 'd': | |
157 | case 'e': | |
158 | case 'p': | |
159 | if (acs != NIL && acs != cs) { | |
160 | do { | |
161 | temp = OCNT(acs); | |
162 | OCNT(acs) = ccre - acs; | |
163 | acs -= temp; | |
164 | } while (temp != 0); | |
165 | acs = NIL; | |
166 | } | |
167 | cs = ccre; | |
168 | *cs = META; | |
169 | MSYM(cs) = c; | |
170 | ccre = MNEXT(cs); | |
171 | break; | |
172 | } | |
173 | break; | |
174 | ||
175 | /* just put the symbol in */ | |
176 | case '^': | |
177 | case '$': | |
178 | if (acs != NIL && acs != cs) { | |
179 | do { | |
180 | temp = OCNT(acs); | |
181 | OCNT(acs) = ccre - acs; | |
182 | acs -= temp; | |
183 | } while (temp != 0); | |
184 | acs = NIL; | |
185 | } | |
186 | cs = ccre; | |
187 | *cs = META; | |
188 | MSYM(cs) = c; | |
189 | ccre = MNEXT(cs); | |
190 | break; | |
191 | ||
192 | /* mark the last match sequence as optional */ | |
193 | case '?': | |
194 | *cs = *cs | OPT; | |
195 | break; | |
196 | ||
197 | /* recurse and define a subexpression */ | |
198 | case '(': | |
199 | if (acs != NIL && acs != cs) { | |
200 | do { | |
201 | temp = OCNT(acs); | |
202 | OCNT(acs) = ccre - acs; | |
203 | acs -= temp; | |
204 | } while (temp != 0); | |
205 | acs = NIL; | |
206 | } | |
207 | cs = ccre; | |
208 | *cs = OPER; | |
209 | OSYM(cs) = '('; | |
210 | ccre = ONEXT(cs); | |
211 | expconv (); | |
212 | OCNT(cs) = ccre - cs; /* offset to next symbol */ | |
213 | break; | |
214 | ||
215 | /* return from a recursion */ | |
216 | case ')': | |
217 | if (acs != NIL) { | |
218 | do { | |
219 | temp = OCNT(acs); | |
220 | OCNT(acs) = ccre - acs; | |
221 | acs -= temp; | |
222 | } while (temp != 0); | |
223 | acs = NIL; | |
224 | } | |
225 | cs = ccre; | |
226 | *cs = META; | |
227 | MSYM(cs) = c; | |
228 | ccre = MNEXT(cs); | |
229 | return; | |
230 | ||
231 | /* mark the last match sequence as having an alternate */ | |
232 | /* the third byte will contain an offset to jump over the */ | |
233 | /* alternate match in case the first did not fail */ | |
234 | case '|': | |
235 | if (acs != NIL && acs != cs) | |
236 | OCNT(ccre) = ccre - acs; /* make a back pointer */ | |
237 | else | |
238 | OCNT(ccre) = 0; | |
239 | *cs |= ALT; | |
240 | cs = ccre; | |
241 | *cs = OPER; | |
242 | OSYM(cs) = '|'; | |
243 | ccre = ONEXT(cs); | |
244 | acs = cs; /* remember that the pointer is to be filles */ | |
245 | break; | |
246 | ||
247 | /* if its not a metasymbol just build a scharacter string */ | |
248 | default: | |
249 | if ((*cs & STR) == 0) { | |
250 | cs = ccre; | |
251 | *cs = STR; | |
252 | SCNT(cs) = 1; | |
253 | ccre = SSTR(cs); | |
254 | } else | |
255 | SCNT(cs)++; | |
256 | *ccre++ = c; | |
257 | break; | |
258 | } | |
259 | } | |
260 | if (acs != NIL) { | |
261 | do { | |
262 | temp = OCNT(acs); | |
263 | OCNT(acs) = ccre - acs; | |
264 | acs -= temp; | |
265 | } while (temp != 0); | |
266 | acs = NIL; | |
267 | } | |
268 | return; | |
269 | } | |
270 | /* end of convertre */ | |
271 | ||
272 | ||
273 | /* | |
274 | * The following routine recognises an irregular expresion | |
275 | * with the following special characters: | |
276 | * | |
277 | * \? - means last match was optional | |
278 | * \a - matches any number of characters | |
279 | * \d - matches any number of spaces and tabs | |
280 | * \p - matches any number of alphanumeric | |
281 | * characters. The | |
282 | * characters matched will be copied into | |
283 | * the area pointed to by 'name'. | |
284 | * \| - alternation | |
285 | * \( \) - grouping used mostly for alternation and | |
286 | * optionality | |
287 | * | |
288 | * The irregular expression must be translated to internal form | |
289 | * prior to calling this routine | |
290 | * | |
291 | * The value returned is the pointer to the first non \a | |
292 | * character matched. | |
293 | */ | |
294 | ||
295 | boolean _escaped; /* true if we are currently _escaped */ | |
296 | char *_start; /* start of string */ | |
297 | ||
298 | char * | |
299 | expmatch (s, re, mstring) | |
300 | register char *s; /* string to check for a match in */ | |
301 | register char *re; /* a converted irregular expression */ | |
302 | register char *mstring; /* where to put whatever matches a \p */ | |
303 | { | |
304 | register char *cs; /* the current symbol */ | |
305 | register char *ptr,*s1; /* temporary pointer */ | |
306 | boolean matched; /* a temporary boolean */ | |
307 | ||
308 | /* initial conditions */ | |
309 | if (re == NIL) | |
310 | return (NIL); | |
311 | cs = re; | |
312 | matched = FALSE; | |
313 | ||
314 | /* loop till expression string is exhausted (or at least pretty tired) */ | |
315 | while (*cs) { | |
316 | switch (*cs & (OPER | STR | META)) { | |
317 | ||
318 | /* try to match a string */ | |
319 | case STR: | |
320 | matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); | |
321 | if (matched) { | |
322 | ||
323 | /* hoorah it matches */ | |
324 | s += SCNT(cs); | |
325 | cs = SNEXT(cs); | |
326 | } else if (*cs & ALT) { | |
327 | ||
328 | /* alternation, skip to next expression */ | |
329 | cs = SNEXT(cs); | |
330 | } else if (*cs & OPT) { | |
331 | ||
332 | /* the match is optional */ | |
333 | cs = SNEXT(cs); | |
334 | matched = 1; /* indicate a successful match */ | |
335 | } else { | |
336 | ||
337 | /* no match, error return */ | |
338 | return (NIL); | |
339 | } | |
340 | break; | |
341 | ||
342 | /* an operator, do something fancy */ | |
343 | case OPER: | |
344 | switch (OSYM(cs)) { | |
345 | ||
346 | /* this is an alternation */ | |
347 | case '|': | |
348 | if (matched) | |
349 | ||
350 | /* last thing in the alternation was a match, skip ahead */ | |
351 | cs = OPTR(cs); | |
352 | else | |
353 | ||
354 | /* no match, keep trying */ | |
355 | cs = ONEXT(cs); | |
356 | break; | |
357 | ||
358 | /* this is a grouping, recurse */ | |
359 | case '(': | |
360 | ptr = expmatch (s, ONEXT(cs), mstring); | |
361 | if (ptr != NIL) { | |
362 | ||
363 | /* the subexpression matched */ | |
364 | matched = 1; | |
365 | s = ptr; | |
366 | } else if (*cs & ALT) { | |
367 | ||
368 | /* alternation, skip to next expression */ | |
369 | matched = 0; | |
370 | } else if (*cs & OPT) { | |
371 | ||
372 | /* the match is optional */ | |
373 | matched = 1; /* indicate a successful match */ | |
374 | } else { | |
375 | ||
376 | /* no match, error return */ | |
377 | return (NIL); | |
378 | } | |
379 | cs = OPTR(cs); | |
380 | break; | |
381 | } | |
382 | break; | |
383 | ||
384 | /* try to match a metasymbol */ | |
385 | case META: | |
386 | switch (MSYM(cs)) { | |
387 | ||
388 | /* try to match anything and remember what was matched */ | |
389 | case 'p': | |
390 | /* | |
391 | * This is really the same as trying the match the | |
392 | * remaining parts of the expression to any subset | |
393 | * of the string. | |
394 | */ | |
395 | s1 = s; | |
396 | do { | |
397 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
398 | if (ptr != NIL && s1 != s) { | |
399 | ||
400 | /* we have a match, remember the match */ | |
401 | strncpy (mstring, s, s1 - s); | |
402 | mstring[s1 - s] = '\0'; | |
403 | return (ptr); | |
404 | } else if (ptr != NIL && (*cs & OPT)) { | |
405 | ||
406 | /* it was aoptional so no match is ok */ | |
407 | return (ptr); | |
408 | } else if (ptr != NIL) { | |
409 | ||
410 | /* not optional and we still matched */ | |
411 | return (NIL); | |
412 | } | |
413 | if (!isalnum(*s1) && *s1 != '_') | |
414 | return (NIL); | |
415 | if (*s1 == '\\') | |
416 | _escaped = _escaped ? FALSE : TRUE; | |
417 | else | |
418 | _escaped = FALSE; | |
419 | } while (*s1++); | |
420 | return (NIL); | |
421 | ||
422 | /* try to match anything */ | |
423 | case 'a': | |
424 | /* | |
425 | * This is really the same as trying the match the | |
426 | * remaining parts of the expression to any subset | |
427 | * of the string. | |
428 | */ | |
429 | s1 = s; | |
430 | do { | |
431 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
432 | if (ptr != NIL && s1 != s) { | |
433 | ||
434 | /* we have a match */ | |
435 | return (ptr); | |
436 | } else if (ptr != NIL && (*cs & OPT)) { | |
437 | ||
438 | /* it was aoptional so no match is ok */ | |
439 | return (ptr); | |
440 | } else if (ptr != NIL) { | |
441 | ||
442 | /* not optional and we still matched */ | |
443 | return (NIL); | |
444 | } | |
445 | if (*s1 == '\\') | |
446 | _escaped = _escaped ? FALSE : TRUE; | |
447 | else | |
448 | _escaped = FALSE; | |
449 | } while (*s1++); | |
450 | return (NIL); | |
451 | ||
452 | /* fail if we are currently _escaped */ | |
453 | case 'e': | |
454 | if (_escaped) | |
455 | return(NIL); | |
456 | cs = MNEXT(cs); | |
457 | break; | |
458 | ||
459 | /* match any number of tabs and spaces */ | |
460 | case 'd': | |
461 | ptr = s; | |
462 | while (*s == ' ' || *s == '\t') | |
463 | s++; | |
464 | if (s != ptr || s == _start) { | |
465 | ||
466 | /* match, be happy */ | |
467 | matched = 1; | |
468 | cs = MNEXT(cs); | |
469 | } else if (*s == '\n' || *s == '\0') { | |
470 | ||
471 | /* match, be happy */ | |
472 | matched = 1; | |
473 | cs = MNEXT(cs); | |
474 | } else if (*cs & ALT) { | |
475 | ||
476 | /* try the next part */ | |
477 | matched = 0; | |
478 | cs = MNEXT(cs); | |
479 | } else if (*cs & OPT) { | |
480 | ||
481 | /* doesn't matter */ | |
482 | matched = 1; | |
483 | cs = MNEXT(cs); | |
484 | } else | |
485 | ||
486 | /* no match, error return */ | |
487 | return (NIL); | |
488 | break; | |
489 | ||
490 | /* check for end of line */ | |
491 | case '$': | |
492 | if (*s == '\0' || *s == '\n') { | |
493 | ||
494 | /* match, be happy */ | |
495 | s++; | |
496 | matched = 1; | |
497 | cs = MNEXT(cs); | |
498 | } else if (*cs & ALT) { | |
499 | ||
500 | /* try the next part */ | |
501 | matched = 0; | |
502 | cs = MNEXT(cs); | |
503 | } else if (*cs & OPT) { | |
504 | ||
505 | /* doesn't matter */ | |
506 | matched = 1; | |
507 | cs = MNEXT(cs); | |
508 | } else | |
509 | ||
510 | /* no match, error return */ | |
511 | return (NIL); | |
512 | break; | |
513 | ||
514 | /* check for start of line */ | |
515 | case '^': | |
516 | if (s == _start) { | |
517 | ||
518 | /* match, be happy */ | |
519 | matched = 1; | |
520 | cs = MNEXT(cs); | |
521 | } else if (*cs & ALT) { | |
522 | ||
523 | /* try the next part */ | |
524 | matched = 0; | |
525 | cs = MNEXT(cs); | |
526 | } else if (*cs & OPT) { | |
527 | ||
528 | /* doesn't matter */ | |
529 | matched = 1; | |
530 | cs = MNEXT(cs); | |
531 | } else | |
532 | ||
533 | /* no match, error return */ | |
534 | return (NIL); | |
535 | break; | |
536 | ||
537 | /* end of a subexpression, return success */ | |
538 | case ')': | |
539 | return (s); | |
540 | } | |
541 | break; | |
542 | } | |
543 | } | |
544 | return (s); | |
545 | } |