Commit | Line | Data |
---|---|---|
1a8c1b81 | 1 | static char sccsid[] = "@(#)regexp.c 4.2 (Berkeley) %G%"; |
23e9541a KM |
2 | |
3 | #include <ctype.h> | |
4 | ||
5 | typedef int boolean; | |
6 | #define TRUE 1 | |
7 | #define FALSE 0 | |
8 | #define NIL 0 | |
9 | ||
10 | boolean l_onecase; /* true if upper and lower equivalent */ | |
11 | ||
12 | #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) | |
13 | ||
14 | /* STRNCMP - like strncmp except that we convert the | |
15 | * first string to lower case before comparing | |
16 | * if l_onecase is set. | |
17 | */ | |
18 | ||
19 | STRNCMP(s1, s2, len) | |
20 | register char *s1,*s2; | |
21 | register int len; | |
22 | { | |
23 | if (l_onecase) { | |
24 | do | |
25 | if (*s2 - makelower(*s1)) | |
26 | return (*s2 - makelower(*s1)); | |
27 | else { | |
28 | s2++; | |
29 | s1++; | |
30 | } | |
31 | while (--len); | |
32 | } else { | |
33 | do | |
34 | if (*s2 - *s1) | |
35 | return (*s2 - *s1); | |
36 | else { | |
37 | s2++; | |
38 | s1++; | |
39 | } | |
40 | while (--len); | |
41 | } | |
42 | return(0); | |
43 | } | |
44 | ||
45 | /* The following routine converts an irregular expression to | |
46 | * internal format. | |
47 | * | |
48 | * Either meta symbols (\a \d or \p) or character strings or | |
49 | * operations ( alternation or perenthesizing ) can be | |
50 | * specified. Each starts with a descriptor byte. The descriptor | |
51 | * byte has STR set for strings, META set for meta symbols | |
52 | * and OPER set for operations. | |
53 | * The descriptor byte can also have the OPT bit set if the object | |
54 | * defined is optional. Also ALT can be set to indicate an alternation. | |
55 | * | |
56 | * For metasymbols the byte following the descriptor byte identities | |
57 | * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For | |
58 | * strings the byte after the descriptor is a character count for | |
59 | * the string: | |
60 | * | |
61 | * meta symbols := descriptor | |
62 | * symbol | |
63 | * | |
64 | * strings := descriptor | |
65 | * character count | |
66 | * the string | |
67 | * | |
68 | * operatins := descriptor | |
69 | * symbol | |
70 | * character count | |
71 | */ | |
72 | ||
73 | /* | |
74 | * handy macros for accessing parts of match blocks | |
75 | */ | |
76 | #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ | |
77 | #define MNEXT(A) (A+2) /* character following a metasymbol block */ | |
78 | ||
79 | #define OSYM(A) (*(A+1)) /* symbol in an operation block */ | |
80 | #define OCNT(A) (*(A+2)) /* character count */ | |
81 | #define ONEXT(A) (A+3) /* next character after the operation */ | |
82 | #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ | |
83 | ||
84 | #define SCNT(A) (*(A+1)) /* byte count of a string */ | |
85 | #define SSTR(A) (A+2) /* address of the string */ | |
86 | #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ | |
87 | ||
88 | /* | |
89 | * bit flags in the descriptor | |
90 | */ | |
91 | #define OPT 1 | |
92 | #define STR 2 | |
93 | #define META 4 | |
94 | #define ALT 8 | |
95 | #define OPER 16 | |
96 | ||
97 | char *ure; /* pointer current position in unconverted exp */ | |
98 | char *ccre; /* pointer to current position in converted exp*/ | |
99 | char *malloc(); | |
100 | ||
101 | char * | |
102 | convexp(re) | |
103 | char *re; /* unconverted irregular expression */ | |
104 | { | |
105 | register char *cre; /* pointer to converted regular expression */ | |
106 | ||
107 | /* allocate room for the converted expression */ | |
108 | if (re == NIL) | |
109 | return (NIL); | |
110 | if (*re == '\0') | |
111 | return (NIL); | |
112 | cre = malloc (4 * strlen(re) + 3); | |
113 | ccre = cre; | |
114 | ure = re; | |
115 | ||
116 | /* start the conversion with a \a */ | |
117 | *cre = META | OPT; | |
118 | MSYM(cre) = 'a'; | |
119 | ccre = MNEXT(cre); | |
120 | ||
121 | /* start the conversion (its recursive) */ | |
122 | expconv (); | |
123 | *ccre = 0; | |
124 | return (cre); | |
125 | } | |
126 | ||
127 | expconv() | |
128 | { | |
129 | register char *cs; /* pointer to current symbol in converted exp */ | |
130 | register char c; /* character being processed */ | |
131 | register char *acs; /* pinter to last alternate */ | |
132 | register int temp; | |
133 | ||
134 | /* let the conversion begin */ | |
135 | acs = NIL; | |
1a8c1b81 | 136 | cs = NIL; |
23e9541a KM |
137 | while (*ure != NIL) { |
138 | switch (c = *ure++) { | |
139 | ||
140 | case '\\': | |
141 | switch (c = *ure++) { | |
142 | ||
143 | /* escaped characters are just characters */ | |
144 | default: | |
1a8c1b81 | 145 | if (cs == NIL || (*cs & STR) == 0) { |
23e9541a KM |
146 | cs = ccre; |
147 | *cs = STR; | |
148 | SCNT(cs) = 1; | |
149 | ccre += 2; | |
150 | } else | |
151 | SCNT(cs)++; | |
152 | *ccre++ = c; | |
153 | break; | |
154 | ||
155 | /* normal(?) metacharacters */ | |
156 | case 'a': | |
157 | case 'd': | |
158 | case 'e': | |
159 | case 'p': | |
160 | if (acs != NIL && acs != cs) { | |
161 | do { | |
162 | temp = OCNT(acs); | |
163 | OCNT(acs) = ccre - acs; | |
164 | acs -= temp; | |
165 | } while (temp != 0); | |
166 | acs = NIL; | |
167 | } | |
168 | cs = ccre; | |
169 | *cs = META; | |
170 | MSYM(cs) = c; | |
171 | ccre = MNEXT(cs); | |
172 | break; | |
173 | } | |
174 | break; | |
175 | ||
176 | /* just put the symbol in */ | |
177 | case '^': | |
178 | case '$': | |
179 | if (acs != NIL && acs != cs) { | |
180 | do { | |
181 | temp = OCNT(acs); | |
182 | OCNT(acs) = ccre - acs; | |
183 | acs -= temp; | |
184 | } while (temp != 0); | |
185 | acs = NIL; | |
186 | } | |
187 | cs = ccre; | |
188 | *cs = META; | |
189 | MSYM(cs) = c; | |
190 | ccre = MNEXT(cs); | |
191 | break; | |
192 | ||
193 | /* mark the last match sequence as optional */ | |
194 | case '?': | |
1a8c1b81 RC |
195 | if (cs) |
196 | *cs = *cs | OPT; | |
23e9541a KM |
197 | break; |
198 | ||
199 | /* recurse and define a subexpression */ | |
200 | case '(': | |
201 | if (acs != NIL && acs != cs) { | |
202 | do { | |
203 | temp = OCNT(acs); | |
204 | OCNT(acs) = ccre - acs; | |
205 | acs -= temp; | |
206 | } while (temp != 0); | |
207 | acs = NIL; | |
208 | } | |
209 | cs = ccre; | |
210 | *cs = OPER; | |
211 | OSYM(cs) = '('; | |
212 | ccre = ONEXT(cs); | |
213 | expconv (); | |
214 | OCNT(cs) = ccre - cs; /* offset to next symbol */ | |
215 | break; | |
216 | ||
217 | /* return from a recursion */ | |
218 | case ')': | |
219 | if (acs != NIL) { | |
220 | do { | |
221 | temp = OCNT(acs); | |
222 | OCNT(acs) = ccre - acs; | |
223 | acs -= temp; | |
224 | } while (temp != 0); | |
225 | acs = NIL; | |
226 | } | |
227 | cs = ccre; | |
228 | *cs = META; | |
229 | MSYM(cs) = c; | |
230 | ccre = MNEXT(cs); | |
231 | return; | |
232 | ||
233 | /* mark the last match sequence as having an alternate */ | |
234 | /* the third byte will contain an offset to jump over the */ | |
235 | /* alternate match in case the first did not fail */ | |
236 | case '|': | |
237 | if (acs != NIL && acs != cs) | |
238 | OCNT(ccre) = ccre - acs; /* make a back pointer */ | |
239 | else | |
240 | OCNT(ccre) = 0; | |
241 | *cs |= ALT; | |
242 | cs = ccre; | |
243 | *cs = OPER; | |
244 | OSYM(cs) = '|'; | |
245 | ccre = ONEXT(cs); | |
246 | acs = cs; /* remember that the pointer is to be filles */ | |
247 | break; | |
248 | ||
249 | /* if its not a metasymbol just build a scharacter string */ | |
250 | default: | |
1a8c1b81 | 251 | if (cs == NIL || (*cs & STR) == 0) { |
23e9541a KM |
252 | cs = ccre; |
253 | *cs = STR; | |
254 | SCNT(cs) = 1; | |
255 | ccre = SSTR(cs); | |
256 | } else | |
257 | SCNT(cs)++; | |
258 | *ccre++ = c; | |
259 | break; | |
260 | } | |
261 | } | |
262 | if (acs != NIL) { | |
263 | do { | |
264 | temp = OCNT(acs); | |
265 | OCNT(acs) = ccre - acs; | |
266 | acs -= temp; | |
267 | } while (temp != 0); | |
268 | acs = NIL; | |
269 | } | |
270 | return; | |
271 | } | |
272 | /* end of convertre */ | |
273 | ||
274 | ||
275 | /* | |
276 | * The following routine recognises an irregular expresion | |
277 | * with the following special characters: | |
278 | * | |
279 | * \? - means last match was optional | |
280 | * \a - matches any number of characters | |
281 | * \d - matches any number of spaces and tabs | |
282 | * \p - matches any number of alphanumeric | |
283 | * characters. The | |
284 | * characters matched will be copied into | |
285 | * the area pointed to by 'name'. | |
286 | * \| - alternation | |
287 | * \( \) - grouping used mostly for alternation and | |
288 | * optionality | |
289 | * | |
290 | * The irregular expression must be translated to internal form | |
291 | * prior to calling this routine | |
292 | * | |
293 | * The value returned is the pointer to the first non \a | |
294 | * character matched. | |
295 | */ | |
296 | ||
297 | boolean _escaped; /* true if we are currently _escaped */ | |
298 | char *_start; /* start of string */ | |
299 | ||
300 | char * | |
301 | expmatch (s, re, mstring) | |
302 | register char *s; /* string to check for a match in */ | |
303 | register char *re; /* a converted irregular expression */ | |
304 | register char *mstring; /* where to put whatever matches a \p */ | |
305 | { | |
306 | register char *cs; /* the current symbol */ | |
307 | register char *ptr,*s1; /* temporary pointer */ | |
308 | boolean matched; /* a temporary boolean */ | |
309 | ||
310 | /* initial conditions */ | |
311 | if (re == NIL) | |
312 | return (NIL); | |
313 | cs = re; | |
314 | matched = FALSE; | |
315 | ||
316 | /* loop till expression string is exhausted (or at least pretty tired) */ | |
317 | while (*cs) { | |
318 | switch (*cs & (OPER | STR | META)) { | |
319 | ||
320 | /* try to match a string */ | |
321 | case STR: | |
322 | matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); | |
323 | if (matched) { | |
324 | ||
325 | /* hoorah it matches */ | |
326 | s += SCNT(cs); | |
327 | cs = SNEXT(cs); | |
328 | } else if (*cs & ALT) { | |
329 | ||
330 | /* alternation, skip to next expression */ | |
331 | cs = SNEXT(cs); | |
332 | } else if (*cs & OPT) { | |
333 | ||
334 | /* the match is optional */ | |
335 | cs = SNEXT(cs); | |
336 | matched = 1; /* indicate a successful match */ | |
337 | } else { | |
338 | ||
339 | /* no match, error return */ | |
340 | return (NIL); | |
341 | } | |
342 | break; | |
343 | ||
344 | /* an operator, do something fancy */ | |
345 | case OPER: | |
346 | switch (OSYM(cs)) { | |
347 | ||
348 | /* this is an alternation */ | |
349 | case '|': | |
350 | if (matched) | |
351 | ||
352 | /* last thing in the alternation was a match, skip ahead */ | |
353 | cs = OPTR(cs); | |
354 | else | |
355 | ||
356 | /* no match, keep trying */ | |
357 | cs = ONEXT(cs); | |
358 | break; | |
359 | ||
360 | /* this is a grouping, recurse */ | |
361 | case '(': | |
362 | ptr = expmatch (s, ONEXT(cs), mstring); | |
363 | if (ptr != NIL) { | |
364 | ||
365 | /* the subexpression matched */ | |
366 | matched = 1; | |
367 | s = ptr; | |
368 | } else if (*cs & ALT) { | |
369 | ||
370 | /* alternation, skip to next expression */ | |
371 | matched = 0; | |
372 | } else if (*cs & OPT) { | |
373 | ||
374 | /* the match is optional */ | |
375 | matched = 1; /* indicate a successful match */ | |
376 | } else { | |
377 | ||
378 | /* no match, error return */ | |
379 | return (NIL); | |
380 | } | |
381 | cs = OPTR(cs); | |
382 | break; | |
383 | } | |
384 | break; | |
385 | ||
386 | /* try to match a metasymbol */ | |
387 | case META: | |
388 | switch (MSYM(cs)) { | |
389 | ||
390 | /* try to match anything and remember what was matched */ | |
391 | case 'p': | |
392 | /* | |
393 | * This is really the same as trying the match the | |
394 | * remaining parts of the expression to any subset | |
395 | * of the string. | |
396 | */ | |
397 | s1 = s; | |
398 | do { | |
399 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
400 | if (ptr != NIL && s1 != s) { | |
401 | ||
402 | /* we have a match, remember the match */ | |
403 | strncpy (mstring, s, s1 - s); | |
404 | mstring[s1 - s] = '\0'; | |
405 | return (ptr); | |
406 | } else if (ptr != NIL && (*cs & OPT)) { | |
407 | ||
408 | /* it was aoptional so no match is ok */ | |
409 | return (ptr); | |
410 | } else if (ptr != NIL) { | |
411 | ||
412 | /* not optional and we still matched */ | |
413 | return (NIL); | |
414 | } | |
415 | if (!isalnum(*s1) && *s1 != '_') | |
416 | return (NIL); | |
417 | if (*s1 == '\\') | |
418 | _escaped = _escaped ? FALSE : TRUE; | |
419 | else | |
420 | _escaped = FALSE; | |
421 | } while (*s1++); | |
422 | return (NIL); | |
423 | ||
424 | /* try to match anything */ | |
425 | case 'a': | |
426 | /* | |
427 | * This is really the same as trying the match the | |
428 | * remaining parts of the expression to any subset | |
429 | * of the string. | |
430 | */ | |
431 | s1 = s; | |
432 | do { | |
433 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
434 | if (ptr != NIL && s1 != s) { | |
435 | ||
436 | /* we have a match */ | |
437 | return (ptr); | |
438 | } else if (ptr != NIL && (*cs & OPT)) { | |
439 | ||
440 | /* it was aoptional so no match is ok */ | |
441 | return (ptr); | |
442 | } else if (ptr != NIL) { | |
443 | ||
444 | /* not optional and we still matched */ | |
445 | return (NIL); | |
446 | } | |
447 | if (*s1 == '\\') | |
448 | _escaped = _escaped ? FALSE : TRUE; | |
449 | else | |
450 | _escaped = FALSE; | |
451 | } while (*s1++); | |
452 | return (NIL); | |
453 | ||
454 | /* fail if we are currently _escaped */ | |
455 | case 'e': | |
456 | if (_escaped) | |
457 | return(NIL); | |
458 | cs = MNEXT(cs); | |
459 | break; | |
460 | ||
461 | /* match any number of tabs and spaces */ | |
462 | case 'd': | |
463 | ptr = s; | |
464 | while (*s == ' ' || *s == '\t') | |
465 | s++; | |
466 | if (s != ptr || s == _start) { | |
467 | ||
468 | /* match, be happy */ | |
469 | matched = 1; | |
470 | cs = MNEXT(cs); | |
471 | } else if (*s == '\n' || *s == '\0') { | |
472 | ||
473 | /* match, be happy */ | |
474 | matched = 1; | |
475 | cs = MNEXT(cs); | |
476 | } else if (*cs & ALT) { | |
477 | ||
478 | /* try the next part */ | |
479 | matched = 0; | |
480 | cs = MNEXT(cs); | |
481 | } else if (*cs & OPT) { | |
482 | ||
483 | /* doesn't matter */ | |
484 | matched = 1; | |
485 | cs = MNEXT(cs); | |
486 | } else | |
487 | ||
488 | /* no match, error return */ | |
489 | return (NIL); | |
490 | break; | |
491 | ||
492 | /* check for end of line */ | |
493 | case '$': | |
494 | if (*s == '\0' || *s == '\n') { | |
495 | ||
496 | /* match, be happy */ | |
497 | s++; | |
498 | matched = 1; | |
499 | cs = MNEXT(cs); | |
500 | } else if (*cs & ALT) { | |
501 | ||
502 | /* try the next part */ | |
503 | matched = 0; | |
504 | cs = MNEXT(cs); | |
505 | } else if (*cs & OPT) { | |
506 | ||
507 | /* doesn't matter */ | |
508 | matched = 1; | |
509 | cs = MNEXT(cs); | |
510 | } else | |
511 | ||
512 | /* no match, error return */ | |
513 | return (NIL); | |
514 | break; | |
515 | ||
516 | /* check for start of line */ | |
517 | case '^': | |
518 | if (s == _start) { | |
519 | ||
520 | /* match, be happy */ | |
521 | matched = 1; | |
522 | cs = MNEXT(cs); | |
523 | } else if (*cs & ALT) { | |
524 | ||
525 | /* try the next part */ | |
526 | matched = 0; | |
527 | cs = MNEXT(cs); | |
528 | } else if (*cs & OPT) { | |
529 | ||
530 | /* doesn't matter */ | |
531 | matched = 1; | |
532 | cs = MNEXT(cs); | |
533 | } else | |
534 | ||
535 | /* no match, error return */ | |
536 | return (NIL); | |
537 | break; | |
538 | ||
539 | /* end of a subexpression, return success */ | |
540 | case ')': | |
541 | return (s); | |
542 | } | |
543 | break; | |
544 | } | |
545 | } | |
546 | return (s); | |
547 | } |