Commit | Line | Data |
---|---|---|
76797561 | 1 | /* |
616af90a KB |
2 | * Copyright (c) 1980 The Regents of the University of California. |
3 | * All rights reserved. | |
4 | * | |
af359dea C |
5 | * Redistribution and use in source and binary forms, with or without |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * 3. All advertising materials mentioning features or use of this software | |
14 | * must display the following acknowledgement: | |
15 | * This product includes software developed by the University of | |
16 | * California, Berkeley and its contributors. | |
17 | * 4. Neither the name of the University nor the names of its contributors | |
18 | * may be used to endorse or promote products derived from this software | |
19 | * without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
22 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
23 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
24 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
25 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
26 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
27 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
28 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
29 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
30 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
31 | * SUCH DAMAGE. | |
76797561 DF |
32 | */ |
33 | ||
34 | #ifndef lint | |
1c15e888 | 35 | static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 6/1/90"; |
616af90a | 36 | #endif /* not lint */ |
23e9541a KM |
37 | |
38 | #include <ctype.h> | |
39 | ||
40 | typedef int boolean; | |
41 | #define TRUE 1 | |
42 | #define FALSE 0 | |
43 | #define NIL 0 | |
44 | ||
45 | boolean l_onecase; /* true if upper and lower equivalent */ | |
46 | ||
47 | #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) | |
48 | ||
49 | /* STRNCMP - like strncmp except that we convert the | |
50 | * first string to lower case before comparing | |
51 | * if l_onecase is set. | |
52 | */ | |
53 | ||
54 | STRNCMP(s1, s2, len) | |
55 | register char *s1,*s2; | |
56 | register int len; | |
57 | { | |
58 | if (l_onecase) { | |
59 | do | |
60 | if (*s2 - makelower(*s1)) | |
61 | return (*s2 - makelower(*s1)); | |
62 | else { | |
63 | s2++; | |
64 | s1++; | |
65 | } | |
66 | while (--len); | |
67 | } else { | |
68 | do | |
69 | if (*s2 - *s1) | |
70 | return (*s2 - *s1); | |
71 | else { | |
72 | s2++; | |
73 | s1++; | |
74 | } | |
75 | while (--len); | |
76 | } | |
77 | return(0); | |
78 | } | |
79 | ||
80 | /* The following routine converts an irregular expression to | |
81 | * internal format. | |
82 | * | |
83 | * Either meta symbols (\a \d or \p) or character strings or | |
84 | * operations ( alternation or perenthesizing ) can be | |
85 | * specified. Each starts with a descriptor byte. The descriptor | |
86 | * byte has STR set for strings, META set for meta symbols | |
87 | * and OPER set for operations. | |
88 | * The descriptor byte can also have the OPT bit set if the object | |
89 | * defined is optional. Also ALT can be set to indicate an alternation. | |
90 | * | |
91 | * For metasymbols the byte following the descriptor byte identities | |
92 | * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For | |
93 | * strings the byte after the descriptor is a character count for | |
94 | * the string: | |
95 | * | |
96 | * meta symbols := descriptor | |
97 | * symbol | |
98 | * | |
99 | * strings := descriptor | |
100 | * character count | |
101 | * the string | |
102 | * | |
103 | * operatins := descriptor | |
104 | * symbol | |
105 | * character count | |
106 | */ | |
107 | ||
108 | /* | |
109 | * handy macros for accessing parts of match blocks | |
110 | */ | |
111 | #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ | |
112 | #define MNEXT(A) (A+2) /* character following a metasymbol block */ | |
113 | ||
114 | #define OSYM(A) (*(A+1)) /* symbol in an operation block */ | |
115 | #define OCNT(A) (*(A+2)) /* character count */ | |
116 | #define ONEXT(A) (A+3) /* next character after the operation */ | |
117 | #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ | |
118 | ||
119 | #define SCNT(A) (*(A+1)) /* byte count of a string */ | |
120 | #define SSTR(A) (A+2) /* address of the string */ | |
121 | #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ | |
122 | ||
123 | /* | |
124 | * bit flags in the descriptor | |
125 | */ | |
126 | #define OPT 1 | |
127 | #define STR 2 | |
128 | #define META 4 | |
129 | #define ALT 8 | |
130 | #define OPER 16 | |
131 | ||
132 | char *ure; /* pointer current position in unconverted exp */ | |
133 | char *ccre; /* pointer to current position in converted exp*/ | |
134 | char *malloc(); | |
135 | ||
136 | char * | |
137 | convexp(re) | |
138 | char *re; /* unconverted irregular expression */ | |
139 | { | |
140 | register char *cre; /* pointer to converted regular expression */ | |
141 | ||
142 | /* allocate room for the converted expression */ | |
143 | if (re == NIL) | |
144 | return (NIL); | |
145 | if (*re == '\0') | |
146 | return (NIL); | |
147 | cre = malloc (4 * strlen(re) + 3); | |
148 | ccre = cre; | |
149 | ure = re; | |
150 | ||
151 | /* start the conversion with a \a */ | |
152 | *cre = META | OPT; | |
153 | MSYM(cre) = 'a'; | |
154 | ccre = MNEXT(cre); | |
155 | ||
156 | /* start the conversion (its recursive) */ | |
157 | expconv (); | |
158 | *ccre = 0; | |
159 | return (cre); | |
160 | } | |
161 | ||
162 | expconv() | |
163 | { | |
164 | register char *cs; /* pointer to current symbol in converted exp */ | |
165 | register char c; /* character being processed */ | |
166 | register char *acs; /* pinter to last alternate */ | |
167 | register int temp; | |
168 | ||
169 | /* let the conversion begin */ | |
170 | acs = NIL; | |
1a8c1b81 | 171 | cs = NIL; |
23e9541a KM |
172 | while (*ure != NIL) { |
173 | switch (c = *ure++) { | |
174 | ||
175 | case '\\': | |
176 | switch (c = *ure++) { | |
177 | ||
178 | /* escaped characters are just characters */ | |
179 | default: | |
1a8c1b81 | 180 | if (cs == NIL || (*cs & STR) == 0) { |
23e9541a KM |
181 | cs = ccre; |
182 | *cs = STR; | |
183 | SCNT(cs) = 1; | |
184 | ccre += 2; | |
185 | } else | |
186 | SCNT(cs)++; | |
187 | *ccre++ = c; | |
188 | break; | |
189 | ||
190 | /* normal(?) metacharacters */ | |
191 | case 'a': | |
192 | case 'd': | |
193 | case 'e': | |
194 | case 'p': | |
195 | if (acs != NIL && acs != cs) { | |
196 | do { | |
197 | temp = OCNT(acs); | |
198 | OCNT(acs) = ccre - acs; | |
199 | acs -= temp; | |
200 | } while (temp != 0); | |
201 | acs = NIL; | |
202 | } | |
203 | cs = ccre; | |
204 | *cs = META; | |
205 | MSYM(cs) = c; | |
206 | ccre = MNEXT(cs); | |
207 | break; | |
208 | } | |
209 | break; | |
210 | ||
211 | /* just put the symbol in */ | |
212 | case '^': | |
213 | case '$': | |
214 | if (acs != NIL && acs != cs) { | |
215 | do { | |
216 | temp = OCNT(acs); | |
217 | OCNT(acs) = ccre - acs; | |
218 | acs -= temp; | |
219 | } while (temp != 0); | |
220 | acs = NIL; | |
221 | } | |
222 | cs = ccre; | |
223 | *cs = META; | |
224 | MSYM(cs) = c; | |
225 | ccre = MNEXT(cs); | |
226 | break; | |
227 | ||
228 | /* mark the last match sequence as optional */ | |
229 | case '?': | |
1a8c1b81 RC |
230 | if (cs) |
231 | *cs = *cs | OPT; | |
23e9541a KM |
232 | break; |
233 | ||
234 | /* recurse and define a subexpression */ | |
235 | case '(': | |
236 | if (acs != NIL && acs != cs) { | |
237 | do { | |
238 | temp = OCNT(acs); | |
239 | OCNT(acs) = ccre - acs; | |
240 | acs -= temp; | |
241 | } while (temp != 0); | |
242 | acs = NIL; | |
243 | } | |
244 | cs = ccre; | |
245 | *cs = OPER; | |
246 | OSYM(cs) = '('; | |
247 | ccre = ONEXT(cs); | |
248 | expconv (); | |
249 | OCNT(cs) = ccre - cs; /* offset to next symbol */ | |
250 | break; | |
251 | ||
252 | /* return from a recursion */ | |
253 | case ')': | |
254 | if (acs != NIL) { | |
255 | do { | |
256 | temp = OCNT(acs); | |
257 | OCNT(acs) = ccre - acs; | |
258 | acs -= temp; | |
259 | } while (temp != 0); | |
260 | acs = NIL; | |
261 | } | |
262 | cs = ccre; | |
263 | *cs = META; | |
264 | MSYM(cs) = c; | |
265 | ccre = MNEXT(cs); | |
266 | return; | |
267 | ||
268 | /* mark the last match sequence as having an alternate */ | |
269 | /* the third byte will contain an offset to jump over the */ | |
270 | /* alternate match in case the first did not fail */ | |
271 | case '|': | |
272 | if (acs != NIL && acs != cs) | |
273 | OCNT(ccre) = ccre - acs; /* make a back pointer */ | |
274 | else | |
275 | OCNT(ccre) = 0; | |
276 | *cs |= ALT; | |
277 | cs = ccre; | |
278 | *cs = OPER; | |
279 | OSYM(cs) = '|'; | |
280 | ccre = ONEXT(cs); | |
281 | acs = cs; /* remember that the pointer is to be filles */ | |
282 | break; | |
283 | ||
284 | /* if its not a metasymbol just build a scharacter string */ | |
285 | default: | |
1a8c1b81 | 286 | if (cs == NIL || (*cs & STR) == 0) { |
23e9541a KM |
287 | cs = ccre; |
288 | *cs = STR; | |
289 | SCNT(cs) = 1; | |
290 | ccre = SSTR(cs); | |
291 | } else | |
292 | SCNT(cs)++; | |
293 | *ccre++ = c; | |
294 | break; | |
295 | } | |
296 | } | |
297 | if (acs != NIL) { | |
298 | do { | |
299 | temp = OCNT(acs); | |
300 | OCNT(acs) = ccre - acs; | |
301 | acs -= temp; | |
302 | } while (temp != 0); | |
303 | acs = NIL; | |
304 | } | |
305 | return; | |
306 | } | |
307 | /* end of convertre */ | |
308 | ||
309 | ||
310 | /* | |
311 | * The following routine recognises an irregular expresion | |
312 | * with the following special characters: | |
313 | * | |
314 | * \? - means last match was optional | |
315 | * \a - matches any number of characters | |
316 | * \d - matches any number of spaces and tabs | |
317 | * \p - matches any number of alphanumeric | |
318 | * characters. The | |
319 | * characters matched will be copied into | |
320 | * the area pointed to by 'name'. | |
321 | * \| - alternation | |
322 | * \( \) - grouping used mostly for alternation and | |
323 | * optionality | |
324 | * | |
325 | * The irregular expression must be translated to internal form | |
326 | * prior to calling this routine | |
327 | * | |
328 | * The value returned is the pointer to the first non \a | |
329 | * character matched. | |
330 | */ | |
331 | ||
332 | boolean _escaped; /* true if we are currently _escaped */ | |
333 | char *_start; /* start of string */ | |
334 | ||
335 | char * | |
336 | expmatch (s, re, mstring) | |
337 | register char *s; /* string to check for a match in */ | |
338 | register char *re; /* a converted irregular expression */ | |
339 | register char *mstring; /* where to put whatever matches a \p */ | |
340 | { | |
341 | register char *cs; /* the current symbol */ | |
342 | register char *ptr,*s1; /* temporary pointer */ | |
343 | boolean matched; /* a temporary boolean */ | |
344 | ||
345 | /* initial conditions */ | |
346 | if (re == NIL) | |
347 | return (NIL); | |
348 | cs = re; | |
349 | matched = FALSE; | |
350 | ||
351 | /* loop till expression string is exhausted (or at least pretty tired) */ | |
352 | while (*cs) { | |
353 | switch (*cs & (OPER | STR | META)) { | |
354 | ||
355 | /* try to match a string */ | |
356 | case STR: | |
357 | matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); | |
358 | if (matched) { | |
359 | ||
360 | /* hoorah it matches */ | |
361 | s += SCNT(cs); | |
362 | cs = SNEXT(cs); | |
363 | } else if (*cs & ALT) { | |
364 | ||
365 | /* alternation, skip to next expression */ | |
366 | cs = SNEXT(cs); | |
367 | } else if (*cs & OPT) { | |
368 | ||
369 | /* the match is optional */ | |
370 | cs = SNEXT(cs); | |
371 | matched = 1; /* indicate a successful match */ | |
372 | } else { | |
373 | ||
374 | /* no match, error return */ | |
375 | return (NIL); | |
376 | } | |
377 | break; | |
378 | ||
379 | /* an operator, do something fancy */ | |
380 | case OPER: | |
381 | switch (OSYM(cs)) { | |
382 | ||
383 | /* this is an alternation */ | |
384 | case '|': | |
385 | if (matched) | |
386 | ||
387 | /* last thing in the alternation was a match, skip ahead */ | |
388 | cs = OPTR(cs); | |
389 | else | |
390 | ||
391 | /* no match, keep trying */ | |
392 | cs = ONEXT(cs); | |
393 | break; | |
394 | ||
395 | /* this is a grouping, recurse */ | |
396 | case '(': | |
397 | ptr = expmatch (s, ONEXT(cs), mstring); | |
398 | if (ptr != NIL) { | |
399 | ||
400 | /* the subexpression matched */ | |
401 | matched = 1; | |
402 | s = ptr; | |
403 | } else if (*cs & ALT) { | |
404 | ||
405 | /* alternation, skip to next expression */ | |
406 | matched = 0; | |
407 | } else if (*cs & OPT) { | |
408 | ||
409 | /* the match is optional */ | |
410 | matched = 1; /* indicate a successful match */ | |
411 | } else { | |
412 | ||
413 | /* no match, error return */ | |
414 | return (NIL); | |
415 | } | |
416 | cs = OPTR(cs); | |
417 | break; | |
418 | } | |
419 | break; | |
420 | ||
421 | /* try to match a metasymbol */ | |
422 | case META: | |
423 | switch (MSYM(cs)) { | |
424 | ||
425 | /* try to match anything and remember what was matched */ | |
426 | case 'p': | |
427 | /* | |
428 | * This is really the same as trying the match the | |
429 | * remaining parts of the expression to any subset | |
430 | * of the string. | |
431 | */ | |
432 | s1 = s; | |
433 | do { | |
434 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
435 | if (ptr != NIL && s1 != s) { | |
436 | ||
437 | /* we have a match, remember the match */ | |
438 | strncpy (mstring, s, s1 - s); | |
439 | mstring[s1 - s] = '\0'; | |
440 | return (ptr); | |
441 | } else if (ptr != NIL && (*cs & OPT)) { | |
442 | ||
443 | /* it was aoptional so no match is ok */ | |
444 | return (ptr); | |
445 | } else if (ptr != NIL) { | |
446 | ||
447 | /* not optional and we still matched */ | |
448 | return (NIL); | |
449 | } | |
450 | if (!isalnum(*s1) && *s1 != '_') | |
451 | return (NIL); | |
452 | if (*s1 == '\\') | |
453 | _escaped = _escaped ? FALSE : TRUE; | |
454 | else | |
455 | _escaped = FALSE; | |
456 | } while (*s1++); | |
457 | return (NIL); | |
458 | ||
459 | /* try to match anything */ | |
460 | case 'a': | |
461 | /* | |
462 | * This is really the same as trying the match the | |
463 | * remaining parts of the expression to any subset | |
464 | * of the string. | |
465 | */ | |
466 | s1 = s; | |
467 | do { | |
468 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
469 | if (ptr != NIL && s1 != s) { | |
470 | ||
471 | /* we have a match */ | |
472 | return (ptr); | |
473 | } else if (ptr != NIL && (*cs & OPT)) { | |
474 | ||
475 | /* it was aoptional so no match is ok */ | |
476 | return (ptr); | |
477 | } else if (ptr != NIL) { | |
478 | ||
479 | /* not optional and we still matched */ | |
480 | return (NIL); | |
481 | } | |
482 | if (*s1 == '\\') | |
483 | _escaped = _escaped ? FALSE : TRUE; | |
484 | else | |
485 | _escaped = FALSE; | |
486 | } while (*s1++); | |
487 | return (NIL); | |
488 | ||
489 | /* fail if we are currently _escaped */ | |
490 | case 'e': | |
491 | if (_escaped) | |
492 | return(NIL); | |
493 | cs = MNEXT(cs); | |
494 | break; | |
495 | ||
496 | /* match any number of tabs and spaces */ | |
497 | case 'd': | |
498 | ptr = s; | |
499 | while (*s == ' ' || *s == '\t') | |
500 | s++; | |
501 | if (s != ptr || s == _start) { | |
502 | ||
503 | /* match, be happy */ | |
504 | matched = 1; | |
505 | cs = MNEXT(cs); | |
506 | } else if (*s == '\n' || *s == '\0') { | |
507 | ||
508 | /* match, be happy */ | |
509 | matched = 1; | |
510 | cs = MNEXT(cs); | |
511 | } else if (*cs & ALT) { | |
512 | ||
513 | /* try the next part */ | |
514 | matched = 0; | |
515 | cs = MNEXT(cs); | |
516 | } else if (*cs & OPT) { | |
517 | ||
518 | /* doesn't matter */ | |
519 | matched = 1; | |
520 | cs = MNEXT(cs); | |
521 | } else | |
522 | ||
523 | /* no match, error return */ | |
524 | return (NIL); | |
525 | break; | |
526 | ||
527 | /* check for end of line */ | |
528 | case '$': | |
529 | if (*s == '\0' || *s == '\n') { | |
530 | ||
531 | /* match, be happy */ | |
532 | s++; | |
533 | matched = 1; | |
534 | cs = MNEXT(cs); | |
535 | } else if (*cs & ALT) { | |
536 | ||
537 | /* try the next part */ | |
538 | matched = 0; | |
539 | cs = MNEXT(cs); | |
540 | } else if (*cs & OPT) { | |
541 | ||
542 | /* doesn't matter */ | |
543 | matched = 1; | |
544 | cs = MNEXT(cs); | |
545 | } else | |
546 | ||
547 | /* no match, error return */ | |
548 | return (NIL); | |
549 | break; | |
550 | ||
551 | /* check for start of line */ | |
552 | case '^': | |
553 | if (s == _start) { | |
554 | ||
555 | /* match, be happy */ | |
556 | matched = 1; | |
557 | cs = MNEXT(cs); | |
558 | } else if (*cs & ALT) { | |
559 | ||
560 | /* try the next part */ | |
561 | matched = 0; | |
562 | cs = MNEXT(cs); | |
563 | } else if (*cs & OPT) { | |
564 | ||
565 | /* doesn't matter */ | |
566 | matched = 1; | |
567 | cs = MNEXT(cs); | |
568 | } else | |
569 | ||
570 | /* no match, error return */ | |
571 | return (NIL); | |
572 | break; | |
573 | ||
574 | /* end of a subexpression, return success */ | |
575 | case ')': | |
576 | return (s); | |
577 | } | |
578 | break; | |
579 | } | |
580 | } | |
581 | return (s); | |
582 | } |