Commit | Line | Data |
---|---|---|
76797561 DF |
1 | /* |
2 | * Copyright (c) 1980 Regents of the University of California. | |
3 | * All rights reserved. The Berkeley software License Agreement | |
4 | * specifies the terms and conditions for redistribution. | |
5 | */ | |
6 | ||
7 | #ifndef lint | |
8 | static char sccsid[] = "@(#)regexp.c 5.1 (Berkeley) %G%"; | |
9 | #endif not lint | |
23e9541a KM |
10 | |
11 | #include <ctype.h> | |
12 | ||
13 | typedef int boolean; | |
14 | #define TRUE 1 | |
15 | #define FALSE 0 | |
16 | #define NIL 0 | |
17 | ||
18 | boolean l_onecase; /* true if upper and lower equivalent */ | |
19 | ||
20 | #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) | |
21 | ||
22 | /* STRNCMP - like strncmp except that we convert the | |
23 | * first string to lower case before comparing | |
24 | * if l_onecase is set. | |
25 | */ | |
26 | ||
27 | STRNCMP(s1, s2, len) | |
28 | register char *s1,*s2; | |
29 | register int len; | |
30 | { | |
31 | if (l_onecase) { | |
32 | do | |
33 | if (*s2 - makelower(*s1)) | |
34 | return (*s2 - makelower(*s1)); | |
35 | else { | |
36 | s2++; | |
37 | s1++; | |
38 | } | |
39 | while (--len); | |
40 | } else { | |
41 | do | |
42 | if (*s2 - *s1) | |
43 | return (*s2 - *s1); | |
44 | else { | |
45 | s2++; | |
46 | s1++; | |
47 | } | |
48 | while (--len); | |
49 | } | |
50 | return(0); | |
51 | } | |
52 | ||
53 | /* The following routine converts an irregular expression to | |
54 | * internal format. | |
55 | * | |
56 | * Either meta symbols (\a \d or \p) or character strings or | |
57 | * operations ( alternation or perenthesizing ) can be | |
58 | * specified. Each starts with a descriptor byte. The descriptor | |
59 | * byte has STR set for strings, META set for meta symbols | |
60 | * and OPER set for operations. | |
61 | * The descriptor byte can also have the OPT bit set if the object | |
62 | * defined is optional. Also ALT can be set to indicate an alternation. | |
63 | * | |
64 | * For metasymbols the byte following the descriptor byte identities | |
65 | * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For | |
66 | * strings the byte after the descriptor is a character count for | |
67 | * the string: | |
68 | * | |
69 | * meta symbols := descriptor | |
70 | * symbol | |
71 | * | |
72 | * strings := descriptor | |
73 | * character count | |
74 | * the string | |
75 | * | |
76 | * operatins := descriptor | |
77 | * symbol | |
78 | * character count | |
79 | */ | |
80 | ||
81 | /* | |
82 | * handy macros for accessing parts of match blocks | |
83 | */ | |
84 | #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ | |
85 | #define MNEXT(A) (A+2) /* character following a metasymbol block */ | |
86 | ||
87 | #define OSYM(A) (*(A+1)) /* symbol in an operation block */ | |
88 | #define OCNT(A) (*(A+2)) /* character count */ | |
89 | #define ONEXT(A) (A+3) /* next character after the operation */ | |
90 | #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ | |
91 | ||
92 | #define SCNT(A) (*(A+1)) /* byte count of a string */ | |
93 | #define SSTR(A) (A+2) /* address of the string */ | |
94 | #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ | |
95 | ||
96 | /* | |
97 | * bit flags in the descriptor | |
98 | */ | |
99 | #define OPT 1 | |
100 | #define STR 2 | |
101 | #define META 4 | |
102 | #define ALT 8 | |
103 | #define OPER 16 | |
104 | ||
105 | char *ure; /* pointer current position in unconverted exp */ | |
106 | char *ccre; /* pointer to current position in converted exp*/ | |
107 | char *malloc(); | |
108 | ||
109 | char * | |
110 | convexp(re) | |
111 | char *re; /* unconverted irregular expression */ | |
112 | { | |
113 | register char *cre; /* pointer to converted regular expression */ | |
114 | ||
115 | /* allocate room for the converted expression */ | |
116 | if (re == NIL) | |
117 | return (NIL); | |
118 | if (*re == '\0') | |
119 | return (NIL); | |
120 | cre = malloc (4 * strlen(re) + 3); | |
121 | ccre = cre; | |
122 | ure = re; | |
123 | ||
124 | /* start the conversion with a \a */ | |
125 | *cre = META | OPT; | |
126 | MSYM(cre) = 'a'; | |
127 | ccre = MNEXT(cre); | |
128 | ||
129 | /* start the conversion (its recursive) */ | |
130 | expconv (); | |
131 | *ccre = 0; | |
132 | return (cre); | |
133 | } | |
134 | ||
135 | expconv() | |
136 | { | |
137 | register char *cs; /* pointer to current symbol in converted exp */ | |
138 | register char c; /* character being processed */ | |
139 | register char *acs; /* pinter to last alternate */ | |
140 | register int temp; | |
141 | ||
142 | /* let the conversion begin */ | |
143 | acs = NIL; | |
1a8c1b81 | 144 | cs = NIL; |
23e9541a KM |
145 | while (*ure != NIL) { |
146 | switch (c = *ure++) { | |
147 | ||
148 | case '\\': | |
149 | switch (c = *ure++) { | |
150 | ||
151 | /* escaped characters are just characters */ | |
152 | default: | |
1a8c1b81 | 153 | if (cs == NIL || (*cs & STR) == 0) { |
23e9541a KM |
154 | cs = ccre; |
155 | *cs = STR; | |
156 | SCNT(cs) = 1; | |
157 | ccre += 2; | |
158 | } else | |
159 | SCNT(cs)++; | |
160 | *ccre++ = c; | |
161 | break; | |
162 | ||
163 | /* normal(?) metacharacters */ | |
164 | case 'a': | |
165 | case 'd': | |
166 | case 'e': | |
167 | case 'p': | |
168 | if (acs != NIL && acs != cs) { | |
169 | do { | |
170 | temp = OCNT(acs); | |
171 | OCNT(acs) = ccre - acs; | |
172 | acs -= temp; | |
173 | } while (temp != 0); | |
174 | acs = NIL; | |
175 | } | |
176 | cs = ccre; | |
177 | *cs = META; | |
178 | MSYM(cs) = c; | |
179 | ccre = MNEXT(cs); | |
180 | break; | |
181 | } | |
182 | break; | |
183 | ||
184 | /* just put the symbol in */ | |
185 | case '^': | |
186 | case '$': | |
187 | if (acs != NIL && acs != cs) { | |
188 | do { | |
189 | temp = OCNT(acs); | |
190 | OCNT(acs) = ccre - acs; | |
191 | acs -= temp; | |
192 | } while (temp != 0); | |
193 | acs = NIL; | |
194 | } | |
195 | cs = ccre; | |
196 | *cs = META; | |
197 | MSYM(cs) = c; | |
198 | ccre = MNEXT(cs); | |
199 | break; | |
200 | ||
201 | /* mark the last match sequence as optional */ | |
202 | case '?': | |
1a8c1b81 RC |
203 | if (cs) |
204 | *cs = *cs | OPT; | |
23e9541a KM |
205 | break; |
206 | ||
207 | /* recurse and define a subexpression */ | |
208 | case '(': | |
209 | if (acs != NIL && acs != cs) { | |
210 | do { | |
211 | temp = OCNT(acs); | |
212 | OCNT(acs) = ccre - acs; | |
213 | acs -= temp; | |
214 | } while (temp != 0); | |
215 | acs = NIL; | |
216 | } | |
217 | cs = ccre; | |
218 | *cs = OPER; | |
219 | OSYM(cs) = '('; | |
220 | ccre = ONEXT(cs); | |
221 | expconv (); | |
222 | OCNT(cs) = ccre - cs; /* offset to next symbol */ | |
223 | break; | |
224 | ||
225 | /* return from a recursion */ | |
226 | case ')': | |
227 | if (acs != NIL) { | |
228 | do { | |
229 | temp = OCNT(acs); | |
230 | OCNT(acs) = ccre - acs; | |
231 | acs -= temp; | |
232 | } while (temp != 0); | |
233 | acs = NIL; | |
234 | } | |
235 | cs = ccre; | |
236 | *cs = META; | |
237 | MSYM(cs) = c; | |
238 | ccre = MNEXT(cs); | |
239 | return; | |
240 | ||
241 | /* mark the last match sequence as having an alternate */ | |
242 | /* the third byte will contain an offset to jump over the */ | |
243 | /* alternate match in case the first did not fail */ | |
244 | case '|': | |
245 | if (acs != NIL && acs != cs) | |
246 | OCNT(ccre) = ccre - acs; /* make a back pointer */ | |
247 | else | |
248 | OCNT(ccre) = 0; | |
249 | *cs |= ALT; | |
250 | cs = ccre; | |
251 | *cs = OPER; | |
252 | OSYM(cs) = '|'; | |
253 | ccre = ONEXT(cs); | |
254 | acs = cs; /* remember that the pointer is to be filles */ | |
255 | break; | |
256 | ||
257 | /* if its not a metasymbol just build a scharacter string */ | |
258 | default: | |
1a8c1b81 | 259 | if (cs == NIL || (*cs & STR) == 0) { |
23e9541a KM |
260 | cs = ccre; |
261 | *cs = STR; | |
262 | SCNT(cs) = 1; | |
263 | ccre = SSTR(cs); | |
264 | } else | |
265 | SCNT(cs)++; | |
266 | *ccre++ = c; | |
267 | break; | |
268 | } | |
269 | } | |
270 | if (acs != NIL) { | |
271 | do { | |
272 | temp = OCNT(acs); | |
273 | OCNT(acs) = ccre - acs; | |
274 | acs -= temp; | |
275 | } while (temp != 0); | |
276 | acs = NIL; | |
277 | } | |
278 | return; | |
279 | } | |
280 | /* end of convertre */ | |
281 | ||
282 | ||
283 | /* | |
284 | * The following routine recognises an irregular expresion | |
285 | * with the following special characters: | |
286 | * | |
287 | * \? - means last match was optional | |
288 | * \a - matches any number of characters | |
289 | * \d - matches any number of spaces and tabs | |
290 | * \p - matches any number of alphanumeric | |
291 | * characters. The | |
292 | * characters matched will be copied into | |
293 | * the area pointed to by 'name'. | |
294 | * \| - alternation | |
295 | * \( \) - grouping used mostly for alternation and | |
296 | * optionality | |
297 | * | |
298 | * The irregular expression must be translated to internal form | |
299 | * prior to calling this routine | |
300 | * | |
301 | * The value returned is the pointer to the first non \a | |
302 | * character matched. | |
303 | */ | |
304 | ||
305 | boolean _escaped; /* true if we are currently _escaped */ | |
306 | char *_start; /* start of string */ | |
307 | ||
308 | char * | |
309 | expmatch (s, re, mstring) | |
310 | register char *s; /* string to check for a match in */ | |
311 | register char *re; /* a converted irregular expression */ | |
312 | register char *mstring; /* where to put whatever matches a \p */ | |
313 | { | |
314 | register char *cs; /* the current symbol */ | |
315 | register char *ptr,*s1; /* temporary pointer */ | |
316 | boolean matched; /* a temporary boolean */ | |
317 | ||
318 | /* initial conditions */ | |
319 | if (re == NIL) | |
320 | return (NIL); | |
321 | cs = re; | |
322 | matched = FALSE; | |
323 | ||
324 | /* loop till expression string is exhausted (or at least pretty tired) */ | |
325 | while (*cs) { | |
326 | switch (*cs & (OPER | STR | META)) { | |
327 | ||
328 | /* try to match a string */ | |
329 | case STR: | |
330 | matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); | |
331 | if (matched) { | |
332 | ||
333 | /* hoorah it matches */ | |
334 | s += SCNT(cs); | |
335 | cs = SNEXT(cs); | |
336 | } else if (*cs & ALT) { | |
337 | ||
338 | /* alternation, skip to next expression */ | |
339 | cs = SNEXT(cs); | |
340 | } else if (*cs & OPT) { | |
341 | ||
342 | /* the match is optional */ | |
343 | cs = SNEXT(cs); | |
344 | matched = 1; /* indicate a successful match */ | |
345 | } else { | |
346 | ||
347 | /* no match, error return */ | |
348 | return (NIL); | |
349 | } | |
350 | break; | |
351 | ||
352 | /* an operator, do something fancy */ | |
353 | case OPER: | |
354 | switch (OSYM(cs)) { | |
355 | ||
356 | /* this is an alternation */ | |
357 | case '|': | |
358 | if (matched) | |
359 | ||
360 | /* last thing in the alternation was a match, skip ahead */ | |
361 | cs = OPTR(cs); | |
362 | else | |
363 | ||
364 | /* no match, keep trying */ | |
365 | cs = ONEXT(cs); | |
366 | break; | |
367 | ||
368 | /* this is a grouping, recurse */ | |
369 | case '(': | |
370 | ptr = expmatch (s, ONEXT(cs), mstring); | |
371 | if (ptr != NIL) { | |
372 | ||
373 | /* the subexpression matched */ | |
374 | matched = 1; | |
375 | s = ptr; | |
376 | } else if (*cs & ALT) { | |
377 | ||
378 | /* alternation, skip to next expression */ | |
379 | matched = 0; | |
380 | } else if (*cs & OPT) { | |
381 | ||
382 | /* the match is optional */ | |
383 | matched = 1; /* indicate a successful match */ | |
384 | } else { | |
385 | ||
386 | /* no match, error return */ | |
387 | return (NIL); | |
388 | } | |
389 | cs = OPTR(cs); | |
390 | break; | |
391 | } | |
392 | break; | |
393 | ||
394 | /* try to match a metasymbol */ | |
395 | case META: | |
396 | switch (MSYM(cs)) { | |
397 | ||
398 | /* try to match anything and remember what was matched */ | |
399 | case 'p': | |
400 | /* | |
401 | * This is really the same as trying the match the | |
402 | * remaining parts of the expression to any subset | |
403 | * of the string. | |
404 | */ | |
405 | s1 = s; | |
406 | do { | |
407 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
408 | if (ptr != NIL && s1 != s) { | |
409 | ||
410 | /* we have a match, remember the match */ | |
411 | strncpy (mstring, s, s1 - s); | |
412 | mstring[s1 - s] = '\0'; | |
413 | return (ptr); | |
414 | } else if (ptr != NIL && (*cs & OPT)) { | |
415 | ||
416 | /* it was aoptional so no match is ok */ | |
417 | return (ptr); | |
418 | } else if (ptr != NIL) { | |
419 | ||
420 | /* not optional and we still matched */ | |
421 | return (NIL); | |
422 | } | |
423 | if (!isalnum(*s1) && *s1 != '_') | |
424 | return (NIL); | |
425 | if (*s1 == '\\') | |
426 | _escaped = _escaped ? FALSE : TRUE; | |
427 | else | |
428 | _escaped = FALSE; | |
429 | } while (*s1++); | |
430 | return (NIL); | |
431 | ||
432 | /* try to match anything */ | |
433 | case 'a': | |
434 | /* | |
435 | * This is really the same as trying the match the | |
436 | * remaining parts of the expression to any subset | |
437 | * of the string. | |
438 | */ | |
439 | s1 = s; | |
440 | do { | |
441 | ptr = expmatch (s1, MNEXT(cs), mstring); | |
442 | if (ptr != NIL && s1 != s) { | |
443 | ||
444 | /* we have a match */ | |
445 | return (ptr); | |
446 | } else if (ptr != NIL && (*cs & OPT)) { | |
447 | ||
448 | /* it was aoptional so no match is ok */ | |
449 | return (ptr); | |
450 | } else if (ptr != NIL) { | |
451 | ||
452 | /* not optional and we still matched */ | |
453 | return (NIL); | |
454 | } | |
455 | if (*s1 == '\\') | |
456 | _escaped = _escaped ? FALSE : TRUE; | |
457 | else | |
458 | _escaped = FALSE; | |
459 | } while (*s1++); | |
460 | return (NIL); | |
461 | ||
462 | /* fail if we are currently _escaped */ | |
463 | case 'e': | |
464 | if (_escaped) | |
465 | return(NIL); | |
466 | cs = MNEXT(cs); | |
467 | break; | |
468 | ||
469 | /* match any number of tabs and spaces */ | |
470 | case 'd': | |
471 | ptr = s; | |
472 | while (*s == ' ' || *s == '\t') | |
473 | s++; | |
474 | if (s != ptr || s == _start) { | |
475 | ||
476 | /* match, be happy */ | |
477 | matched = 1; | |
478 | cs = MNEXT(cs); | |
479 | } else if (*s == '\n' || *s == '\0') { | |
480 | ||
481 | /* match, be happy */ | |
482 | matched = 1; | |
483 | cs = MNEXT(cs); | |
484 | } else if (*cs & ALT) { | |
485 | ||
486 | /* try the next part */ | |
487 | matched = 0; | |
488 | cs = MNEXT(cs); | |
489 | } else if (*cs & OPT) { | |
490 | ||
491 | /* doesn't matter */ | |
492 | matched = 1; | |
493 | cs = MNEXT(cs); | |
494 | } else | |
495 | ||
496 | /* no match, error return */ | |
497 | return (NIL); | |
498 | break; | |
499 | ||
500 | /* check for end of line */ | |
501 | case '$': | |
502 | if (*s == '\0' || *s == '\n') { | |
503 | ||
504 | /* match, be happy */ | |
505 | s++; | |
506 | matched = 1; | |
507 | cs = MNEXT(cs); | |
508 | } else if (*cs & ALT) { | |
509 | ||
510 | /* try the next part */ | |
511 | matched = 0; | |
512 | cs = MNEXT(cs); | |
513 | } else if (*cs & OPT) { | |
514 | ||
515 | /* doesn't matter */ | |
516 | matched = 1; | |
517 | cs = MNEXT(cs); | |
518 | } else | |
519 | ||
520 | /* no match, error return */ | |
521 | return (NIL); | |
522 | break; | |
523 | ||
524 | /* check for start of line */ | |
525 | case '^': | |
526 | if (s == _start) { | |
527 | ||
528 | /* match, be happy */ | |
529 | matched = 1; | |
530 | cs = MNEXT(cs); | |
531 | } else if (*cs & ALT) { | |
532 | ||
533 | /* try the next part */ | |
534 | matched = 0; | |
535 | cs = MNEXT(cs); | |
536 | } else if (*cs & OPT) { | |
537 | ||
538 | /* doesn't matter */ | |
539 | matched = 1; | |
540 | cs = MNEXT(cs); | |
541 | } else | |
542 | ||
543 | /* no match, error return */ | |
544 | return (NIL); | |
545 | break; | |
546 | ||
547 | /* end of a subexpression, return success */ | |
548 | case ')': | |
549 | return (s); | |
550 | } | |
551 | break; | |
552 | } | |
553 | } | |
554 | return (s); | |
555 | } |