Commit | Line | Data |
---|---|---|
f5bcab4b WJ |
1 | /* regexp.c */ |
2 | ||
3 | /* This file contains the code that compiles regular expressions and executes | |
4 | * them. It supports the same syntax and features as vi's regular expression | |
5 | * code. Specifically, the meta characters are: | |
6 | * ^ matches the beginning of a line | |
7 | * $ matches the end of a line | |
8 | * \< matches the beginning of a word | |
9 | * \> matches the end of a word | |
10 | * . matches any single character | |
11 | * [] matches any character in a character class | |
12 | * \( delimits the start of a subexpression | |
13 | * \) delimits the end of a subexpression | |
14 | * * repeats the preceding 0 or more times | |
15 | * NOTE: You cannot follow a \) with a *. | |
16 | * | |
17 | * The physical structure of a compiled RE is as follows: | |
18 | * - First, there is a one-byte value that says how many character classes | |
19 | * are used in this regular expression | |
20 | * - Next, each character class is stored as a bitmap that is 256 bits | |
21 | * (32 bytes) long. | |
22 | * - A mixture of literal characters and compiled meta characters follows. | |
23 | * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars | |
24 | * are stored as a \n followed by a one-byte code, so they take up two | |
25 | * bytes apiece. Literal characters take up one byte apiece. \n can't | |
26 | * be used as a literal character. | |
27 | * | |
28 | * If NO_MAGIC is defined, then a different set of functions is used instead. | |
29 | * That right, this file contains TWO versions of the code. | |
30 | */ | |
31 | ||
32 | #include <setjmp.h> | |
33 | #include "config.h" | |
34 | #include "ctype.h" | |
35 | #include "vi.h" | |
36 | #include "regexp.h" | |
37 | ||
38 | ||
39 | ||
40 | static char *previous; /* the previous regexp, used when null regexp is given */ | |
41 | ||
42 | ||
43 | #ifndef NO_MAGIC | |
44 | /* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED */ | |
45 | ||
46 | /* These are used to classify or recognize meta-characters */ | |
47 | #define META '\0' | |
48 | #define BASE_META(m) ((m) - 256) | |
49 | #define INT_META(c) ((c) + 256) | |
50 | #define IS_META(m) ((m) >= 256) | |
51 | #define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9)) | |
52 | #define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9)) | |
53 | #define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9)) | |
54 | #define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE) | |
55 | #define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m)) | |
56 | #define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s) | |
57 | ||
58 | /* These are the internal codes used for each type of meta-character */ | |
59 | #define M_BEGLINE 256 /* internal code for ^ */ | |
60 | #define M_ENDLINE 257 /* internal code for $ */ | |
61 | #define M_BEGWORD 258 /* internal code for \< */ | |
62 | #define M_ENDWORD 259 /* internal code for \> */ | |
63 | #define M_ANY 260 /* internal code for . */ | |
64 | #define M_SPLAT 261 /* internal code for * */ | |
65 | #define M_PLUS 262 /* internal code for \+ */ | |
66 | #define M_QMARK 263 /* internal code for \? */ | |
67 | #define M_RANGE 264 /* internal code for \{ */ | |
68 | #define M_CLASS(n) (265+(n)) /* internal code for [] */ | |
69 | #define M_START(n) (275+(n)) /* internal code for \( */ | |
70 | #define M_END(n) (285+(n)) /* internal code for \) */ | |
71 | ||
72 | /* These are used during compilation */ | |
73 | static int class_cnt; /* used to assign class IDs */ | |
74 | static int start_cnt; /* used to assign start IDs */ | |
75 | static int end_stk[NSUBEXP];/* used to assign end IDs */ | |
76 | static int end_sp; | |
77 | static char *retext; /* points to the text being compiled */ | |
78 | ||
79 | /* error-handling stuff */ | |
80 | jmp_buf errorhandler; | |
81 | #define FAIL(why) regerror(why); longjmp(errorhandler, 1) | |
82 | ||
83 | ||
84 | ||
85 | ||
86 | ||
87 | /* This function builds a bitmap for a particular class */ | |
88 | static char *makeclass(text, bmap) | |
89 | REG char *text; /* start of the class */ | |
90 | REG char *bmap; /* the bitmap */ | |
91 | { | |
92 | REG int i; | |
93 | int complement = 0; | |
94 | ||
95 | ||
96 | /* zero the bitmap */ | |
97 | for (i = 0; bmap && i < 32; i++) | |
98 | { | |
99 | bmap[i] = 0; | |
100 | } | |
101 | ||
102 | /* see if we're going to complement this class */ | |
103 | if (*text == '^') | |
104 | { | |
105 | text++; | |
106 | complement = 1; | |
107 | } | |
108 | ||
109 | /* add in the characters */ | |
110 | while (*text && *text != ']') | |
111 | { | |
112 | /* is this a span of characters? */ | |
113 | if (text[1] == '-' && text[2]) | |
114 | { | |
115 | /* spans can't be backwards */ | |
116 | if (text[0] > text[2]) | |
117 | { | |
118 | FAIL("Backwards span in []"); | |
119 | } | |
120 | ||
121 | /* add each character in the span to the bitmap */ | |
122 | for (i = text[0]; bmap && i <= text[2]; i++) | |
123 | { | |
124 | bmap[i >> 3] |= (1 << (i & 7)); | |
125 | } | |
126 | ||
127 | /* move past this span */ | |
128 | text += 3; | |
129 | } | |
130 | else | |
131 | { | |
132 | /* add this single character to the span */ | |
133 | i = *text++; | |
134 | if (bmap) | |
135 | { | |
136 | bmap[i >> 3] |= (1 << (i & 7)); | |
137 | } | |
138 | } | |
139 | } | |
140 | ||
141 | /* make sure the closing ] is missing */ | |
142 | if (*text++ != ']') | |
143 | { | |
144 | FAIL("] missing"); | |
145 | } | |
146 | ||
147 | /* if we're supposed to complement this class, then do so */ | |
148 | if (complement && bmap) | |
149 | { | |
150 | for (i = 0; i < 32; i++) | |
151 | { | |
152 | bmap[i] = ~bmap[i]; | |
153 | } | |
154 | } | |
155 | ||
156 | return text; | |
157 | } | |
158 | ||
159 | ||
160 | ||
161 | ||
162 | /* This function gets the next character or meta character from a string. | |
163 | * The pointer is incremented by 1, or by 2 for \-quoted characters. For [], | |
164 | * a bitmap is generated via makeclass() (if re is given), and the | |
165 | * character-class text is skipped. | |
166 | */ | |
167 | static int gettoken(sptr, re) | |
168 | char **sptr; | |
169 | regexp *re; | |
170 | { | |
171 | int c; | |
172 | ||
173 | c = **sptr; | |
174 | ++*sptr; | |
175 | if (c == '\\') | |
176 | { | |
177 | c = **sptr; | |
178 | ++*sptr; | |
179 | switch (c) | |
180 | { | |
181 | case '<': | |
182 | return M_BEGWORD; | |
183 | ||
184 | case '>': | |
185 | return M_ENDWORD; | |
186 | ||
187 | case '(': | |
188 | if (start_cnt >= NSUBEXP) | |
189 | { | |
190 | FAIL("Too many \\(s"); | |
191 | } | |
192 | end_stk[end_sp++] = start_cnt; | |
193 | return M_START(start_cnt++); | |
194 | ||
195 | case ')': | |
196 | if (end_sp <= 0) | |
197 | { | |
198 | FAIL("Mismatched \\)"); | |
199 | } | |
200 | return M_END(end_stk[--end_sp]); | |
201 | ||
202 | case '*': | |
203 | return (*o_magic ? c : M_SPLAT); | |
204 | ||
205 | case '.': | |
206 | return (*o_magic ? c : M_ANY); | |
207 | ||
208 | case '+': | |
209 | return M_PLUS; | |
210 | ||
211 | case '?': | |
212 | return M_QMARK; | |
213 | #ifndef CRUNCH | |
214 | case '{': | |
215 | return M_RANGE; | |
216 | #endif | |
217 | default: | |
218 | return c; | |
219 | } | |
220 | } | |
221 | else if (*o_magic) | |
222 | { | |
223 | switch (c) | |
224 | { | |
225 | case '^': | |
226 | if (*sptr == retext + 1) | |
227 | { | |
228 | return M_BEGLINE; | |
229 | } | |
230 | return c; | |
231 | ||
232 | case '$': | |
233 | if (!**sptr) | |
234 | { | |
235 | return M_ENDLINE; | |
236 | } | |
237 | return c; | |
238 | ||
239 | case '.': | |
240 | return M_ANY; | |
241 | ||
242 | case '*': | |
243 | return M_SPLAT; | |
244 | ||
245 | case '[': | |
246 | /* make sure we don't have too many classes */ | |
247 | if (class_cnt >= 10) | |
248 | { | |
249 | FAIL("Too many []s"); | |
250 | } | |
251 | ||
252 | /* process the character list for this class */ | |
253 | if (re) | |
254 | { | |
255 | /* generate the bitmap for this class */ | |
256 | *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt); | |
257 | } | |
258 | else | |
259 | { | |
260 | /* skip to end of the class */ | |
261 | *sptr = makeclass(*sptr, (char *)0); | |
262 | } | |
263 | return M_CLASS(class_cnt++); | |
264 | ||
265 | default: | |
266 | return c; | |
267 | } | |
268 | } | |
269 | else /* unquoted nomagic */ | |
270 | { | |
271 | switch (c) | |
272 | { | |
273 | case '^': | |
274 | if (*sptr == retext + 1) | |
275 | { | |
276 | return M_BEGLINE; | |
277 | } | |
278 | return c; | |
279 | ||
280 | case '$': | |
281 | if (!**sptr) | |
282 | { | |
283 | return M_ENDLINE; | |
284 | } | |
285 | return c; | |
286 | ||
287 | default: | |
288 | return c; | |
289 | } | |
290 | } | |
291 | /*NOTREACHED*/ | |
292 | } | |
293 | ||
294 | ||
295 | ||
296 | ||
297 | /* This function calculates the number of bytes that will be needed for a | |
298 | * compiled RE. Its argument is the uncompiled version. It is not clever | |
299 | * about catching syntax errors; that is done in a later pass. | |
300 | */ | |
301 | static unsigned calcsize(text) | |
302 | char *text; | |
303 | { | |
304 | unsigned size; | |
305 | int token; | |
306 | ||
307 | retext = text; | |
308 | class_cnt = 0; | |
309 | start_cnt = 1; | |
310 | end_sp = 0; | |
311 | size = 5; | |
312 | while ((token = gettoken(&text, (regexp *)0)) != 0) | |
313 | { | |
314 | if (IS_CLASS(token)) | |
315 | { | |
316 | size += 34; | |
317 | } | |
318 | #ifndef CRUNCH | |
319 | else if (token == M_RANGE) | |
320 | { | |
321 | size += 4; | |
322 | while ((token = gettoken(&text, (regexp *)0)) != 0 | |
323 | && token != '}') | |
324 | { | |
325 | } | |
326 | if (!token) | |
327 | { | |
328 | return size; | |
329 | } | |
330 | } | |
331 | #endif | |
332 | else if (IS_META(token)) | |
333 | { | |
334 | size += 2; | |
335 | } | |
336 | else | |
337 | { | |
338 | size++; | |
339 | } | |
340 | } | |
341 | ||
342 | return size; | |
343 | } | |
344 | ||
345 | ||
346 | ||
347 | /* This function compiles a regexp. */ | |
348 | regexp *regcomp(exp) | |
349 | char *exp; | |
350 | { | |
351 | int needfirst; | |
352 | unsigned size; | |
353 | int token; | |
354 | int peek; | |
355 | char *build; | |
356 | regexp *re; | |
357 | #ifndef CRUNCH | |
358 | int from; | |
359 | int to; | |
360 | int digit; | |
361 | #endif | |
362 | ||
363 | ||
364 | /* prepare for error handling */ | |
365 | re = (regexp *)0; | |
366 | if (setjmp(errorhandler)) | |
367 | { | |
368 | if (re) | |
369 | { | |
370 | free(re); | |
371 | } | |
372 | return (regexp *)0; | |
373 | } | |
374 | ||
375 | /* if an empty regexp string was given, use the previous one */ | |
376 | if (*exp == 0) | |
377 | { | |
378 | if (!previous) | |
379 | { | |
380 | FAIL("No previous RE"); | |
381 | } | |
382 | exp = previous; | |
383 | } | |
384 | else /* non-empty regexp given, so remember it */ | |
385 | { | |
386 | if (previous) | |
387 | free(previous); | |
388 | previous = (char *)malloc((unsigned)(strlen(exp) + 1)); | |
389 | if (previous) | |
390 | strcpy(previous, exp); | |
391 | } | |
392 | ||
393 | /* allocate memory */ | |
394 | class_cnt = 0; | |
395 | start_cnt = 1; | |
396 | end_sp = 0; | |
397 | retext = exp; | |
398 | size = calcsize(exp) + sizeof(regexp) + 10; /* !!! 10 bytes for slop */ | |
399 | #ifdef lint | |
400 | re = ((regexp *)0) + size; | |
401 | #else | |
402 | re = (regexp *)malloc((unsigned)size); | |
403 | #endif | |
404 | if (!re) | |
405 | { | |
406 | FAIL("Not enough memory for this RE"); | |
407 | } | |
408 | ||
409 | /* compile it */ | |
410 | build = &re->program[1 + 32 * class_cnt]; | |
411 | re->program[0] = class_cnt; | |
412 | for (token = 0; token < NSUBEXP; token++) | |
413 | { | |
414 | re->startp[token] = re->endp[token] = (char *)0; | |
415 | } | |
416 | re->first = 0; | |
417 | re->bol = 0; | |
418 | re->minlen = 0; | |
419 | needfirst = 1; | |
420 | class_cnt = 0; | |
421 | start_cnt = 1; | |
422 | end_sp = 0; | |
423 | retext = exp; | |
424 | for (token = M_START(0), peek = gettoken(&exp, re); | |
425 | token; | |
426 | token = peek, peek = gettoken(&exp, re)) | |
427 | { | |
428 | /* special processing for the closure operator */ | |
429 | if (IS_CLOSURE(peek)) | |
430 | { | |
431 | /* detect misuse of closure operator */ | |
432 | if (IS_START(token)) | |
433 | { | |
434 | FAIL("Closure operator follows nothing"); | |
435 | } | |
436 | else if (IS_META(token) && token != M_ANY && !IS_CLASS(token)) | |
437 | { | |
438 | FAIL("Closure operators can only follow a normal character or . or []"); | |
439 | } | |
440 | ||
441 | #ifndef CRUNCH | |
442 | /* if \{ \} then read the range */ | |
443 | if (peek == M_RANGE) | |
444 | { | |
445 | from = 0; | |
446 | for (digit = gettoken(&exp, re); | |
447 | !IS_META(digit) && isdigit(digit); | |
448 | digit = gettoken(&exp, re)) | |
449 | { | |
450 | from = from * 10 + digit - '0'; | |
451 | } | |
452 | if (digit == '}') | |
453 | { | |
454 | to = from; | |
455 | } | |
456 | else if (digit == ',') | |
457 | { | |
458 | to = 0; | |
459 | for (digit = gettoken(&exp, re); | |
460 | !IS_META(digit) && isdigit(digit); | |
461 | digit = gettoken(&exp, re)) | |
462 | { | |
463 | to = to * 10 + digit - '0'; | |
464 | } | |
465 | if (to == 0) | |
466 | { | |
467 | to = 255; | |
468 | } | |
469 | } | |
470 | if (digit != '}') | |
471 | { | |
472 | FAIL("Bad characters after \\{"); | |
473 | } | |
474 | else if (to < from || to == 0 || from >= 255) | |
475 | { | |
476 | FAIL("Invalid range for \\{ \\}"); | |
477 | } | |
478 | re->minlen += from; | |
479 | } | |
480 | else | |
481 | #endif | |
482 | if (peek != M_SPLAT) | |
483 | { | |
484 | re->minlen++; | |
485 | } | |
486 | ||
487 | /* it is okay -- make it prefix instead of postfix */ | |
488 | ADD_META(build, peek); | |
489 | #ifndef CRUNCH | |
490 | if (peek == M_RANGE) | |
491 | { | |
492 | *build++ = from; | |
493 | *build++ = (to < 255 ? to : 255); | |
494 | } | |
495 | #endif | |
496 | ||
497 | ||
498 | /* take care of "needfirst" - is this the first char? */ | |
499 | if (needfirst && peek == M_PLUS && !IS_META(token)) | |
500 | { | |
501 | re->first = token; | |
502 | } | |
503 | needfirst = 0; | |
504 | ||
505 | /* we used "peek" -- need to refill it */ | |
506 | peek = gettoken(&exp, re); | |
507 | if (IS_CLOSURE(peek)) | |
508 | { | |
509 | FAIL("* or \\+ or \\? doubled up"); | |
510 | } | |
511 | } | |
512 | else if (!IS_META(token)) | |
513 | { | |
514 | /* normal char is NOT argument of closure */ | |
515 | if (needfirst) | |
516 | { | |
517 | re->first = token; | |
518 | needfirst = 0; | |
519 | } | |
520 | re->minlen++; | |
521 | } | |
522 | else if (token == M_ANY || IS_CLASS(token)) | |
523 | { | |
524 | /* . or [] is NOT argument of closure */ | |
525 | needfirst = 0; | |
526 | re->minlen++; | |
527 | } | |
528 | ||
529 | /* the "token" character is not closure -- process it normally */ | |
530 | if (token == M_BEGLINE) | |
531 | { | |
532 | /* set the BOL flag instead of storing M_BEGLINE */ | |
533 | re->bol = 1; | |
534 | } | |
535 | else if (IS_META(token)) | |
536 | { | |
537 | ADD_META(build, token); | |
538 | } | |
539 | else | |
540 | { | |
541 | *build++ = token; | |
542 | } | |
543 | } | |
544 | ||
545 | /* end it with a \) which MUST MATCH the opening \( */ | |
546 | ADD_META(build, M_END(0)); | |
547 | if (end_sp > 0) | |
548 | { | |
549 | FAIL("Not enough \\)s"); | |
550 | } | |
551 | ||
552 | return re; | |
553 | } | |
554 | ||
555 | ||
556 | ||
557 | /*---------------------------------------------------------------------------*/ | |
558 | ||
559 | ||
560 | /* This function checks for a match between a character and a token which is | |
561 | * known to represent a single character. It returns 0 if they match, or | |
562 | * 1 if they don't. | |
563 | */ | |
564 | int match1(re, ch, token) | |
565 | regexp *re; | |
566 | REG char ch; | |
567 | REG int token; | |
568 | { | |
569 | if (!ch) | |
570 | { | |
571 | /* the end of a line can't match any RE of width 1 */ | |
572 | return 1; | |
573 | } | |
574 | if (token == M_ANY) | |
575 | { | |
576 | return 0; | |
577 | } | |
578 | else if (IS_CLASS(token)) | |
579 | { | |
580 | if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7))) | |
581 | return 0; | |
582 | } | |
583 | else if (ch == token || *o_ignorecase && tolower(ch) == tolower(token)) | |
584 | { | |
585 | return 0; | |
586 | } | |
587 | return 1; | |
588 | } | |
589 | ||
590 | ||
591 | ||
592 | /* This function checks characters up to and including the next closure, at | |
593 | * which point it does a recursive call to check the rest of it. This function | |
594 | * returns 0 if everything matches, or 1 if something doesn't match. | |
595 | */ | |
596 | int match(re, str, prog, here) | |
597 | regexp *re; /* the regular expression */ | |
598 | char *str; /* the string */ | |
599 | REG char *prog; /* a portion of re->program, an compiled RE */ | |
600 | REG char *here; /* a portion of str, the string to compare it to */ | |
601 | { | |
602 | REG int token; /* the roken pointed to by prog */ | |
603 | REG int nmatched;/* counter, used during closure matching */ | |
604 | REG int closure;/* the token denoting the type of closure */ | |
605 | int from; /* minimum number of matches in closure */ | |
606 | int to; /* maximum number of matches in closure */ | |
607 | ||
608 | for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog)) | |
609 | { | |
610 | switch (token) | |
611 | { | |
612 | /*case M_BEGLINE: can't happen; re->bol is used instead */ | |
613 | case M_ENDLINE: | |
614 | if (*here) | |
615 | return 1; | |
616 | break; | |
617 | ||
618 | case M_BEGWORD: | |
619 | if (here != str && | |
620 | (here[-1] == '_' || isalnum(here[-1]))) | |
621 | return 1; | |
622 | break; | |
623 | ||
624 | case M_ENDWORD: | |
625 | if (here[0] == '_' || isalnum(here[0])) | |
626 | return 1; | |
627 | break; | |
628 | ||
629 | case M_START(0): | |
630 | case M_START(1): | |
631 | case M_START(2): | |
632 | case M_START(3): | |
633 | case M_START(4): | |
634 | case M_START(5): | |
635 | case M_START(6): | |
636 | case M_START(7): | |
637 | case M_START(8): | |
638 | case M_START(9): | |
639 | re->startp[token - M_START(0)] = (char *)here; | |
640 | break; | |
641 | ||
642 | case M_END(0): | |
643 | case M_END(1): | |
644 | case M_END(2): | |
645 | case M_END(3): | |
646 | case M_END(4): | |
647 | case M_END(5): | |
648 | case M_END(6): | |
649 | case M_END(7): | |
650 | case M_END(8): | |
651 | case M_END(9): | |
652 | re->endp[token - M_END(0)] = (char *)here; | |
653 | if (token == M_END(0)) | |
654 | { | |
655 | return 0; | |
656 | } | |
657 | break; | |
658 | ||
659 | default: /* literal, M_CLASS(n), or M_ANY */ | |
660 | if (match1(re, *here, token) != 0) | |
661 | return 1; | |
662 | here++; | |
663 | } | |
664 | } | |
665 | ||
666 | /* C L O S U R E */ | |
667 | ||
668 | /* step 1: see what we have to match against, and move "prog" to point | |
669 | * to the remainder of the compiled RE. | |
670 | */ | |
671 | closure = token; | |
672 | prog++; | |
673 | switch (closure) | |
674 | { | |
675 | case M_SPLAT: | |
676 | from = 0; | |
677 | to = strlen(str); /* infinity */ | |
678 | break; | |
679 | ||
680 | case M_PLUS: | |
681 | from = 1; | |
682 | to = strlen(str); /* infinity */ | |
683 | break; | |
684 | ||
685 | case M_QMARK: | |
686 | from = 0; | |
687 | to = 1; | |
688 | break; | |
689 | ||
690 | #ifndef CRUNCH | |
691 | case M_RANGE: | |
692 | from = UCHAR(*prog++); | |
693 | to = UCHAR(*prog++); | |
694 | if (to == 255) | |
695 | { | |
696 | to = strlen(str); /* infinity */ | |
697 | } | |
698 | break; | |
699 | #endif | |
700 | } | |
701 | token = GET_META(prog); | |
702 | prog++; | |
703 | ||
704 | /* step 2: see how many times we can match that token against the string */ | |
705 | for (nmatched = 0; | |
706 | nmatched < to && *here && match1(re, *here, token) == 0; | |
707 | nmatched++, here++) | |
708 | { | |
709 | } | |
710 | ||
711 | /* step 3: try to match the remainder, and back off if it doesn't */ | |
712 | while (nmatched >= from && match(re, str, prog, here) != 0) | |
713 | { | |
714 | nmatched--; | |
715 | here--; | |
716 | } | |
717 | ||
718 | /* so how did it work out? */ | |
719 | if (nmatched >= from) | |
720 | return 0; | |
721 | return 1; | |
722 | } | |
723 | ||
724 | ||
725 | ||
726 | /* This function searches through a string for text that matches an RE. */ | |
727 | int regexec(re, str, bol) | |
728 | regexp *re; /* the compiled regexp to search for */ | |
729 | char *str; /* the string to search through */ | |
730 | int bol; /* boolean: does str start at the beginning of a line? */ | |
731 | { | |
732 | char *prog; /* the entry point of re->program */ | |
733 | int len; /* length of the string */ | |
734 | REG char *here; | |
735 | ||
736 | /* if must start at the beginning of a line, and this isn't, then fail */ | |
737 | if (re->bol && !bol) | |
738 | { | |
739 | return 0; | |
740 | } | |
741 | ||
742 | len = strlen(str); | |
743 | prog = re->program + 1 + 32 * re->program[0]; | |
744 | ||
745 | /* search for the RE in the string */ | |
746 | if (re->bol) | |
747 | { | |
748 | /* must occur at BOL */ | |
749 | if ((re->first | |
750 | && match1(re, *(char *)str, re->first))/* wrong first letter? */ | |
751 | || len < re->minlen /* not long enough? */ | |
752 | || match(re, (char *)str, prog, str)) /* doesn't match? */ | |
753 | return 0; /* THEN FAIL! */ | |
754 | } | |
755 | #ifndef CRUNCH | |
756 | else if (!*o_ignorecase) | |
757 | { | |
758 | /* can occur anywhere in the line, noignorecase */ | |
759 | for (here = (char *)str; | |
760 | (re->first && re->first != *here) | |
761 | || match(re, (char *)str, prog, here); | |
762 | here++, len--) | |
763 | { | |
764 | if (len < re->minlen) | |
765 | return 0; | |
766 | } | |
767 | } | |
768 | #endif | |
769 | else | |
770 | { | |
771 | /* can occur anywhere in the line, ignorecase */ | |
772 | for (here = (char *)str; | |
773 | (re->first && match1(re, *here, (int)re->first)) | |
774 | || match(re, (char *)str, prog, here); | |
775 | here++, len--) | |
776 | { | |
777 | if (len < re->minlen) | |
778 | return 0; | |
779 | } | |
780 | } | |
781 | ||
782 | /* if we didn't fail, then we must have succeeded */ | |
783 | return 1; | |
784 | } | |
785 | ||
786 | /*============================================================================*/ | |
787 | #else /* NO_MAGIC */ | |
788 | ||
789 | regexp *regcomp(exp) | |
790 | char *exp; | |
791 | { | |
792 | char *src; | |
793 | char *dest; | |
794 | regexp *re; | |
795 | int i; | |
796 | ||
797 | /* allocate a big enough regexp structure */ | |
798 | #ifdef lint | |
799 | re = (regexp *)0; | |
800 | #else | |
801 | re = (regexp *)malloc((unsigned)(strlen(exp) + 1 + sizeof(struct regexp))); | |
802 | #endif | |
803 | if (!re) | |
804 | { | |
805 | regerror("Could not malloc a regexp structure"); | |
806 | return (regexp *)0; | |
807 | } | |
808 | ||
809 | /* initialize all fields of the structure */ | |
810 | for (i = 0; i < NSUBEXP; i++) | |
811 | { | |
812 | re->startp[i] = re->endp[i] = (char *)0; | |
813 | } | |
814 | re->minlen = 0; | |
815 | re->first = 0; | |
816 | re->bol = 0; | |
817 | ||
818 | /* copy the string into it, translating ^ and $ as needed */ | |
819 | for (src = exp, dest = re->program + 1; *src; src++) | |
820 | { | |
821 | switch (*src) | |
822 | { | |
823 | case '^': | |
824 | if (src == exp) | |
825 | { | |
826 | re->bol += 1; | |
827 | } | |
828 | else | |
829 | { | |
830 | *dest++ = '^'; | |
831 | re->minlen++; | |
832 | } | |
833 | break; | |
834 | ||
835 | case '$': | |
836 | if (!src[1]) | |
837 | { | |
838 | re->bol += 2; | |
839 | } | |
840 | else | |
841 | { | |
842 | *dest++ = '$'; | |
843 | re->minlen++; | |
844 | } | |
845 | break; | |
846 | ||
847 | case '\\': | |
848 | if (src[1]) | |
849 | { | |
850 | *dest++ = *++src; | |
851 | re->minlen++; | |
852 | } | |
853 | else | |
854 | { | |
855 | regerror("extra \\ at end of regular expression"); | |
856 | } | |
857 | break; | |
858 | ||
859 | default: | |
860 | *dest++ = *src; | |
861 | re->minlen++; | |
862 | } | |
863 | } | |
864 | *dest = '\0'; | |
865 | ||
866 | return re; | |
867 | } | |
868 | ||
869 | ||
870 | /* This "helper" function checks for a match at a given location. It returns | |
871 | * 1 if it matches, 0 if it doesn't match here but might match later on in the | |
872 | * string, or -1 if it could not possibly match | |
873 | */ | |
874 | static int reghelp(prog, string, bolflag) | |
875 | struct regexp *prog; | |
876 | char *string; | |
877 | int bolflag; | |
878 | { | |
879 | char *scan; | |
880 | char *str; | |
881 | ||
882 | /* if ^, then require bolflag */ | |
883 | if ((prog->bol & 1) && !bolflag) | |
884 | { | |
885 | return -1; | |
886 | } | |
887 | ||
888 | /* if it matches, then it will start here */ | |
889 | prog->startp[0] = string; | |
890 | ||
891 | /* compare, possibly ignoring case */ | |
892 | if (*o_ignorecase) | |
893 | { | |
894 | for (scan = &prog->program[1]; *scan; scan++, string++) | |
895 | if (tolower(*scan) != tolower(*string)) | |
896 | return *string ? 0 : -1; | |
897 | } | |
898 | else | |
899 | { | |
900 | for (scan = &prog->program[1]; *scan; scan++, string++) | |
901 | if (*scan != *string) | |
902 | return *string ? 0 : -1; | |
903 | } | |
904 | ||
905 | /* if $, then require string to end here, too */ | |
906 | if ((prog->bol & 2) && *string) | |
907 | { | |
908 | return 0; | |
909 | } | |
910 | ||
911 | /* if we get to here, it matches */ | |
912 | prog->endp[0] = string; | |
913 | return 1; | |
914 | } | |
915 | ||
916 | ||
917 | ||
918 | int regexec(prog, string, bolflag) | |
919 | struct regexp *prog; | |
920 | char *string; | |
921 | int bolflag; | |
922 | { | |
923 | int rc; | |
924 | ||
925 | /* keep trying to match it */ | |
926 | for (rc = reghelp(prog, string, bolflag); rc == 0; rc = reghelp(prog, string, 0)) | |
927 | { | |
928 | string++; | |
929 | } | |
930 | ||
931 | /* did we match? */ | |
932 | return rc == 1; | |
933 | } | |
934 | #endif |