Commit | Line | Data |
---|---|---|
15637ed4 RG |
1 | /* regexp.c */ |
2 | ||
3 | /* This file contains the code that compiles regular expressions and executes | |
4 | * them. It supports the same syntax and features as vi's regular expression | |
5 | * code. Specifically, the meta characters are: | |
6 | * ^ matches the beginning of a line | |
7 | * $ matches the end of a line | |
8 | * \< matches the beginning of a word | |
9 | * \> matches the end of a word | |
10 | * . matches any single character | |
11 | * [] matches any character in a character class | |
12 | * \( delimits the start of a subexpression | |
13 | * \) delimits the end of a subexpression | |
14 | * * repeats the preceding 0 or more times | |
15 | * NOTE: You cannot follow a \) with a *. | |
16 | * | |
17 | * The physical structure of a compiled RE is as follows: | |
18 | * - First, there is a one-byte value that says how many character classes | |
19 | * are used in this regular expression | |
20 | * - Next, each character class is stored as a bitmap that is 256 bits | |
21 | * (32 bytes) long. | |
22 | * - A mixture of literal characters and compiled meta characters follows. | |
23 | * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars | |
24 | * are stored as a \n followed by a one-byte code, so they take up two | |
25 | * bytes apiece. Literal characters take up one byte apiece. \n can't | |
26 | * be used as a literal character. | |
27 | * | |
28 | * If NO_MAGIC is defined, then a different set of functions is used instead. | |
29 | * That right, this file contains TWO versions of the code. | |
30 | */ | |
31 | ||
32 | #include <setjmp.h> | |
33 | #include "config.h" | |
34 | #include "ctype.h" | |
35 | #include "vi.h" | |
6e657cf2 AM |
36 | #ifdef REGEX |
37 | # include <regex.h> | |
38 | #else | |
39 | # include "regexp.h" | |
40 | #endif | |
41 | ||
15637ed4 | 42 | |
6e657cf2 AM |
43 | #ifdef REGEX |
44 | extern int patlock; /* from cmd_substitute() module */ | |
15637ed4 | 45 | |
6e657cf2 | 46 | static regex_t *previous = NULL; /* the previous regexp, used when null regexp is given */ |
15637ed4 | 47 | |
6e657cf2 AM |
48 | regex_t * |
49 | optpat(s) | |
50 | char *s; | |
51 | { | |
52 | int n; | |
53 | if (*s == '\0') { | |
54 | if (!previous) regerr("RE error: no previous pattern"); | |
55 | return previous; | |
56 | } else if (previous && !patlock) | |
57 | regfree(previous); | |
58 | else if ((previous = (regex_t *) malloc(sizeof(regex_t))) == NULL) { | |
59 | regerr("RE error: out of memory"); | |
60 | return previous; | |
61 | } | |
62 | patlock = 0; | |
63 | if (n = regcomp(previous, s, 0)) { | |
64 | regerr("RE error: %d", n); | |
65 | free(previous); | |
66 | return previous = NULL; | |
67 | } | |
68 | return previous; | |
69 | } | |
70 | #else | |
15637ed4 RG |
71 | static char *previous; /* the previous regexp, used when null regexp is given */ |
72 | ||
73 | ||
74 | #ifndef NO_MAGIC | |
75 | /* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED */ | |
76 | ||
77 | /* These are used to classify or recognize meta-characters */ | |
78 | #define META '\0' | |
79 | #define BASE_META(m) ((m) - 256) | |
80 | #define INT_META(c) ((c) + 256) | |
81 | #define IS_META(m) ((m) >= 256) | |
82 | #define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9)) | |
83 | #define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9)) | |
84 | #define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9)) | |
85 | #define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE) | |
86 | #define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m)) | |
87 | #define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s) | |
88 | ||
89 | /* These are the internal codes used for each type of meta-character */ | |
90 | #define M_BEGLINE 256 /* internal code for ^ */ | |
91 | #define M_ENDLINE 257 /* internal code for $ */ | |
92 | #define M_BEGWORD 258 /* internal code for \< */ | |
93 | #define M_ENDWORD 259 /* internal code for \> */ | |
94 | #define M_ANY 260 /* internal code for . */ | |
95 | #define M_SPLAT 261 /* internal code for * */ | |
96 | #define M_PLUS 262 /* internal code for \+ */ | |
97 | #define M_QMARK 263 /* internal code for \? */ | |
98 | #define M_RANGE 264 /* internal code for \{ */ | |
99 | #define M_CLASS(n) (265+(n)) /* internal code for [] */ | |
100 | #define M_START(n) (275+(n)) /* internal code for \( */ | |
101 | #define M_END(n) (285+(n)) /* internal code for \) */ | |
102 | ||
103 | /* These are used during compilation */ | |
104 | static int class_cnt; /* used to assign class IDs */ | |
105 | static int start_cnt; /* used to assign start IDs */ | |
106 | static int end_stk[NSUBEXP];/* used to assign end IDs */ | |
107 | static int end_sp; | |
108 | static char *retext; /* points to the text being compiled */ | |
109 | ||
110 | /* error-handling stuff */ | |
111 | jmp_buf errorhandler; | |
6e657cf2 | 112 | #define FAIL(why) regerr(why); longjmp(errorhandler, 1) |
15637ed4 RG |
113 | |
114 | ||
115 | ||
116 | ||
117 | ||
118 | /* This function builds a bitmap for a particular class */ | |
119 | static char *makeclass(text, bmap) | |
120 | REG char *text; /* start of the class */ | |
121 | REG char *bmap; /* the bitmap */ | |
122 | { | |
123 | REG int i; | |
124 | int complement = 0; | |
125 | ||
126 | ||
08746e8b AM |
127 | checkmem(); |
128 | ||
15637ed4 RG |
129 | /* zero the bitmap */ |
130 | for (i = 0; bmap && i < 32; i++) | |
131 | { | |
132 | bmap[i] = 0; | |
133 | } | |
134 | ||
135 | /* see if we're going to complement this class */ | |
136 | if (*text == '^') | |
137 | { | |
138 | text++; | |
139 | complement = 1; | |
140 | } | |
141 | ||
142 | /* add in the characters */ | |
143 | while (*text && *text != ']') | |
144 | { | |
145 | /* is this a span of characters? */ | |
146 | if (text[1] == '-' && text[2]) | |
147 | { | |
148 | /* spans can't be backwards */ | |
149 | if (text[0] > text[2]) | |
150 | { | |
151 | FAIL("Backwards span in []"); | |
152 | } | |
153 | ||
154 | /* add each character in the span to the bitmap */ | |
08746e8b | 155 | for (i = UCHAR(text[0]); bmap && (unsigned)i <= UCHAR(text[2]); i++) |
15637ed4 RG |
156 | { |
157 | bmap[i >> 3] |= (1 << (i & 7)); | |
158 | } | |
159 | ||
160 | /* move past this span */ | |
161 | text += 3; | |
162 | } | |
163 | else | |
164 | { | |
165 | /* add this single character to the span */ | |
166 | i = *text++; | |
167 | if (bmap) | |
168 | { | |
08746e8b | 169 | bmap[UCHAR(i) >> 3] |= (1 << (UCHAR(i) & 7)); |
15637ed4 RG |
170 | } |
171 | } | |
172 | } | |
173 | ||
174 | /* make sure the closing ] is missing */ | |
175 | if (*text++ != ']') | |
176 | { | |
177 | FAIL("] missing"); | |
178 | } | |
179 | ||
180 | /* if we're supposed to complement this class, then do so */ | |
181 | if (complement && bmap) | |
182 | { | |
183 | for (i = 0; i < 32; i++) | |
184 | { | |
185 | bmap[i] = ~bmap[i]; | |
186 | } | |
187 | } | |
188 | ||
08746e8b AM |
189 | checkmem(); |
190 | ||
15637ed4 RG |
191 | return text; |
192 | } | |
193 | ||
194 | ||
195 | ||
196 | ||
197 | /* This function gets the next character or meta character from a string. | |
198 | * The pointer is incremented by 1, or by 2 for \-quoted characters. For [], | |
199 | * a bitmap is generated via makeclass() (if re is given), and the | |
200 | * character-class text is skipped. | |
201 | */ | |
202 | static int gettoken(sptr, re) | |
203 | char **sptr; | |
204 | regexp *re; | |
205 | { | |
206 | int c; | |
207 | ||
208 | c = **sptr; | |
08746e8b AM |
209 | if (!c) |
210 | { | |
211 | return c; | |
212 | } | |
15637ed4 RG |
213 | ++*sptr; |
214 | if (c == '\\') | |
215 | { | |
216 | c = **sptr; | |
217 | ++*sptr; | |
218 | switch (c) | |
219 | { | |
220 | case '<': | |
221 | return M_BEGWORD; | |
222 | ||
223 | case '>': | |
224 | return M_ENDWORD; | |
225 | ||
226 | case '(': | |
227 | if (start_cnt >= NSUBEXP) | |
228 | { | |
229 | FAIL("Too many \\(s"); | |
230 | } | |
231 | end_stk[end_sp++] = start_cnt; | |
232 | return M_START(start_cnt++); | |
233 | ||
234 | case ')': | |
235 | if (end_sp <= 0) | |
236 | { | |
237 | FAIL("Mismatched \\)"); | |
238 | } | |
239 | return M_END(end_stk[--end_sp]); | |
240 | ||
241 | case '*': | |
242 | return (*o_magic ? c : M_SPLAT); | |
243 | ||
244 | case '.': | |
245 | return (*o_magic ? c : M_ANY); | |
246 | ||
247 | case '+': | |
248 | return M_PLUS; | |
249 | ||
250 | case '?': | |
251 | return M_QMARK; | |
252 | #ifndef CRUNCH | |
253 | case '{': | |
254 | return M_RANGE; | |
255 | #endif | |
256 | default: | |
257 | return c; | |
258 | } | |
259 | } | |
260 | else if (*o_magic) | |
261 | { | |
262 | switch (c) | |
263 | { | |
264 | case '^': | |
265 | if (*sptr == retext + 1) | |
266 | { | |
267 | return M_BEGLINE; | |
268 | } | |
269 | return c; | |
270 | ||
271 | case '$': | |
272 | if (!**sptr) | |
273 | { | |
274 | return M_ENDLINE; | |
275 | } | |
276 | return c; | |
277 | ||
278 | case '.': | |
279 | return M_ANY; | |
280 | ||
281 | case '*': | |
282 | return M_SPLAT; | |
283 | ||
284 | case '[': | |
285 | /* make sure we don't have too many classes */ | |
286 | if (class_cnt >= 10) | |
287 | { | |
288 | FAIL("Too many []s"); | |
289 | } | |
290 | ||
291 | /* process the character list for this class */ | |
292 | if (re) | |
293 | { | |
294 | /* generate the bitmap for this class */ | |
295 | *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt); | |
296 | } | |
297 | else | |
298 | { | |
299 | /* skip to end of the class */ | |
300 | *sptr = makeclass(*sptr, (char *)0); | |
301 | } | |
302 | return M_CLASS(class_cnt++); | |
303 | ||
304 | default: | |
305 | return c; | |
306 | } | |
307 | } | |
308 | else /* unquoted nomagic */ | |
309 | { | |
310 | switch (c) | |
311 | { | |
312 | case '^': | |
313 | if (*sptr == retext + 1) | |
314 | { | |
315 | return M_BEGLINE; | |
316 | } | |
317 | return c; | |
318 | ||
319 | case '$': | |
320 | if (!**sptr) | |
321 | { | |
322 | return M_ENDLINE; | |
323 | } | |
324 | return c; | |
325 | ||
326 | default: | |
327 | return c; | |
328 | } | |
329 | } | |
330 | /*NOTREACHED*/ | |
331 | } | |
332 | ||
333 | ||
334 | ||
335 | ||
336 | /* This function calculates the number of bytes that will be needed for a | |
337 | * compiled RE. Its argument is the uncompiled version. It is not clever | |
338 | * about catching syntax errors; that is done in a later pass. | |
339 | */ | |
340 | static unsigned calcsize(text) | |
341 | char *text; | |
342 | { | |
343 | unsigned size; | |
344 | int token; | |
345 | ||
346 | retext = text; | |
347 | class_cnt = 0; | |
348 | start_cnt = 1; | |
349 | end_sp = 0; | |
350 | size = 5; | |
351 | while ((token = gettoken(&text, (regexp *)0)) != 0) | |
352 | { | |
353 | if (IS_CLASS(token)) | |
354 | { | |
355 | size += 34; | |
356 | } | |
357 | #ifndef CRUNCH | |
358 | else if (token == M_RANGE) | |
359 | { | |
360 | size += 4; | |
361 | while ((token = gettoken(&text, (regexp *)0)) != 0 | |
362 | && token != '}') | |
363 | { | |
364 | } | |
365 | if (!token) | |
366 | { | |
367 | return size; | |
368 | } | |
369 | } | |
370 | #endif | |
371 | else if (IS_META(token)) | |
372 | { | |
373 | size += 2; | |
374 | } | |
375 | else | |
376 | { | |
377 | size++; | |
378 | } | |
379 | } | |
380 | ||
381 | return size; | |
382 | } | |
383 | ||
384 | ||
385 | ||
386 | /* This function compiles a regexp. */ | |
387 | regexp *regcomp(exp) | |
388 | char *exp; | |
389 | { | |
390 | int needfirst; | |
391 | unsigned size; | |
392 | int token; | |
393 | int peek; | |
394 | char *build; | |
08746e8b AM |
395 | #if __STDC__ |
396 | volatile | |
397 | #endif | |
15637ed4 RG |
398 | regexp *re; |
399 | #ifndef CRUNCH | |
400 | int from; | |
401 | int to; | |
402 | int digit; | |
403 | #endif | |
08746e8b AM |
404 | #ifdef DEBUG |
405 | int calced; | |
406 | #endif | |
407 | ||
15637ed4 | 408 | |
08746e8b | 409 | checkmem(); |
15637ed4 RG |
410 | |
411 | /* prepare for error handling */ | |
412 | re = (regexp *)0; | |
413 | if (setjmp(errorhandler)) | |
414 | { | |
08746e8b | 415 | checkmem(); |
15637ed4 RG |
416 | if (re) |
417 | { | |
08746e8b | 418 | _free_(re); |
15637ed4 RG |
419 | } |
420 | return (regexp *)0; | |
421 | } | |
422 | ||
423 | /* if an empty regexp string was given, use the previous one */ | |
424 | if (*exp == 0) | |
425 | { | |
426 | if (!previous) | |
427 | { | |
428 | FAIL("No previous RE"); | |
429 | } | |
430 | exp = previous; | |
431 | } | |
432 | else /* non-empty regexp given, so remember it */ | |
433 | { | |
434 | if (previous) | |
08746e8b | 435 | _free_(previous); |
15637ed4 RG |
436 | previous = (char *)malloc((unsigned)(strlen(exp) + 1)); |
437 | if (previous) | |
438 | strcpy(previous, exp); | |
439 | } | |
440 | ||
441 | /* allocate memory */ | |
08746e8b | 442 | checkmem(); |
15637ed4 RG |
443 | class_cnt = 0; |
444 | start_cnt = 1; | |
445 | end_sp = 0; | |
446 | retext = exp; | |
08746e8b AM |
447 | #ifdef DEBUG |
448 | calced = calcsize(exp); | |
449 | size = calced + sizeof(regexp); | |
450 | #else | |
15637ed4 | 451 | size = calcsize(exp) + sizeof(regexp) + 10; /* !!! 10 bytes for slop */ |
08746e8b | 452 | #endif |
15637ed4 | 453 | #ifdef lint |
08746e8b | 454 | re = (regexp *)0; |
15637ed4 RG |
455 | #else |
456 | re = (regexp *)malloc((unsigned)size); | |
457 | #endif | |
458 | if (!re) | |
459 | { | |
460 | FAIL("Not enough memory for this RE"); | |
461 | } | |
08746e8b | 462 | checkmem(); |
15637ed4 RG |
463 | |
464 | /* compile it */ | |
465 | build = &re->program[1 + 32 * class_cnt]; | |
466 | re->program[0] = class_cnt; | |
467 | for (token = 0; token < NSUBEXP; token++) | |
468 | { | |
469 | re->startp[token] = re->endp[token] = (char *)0; | |
470 | } | |
471 | re->first = 0; | |
472 | re->bol = 0; | |
473 | re->minlen = 0; | |
474 | needfirst = 1; | |
475 | class_cnt = 0; | |
476 | start_cnt = 1; | |
477 | end_sp = 0; | |
478 | retext = exp; | |
479 | for (token = M_START(0), peek = gettoken(&exp, re); | |
480 | token; | |
481 | token = peek, peek = gettoken(&exp, re)) | |
482 | { | |
483 | /* special processing for the closure operator */ | |
484 | if (IS_CLOSURE(peek)) | |
485 | { | |
486 | /* detect misuse of closure operator */ | |
487 | if (IS_START(token)) | |
488 | { | |
489 | FAIL("Closure operator follows nothing"); | |
490 | } | |
491 | else if (IS_META(token) && token != M_ANY && !IS_CLASS(token)) | |
492 | { | |
493 | FAIL("Closure operators can only follow a normal character or . or []"); | |
494 | } | |
495 | ||
496 | #ifndef CRUNCH | |
497 | /* if \{ \} then read the range */ | |
498 | if (peek == M_RANGE) | |
499 | { | |
500 | from = 0; | |
501 | for (digit = gettoken(&exp, re); | |
502 | !IS_META(digit) && isdigit(digit); | |
503 | digit = gettoken(&exp, re)) | |
504 | { | |
505 | from = from * 10 + digit - '0'; | |
506 | } | |
507 | if (digit == '}') | |
508 | { | |
509 | to = from; | |
510 | } | |
511 | else if (digit == ',') | |
512 | { | |
513 | to = 0; | |
514 | for (digit = gettoken(&exp, re); | |
515 | !IS_META(digit) && isdigit(digit); | |
516 | digit = gettoken(&exp, re)) | |
517 | { | |
518 | to = to * 10 + digit - '0'; | |
519 | } | |
520 | if (to == 0) | |
521 | { | |
522 | to = 255; | |
523 | } | |
524 | } | |
525 | if (digit != '}') | |
526 | { | |
527 | FAIL("Bad characters after \\{"); | |
528 | } | |
529 | else if (to < from || to == 0 || from >= 255) | |
530 | { | |
531 | FAIL("Invalid range for \\{ \\}"); | |
532 | } | |
533 | re->minlen += from; | |
534 | } | |
535 | else | |
536 | #endif | |
537 | if (peek != M_SPLAT) | |
538 | { | |
539 | re->minlen++; | |
540 | } | |
541 | ||
542 | /* it is okay -- make it prefix instead of postfix */ | |
543 | ADD_META(build, peek); | |
544 | #ifndef CRUNCH | |
545 | if (peek == M_RANGE) | |
546 | { | |
547 | *build++ = from; | |
548 | *build++ = (to < 255 ? to : 255); | |
549 | } | |
550 | #endif | |
551 | ||
552 | ||
553 | /* take care of "needfirst" - is this the first char? */ | |
554 | if (needfirst && peek == M_PLUS && !IS_META(token)) | |
555 | { | |
556 | re->first = token; | |
557 | } | |
558 | needfirst = 0; | |
559 | ||
560 | /* we used "peek" -- need to refill it */ | |
561 | peek = gettoken(&exp, re); | |
562 | if (IS_CLOSURE(peek)) | |
563 | { | |
564 | FAIL("* or \\+ or \\? doubled up"); | |
565 | } | |
566 | } | |
567 | else if (!IS_META(token)) | |
568 | { | |
569 | /* normal char is NOT argument of closure */ | |
570 | if (needfirst) | |
571 | { | |
572 | re->first = token; | |
573 | needfirst = 0; | |
574 | } | |
575 | re->minlen++; | |
576 | } | |
577 | else if (token == M_ANY || IS_CLASS(token)) | |
578 | { | |
579 | /* . or [] is NOT argument of closure */ | |
580 | needfirst = 0; | |
581 | re->minlen++; | |
582 | } | |
583 | ||
584 | /* the "token" character is not closure -- process it normally */ | |
585 | if (token == M_BEGLINE) | |
586 | { | |
587 | /* set the BOL flag instead of storing M_BEGLINE */ | |
588 | re->bol = 1; | |
589 | } | |
590 | else if (IS_META(token)) | |
591 | { | |
592 | ADD_META(build, token); | |
593 | } | |
594 | else | |
595 | { | |
596 | *build++ = token; | |
597 | } | |
598 | } | |
08746e8b | 599 | checkmem(); |
15637ed4 RG |
600 | |
601 | /* end it with a \) which MUST MATCH the opening \( */ | |
602 | ADD_META(build, M_END(0)); | |
603 | if (end_sp > 0) | |
604 | { | |
605 | FAIL("Not enough \\)s"); | |
606 | } | |
607 | ||
08746e8b AM |
608 | #ifdef DEBUG |
609 | if ((int)(build - re->program) != calced) | |
610 | { | |
611 | msg("regcomp error: calced=%d, actual=%d", calced, (int)(build - re->program)); | |
612 | getkey(0); | |
613 | } | |
614 | #endif | |
615 | ||
616 | checkmem(); | |
15637ed4 RG |
617 | return re; |
618 | } | |
619 | ||
620 | ||
621 | ||
622 | /*---------------------------------------------------------------------------*/ | |
623 | ||
624 | ||
625 | /* This function checks for a match between a character and a token which is | |
626 | * known to represent a single character. It returns 0 if they match, or | |
627 | * 1 if they don't. | |
628 | */ | |
629 | int match1(re, ch, token) | |
630 | regexp *re; | |
631 | REG char ch; | |
632 | REG int token; | |
633 | { | |
634 | if (!ch) | |
635 | { | |
636 | /* the end of a line can't match any RE of width 1 */ | |
637 | return 1; | |
638 | } | |
639 | if (token == M_ANY) | |
640 | { | |
641 | return 0; | |
642 | } | |
643 | else if (IS_CLASS(token)) | |
644 | { | |
08746e8b | 645 | if (re->program[1 + 32 * (token - M_CLASS(0)) + (UCHAR(ch) >> 3)] & (1 << (UCHAR(ch) & 7))) |
15637ed4 RG |
646 | return 0; |
647 | } | |
648 | else if (ch == token || *o_ignorecase && tolower(ch) == tolower(token)) | |
649 | { | |
650 | return 0; | |
651 | } | |
652 | return 1; | |
653 | } | |
654 | ||
655 | ||
656 | ||
657 | /* This function checks characters up to and including the next closure, at | |
658 | * which point it does a recursive call to check the rest of it. This function | |
659 | * returns 0 if everything matches, or 1 if something doesn't match. | |
660 | */ | |
661 | int match(re, str, prog, here) | |
662 | regexp *re; /* the regular expression */ | |
663 | char *str; /* the string */ | |
664 | REG char *prog; /* a portion of re->program, an compiled RE */ | |
665 | REG char *here; /* a portion of str, the string to compare it to */ | |
666 | { | |
667 | REG int token; /* the roken pointed to by prog */ | |
668 | REG int nmatched;/* counter, used during closure matching */ | |
669 | REG int closure;/* the token denoting the type of closure */ | |
670 | int from; /* minimum number of matches in closure */ | |
671 | int to; /* maximum number of matches in closure */ | |
672 | ||
673 | for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog)) | |
674 | { | |
675 | switch (token) | |
676 | { | |
677 | /*case M_BEGLINE: can't happen; re->bol is used instead */ | |
678 | case M_ENDLINE: | |
679 | if (*here) | |
680 | return 1; | |
681 | break; | |
682 | ||
683 | case M_BEGWORD: | |
684 | if (here != str && | |
685 | (here[-1] == '_' || isalnum(here[-1]))) | |
686 | return 1; | |
687 | break; | |
688 | ||
689 | case M_ENDWORD: | |
690 | if (here[0] == '_' || isalnum(here[0])) | |
691 | return 1; | |
692 | break; | |
693 | ||
694 | case M_START(0): | |
695 | case M_START(1): | |
696 | case M_START(2): | |
697 | case M_START(3): | |
698 | case M_START(4): | |
699 | case M_START(5): | |
700 | case M_START(6): | |
701 | case M_START(7): | |
702 | case M_START(8): | |
703 | case M_START(9): | |
704 | re->startp[token - M_START(0)] = (char *)here; | |
705 | break; | |
706 | ||
707 | case M_END(0): | |
708 | case M_END(1): | |
709 | case M_END(2): | |
710 | case M_END(3): | |
711 | case M_END(4): | |
712 | case M_END(5): | |
713 | case M_END(6): | |
714 | case M_END(7): | |
715 | case M_END(8): | |
716 | case M_END(9): | |
717 | re->endp[token - M_END(0)] = (char *)here; | |
718 | if (token == M_END(0)) | |
719 | { | |
720 | return 0; | |
721 | } | |
722 | break; | |
723 | ||
724 | default: /* literal, M_CLASS(n), or M_ANY */ | |
725 | if (match1(re, *here, token) != 0) | |
726 | return 1; | |
727 | here++; | |
728 | } | |
729 | } | |
730 | ||
731 | /* C L O S U R E */ | |
732 | ||
733 | /* step 1: see what we have to match against, and move "prog" to point | |
734 | * to the remainder of the compiled RE. | |
735 | */ | |
736 | closure = token; | |
737 | prog++; | |
738 | switch (closure) | |
739 | { | |
740 | case M_SPLAT: | |
741 | from = 0; | |
742 | to = strlen(str); /* infinity */ | |
743 | break; | |
744 | ||
745 | case M_PLUS: | |
746 | from = 1; | |
747 | to = strlen(str); /* infinity */ | |
748 | break; | |
749 | ||
750 | case M_QMARK: | |
751 | from = 0; | |
752 | to = 1; | |
753 | break; | |
754 | ||
755 | #ifndef CRUNCH | |
756 | case M_RANGE: | |
757 | from = UCHAR(*prog++); | |
758 | to = UCHAR(*prog++); | |
759 | if (to == 255) | |
760 | { | |
761 | to = strlen(str); /* infinity */ | |
762 | } | |
763 | break; | |
764 | #endif | |
765 | } | |
766 | token = GET_META(prog); | |
767 | prog++; | |
768 | ||
769 | /* step 2: see how many times we can match that token against the string */ | |
770 | for (nmatched = 0; | |
771 | nmatched < to && *here && match1(re, *here, token) == 0; | |
772 | nmatched++, here++) | |
773 | { | |
774 | } | |
775 | ||
776 | /* step 3: try to match the remainder, and back off if it doesn't */ | |
777 | while (nmatched >= from && match(re, str, prog, here) != 0) | |
778 | { | |
779 | nmatched--; | |
780 | here--; | |
781 | } | |
782 | ||
783 | /* so how did it work out? */ | |
784 | if (nmatched >= from) | |
785 | return 0; | |
786 | return 1; | |
787 | } | |
788 | ||
789 | ||
790 | ||
791 | /* This function searches through a string for text that matches an RE. */ | |
792 | int regexec(re, str, bol) | |
793 | regexp *re; /* the compiled regexp to search for */ | |
794 | char *str; /* the string to search through */ | |
795 | int bol; /* boolean: does str start at the beginning of a line? */ | |
796 | { | |
797 | char *prog; /* the entry point of re->program */ | |
798 | int len; /* length of the string */ | |
799 | REG char *here; | |
800 | ||
08746e8b AM |
801 | checkmem(); |
802 | ||
15637ed4 RG |
803 | /* if must start at the beginning of a line, and this isn't, then fail */ |
804 | if (re->bol && !bol) | |
805 | { | |
806 | return 0; | |
807 | } | |
808 | ||
809 | len = strlen(str); | |
810 | prog = re->program + 1 + 32 * re->program[0]; | |
811 | ||
812 | /* search for the RE in the string */ | |
813 | if (re->bol) | |
814 | { | |
815 | /* must occur at BOL */ | |
816 | if ((re->first | |
817 | && match1(re, *(char *)str, re->first))/* wrong first letter? */ | |
818 | || len < re->minlen /* not long enough? */ | |
819 | || match(re, (char *)str, prog, str)) /* doesn't match? */ | |
820 | return 0; /* THEN FAIL! */ | |
821 | } | |
822 | #ifndef CRUNCH | |
823 | else if (!*o_ignorecase) | |
824 | { | |
825 | /* can occur anywhere in the line, noignorecase */ | |
826 | for (here = (char *)str; | |
827 | (re->first && re->first != *here) | |
828 | || match(re, (char *)str, prog, here); | |
829 | here++, len--) | |
830 | { | |
831 | if (len < re->minlen) | |
832 | return 0; | |
833 | } | |
834 | } | |
835 | #endif | |
836 | else | |
837 | { | |
838 | /* can occur anywhere in the line, ignorecase */ | |
839 | for (here = (char *)str; | |
840 | (re->first && match1(re, *here, (int)re->first)) | |
841 | || match(re, (char *)str, prog, here); | |
842 | here++, len--) | |
843 | { | |
844 | if (len < re->minlen) | |
845 | return 0; | |
846 | } | |
847 | } | |
848 | ||
849 | /* if we didn't fail, then we must have succeeded */ | |
08746e8b | 850 | checkmem(); |
15637ed4 RG |
851 | return 1; |
852 | } | |
853 | ||
854 | /*============================================================================*/ | |
855 | #else /* NO_MAGIC */ | |
856 | ||
857 | regexp *regcomp(exp) | |
858 | char *exp; | |
859 | { | |
860 | char *src; | |
861 | char *dest; | |
862 | regexp *re; | |
863 | int i; | |
864 | ||
865 | /* allocate a big enough regexp structure */ | |
866 | #ifdef lint | |
867 | re = (regexp *)0; | |
868 | #else | |
869 | re = (regexp *)malloc((unsigned)(strlen(exp) + 1 + sizeof(struct regexp))); | |
870 | #endif | |
871 | if (!re) | |
872 | { | |
6e657cf2 | 873 | regerr("Could not malloc a regexp structure"); |
15637ed4 RG |
874 | return (regexp *)0; |
875 | } | |
876 | ||
877 | /* initialize all fields of the structure */ | |
878 | for (i = 0; i < NSUBEXP; i++) | |
879 | { | |
880 | re->startp[i] = re->endp[i] = (char *)0; | |
881 | } | |
882 | re->minlen = 0; | |
883 | re->first = 0; | |
884 | re->bol = 0; | |
885 | ||
886 | /* copy the string into it, translating ^ and $ as needed */ | |
887 | for (src = exp, dest = re->program + 1; *src; src++) | |
888 | { | |
889 | switch (*src) | |
890 | { | |
891 | case '^': | |
892 | if (src == exp) | |
893 | { | |
894 | re->bol += 1; | |
895 | } | |
896 | else | |
897 | { | |
898 | *dest++ = '^'; | |
899 | re->minlen++; | |
900 | } | |
901 | break; | |
902 | ||
903 | case '$': | |
904 | if (!src[1]) | |
905 | { | |
906 | re->bol += 2; | |
907 | } | |
908 | else | |
909 | { | |
910 | *dest++ = '$'; | |
911 | re->minlen++; | |
912 | } | |
913 | break; | |
914 | ||
915 | case '\\': | |
916 | if (src[1]) | |
917 | { | |
918 | *dest++ = *++src; | |
919 | re->minlen++; | |
920 | } | |
921 | else | |
922 | { | |
6e657cf2 | 923 | regerr("extra \\ at end of regular expression"); |
15637ed4 RG |
924 | } |
925 | break; | |
926 | ||
927 | default: | |
928 | *dest++ = *src; | |
929 | re->minlen++; | |
930 | } | |
931 | } | |
932 | *dest = '\0'; | |
933 | ||
934 | return re; | |
935 | } | |
936 | ||
937 | ||
938 | /* This "helper" function checks for a match at a given location. It returns | |
939 | * 1 if it matches, 0 if it doesn't match here but might match later on in the | |
940 | * string, or -1 if it could not possibly match | |
941 | */ | |
942 | static int reghelp(prog, string, bolflag) | |
943 | struct regexp *prog; | |
944 | char *string; | |
945 | int bolflag; | |
946 | { | |
947 | char *scan; | |
948 | char *str; | |
949 | ||
950 | /* if ^, then require bolflag */ | |
951 | if ((prog->bol & 1) && !bolflag) | |
952 | { | |
953 | return -1; | |
954 | } | |
955 | ||
956 | /* if it matches, then it will start here */ | |
957 | prog->startp[0] = string; | |
958 | ||
959 | /* compare, possibly ignoring case */ | |
960 | if (*o_ignorecase) | |
961 | { | |
962 | for (scan = &prog->program[1]; *scan; scan++, string++) | |
963 | if (tolower(*scan) != tolower(*string)) | |
964 | return *string ? 0 : -1; | |
965 | } | |
966 | else | |
967 | { | |
968 | for (scan = &prog->program[1]; *scan; scan++, string++) | |
969 | if (*scan != *string) | |
970 | return *string ? 0 : -1; | |
971 | } | |
972 | ||
973 | /* if $, then require string to end here, too */ | |
974 | if ((prog->bol & 2) && *string) | |
975 | { | |
976 | return 0; | |
977 | } | |
978 | ||
979 | /* if we get to here, it matches */ | |
980 | prog->endp[0] = string; | |
981 | return 1; | |
982 | } | |
983 | ||
984 | ||
985 | ||
986 | int regexec(prog, string, bolflag) | |
987 | struct regexp *prog; | |
988 | char *string; | |
989 | int bolflag; | |
990 | { | |
991 | int rc; | |
992 | ||
993 | /* keep trying to match it */ | |
994 | for (rc = reghelp(prog, string, bolflag); rc == 0; rc = reghelp(prog, string, 0)) | |
995 | { | |
996 | string++; | |
997 | } | |
998 | ||
999 | /* did we match? */ | |
1000 | return rc == 1; | |
1001 | } | |
1002 | #endif | |
6e657cf2 | 1003 | #endif /* !REGEX */ |