Commit | Line | Data |
---|---|---|
15637ed4 RG |
1 | /* regexp.c */ |
2 | ||
3 | /* This file contains the code that compiles regular expressions and executes | |
4 | * them. It supports the same syntax and features as vi's regular expression | |
5 | * code. Specifically, the meta characters are: | |
6 | * ^ matches the beginning of a line | |
7 | * $ matches the end of a line | |
8 | * \< matches the beginning of a word | |
9 | * \> matches the end of a word | |
10 | * . matches any single character | |
11 | * [] matches any character in a character class | |
12 | * \( delimits the start of a subexpression | |
13 | * \) delimits the end of a subexpression | |
14 | * * repeats the preceding 0 or more times | |
15 | * NOTE: You cannot follow a \) with a *. | |
16 | * | |
17 | * The physical structure of a compiled RE is as follows: | |
18 | * - First, there is a one-byte value that says how many character classes | |
19 | * are used in this regular expression | |
20 | * - Next, each character class is stored as a bitmap that is 256 bits | |
21 | * (32 bytes) long. | |
22 | * - A mixture of literal characters and compiled meta characters follows. | |
23 | * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars | |
24 | * are stored as a \n followed by a one-byte code, so they take up two | |
25 | * bytes apiece. Literal characters take up one byte apiece. \n can't | |
26 | * be used as a literal character. | |
27 | * | |
28 | * If NO_MAGIC is defined, then a different set of functions is used instead. | |
29 | * That right, this file contains TWO versions of the code. | |
30 | */ | |
31 | ||
32 | #include <setjmp.h> | |
33 | #include "config.h" | |
34 | #include "ctype.h" | |
35 | #include "vi.h" | |
6e657cf2 AM |
36 | #ifdef REGEX |
37 | # include <regex.h> | |
38 | #else | |
39 | # include "regexp.h" | |
40 | #endif | |
41 | ||
15637ed4 | 42 | |
6e657cf2 AM |
43 | #ifdef REGEX |
44 | extern int patlock; /* from cmd_substitute() module */ | |
15637ed4 | 45 | |
6e657cf2 | 46 | static regex_t *previous = NULL; /* the previous regexp, used when null regexp is given */ |
15637ed4 | 47 | |
6e657cf2 AM |
48 | regex_t * |
49 | optpat(s) | |
50 | char *s; | |
51 | { | |
99668b43 AM |
52 | char *neuter(); |
53 | char *expand_tilde(); | |
0b18c9d7 | 54 | |
6e657cf2 AM |
55 | int n; |
56 | if (*s == '\0') { | |
99668b43 | 57 | if (!previous) regerr("no previous pattern"); |
6e657cf2 AM |
58 | return previous; |
59 | } else if (previous && !patlock) | |
60 | regfree(previous); | |
61 | else if ((previous = (regex_t *) malloc(sizeof(regex_t))) == NULL) { | |
99668b43 | 62 | regerr("out of memory"); |
6e657cf2 AM |
63 | return previous; |
64 | } | |
65 | patlock = 0; | |
99668b43 AM |
66 | if ((s = *o_magic ? expand_tilde(s) : neuter(s)) == NULL) { |
67 | free(previous); | |
68 | return previous = NULL; | |
69 | } else if (n = regcomp(previous, s, *o_ignorecase ? REG_ICASE : 0)) { | |
70 | regerr("%d", n); | |
6e657cf2 AM |
71 | free(previous); |
72 | return previous = NULL; | |
73 | } | |
74 | return previous; | |
75 | } | |
0b18c9d7 | 76 | |
99668b43 AM |
77 | extern char *last_repl; /* replacement text from previous substitute */ |
78 | ||
79 | /* expand_tilde: expand ~'s in a BRE */ | |
80 | char * | |
81 | expand_tilde(s) | |
82 | char *s; | |
83 | { | |
84 | char *literalize(); | |
85 | static char *hd = NULL; | |
86 | ||
87 | char *t, *repl; | |
88 | int size; | |
89 | int offset; | |
90 | int m; | |
91 | ||
92 | free(hd); | |
93 | hd = t = malloc(size = strlen(s) + 1); | |
94 | while (*s) | |
95 | if (*s == '\\' && *(s + 1) == '~') { | |
96 | *t++ = *s++; | |
97 | *t++ = *s++; | |
98 | } else if (*s != '~') | |
99 | *t++ = *s++; | |
100 | else { | |
101 | if (!last_repl) { | |
102 | regerr("no previous replacement"); | |
103 | return NULL; | |
104 | } else if ((repl = literalize(last_repl)) == NULL) | |
105 | return NULL; | |
106 | m = strlen(repl); | |
107 | offset = t - hd; | |
108 | if ((hd = realloc(hd, size += m)) == NULL) { | |
109 | regerr("out of memory"); | |
110 | return NULL; | |
111 | } | |
112 | t = hd + offset; | |
113 | strcpy(t, repl); | |
114 | t += m; | |
115 | s++; | |
116 | } | |
117 | *t = '\0'; | |
118 | return hd; | |
119 | } | |
120 | ||
121 | ||
122 | /* escape BRE meta-characters and expand ~'s in a string */ | |
123 | char * | |
0b18c9d7 AM |
124 | neuter(s) |
125 | char *s; | |
126 | { | |
99668b43 | 127 | char *literalize(); |
0b18c9d7 AM |
128 | static char *hd = NULL; |
129 | ||
99668b43 AM |
130 | char *t, *repl; |
131 | int size; | |
132 | int offset; | |
133 | int m; | |
0b18c9d7 AM |
134 | |
135 | free(hd); | |
99668b43 AM |
136 | if ((hd = t = (char *) malloc(size = 2 * strlen(s) + 1)) == NULL) { |
137 | regerr("out of memory"); | |
0b18c9d7 | 138 | return NULL; |
99668b43 | 139 | } |
0b18c9d7 AM |
140 | if (*s == '^') |
141 | *t++ = *s++; | |
142 | while (*s) { | |
99668b43 AM |
143 | if (*s == '\\' && (*(s + 1) == '.' || *(s + 1) == '*' || |
144 | *(s + 1) == '[')) { | |
145 | s++; | |
146 | *t++ = *s++; | |
147 | continue; | |
148 | } else if (*s == '\\' && *(s + 1) == '~') { | |
149 | if (!last_repl) { | |
150 | regerr("no previous replacement"); | |
151 | return NULL; | |
152 | } else if ((repl = literalize(last_repl)) == NULL) | |
153 | return NULL; | |
154 | m = strlen(repl); | |
155 | offset = t - hd; | |
156 | if ((hd = realloc(hd, size += m)) == NULL) { | |
157 | regerr("out of memory"); | |
158 | return NULL; | |
159 | } | |
160 | t = hd + (offset); | |
161 | strcpy(t, repl); | |
162 | t += m; | |
163 | s += 2; | |
164 | continue; | |
165 | } else if (*s == '.' || *s == '\\' || *s == '[' || *s == '*') | |
0b18c9d7 AM |
166 | *t++ = '\\'; |
167 | *t++ = *s++; | |
168 | } | |
169 | *t = '\0'; | |
170 | return hd; | |
171 | } | |
172 | ||
99668b43 AM |
173 | |
174 | /* escape BRE meta-characters in a string */ | |
175 | char * | |
176 | literalize(s) | |
177 | char *s; | |
178 | { | |
179 | static char *hd = NULL; | |
180 | ||
181 | char *t; | |
182 | int size; | |
183 | int offset; | |
184 | int m; | |
185 | ||
186 | free(hd); | |
187 | if ((hd = t = (char *) malloc(size = 2 * strlen(s) + 1)) == NULL) { | |
188 | regerr("out of memory"); | |
189 | return NULL; | |
190 | } | |
191 | if (*o_magic) | |
192 | while (*s) { | |
193 | if (*s == '~' || *s == '&') { | |
194 | regerr("can't use ~ or & in pattern"); | |
195 | return NULL; | |
196 | } else if (*s == '\\') { | |
197 | s++; | |
198 | if (isdigit(*s)) { | |
199 | regerr("can't use \\d in pattern"); | |
200 | return NULL; | |
201 | } else if (*s == '&' || *s == '~') { | |
202 | *t++ = '\\'; | |
203 | *t++ = *s++; | |
204 | } | |
205 | } else if (*s == '^' || *s == '$' || *s == '.' || | |
206 | *s == '[' || *s == '*') { | |
207 | *t++ = '\\'; | |
208 | *t++ = *s++; | |
209 | } else | |
210 | *t++ = *s++; | |
211 | } | |
212 | else | |
213 | while (*s) { | |
214 | if (*s == '\\') { | |
215 | s++; | |
216 | if (*s == '&' || *s == '~') { | |
217 | regerr("can't use \\~ or \\& in pattern"); | |
218 | return NULL; | |
219 | } else if (isdigit(*s)) { | |
220 | regerr("can't use \\d in pattern"); | |
221 | return NULL; | |
222 | } | |
223 | } else | |
224 | *t++ = *s++; | |
225 | } | |
226 | *t = '\0'; | |
227 | return hd; | |
228 | } | |
229 | ||
6e657cf2 | 230 | #else |
15637ed4 RG |
231 | static char *previous; /* the previous regexp, used when null regexp is given */ |
232 | ||
233 | ||
234 | #ifndef NO_MAGIC | |
235 | /* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED */ | |
236 | ||
237 | /* These are used to classify or recognize meta-characters */ | |
238 | #define META '\0' | |
239 | #define BASE_META(m) ((m) - 256) | |
240 | #define INT_META(c) ((c) + 256) | |
241 | #define IS_META(m) ((m) >= 256) | |
242 | #define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9)) | |
243 | #define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9)) | |
244 | #define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9)) | |
245 | #define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE) | |
246 | #define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m)) | |
247 | #define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s) | |
248 | ||
249 | /* These are the internal codes used for each type of meta-character */ | |
250 | #define M_BEGLINE 256 /* internal code for ^ */ | |
251 | #define M_ENDLINE 257 /* internal code for $ */ | |
252 | #define M_BEGWORD 258 /* internal code for \< */ | |
253 | #define M_ENDWORD 259 /* internal code for \> */ | |
254 | #define M_ANY 260 /* internal code for . */ | |
255 | #define M_SPLAT 261 /* internal code for * */ | |
256 | #define M_PLUS 262 /* internal code for \+ */ | |
257 | #define M_QMARK 263 /* internal code for \? */ | |
258 | #define M_RANGE 264 /* internal code for \{ */ | |
259 | #define M_CLASS(n) (265+(n)) /* internal code for [] */ | |
260 | #define M_START(n) (275+(n)) /* internal code for \( */ | |
261 | #define M_END(n) (285+(n)) /* internal code for \) */ | |
262 | ||
263 | /* These are used during compilation */ | |
264 | static int class_cnt; /* used to assign class IDs */ | |
265 | static int start_cnt; /* used to assign start IDs */ | |
266 | static int end_stk[NSUBEXP];/* used to assign end IDs */ | |
267 | static int end_sp; | |
268 | static char *retext; /* points to the text being compiled */ | |
269 | ||
270 | /* error-handling stuff */ | |
271 | jmp_buf errorhandler; | |
6e657cf2 | 272 | #define FAIL(why) regerr(why); longjmp(errorhandler, 1) |
15637ed4 RG |
273 | |
274 | ||
275 | ||
276 | ||
277 | ||
278 | /* This function builds a bitmap for a particular class */ | |
279 | static char *makeclass(text, bmap) | |
280 | REG char *text; /* start of the class */ | |
281 | REG char *bmap; /* the bitmap */ | |
282 | { | |
283 | REG int i; | |
284 | int complement = 0; | |
285 | ||
286 | ||
08746e8b AM |
287 | checkmem(); |
288 | ||
15637ed4 RG |
289 | /* zero the bitmap */ |
290 | for (i = 0; bmap && i < 32; i++) | |
291 | { | |
292 | bmap[i] = 0; | |
293 | } | |
294 | ||
295 | /* see if we're going to complement this class */ | |
296 | if (*text == '^') | |
297 | { | |
298 | text++; | |
299 | complement = 1; | |
300 | } | |
301 | ||
302 | /* add in the characters */ | |
303 | while (*text && *text != ']') | |
304 | { | |
305 | /* is this a span of characters? */ | |
306 | if (text[1] == '-' && text[2]) | |
307 | { | |
308 | /* spans can't be backwards */ | |
309 | if (text[0] > text[2]) | |
310 | { | |
311 | FAIL("Backwards span in []"); | |
312 | } | |
313 | ||
314 | /* add each character in the span to the bitmap */ | |
08746e8b | 315 | for (i = UCHAR(text[0]); bmap && (unsigned)i <= UCHAR(text[2]); i++) |
15637ed4 RG |
316 | { |
317 | bmap[i >> 3] |= (1 << (i & 7)); | |
318 | } | |
319 | ||
320 | /* move past this span */ | |
321 | text += 3; | |
322 | } | |
323 | else | |
324 | { | |
325 | /* add this single character to the span */ | |
326 | i = *text++; | |
327 | if (bmap) | |
328 | { | |
08746e8b | 329 | bmap[UCHAR(i) >> 3] |= (1 << (UCHAR(i) & 7)); |
15637ed4 RG |
330 | } |
331 | } | |
332 | } | |
333 | ||
334 | /* make sure the closing ] is missing */ | |
335 | if (*text++ != ']') | |
336 | { | |
337 | FAIL("] missing"); | |
338 | } | |
339 | ||
340 | /* if we're supposed to complement this class, then do so */ | |
341 | if (complement && bmap) | |
342 | { | |
343 | for (i = 0; i < 32; i++) | |
344 | { | |
345 | bmap[i] = ~bmap[i]; | |
346 | } | |
347 | } | |
348 | ||
08746e8b AM |
349 | checkmem(); |
350 | ||
15637ed4 RG |
351 | return text; |
352 | } | |
353 | ||
354 | ||
355 | ||
356 | ||
357 | /* This function gets the next character or meta character from a string. | |
358 | * The pointer is incremented by 1, or by 2 for \-quoted characters. For [], | |
359 | * a bitmap is generated via makeclass() (if re is given), and the | |
360 | * character-class text is skipped. | |
361 | */ | |
362 | static int gettoken(sptr, re) | |
363 | char **sptr; | |
364 | regexp *re; | |
365 | { | |
366 | int c; | |
367 | ||
368 | c = **sptr; | |
08746e8b AM |
369 | if (!c) |
370 | { | |
371 | return c; | |
372 | } | |
15637ed4 RG |
373 | ++*sptr; |
374 | if (c == '\\') | |
375 | { | |
376 | c = **sptr; | |
377 | ++*sptr; | |
378 | switch (c) | |
379 | { | |
380 | case '<': | |
381 | return M_BEGWORD; | |
382 | ||
383 | case '>': | |
384 | return M_ENDWORD; | |
385 | ||
386 | case '(': | |
387 | if (start_cnt >= NSUBEXP) | |
388 | { | |
389 | FAIL("Too many \\(s"); | |
390 | } | |
391 | end_stk[end_sp++] = start_cnt; | |
392 | return M_START(start_cnt++); | |
393 | ||
394 | case ')': | |
395 | if (end_sp <= 0) | |
396 | { | |
397 | FAIL("Mismatched \\)"); | |
398 | } | |
399 | return M_END(end_stk[--end_sp]); | |
400 | ||
401 | case '*': | |
402 | return (*o_magic ? c : M_SPLAT); | |
403 | ||
404 | case '.': | |
405 | return (*o_magic ? c : M_ANY); | |
406 | ||
407 | case '+': | |
408 | return M_PLUS; | |
409 | ||
410 | case '?': | |
411 | return M_QMARK; | |
412 | #ifndef CRUNCH | |
413 | case '{': | |
414 | return M_RANGE; | |
415 | #endif | |
416 | default: | |
417 | return c; | |
418 | } | |
419 | } | |
420 | else if (*o_magic) | |
421 | { | |
422 | switch (c) | |
423 | { | |
424 | case '^': | |
425 | if (*sptr == retext + 1) | |
426 | { | |
427 | return M_BEGLINE; | |
428 | } | |
429 | return c; | |
430 | ||
431 | case '$': | |
432 | if (!**sptr) | |
433 | { | |
434 | return M_ENDLINE; | |
435 | } | |
436 | return c; | |
437 | ||
438 | case '.': | |
439 | return M_ANY; | |
440 | ||
441 | case '*': | |
442 | return M_SPLAT; | |
443 | ||
444 | case '[': | |
445 | /* make sure we don't have too many classes */ | |
446 | if (class_cnt >= 10) | |
447 | { | |
448 | FAIL("Too many []s"); | |
449 | } | |
450 | ||
451 | /* process the character list for this class */ | |
452 | if (re) | |
453 | { | |
454 | /* generate the bitmap for this class */ | |
455 | *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt); | |
456 | } | |
457 | else | |
458 | { | |
459 | /* skip to end of the class */ | |
460 | *sptr = makeclass(*sptr, (char *)0); | |
461 | } | |
462 | return M_CLASS(class_cnt++); | |
463 | ||
464 | default: | |
465 | return c; | |
466 | } | |
467 | } | |
468 | else /* unquoted nomagic */ | |
469 | { | |
470 | switch (c) | |
471 | { | |
472 | case '^': | |
473 | if (*sptr == retext + 1) | |
474 | { | |
475 | return M_BEGLINE; | |
476 | } | |
477 | return c; | |
478 | ||
479 | case '$': | |
480 | if (!**sptr) | |
481 | { | |
482 | return M_ENDLINE; | |
483 | } | |
484 | return c; | |
485 | ||
486 | default: | |
487 | return c; | |
488 | } | |
489 | } | |
490 | /*NOTREACHED*/ | |
491 | } | |
492 | ||
493 | ||
494 | ||
495 | ||
496 | /* This function calculates the number of bytes that will be needed for a | |
497 | * compiled RE. Its argument is the uncompiled version. It is not clever | |
498 | * about catching syntax errors; that is done in a later pass. | |
499 | */ | |
500 | static unsigned calcsize(text) | |
501 | char *text; | |
502 | { | |
503 | unsigned size; | |
504 | int token; | |
505 | ||
506 | retext = text; | |
507 | class_cnt = 0; | |
508 | start_cnt = 1; | |
509 | end_sp = 0; | |
510 | size = 5; | |
511 | while ((token = gettoken(&text, (regexp *)0)) != 0) | |
512 | { | |
513 | if (IS_CLASS(token)) | |
514 | { | |
515 | size += 34; | |
516 | } | |
517 | #ifndef CRUNCH | |
518 | else if (token == M_RANGE) | |
519 | { | |
520 | size += 4; | |
521 | while ((token = gettoken(&text, (regexp *)0)) != 0 | |
522 | && token != '}') | |
523 | { | |
524 | } | |
525 | if (!token) | |
526 | { | |
527 | return size; | |
528 | } | |
529 | } | |
530 | #endif | |
531 | else if (IS_META(token)) | |
532 | { | |
533 | size += 2; | |
534 | } | |
535 | else | |
536 | { | |
537 | size++; | |
538 | } | |
539 | } | |
540 | ||
541 | return size; | |
542 | } | |
543 | ||
544 | ||
545 | ||
546 | /* This function compiles a regexp. */ | |
547 | regexp *regcomp(exp) | |
548 | char *exp; | |
549 | { | |
550 | int needfirst; | |
551 | unsigned size; | |
552 | int token; | |
553 | int peek; | |
554 | char *build; | |
08746e8b AM |
555 | #if __STDC__ |
556 | volatile | |
557 | #endif | |
15637ed4 RG |
558 | regexp *re; |
559 | #ifndef CRUNCH | |
560 | int from; | |
561 | int to; | |
562 | int digit; | |
563 | #endif | |
08746e8b AM |
564 | #ifdef DEBUG |
565 | int calced; | |
566 | #endif | |
567 | ||
15637ed4 | 568 | |
08746e8b | 569 | checkmem(); |
15637ed4 RG |
570 | |
571 | /* prepare for error handling */ | |
572 | re = (regexp *)0; | |
573 | if (setjmp(errorhandler)) | |
574 | { | |
08746e8b | 575 | checkmem(); |
15637ed4 RG |
576 | if (re) |
577 | { | |
08746e8b | 578 | _free_(re); |
15637ed4 RG |
579 | } |
580 | return (regexp *)0; | |
581 | } | |
582 | ||
583 | /* if an empty regexp string was given, use the previous one */ | |
584 | if (*exp == 0) | |
585 | { | |
586 | if (!previous) | |
587 | { | |
588 | FAIL("No previous RE"); | |
589 | } | |
590 | exp = previous; | |
591 | } | |
592 | else /* non-empty regexp given, so remember it */ | |
593 | { | |
594 | if (previous) | |
08746e8b | 595 | _free_(previous); |
15637ed4 RG |
596 | previous = (char *)malloc((unsigned)(strlen(exp) + 1)); |
597 | if (previous) | |
598 | strcpy(previous, exp); | |
599 | } | |
600 | ||
601 | /* allocate memory */ | |
08746e8b | 602 | checkmem(); |
15637ed4 RG |
603 | class_cnt = 0; |
604 | start_cnt = 1; | |
605 | end_sp = 0; | |
606 | retext = exp; | |
08746e8b AM |
607 | #ifdef DEBUG |
608 | calced = calcsize(exp); | |
609 | size = calced + sizeof(regexp); | |
610 | #else | |
15637ed4 | 611 | size = calcsize(exp) + sizeof(regexp) + 10; /* !!! 10 bytes for slop */ |
08746e8b | 612 | #endif |
15637ed4 | 613 | #ifdef lint |
08746e8b | 614 | re = (regexp *)0; |
15637ed4 RG |
615 | #else |
616 | re = (regexp *)malloc((unsigned)size); | |
617 | #endif | |
618 | if (!re) | |
619 | { | |
620 | FAIL("Not enough memory for this RE"); | |
621 | } | |
08746e8b | 622 | checkmem(); |
15637ed4 RG |
623 | |
624 | /* compile it */ | |
625 | build = &re->program[1 + 32 * class_cnt]; | |
626 | re->program[0] = class_cnt; | |
627 | for (token = 0; token < NSUBEXP; token++) | |
628 | { | |
629 | re->startp[token] = re->endp[token] = (char *)0; | |
630 | } | |
631 | re->first = 0; | |
632 | re->bol = 0; | |
633 | re->minlen = 0; | |
634 | needfirst = 1; | |
635 | class_cnt = 0; | |
636 | start_cnt = 1; | |
637 | end_sp = 0; | |
638 | retext = exp; | |
639 | for (token = M_START(0), peek = gettoken(&exp, re); | |
640 | token; | |
641 | token = peek, peek = gettoken(&exp, re)) | |
642 | { | |
643 | /* special processing for the closure operator */ | |
644 | if (IS_CLOSURE(peek)) | |
645 | { | |
646 | /* detect misuse of closure operator */ | |
647 | if (IS_START(token)) | |
648 | { | |
649 | FAIL("Closure operator follows nothing"); | |
650 | } | |
651 | else if (IS_META(token) && token != M_ANY && !IS_CLASS(token)) | |
652 | { | |
653 | FAIL("Closure operators can only follow a normal character or . or []"); | |
654 | } | |
655 | ||
656 | #ifndef CRUNCH | |
657 | /* if \{ \} then read the range */ | |
658 | if (peek == M_RANGE) | |
659 | { | |
660 | from = 0; | |
661 | for (digit = gettoken(&exp, re); | |
662 | !IS_META(digit) && isdigit(digit); | |
663 | digit = gettoken(&exp, re)) | |
664 | { | |
665 | from = from * 10 + digit - '0'; | |
666 | } | |
667 | if (digit == '}') | |
668 | { | |
669 | to = from; | |
670 | } | |
671 | else if (digit == ',') | |
672 | { | |
673 | to = 0; | |
674 | for (digit = gettoken(&exp, re); | |
675 | !IS_META(digit) && isdigit(digit); | |
676 | digit = gettoken(&exp, re)) | |
677 | { | |
678 | to = to * 10 + digit - '0'; | |
679 | } | |
680 | if (to == 0) | |
681 | { | |
682 | to = 255; | |
683 | } | |
684 | } | |
685 | if (digit != '}') | |
686 | { | |
687 | FAIL("Bad characters after \\{"); | |
688 | } | |
689 | else if (to < from || to == 0 || from >= 255) | |
690 | { | |
691 | FAIL("Invalid range for \\{ \\}"); | |
692 | } | |
693 | re->minlen += from; | |
694 | } | |
695 | else | |
696 | #endif | |
697 | if (peek != M_SPLAT) | |
698 | { | |
699 | re->minlen++; | |
700 | } | |
701 | ||
702 | /* it is okay -- make it prefix instead of postfix */ | |
703 | ADD_META(build, peek); | |
704 | #ifndef CRUNCH | |
705 | if (peek == M_RANGE) | |
706 | { | |
707 | *build++ = from; | |
708 | *build++ = (to < 255 ? to : 255); | |
709 | } | |
710 | #endif | |
711 | ||
712 | ||
713 | /* take care of "needfirst" - is this the first char? */ | |
714 | if (needfirst && peek == M_PLUS && !IS_META(token)) | |
715 | { | |
716 | re->first = token; | |
717 | } | |
718 | needfirst = 0; | |
719 | ||
720 | /* we used "peek" -- need to refill it */ | |
721 | peek = gettoken(&exp, re); | |
722 | if (IS_CLOSURE(peek)) | |
723 | { | |
724 | FAIL("* or \\+ or \\? doubled up"); | |
725 | } | |
726 | } | |
727 | else if (!IS_META(token)) | |
728 | { | |
729 | /* normal char is NOT argument of closure */ | |
730 | if (needfirst) | |
731 | { | |
732 | re->first = token; | |
733 | needfirst = 0; | |
734 | } | |
735 | re->minlen++; | |
736 | } | |
737 | else if (token == M_ANY || IS_CLASS(token)) | |
738 | { | |
739 | /* . or [] is NOT argument of closure */ | |
740 | needfirst = 0; | |
741 | re->minlen++; | |
742 | } | |
743 | ||
744 | /* the "token" character is not closure -- process it normally */ | |
745 | if (token == M_BEGLINE) | |
746 | { | |
747 | /* set the BOL flag instead of storing M_BEGLINE */ | |
748 | re->bol = 1; | |
749 | } | |
750 | else if (IS_META(token)) | |
751 | { | |
752 | ADD_META(build, token); | |
753 | } | |
754 | else | |
755 | { | |
756 | *build++ = token; | |
757 | } | |
758 | } | |
08746e8b | 759 | checkmem(); |
15637ed4 RG |
760 | |
761 | /* end it with a \) which MUST MATCH the opening \( */ | |
762 | ADD_META(build, M_END(0)); | |
763 | if (end_sp > 0) | |
764 | { | |
765 | FAIL("Not enough \\)s"); | |
766 | } | |
767 | ||
08746e8b AM |
768 | #ifdef DEBUG |
769 | if ((int)(build - re->program) != calced) | |
770 | { | |
771 | msg("regcomp error: calced=%d, actual=%d", calced, (int)(build - re->program)); | |
772 | getkey(0); | |
773 | } | |
774 | #endif | |
775 | ||
776 | checkmem(); | |
15637ed4 RG |
777 | return re; |
778 | } | |
779 | ||
780 | ||
781 | ||
782 | /*---------------------------------------------------------------------------*/ | |
783 | ||
784 | ||
785 | /* This function checks for a match between a character and a token which is | |
786 | * known to represent a single character. It returns 0 if they match, or | |
787 | * 1 if they don't. | |
788 | */ | |
789 | int match1(re, ch, token) | |
790 | regexp *re; | |
791 | REG char ch; | |
792 | REG int token; | |
793 | { | |
794 | if (!ch) | |
795 | { | |
796 | /* the end of a line can't match any RE of width 1 */ | |
797 | return 1; | |
798 | } | |
799 | if (token == M_ANY) | |
800 | { | |
801 | return 0; | |
802 | } | |
803 | else if (IS_CLASS(token)) | |
804 | { | |
08746e8b | 805 | if (re->program[1 + 32 * (token - M_CLASS(0)) + (UCHAR(ch) >> 3)] & (1 << (UCHAR(ch) & 7))) |
15637ed4 RG |
806 | return 0; |
807 | } | |
808 | else if (ch == token || *o_ignorecase && tolower(ch) == tolower(token)) | |
809 | { | |
810 | return 0; | |
811 | } | |
812 | return 1; | |
813 | } | |
814 | ||
815 | ||
816 | ||
817 | /* This function checks characters up to and including the next closure, at | |
818 | * which point it does a recursive call to check the rest of it. This function | |
819 | * returns 0 if everything matches, or 1 if something doesn't match. | |
820 | */ | |
821 | int match(re, str, prog, here) | |
822 | regexp *re; /* the regular expression */ | |
823 | char *str; /* the string */ | |
824 | REG char *prog; /* a portion of re->program, an compiled RE */ | |
825 | REG char *here; /* a portion of str, the string to compare it to */ | |
826 | { | |
827 | REG int token; /* the roken pointed to by prog */ | |
828 | REG int nmatched;/* counter, used during closure matching */ | |
829 | REG int closure;/* the token denoting the type of closure */ | |
830 | int from; /* minimum number of matches in closure */ | |
831 | int to; /* maximum number of matches in closure */ | |
832 | ||
833 | for (token = GET_META(prog); !IS_CLOSURE(token); prog++, token = GET_META(prog)) | |
834 | { | |
835 | switch (token) | |
836 | { | |
837 | /*case M_BEGLINE: can't happen; re->bol is used instead */ | |
838 | case M_ENDLINE: | |
839 | if (*here) | |
840 | return 1; | |
841 | break; | |
842 | ||
843 | case M_BEGWORD: | |
844 | if (here != str && | |
845 | (here[-1] == '_' || isalnum(here[-1]))) | |
846 | return 1; | |
847 | break; | |
848 | ||
849 | case M_ENDWORD: | |
850 | if (here[0] == '_' || isalnum(here[0])) | |
851 | return 1; | |
852 | break; | |
853 | ||
854 | case M_START(0): | |
855 | case M_START(1): | |
856 | case M_START(2): | |
857 | case M_START(3): | |
858 | case M_START(4): | |
859 | case M_START(5): | |
860 | case M_START(6): | |
861 | case M_START(7): | |
862 | case M_START(8): | |
863 | case M_START(9): | |
864 | re->startp[token - M_START(0)] = (char *)here; | |
865 | break; | |
866 | ||
867 | case M_END(0): | |
868 | case M_END(1): | |
869 | case M_END(2): | |
870 | case M_END(3): | |
871 | case M_END(4): | |
872 | case M_END(5): | |
873 | case M_END(6): | |
874 | case M_END(7): | |
875 | case M_END(8): | |
876 | case M_END(9): | |
877 | re->endp[token - M_END(0)] = (char *)here; | |
878 | if (token == M_END(0)) | |
879 | { | |
880 | return 0; | |
881 | } | |
882 | break; | |
883 | ||
884 | default: /* literal, M_CLASS(n), or M_ANY */ | |
885 | if (match1(re, *here, token) != 0) | |
886 | return 1; | |
887 | here++; | |
888 | } | |
889 | } | |
890 | ||
891 | /* C L O S U R E */ | |
892 | ||
893 | /* step 1: see what we have to match against, and move "prog" to point | |
894 | * to the remainder of the compiled RE. | |
895 | */ | |
896 | closure = token; | |
897 | prog++; | |
898 | switch (closure) | |
899 | { | |
900 | case M_SPLAT: | |
901 | from = 0; | |
902 | to = strlen(str); /* infinity */ | |
903 | break; | |
904 | ||
905 | case M_PLUS: | |
906 | from = 1; | |
907 | to = strlen(str); /* infinity */ | |
908 | break; | |
909 | ||
910 | case M_QMARK: | |
911 | from = 0; | |
912 | to = 1; | |
913 | break; | |
914 | ||
915 | #ifndef CRUNCH | |
916 | case M_RANGE: | |
917 | from = UCHAR(*prog++); | |
918 | to = UCHAR(*prog++); | |
919 | if (to == 255) | |
920 | { | |
921 | to = strlen(str); /* infinity */ | |
922 | } | |
923 | break; | |
924 | #endif | |
925 | } | |
926 | token = GET_META(prog); | |
927 | prog++; | |
928 | ||
929 | /* step 2: see how many times we can match that token against the string */ | |
930 | for (nmatched = 0; | |
931 | nmatched < to && *here && match1(re, *here, token) == 0; | |
932 | nmatched++, here++) | |
933 | { | |
934 | } | |
935 | ||
936 | /* step 3: try to match the remainder, and back off if it doesn't */ | |
937 | while (nmatched >= from && match(re, str, prog, here) != 0) | |
938 | { | |
939 | nmatched--; | |
940 | here--; | |
941 | } | |
942 | ||
943 | /* so how did it work out? */ | |
944 | if (nmatched >= from) | |
945 | return 0; | |
946 | return 1; | |
947 | } | |
948 | ||
949 | ||
950 | ||
951 | /* This function searches through a string for text that matches an RE. */ | |
952 | int regexec(re, str, bol) | |
953 | regexp *re; /* the compiled regexp to search for */ | |
954 | char *str; /* the string to search through */ | |
955 | int bol; /* boolean: does str start at the beginning of a line? */ | |
956 | { | |
957 | char *prog; /* the entry point of re->program */ | |
958 | int len; /* length of the string */ | |
959 | REG char *here; | |
960 | ||
08746e8b AM |
961 | checkmem(); |
962 | ||
15637ed4 RG |
963 | /* if must start at the beginning of a line, and this isn't, then fail */ |
964 | if (re->bol && !bol) | |
965 | { | |
966 | return 0; | |
967 | } | |
968 | ||
969 | len = strlen(str); | |
970 | prog = re->program + 1 + 32 * re->program[0]; | |
971 | ||
972 | /* search for the RE in the string */ | |
973 | if (re->bol) | |
974 | { | |
975 | /* must occur at BOL */ | |
976 | if ((re->first | |
977 | && match1(re, *(char *)str, re->first))/* wrong first letter? */ | |
978 | || len < re->minlen /* not long enough? */ | |
979 | || match(re, (char *)str, prog, str)) /* doesn't match? */ | |
980 | return 0; /* THEN FAIL! */ | |
981 | } | |
982 | #ifndef CRUNCH | |
983 | else if (!*o_ignorecase) | |
984 | { | |
985 | /* can occur anywhere in the line, noignorecase */ | |
986 | for (here = (char *)str; | |
987 | (re->first && re->first != *here) | |
988 | || match(re, (char *)str, prog, here); | |
989 | here++, len--) | |
990 | { | |
991 | if (len < re->minlen) | |
992 | return 0; | |
993 | } | |
994 | } | |
995 | #endif | |
996 | else | |
997 | { | |
998 | /* can occur anywhere in the line, ignorecase */ | |
999 | for (here = (char *)str; | |
1000 | (re->first && match1(re, *here, (int)re->first)) | |
1001 | || match(re, (char *)str, prog, here); | |
1002 | here++, len--) | |
1003 | { | |
1004 | if (len < re->minlen) | |
1005 | return 0; | |
1006 | } | |
1007 | } | |
1008 | ||
1009 | /* if we didn't fail, then we must have succeeded */ | |
08746e8b | 1010 | checkmem(); |
15637ed4 RG |
1011 | return 1; |
1012 | } | |
1013 | ||
1014 | /*============================================================================*/ | |
1015 | #else /* NO_MAGIC */ | |
1016 | ||
1017 | regexp *regcomp(exp) | |
1018 | char *exp; | |
1019 | { | |
1020 | char *src; | |
1021 | char *dest; | |
1022 | regexp *re; | |
1023 | int i; | |
1024 | ||
1025 | /* allocate a big enough regexp structure */ | |
1026 | #ifdef lint | |
1027 | re = (regexp *)0; | |
1028 | #else | |
1029 | re = (regexp *)malloc((unsigned)(strlen(exp) + 1 + sizeof(struct regexp))); | |
1030 | #endif | |
1031 | if (!re) | |
1032 | { | |
6e657cf2 | 1033 | regerr("Could not malloc a regexp structure"); |
15637ed4 RG |
1034 | return (regexp *)0; |
1035 | } | |
1036 | ||
1037 | /* initialize all fields of the structure */ | |
1038 | for (i = 0; i < NSUBEXP; i++) | |
1039 | { | |
1040 | re->startp[i] = re->endp[i] = (char *)0; | |
1041 | } | |
1042 | re->minlen = 0; | |
1043 | re->first = 0; | |
1044 | re->bol = 0; | |
1045 | ||
1046 | /* copy the string into it, translating ^ and $ as needed */ | |
1047 | for (src = exp, dest = re->program + 1; *src; src++) | |
1048 | { | |
1049 | switch (*src) | |
1050 | { | |
1051 | case '^': | |
1052 | if (src == exp) | |
1053 | { | |
1054 | re->bol += 1; | |
1055 | } | |
1056 | else | |
1057 | { | |
1058 | *dest++ = '^'; | |
1059 | re->minlen++; | |
1060 | } | |
1061 | break; | |
1062 | ||
1063 | case '$': | |
1064 | if (!src[1]) | |
1065 | { | |
1066 | re->bol += 2; | |
1067 | } | |
1068 | else | |
1069 | { | |
1070 | *dest++ = '$'; | |
1071 | re->minlen++; | |
1072 | } | |
1073 | break; | |
1074 | ||
1075 | case '\\': | |
1076 | if (src[1]) | |
1077 | { | |
1078 | *dest++ = *++src; | |
1079 | re->minlen++; | |
1080 | } | |
1081 | else | |
1082 | { | |
6e657cf2 | 1083 | regerr("extra \\ at end of regular expression"); |
15637ed4 RG |
1084 | } |
1085 | break; | |
1086 | ||
1087 | default: | |
1088 | *dest++ = *src; | |
1089 | re->minlen++; | |
1090 | } | |
1091 | } | |
1092 | *dest = '\0'; | |
1093 | ||
1094 | return re; | |
1095 | } | |
1096 | ||
1097 | ||
1098 | /* This "helper" function checks for a match at a given location. It returns | |
1099 | * 1 if it matches, 0 if it doesn't match here but might match later on in the | |
1100 | * string, or -1 if it could not possibly match | |
1101 | */ | |
1102 | static int reghelp(prog, string, bolflag) | |
1103 | struct regexp *prog; | |
1104 | char *string; | |
1105 | int bolflag; | |
1106 | { | |
1107 | char *scan; | |
1108 | char *str; | |
1109 | ||
1110 | /* if ^, then require bolflag */ | |
1111 | if ((prog->bol & 1) && !bolflag) | |
1112 | { | |
1113 | return -1; | |
1114 | } | |
1115 | ||
1116 | /* if it matches, then it will start here */ | |
1117 | prog->startp[0] = string; | |
1118 | ||
1119 | /* compare, possibly ignoring case */ | |
1120 | if (*o_ignorecase) | |
1121 | { | |
1122 | for (scan = &prog->program[1]; *scan; scan++, string++) | |
1123 | if (tolower(*scan) != tolower(*string)) | |
1124 | return *string ? 0 : -1; | |
1125 | } | |
1126 | else | |
1127 | { | |
1128 | for (scan = &prog->program[1]; *scan; scan++, string++) | |
1129 | if (*scan != *string) | |
1130 | return *string ? 0 : -1; | |
1131 | } | |
1132 | ||
1133 | /* if $, then require string to end here, too */ | |
1134 | if ((prog->bol & 2) && *string) | |
1135 | { | |
1136 | return 0; | |
1137 | } | |
1138 | ||
1139 | /* if we get to here, it matches */ | |
1140 | prog->endp[0] = string; | |
1141 | return 1; | |
1142 | } | |
1143 | ||
1144 | ||
1145 | ||
1146 | int regexec(prog, string, bolflag) | |
1147 | struct regexp *prog; | |
1148 | char *string; | |
1149 | int bolflag; | |
1150 | { | |
1151 | int rc; | |
1152 | ||
1153 | /* keep trying to match it */ | |
1154 | for (rc = reghelp(prog, string, bolflag); rc == 0; rc = reghelp(prog, string, 0)) | |
1155 | { | |
1156 | string++; | |
1157 | } | |
1158 | ||
1159 | /* did we match? */ | |
1160 | return rc == 1; | |
1161 | } | |
1162 | #endif | |
6e657cf2 | 1163 | #endif /* !REGEX */ |