386BSD 0.1 development
[unix-history] / usr / src / usr.bin / sed / regex.c
CommitLineData
f87489ac
WJ
1/* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
17
18
19/* To test, compile with -Dtest. This Dtestable feature turns this into
20 a self-contained program which reads a pattern, describes how it
21 compiles, then reads a string and searches for it.
22
23 On the other hand, if you compile with both -Dtest and -Dcanned you
24 can run some tests we've already thought of. */
25
26/* AIX requires the alloca decl to be the first thing in the file. */
27#ifdef __GNUC__
28#define alloca __builtin_alloca
29#else
30#ifdef sparc
31#include <alloca.h>
32#else
33#ifdef _AIX
34#pragma alloca
35#else
36char *alloca ();
37#endif
38#endif
39#endif
40
41#ifdef emacs
42
43/* The `emacs' switch turns on certain special matching commands
44 that make sense only in emacs. */
45
46#include "config.h"
47#include "lisp.h"
48#include "buffer.h"
49#include "syntax.h"
50
51#else /* not emacs */
52
53#if defined (USG) || defined (STDC_HEADERS)
54#ifndef BSTRING
55#include <string.h>
56#define bcopy(s,d,n) memcpy((d),(s),(n))
57#define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
58#define bzero(s,n) memset((s),0,(n))
59#endif
60#endif
61
62#ifdef STDC_HEADERS
63#include <stdlib.h>
64#else
65char *malloc ();
66char *realloc ();
67#endif
68
69/* Define the syntax stuff, so we can do the \<, \>, etc. */
70
71/* This must be nonzero for the wordchar and notwordchar pattern
72 commands in re_match_2. */
73#ifndef Sword
74#define Sword 1
75#endif
76
77#define SYNTAX(c) re_syntax_table[c]
78
79
80#ifdef SYNTAX_TABLE
81
82char *re_syntax_table;
83
84#else /* not SYNTAX_TABLE */
85
86static char re_syntax_table[256];
87
88
89static void
90init_syntax_once ()
91{
92 register int c;
93 static int done = 0;
94
95 if (done)
96 return;
97
98 bzero (re_syntax_table, sizeof re_syntax_table);
99
100 for (c = 'a'; c <= 'z'; c++)
101 re_syntax_table[c] = Sword;
102
103 for (c = 'A'; c <= 'Z'; c++)
104 re_syntax_table[c] = Sword;
105
106 for (c = '0'; c <= '9'; c++)
107 re_syntax_table[c] = Sword;
108
109 done = 1;
110}
111
112#endif /* SYNTAX_TABLE */
113#endif /* emacs */
114
115/* We write fatal error messages on standard error. */
116#include <stdio.h>
117
118/* isalpha(3) etc. are used for the character classes. */
119#include <ctype.h>
120/* Sequents are missing isgraph. */
121#ifndef isgraph
122#define isgraph(c) (isprint((c)) && !isspace((c)))
123#endif
124
125/* Get the interface, including the syntax bits. */
126#include "regex.h"
127
128
129/* These are the command codes that appear in compiled regular
130 expressions, one per byte. Some command codes are followed by
131 argument bytes. A command code can specify any interpretation
132 whatsoever for its arguments. Zero-bytes may appear in the compiled
133 regular expression.
134
135 The value of `exactn' is needed in search.c (search_buffer) in emacs.
136 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
137 `exactn' we use here must also be 1. */
138
139enum regexpcode
140 {
141 unused=0,
142 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
143 begline, /* Fail unless at beginning of line. */
144 endline, /* Fail unless at end of line. */
145 jump, /* Followed by two bytes giving relative address to jump to. */
146 on_failure_jump, /* Followed by two bytes giving relative address of
147 place to resume at in case of failure. */
148 finalize_jump, /* Throw away latest failure point and then jump to
149 address. */
150 maybe_finalize_jump, /* Like jump but finalize if safe to do so.
151 This is used to jump back to the beginning
152 of a repeat. If the command that follows
153 this jump is clearly incompatible with the
154 one at the beginning of the repeat, such that
155 we can be sure that there is no use backtracking
156 out of repetitions already completed,
157 then we finalize. */
158 dummy_failure_jump, /* Jump, and push a dummy failure point. This
159 failure point will be thrown away if an attempt
160 is made to use it for a failure. A + construct
161 makes this before the first repeat. Also
162 use it as an intermediary kind of jump when
163 compiling an or construct. */
164 succeed_n, /* Used like on_failure_jump except has to succeed n times;
165 then gets turned into an on_failure_jump. The relative
166 address following it is useless until then. The
167 address is followed by two bytes containing n. */
168 jump_n, /* Similar to jump, but jump n times only; also the relative
169 address following is in turn followed by yet two more bytes
170 containing n. */
171 set_number_at, /* Set the following relative location to the
172 subsequent number. */
173 anychar, /* Matches any (more or less) one character. */
174 charset, /* Matches any one char belonging to specified set.
175 First following byte is number of bitmap bytes.
176 Then come bytes for a bitmap saying which chars are in.
177 Bits in each byte are ordered low-bit-first.
178 A character is in the set if its bit is 1.
179 A character too large to have a bit in the map
180 is automatically not in the set. */
181 charset_not, /* Same parameters as charset, but match any character
182 that is not one of those specified. */
183 start_memory, /* Start remembering the text that is matched, for
184 storing in a memory register. Followed by one
185 byte containing the register number. Register numbers
186 must be in the range 0 through RE_NREGS. */
187 stop_memory, /* Stop remembering the text that is matched
188 and store it in a memory register. Followed by
189 one byte containing the register number. Register
190 numbers must be in the range 0 through RE_NREGS. */
191 duplicate, /* Match a duplicate of something remembered.
192 Followed by one byte containing the index of the memory
193 register. */
194 before_dot, /* Succeeds if before point. */
195 at_dot, /* Succeeds if at point. */
196 after_dot, /* Succeeds if after point. */
197 begbuf, /* Succeeds if at beginning of buffer. */
198 endbuf, /* Succeeds if at end of buffer. */
199 wordchar, /* Matches any word-constituent character. */
200 notwordchar, /* Matches any char that is not a word-constituent. */
201 wordbeg, /* Succeeds if at word beginning. */
202 wordend, /* Succeeds if at word end. */
203 wordbound, /* Succeeds if at a word boundary. */
204 notwordbound,/* Succeeds if not at a word boundary. */
205 syntaxspec, /* Matches any character whose syntax is specified.
206 followed by a byte which contains a syntax code,
207 e.g., Sword. */
208 notsyntaxspec /* Matches any character whose syntax differs from
209 that specified. */
210 };
211
212
213/* Number of failure points to allocate space for initially,
214 when matching. If this number is exceeded, more space is allocated,
215 so it is not a hard limit. */
216
217#ifndef NFAILURES
218#define NFAILURES 80
219#endif
220
221#ifdef CHAR_UNSIGNED
222#define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
223#endif
224#ifndef SIGN_EXTEND_CHAR
225#define SIGN_EXTEND_CHAR(x) (x)
226#endif
227
228
229/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
230#define STORE_NUMBER(destination, number) \
231 { (destination)[0] = (number) & 0377; \
232 (destination)[1] = (number) >> 8; }
233
234/* Same as STORE_NUMBER, except increment the destination pointer to
235 the byte after where the number is stored. Watch out that values for
236 DESTINATION such as p + 1 won't work, whereas p will. */
237#define STORE_NUMBER_AND_INCR(destination, number) \
238 { STORE_NUMBER(destination, number); \
239 (destination) += 2; }
240
241
242/* Put into DESTINATION a number stored in two contingous bytes starting
243 at SOURCE. */
244#define EXTRACT_NUMBER(destination, source) \
245 { (destination) = *(source) & 0377; \
246 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
247
248/* Same as EXTRACT_NUMBER, except increment the pointer for source to
249 point to second byte of SOURCE. Note that SOURCE has to be a value
250 such as p, not, e.g., p + 1. */
251#define EXTRACT_NUMBER_AND_INCR(destination, source) \
252 { EXTRACT_NUMBER (destination, source); \
253 (source) += 2; }
254
255
256/* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
259
260 The argument SYNTAX is a bit-mask comprised of the various bits
261 defined in regex.h. */
262
263int
264re_set_syntax (syntax)
265 int syntax;
266{
267 int ret;
268
269 ret = obscure_syntax;
270 obscure_syntax = syntax;
271 return ret;
272}
273
274/* Set by re_set_syntax to the current regexp syntax to recognize. */
275int obscure_syntax = 0;
276
277
278\f
279/* Macros for re_compile_pattern, which is found below these definitions. */
280
281#define CHAR_CLASS_MAX_LENGTH 6
282
283/* Fetch the next character in the uncompiled pattern, translating it if
284 necessary. */
285#define PATFETCH(c) \
286 {if (p == pend) goto end_of_pattern; \
287 c = * (unsigned char *) p++; \
288 if (translate) c = translate[c]; }
289
290/* Fetch the next character in the uncompiled pattern, with no
291 translation. */
292#define PATFETCH_RAW(c) \
293 {if (p == pend) goto end_of_pattern; \
294 c = * (unsigned char *) p++; }
295
296#define PATUNFETCH p--
297
298
299/* If the buffer isn't allocated when it comes in, use this. */
300#define INIT_BUF_SIZE 28
301
302/* Make sure we have at least N more bytes of space in buffer. */
303#define GET_BUFFER_SPACE(n) \
304 { \
305 while (b - bufp->buffer + (n) >= bufp->allocated) \
306 EXTEND_BUFFER; \
307 }
308
309/* Make sure we have one more byte of buffer space and then add CH to it. */
310#define BUFPUSH(ch) \
311 { \
312 GET_BUFFER_SPACE (1); \
313 *b++ = (char) (ch); \
314 }
315
316/* Extend the buffer by twice its current size via reallociation and
317 reset the pointers that pointed into the old allocation to point to
318 the correct places in the new allocation. If extending the buffer
319 results in it being larger than 1 << 16, then flag memory exhausted. */
320#define EXTEND_BUFFER \
321 { char *old_buffer = bufp->buffer; \
322 if (bufp->allocated == (1L<<16)) goto too_big; \
323 bufp->allocated *= 2; \
324 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
325 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
326 if (bufp->buffer == 0) \
327 goto memory_exhausted; \
328 b = (b - old_buffer) + bufp->buffer; \
329 if (fixup_jump) \
330 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
331 if (laststart) \
332 laststart = (laststart - old_buffer) + bufp->buffer; \
333 begalt = (begalt - old_buffer) + bufp->buffer; \
334 if (pending_exact) \
335 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
336 }
337
338/* Set the bit for character C in a character set list. */
339#define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
340
341/* Get the next unsigned number in the uncompiled pattern. */
342#define GET_UNSIGNED_NUMBER(num) \
343 { if (p != pend) \
344 { \
345 PATFETCH (c); \
346 while (isdigit (c)) \
347 { \
348 if (num < 0) \
349 num = 0; \
350 num = num * 10 + c - '0'; \
351 if (p == pend) \
352 break; \
353 PATFETCH (c); \
354 } \
355 } \
356 }
357
358/* Subroutines for re_compile_pattern. */
359static void store_jump (), insert_jump (), store_jump_n (),
360 insert_jump_n (), insert_op_2 ();
361
362
363/* re_compile_pattern takes a regular-expression string
364 and converts it into a buffer full of byte commands for matching.
365
366 PATTERN is the address of the pattern string
367 SIZE is the length of it.
368 BUFP is a struct re_pattern_buffer * which points to the info
369 on where to store the byte commands.
370 This structure contains a char * which points to the
371 actual space, which should have been obtained with malloc.
372 re_compile_pattern may use realloc to grow the buffer space.
373
374 The number of bytes of commands can be found out by looking in
375 the `struct re_pattern_buffer' that bufp pointed to, after
376 re_compile_pattern returns. */
377
378char *
379re_compile_pattern (pattern, size, bufp)
380 char *pattern;
381 int size;
382 struct re_pattern_buffer *bufp;
383{
384 register char *b = bufp->buffer;
385 register char *p = pattern;
386 char *pend = pattern + size;
387 register unsigned c, c1;
388 char *p1;
389 unsigned char *translate = (unsigned char *) bufp->translate;
390
391 /* Address of the count-byte of the most recently inserted `exactn'
392 command. This makes it possible to tell whether a new exact-match
393 character can be added to that command or requires a new `exactn'
394 command. */
395
396 char *pending_exact = 0;
397
398 /* Address of the place where a forward-jump should go to the end of
399 the containing expression. Each alternative of an `or', except the
400 last, ends with a forward-jump of this sort. */
401
402 char *fixup_jump = 0;
403
404 /* Address of start of the most recently finished expression.
405 This tells postfix * where to find the start of its operand. */
406
407 char *laststart = 0;
408
409 /* In processing a repeat, 1 means zero matches is allowed. */
410
411 char zero_times_ok;
412
413 /* In processing a repeat, 1 means many matches is allowed. */
414
415 char many_times_ok;
416
417 /* Address of beginning of regexp, or inside of last \(. */
418
419 char *begalt = b;
420
421 /* In processing an interval, at least this many matches must be made. */
422 int lower_bound;
423
424 /* In processing an interval, at most this many matches can be made. */
425 int upper_bound;
426
427 /* Place in pattern (i.e., the {) to which to go back if the interval
428 is invalid. */
429 char *beg_interval = 0;
430
431 /* Stack of information saved by \( and restored by \).
432 Four stack elements are pushed by each \(:
433 First, the value of b.
434 Second, the value of fixup_jump.
435 Third, the value of regnum.
436 Fourth, the value of begalt. */
437
438 int stackb[40];
439 int *stackp = stackb;
440 int *stacke = stackb + 40;
441 int *stackt;
442
443 /* Counts \('s as they are encountered. Remembered for the matching \),
444 where it becomes the register number to put in the stop_memory
445 command. */
446
447 int regnum = 1;
448
449 bufp->fastmap_accurate = 0;
450
451#ifndef emacs
452#ifndef SYNTAX_TABLE
453 /* Initialize the syntax table. */
454 init_syntax_once();
455#endif
456#endif
457
458 if (bufp->allocated == 0)
459 {
460 bufp->allocated = INIT_BUF_SIZE;
461 if (bufp->buffer)
462 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
463 bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
464 else
465 /* Caller did not allocate a buffer. Do it for them. */
466 bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
467 if (!bufp->buffer) goto memory_exhausted;
468 begalt = b = bufp->buffer;
469 }
470
471 while (p != pend)
472 {
473 PATFETCH (c);
474
475 switch (c)
476 {
477 case '$':
478 {
479 char *p1 = p;
480 /* When testing what follows the $,
481 look past the \-constructs that don't consume anything. */
482 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
483 while (p1 != pend)
484 {
485 if (*p1 == '\\' && p1 + 1 != pend
486 && (p1[1] == '<' || p1[1] == '>'
487 || p1[1] == '`' || p1[1] == '\''
488#ifdef emacs
489 || p1[1] == '='
490#endif
491 || p1[1] == 'b' || p1[1] == 'B'))
492 p1 += 2;
493 else
494 break;
495 }
496 if (obscure_syntax & RE_TIGHT_VBAR)
497 {
498 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
499 goto normal_char;
500 /* Make operand of last vbar end before this `$'. */
501 if (fixup_jump)
502 store_jump (fixup_jump, jump, b);
503 fixup_jump = 0;
504 BUFPUSH (endline);
505 break;
506 }
507 /* $ means succeed if at end of line, but only in special contexts.
508 If validly in the middle of a pattern, it is a normal character. */
509
510 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
511 goto invalid_pattern;
512 if (p1 == pend || *p1 == '\n'
513 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
514 || (obscure_syntax & RE_NO_BK_PARENS
515 ? *p1 == ')'
516 : *p1 == '\\' && p1[1] == ')')
517 || (obscure_syntax & RE_NO_BK_VBAR
518 ? *p1 == '|'
519 : *p1 == '\\' && p1[1] == '|'))
520 {
521 BUFPUSH (endline);
522 break;
523 }
524 goto normal_char;
525 }
526 case '^':
527 /* ^ means succeed if at beg of line, but only if no preceding
528 pattern. */
529
530 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
531 goto invalid_pattern;
532 if (laststart && p - 2 >= pattern && p[-2] != '\n'
533 && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
534 goto normal_char;
535 if (obscure_syntax & RE_TIGHT_VBAR)
536 {
537 if (p != pattern + 1
538 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
539 goto normal_char;
540 BUFPUSH (begline);
541 begalt = b;
542 }
543 else
544 BUFPUSH (begline);
545 break;
546
547 case '+':
548 case '?':
549 if ((obscure_syntax & RE_BK_PLUS_QM)
550 || (obscure_syntax & RE_LIMITED_OPS))
551 goto normal_char;
552 handle_plus:
553 case '*':
554 /* If there is no previous pattern, char not special. */
555 if (!laststart)
556 {
557 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
558 goto invalid_pattern;
559 else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
560 goto normal_char;
561 }
562 /* If there is a sequence of repetition chars,
563 collapse it down to just one. */
564 zero_times_ok = 0;
565 many_times_ok = 0;
566 while (1)
567 {
568 zero_times_ok |= c != '+';
569 many_times_ok |= c != '?';
570 if (p == pend)
571 break;
572 PATFETCH (c);
573 if (c == '*')
574 ;
575 else if (!(obscure_syntax & RE_BK_PLUS_QM)
576 && (c == '+' || c == '?'))
577 ;
578 else if ((obscure_syntax & RE_BK_PLUS_QM)
579 && c == '\\')
580 {
581 int c1;
582 PATFETCH (c1);
583 if (!(c1 == '+' || c1 == '?'))
584 {
585 PATUNFETCH;
586 PATUNFETCH;
587 break;
588 }
589 c = c1;
590 }
591 else
592 {
593 PATUNFETCH;
594 break;
595 }
596 }
597
598 /* Star, etc. applied to an empty pattern is equivalent
599 to an empty pattern. */
600 if (!laststart)
601 break;
602
603 /* Now we know whether or not zero matches is allowed
604 and also whether or not two or more matches is allowed. */
605 if (many_times_ok)
606 {
607 /* If more than one repetition is allowed, put in at the
608 end a backward relative jump from b to before the next
609 jump we're going to put in below (which jumps from
610 laststart to after this jump). */
611 GET_BUFFER_SPACE (3);
612 store_jump (b, maybe_finalize_jump, laststart - 3);
613 b += 3; /* Because store_jump put stuff here. */
614 }
615 /* On failure, jump from laststart to b + 3, which will be the
616 end of the buffer after this jump is inserted. */
617 GET_BUFFER_SPACE (3);
618 insert_jump (on_failure_jump, laststart, b + 3, b);
619 pending_exact = 0;
620 b += 3;
621 if (!zero_times_ok)
622 {
623 /* At least one repetition is required, so insert a
624 dummy-failure before the initial on-failure-jump
625 instruction of the loop. This effects a skip over that
626 instruction the first time we hit that loop. */
627 GET_BUFFER_SPACE (6);
628 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
629 b += 3;
630 }
631 break;
632
633 case '.':
634 laststart = b;
635 BUFPUSH (anychar);
636 break;
637
638 case '[':
639 if (p == pend)
640 goto invalid_pattern;
641 while (b - bufp->buffer
642 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
643 EXTEND_BUFFER;
644
645 laststart = b;
646 if (*p == '^')
647 {
648 BUFPUSH (charset_not);
649 p++;
650 }
651 else
652 BUFPUSH (charset);
653 p1 = p;
654
655 BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
656 /* Clear the whole map */
657 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
658
659 if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
660 SET_LIST_BIT ('\n');
661
662
663 /* Read in characters and ranges, setting map bits. */
664 while (1)
665 {
666 /* Don't translate while fetching, in case it's a range bound.
667 When we set the bit for the character, we translate it. */
668 PATFETCH_RAW (c);
669
670 /* If set, \ escapes characters when inside [...]. */
671 if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
672 {
673 PATFETCH(c1);
674 SET_LIST_BIT (c1);
675 continue;
676 }
677 if (c == ']')
678 {
679 if (p == p1 + 1)
680 {
681 /* If this is an empty bracket expression. */
682 if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
683 && p == pend)
684 goto invalid_pattern;
685 }
686 else
687 /* Stop if this isn't merely a ] inside a bracket
688 expression, but rather the end of a bracket
689 expression. */
690 break;
691 }
692 /* Get a range. */
693 if (p[0] == '-' && p[1] != ']')
694 {
695 PATFETCH (c1);
696 /* Don't translate the range bounds while fetching them. */
697 PATFETCH_RAW (c1);
698
699 if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
700 goto invalid_pattern;
701
702 if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
703 && c1 == '-' && *p != ']')
704 goto invalid_pattern;
705
706 while (c <= c1)
707 {
708 /* Translate each char that's in the range. */
709 if (translate)
710 SET_LIST_BIT (translate[c]);
711 else
712 SET_LIST_BIT (c);
713 c++;
714 }
715 }
716 else if ((obscure_syntax & RE_CHAR_CLASSES)
717 && c == '[' && p[0] == ':')
718 {
719 /* Longest valid character class word has six characters. */
720 char str[CHAR_CLASS_MAX_LENGTH];
721 PATFETCH (c);
722 c1 = 0;
723 /* If no ] at end. */
724 if (p == pend)
725 goto invalid_pattern;
726 while (1)
727 {
728 /* Don't translate the ``character class'' characters. */
729 PATFETCH_RAW (c);
730 if (c == ':' || c == ']' || p == pend
731 || c1 == CHAR_CLASS_MAX_LENGTH)
732 break;
733 str[c1++] = c;
734 }
735 str[c1] = '\0';
736 if (p == pend
737 || c == ']' /* End of the bracket expression. */
738 || p[0] != ']'
739 || p + 1 == pend
740 || (strcmp (str, "alpha") != 0
741 && strcmp (str, "upper") != 0
742 && strcmp (str, "lower") != 0
743 && strcmp (str, "digit") != 0
744 && strcmp (str, "alnum") != 0
745 && strcmp (str, "xdigit") != 0
746 && strcmp (str, "space") != 0
747 && strcmp (str, "print") != 0
748 && strcmp (str, "punct") != 0
749 && strcmp (str, "graph") != 0
750 && strcmp (str, "cntrl") != 0))
751 {
752 /* Undo the ending character, the letters, and leave
753 the leading : and [ (but set bits for them). */
754 c1++;
755 while (c1--)
756 PATUNFETCH;
757 SET_LIST_BIT ('[');
758 SET_LIST_BIT (':');
759 }
760 else
761 {
762 /* The ] at the end of the character class. */
763 PATFETCH (c);
764 if (c != ']')
765 goto invalid_pattern;
766 for (c = 0; c < (1 << BYTEWIDTH); c++)
767 {
768 if ((strcmp (str, "alpha") == 0 && isalpha (c))
769 || (strcmp (str, "upper") == 0 && isupper (c))
770 || (strcmp (str, "lower") == 0 && islower (c))
771 || (strcmp (str, "digit") == 0 && isdigit (c))
772 || (strcmp (str, "alnum") == 0 && isalnum (c))
773 || (strcmp (str, "xdigit") == 0 && isxdigit (c))
774 || (strcmp (str, "space") == 0 && isspace (c))
775 || (strcmp (str, "print") == 0 && isprint (c))
776 || (strcmp (str, "punct") == 0 && ispunct (c))
777 || (strcmp (str, "graph") == 0 && isgraph (c))
778 || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
779 SET_LIST_BIT (c);
780 }
781 }
782 }
783 else if (translate)
784 SET_LIST_BIT (translate[c]);
785 else
786 SET_LIST_BIT (c);
787 }
788
789 /* Discard any character set/class bitmap bytes that are all
790 0 at the end of the map. Decrement the map-length byte too. */
791 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
792 b[-1]--;
793 b += b[-1];
794 break;
795
796 case '(':
797 if (! (obscure_syntax & RE_NO_BK_PARENS))
798 goto normal_char;
799 else
800 goto handle_open;
801
802 case ')':
803 if (! (obscure_syntax & RE_NO_BK_PARENS))
804 goto normal_char;
805 else
806 goto handle_close;
807
808 case '\n':
809 if (! (obscure_syntax & RE_NEWLINE_OR))
810 goto normal_char;
811 else
812 goto handle_bar;
813
814 case '|':
815 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
816 && (! laststart || p == pend))
817 goto invalid_pattern;
818 else if (! (obscure_syntax & RE_NO_BK_VBAR))
819 goto normal_char;
820 else
821 goto handle_bar;
822
823 case '{':
824 if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
825 && (obscure_syntax & RE_INTERVALS)))
826 goto normal_char;
827 else
828 goto handle_interval;
829
830 case '\\':
831 if (p == pend) goto invalid_pattern;
832 PATFETCH_RAW (c);
833 switch (c)
834 {
835 case '(':
836 if (obscure_syntax & RE_NO_BK_PARENS)
837 goto normal_backsl;
838 handle_open:
839 if (stackp == stacke) goto nesting_too_deep;
840
841 /* Laststart should point to the start_memory that we are about
842 to push (unless the pattern has RE_NREGS or more ('s). */
843 *stackp++ = b - bufp->buffer;
844 if (regnum < RE_NREGS)
845 {
846 BUFPUSH (start_memory);
847 BUFPUSH (regnum);
848 }
849 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
850 *stackp++ = regnum++;
851 *stackp++ = begalt - bufp->buffer;
852 fixup_jump = 0;
853 laststart = 0;
854 begalt = b;
855 break;
856
857 case ')':
858 if (obscure_syntax & RE_NO_BK_PARENS)
859 goto normal_backsl;
860 handle_close:
861 if (stackp == stackb) goto unmatched_close;
862 begalt = *--stackp + bufp->buffer;
863 if (fixup_jump)
864 store_jump (fixup_jump, jump, b);
865 if (stackp[-1] < RE_NREGS)
866 {
867 BUFPUSH (stop_memory);
868 BUFPUSH (stackp[-1]);
869 }
870 stackp -= 2;
871 fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
872 laststart = *--stackp + bufp->buffer;
873 break;
874
875 case '|':
876 if ((obscure_syntax & RE_LIMITED_OPS)
877 || (obscure_syntax & RE_NO_BK_VBAR))
878 goto normal_backsl;
879 handle_bar:
880 if (obscure_syntax & RE_LIMITED_OPS)
881 goto normal_char;
882 /* Insert before the previous alternative a jump which
883 jumps to this alternative if the former fails. */
884 GET_BUFFER_SPACE (6);
885 insert_jump (on_failure_jump, begalt, b + 6, b);
886 pending_exact = 0;
887 b += 3;
888 /* The alternative before the previous alternative has a
889 jump after it which gets executed if it gets matched.
890 Adjust that jump so it will jump to the previous
891 alternative's analogous jump (put in below, which in
892 turn will jump to the next (if any) alternative's such
893 jump, etc.). The last such jump jumps to the correct
894 final destination. */
895 if (fixup_jump)
896 store_jump (fixup_jump, jump, b);
897
898 /* Leave space for a jump after previous alternative---to be
899 filled in later. */
900 fixup_jump = b;
901 b += 3;
902
903 laststart = 0;
904 begalt = b;
905 break;
906
907 case '{':
908 if (! (obscure_syntax & RE_INTERVALS)
909 /* Let \{ be a literal. */
910 || ((obscure_syntax & RE_INTERVALS)
911 && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
912 /* If it's the string "\{". */
913 || (p - 2 == pattern && p == pend))
914 goto normal_backsl;
915 handle_interval:
916 beg_interval = p - 1; /* The {. */
917 /* If there is no previous pattern, this isn't an interval. */
918 if (!laststart)
919 {
920 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
921 goto invalid_pattern;
922 else
923 goto normal_backsl;
924 }
925 /* It also isn't an interval if not preceded by an re
926 matching a single character or subexpression, or if
927 the current type of intervals can't handle back
928 references and the previous thing is a back reference. */
929 if (! (*laststart == anychar
930 || *laststart == charset
931 || *laststart == charset_not
932 || *laststart == start_memory
933 || (*laststart == exactn && laststart[1] == 1)
934 || (! (obscure_syntax & RE_NO_BK_REFS)
935 && *laststart == duplicate)))
936 {
937 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
938 goto normal_char;
939
940 /* Posix extended syntax is handled in previous
941 statement; this is for Posix basic syntax. */
942 if (obscure_syntax & RE_INTERVALS)
943 goto invalid_pattern;
944
945 goto normal_backsl;
946 }
947 lower_bound = -1; /* So can see if are set. */
948 upper_bound = -1;
949 GET_UNSIGNED_NUMBER (lower_bound);
950 if (c == ',')
951 {
952 GET_UNSIGNED_NUMBER (upper_bound);
953 if (upper_bound < 0)
954 upper_bound = RE_DUP_MAX;
955 }
956 if (upper_bound < 0)
957 upper_bound = lower_bound;
958 if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
959 {
960 if (c != '\\')
961 goto invalid_pattern;
962 PATFETCH (c);
963 }
964 if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
965 || lower_bound > upper_bound
966 || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
967 && p != pend && *p == '{'))
968 {
969 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
970 goto unfetch_interval;
971 else
972 goto invalid_pattern;
973 }
974
975 /* If upper_bound is zero, don't want to succeed at all;
976 jump from laststart to b + 3, which will be the end of
977 the buffer after this jump is inserted. */
978
979 if (upper_bound == 0)
980 {
981 GET_BUFFER_SPACE (3);
982 insert_jump (jump, laststart, b + 3, b);
983 b += 3;
984 }
985
986 /* Otherwise, after lower_bound number of succeeds, jump
987 to after the jump_n which will be inserted at the end
988 of the buffer, and insert that jump_n. */
989 else
990 { /* Set to 5 if only one repetition is allowed and
991 hence no jump_n is inserted at the current end of
992 the buffer; then only space for the succeed_n is
993 needed. Otherwise, need space for both the
994 succeed_n and the jump_n. */
995
996 unsigned slots_needed = upper_bound == 1 ? 5 : 10;
997
998 GET_BUFFER_SPACE (slots_needed);
999 /* Initialize the succeed_n to n, even though it will
1000 be set by its attendant set_number_at, because
1001 re_compile_fastmap will need to know it. Jump to
1002 what the end of buffer will be after inserting
1003 this succeed_n and possibly appending a jump_n. */
1004 insert_jump_n (succeed_n, laststart, b + slots_needed,
1005 b, lower_bound);
1006 b += 5; /* Just increment for the succeed_n here. */
1007
1008 /* More than one repetition is allowed, so put in at
1009 the end of the buffer a backward jump from b to the
1010 succeed_n we put in above. By the time we've gotten
1011 to this jump when matching, we'll have matched once
1012 already, so jump back only upper_bound - 1 times. */
1013
1014 if (upper_bound > 1)
1015 {
1016 store_jump_n (b, jump_n, laststart, upper_bound - 1);
1017 b += 5;
1018 /* When hit this when matching, reset the
1019 preceding jump_n's n to upper_bound - 1. */
1020 BUFPUSH (set_number_at);
1021 GET_BUFFER_SPACE (2);
1022 STORE_NUMBER_AND_INCR (b, -5);
1023 STORE_NUMBER_AND_INCR (b, upper_bound - 1);
1024 }
1025 /* When hit this when matching, set the succeed_n's n. */
1026 GET_BUFFER_SPACE (5);
1027 insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
1028 b += 5;
1029 }
1030 pending_exact = 0;
1031 beg_interval = 0;
1032 break;
1033
1034
1035 unfetch_interval:
1036 /* If an invalid interval, match the characters as literals. */
1037 if (beg_interval)
1038 p = beg_interval;
1039 else
1040 {
1041 fprintf (stderr,
1042 "regex: no interval beginning to which to backtrack.\n");
1043 exit (1);
1044 }
1045
1046 beg_interval = 0;
1047 PATFETCH (c); /* normal_char expects char in `c'. */
1048 goto normal_char;
1049 break;
1050
1051#ifdef emacs
1052 case '=':
1053 BUFPUSH (at_dot);
1054 break;
1055
1056 case 's':
1057 laststart = b;
1058 BUFPUSH (syntaxspec);
1059 PATFETCH (c);
1060 BUFPUSH (syntax_spec_code[c]);
1061 break;
1062
1063 case 'S':
1064 laststart = b;
1065 BUFPUSH (notsyntaxspec);
1066 PATFETCH (c);
1067 BUFPUSH (syntax_spec_code[c]);
1068 break;
1069#endif /* emacs */
1070
1071 case 'w':
1072 laststart = b;
1073 BUFPUSH (wordchar);
1074 break;
1075
1076 case 'W':
1077 laststart = b;
1078 BUFPUSH (notwordchar);
1079 break;
1080
1081 case '<':
1082 BUFPUSH (wordbeg);
1083 break;
1084
1085 case '>':
1086 BUFPUSH (wordend);
1087 break;
1088
1089 case 'b':
1090 BUFPUSH (wordbound);
1091 break;
1092
1093 case 'B':
1094 BUFPUSH (notwordbound);
1095 break;
1096
1097 case '`':
1098 BUFPUSH (begbuf);
1099 break;
1100
1101 case '\'':
1102 BUFPUSH (endbuf);
1103 break;
1104
1105 case '1':
1106 case '2':
1107 case '3':
1108 case '4':
1109 case '5':
1110 case '6':
1111 case '7':
1112 case '8':
1113 case '9':
1114 if (obscure_syntax & RE_NO_BK_REFS)
1115 goto normal_char;
1116 c1 = c - '0';
1117 if (c1 >= regnum)
1118 {
1119 if (obscure_syntax & RE_NO_EMPTY_BK_REF)
1120 goto invalid_pattern;
1121 else
1122 goto normal_char;
1123 }
1124 /* Can't back reference to a subexpression if inside of it. */
1125 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
1126 if (*stackt == c1)
1127 goto normal_char;
1128 laststart = b;
1129 BUFPUSH (duplicate);
1130 BUFPUSH (c1);
1131 break;
1132
1133 case '+':
1134 case '?':
1135 if (obscure_syntax & RE_BK_PLUS_QM)
1136 goto handle_plus;
1137 else
1138 goto normal_backsl;
1139 break;
1140
1141 default:
1142 normal_backsl:
1143 /* You might think it would be useful for \ to mean
1144 not to translate; but if we don't translate it
1145 it will never match anything. */
1146 if (translate) c = translate[c];
1147 goto normal_char;
1148 }
1149 break;
1150
1151 default:
1152 normal_char: /* Expects the character in `c'. */
1153 if (!pending_exact || pending_exact + *pending_exact + 1 != b
1154 || *pending_exact == 0177 || *p == '*' || *p == '^'
1155 || ((obscure_syntax & RE_BK_PLUS_QM)
1156 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
1157 : (*p == '+' || *p == '?'))
1158 || ((obscure_syntax & RE_INTERVALS)
1159 && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
1160 ? *p == '{'
1161 : (p[0] == '\\' && p[1] == '{'))))
1162 {
1163 laststart = b;
1164 BUFPUSH (exactn);
1165 pending_exact = b;
1166 BUFPUSH (0);
1167 }
1168 BUFPUSH (c);
1169 (*pending_exact)++;
1170 }
1171 }
1172
1173 if (fixup_jump)
1174 store_jump (fixup_jump, jump, b);
1175
1176 if (stackp != stackb) goto unmatched_open;
1177
1178 bufp->used = b - bufp->buffer;
1179 return 0;
1180
1181 invalid_pattern:
1182 return "Invalid regular expression";
1183
1184 unmatched_open:
1185 return "Unmatched \\(";
1186
1187 unmatched_close:
1188 return "Unmatched \\)";
1189
1190 end_of_pattern:
1191 return "Premature end of regular expression";
1192
1193 nesting_too_deep:
1194 return "Nesting too deep";
1195
1196 too_big:
1197 return "Regular expression too big";
1198
1199 memory_exhausted:
1200 return "Memory exhausted";
1201}
1202
1203
1204/* Store a jump of the form <OPCODE> <relative address>.
1205 Store in the location FROM a jump operation to jump to relative
1206 address FROM - TO. OPCODE is the opcode to store. */
1207
1208static void
1209store_jump (from, opcode, to)
1210 char *from, *to;
1211 char opcode;
1212{
1213 from[0] = opcode;
1214 STORE_NUMBER(from + 1, to - (from + 3));
1215}
1216
1217
1218/* Open up space before char FROM, and insert there a jump to TO.
1219 CURRENT_END gives the end of the storage not in use, so we know
1220 how much data to copy up. OP is the opcode of the jump to insert.
1221
1222 If you call this function, you must zero out pending_exact. */
1223
1224static void
1225insert_jump (op, from, to, current_end)
1226 char op;
1227 char *from, *to, *current_end;
1228{
1229 register char *pfrom = current_end; /* Copy from here... */
1230 register char *pto = current_end + 3; /* ...to here. */
1231
1232 while (pfrom != from)
1233 *--pto = *--pfrom;
1234 store_jump (from, op, to);
1235}
1236
1237
1238/* Store a jump of the form <opcode> <relative address> <n> .
1239
1240 Store in the location FROM a jump operation to jump to relative
1241 address FROM - TO. OPCODE is the opcode to store, N is a number the
1242 jump uses, say, to decide how many times to jump.
1243
1244 If you call this function, you must zero out pending_exact. */
1245
1246static void
1247store_jump_n (from, opcode, to, n)
1248 char *from, *to;
1249 char opcode;
1250 unsigned n;
1251{
1252 from[0] = opcode;
1253 STORE_NUMBER (from + 1, to - (from + 3));
1254 STORE_NUMBER (from + 3, n);
1255}
1256
1257
1258/* Similar to insert_jump, but handles a jump which needs an extra
1259 number to handle minimum and maximum cases. Open up space at
1260 location FROM, and insert there a jump to TO. CURRENT_END gives the
1261 end of the storage in use, so we know how much data to copy up. OP is
1262 the opcode of the jump to insert.
1263
1264 If you call this function, you must zero out pending_exact. */
1265
1266static void
1267insert_jump_n (op, from, to, current_end, n)
1268 char op;
1269 char *from, *to, *current_end;
1270 unsigned n;
1271{
1272 register char *pfrom = current_end; /* Copy from here... */
1273 register char *pto = current_end + 5; /* ...to here. */
1274
1275 while (pfrom != from)
1276 *--pto = *--pfrom;
1277 store_jump_n (from, op, to, n);
1278}
1279
1280
1281/* Open up space at location THERE, and insert operation OP followed by
1282 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
1283 we know how much data to copy up.
1284
1285 If you call this function, you must zero out pending_exact. */
1286
1287static void
1288insert_op_2 (op, there, current_end, num_1, num_2)
1289 char op;
1290 char *there, *current_end;
1291 int num_1, num_2;
1292{
1293 register char *pfrom = current_end; /* Copy from here... */
1294 register char *pto = current_end + 5; /* ...to here. */
1295
1296 while (pfrom != there)
1297 *--pto = *--pfrom;
1298
1299 there[0] = op;
1300 STORE_NUMBER (there + 1, num_1);
1301 STORE_NUMBER (there + 3, num_2);
1302}
1303
1304
1305\f
1306/* Given a pattern, compute a fastmap from it. The fastmap records
1307 which of the (1 << BYTEWIDTH) possible characters can start a string
1308 that matches the pattern. This fastmap is used by re_search to skip
1309 quickly over totally implausible text.
1310
1311 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
1312 area as bufp->fastmap.
1313 The other components of bufp describe the pattern to be used. */
1314
1315void
1316re_compile_fastmap (bufp)
1317 struct re_pattern_buffer *bufp;
1318{
1319 unsigned char *pattern = (unsigned char *) bufp->buffer;
1320 int size = bufp->used;
1321 register char *fastmap = bufp->fastmap;
1322 register unsigned char *p = pattern;
1323 register unsigned char *pend = pattern + size;
1324 register int j, k;
1325 unsigned char *translate = (unsigned char *) bufp->translate;
1326
1327 unsigned char *stackb[NFAILURES];
1328 unsigned char **stackp = stackb;
1329
1330 unsigned is_a_succeed_n;
1331
1332 bzero (fastmap, (1 << BYTEWIDTH));
1333 bufp->fastmap_accurate = 1;
1334 bufp->can_be_null = 0;
1335
1336 while (p)
1337 {
1338 is_a_succeed_n = 0;
1339 if (p == pend)
1340 {
1341 bufp->can_be_null = 1;
1342 break;
1343 }
1344#ifdef SWITCH_ENUM_BUG
1345 switch ((int) ((enum regexpcode) *p++))
1346#else
1347 switch ((enum regexpcode) *p++)
1348#endif
1349 {
1350 case exactn:
1351 if (translate)
1352 fastmap[translate[p[1]]] = 1;
1353 else
1354 fastmap[p[1]] = 1;
1355 break;
1356
1357 case begline:
1358 case before_dot:
1359 case at_dot:
1360 case after_dot:
1361 case begbuf:
1362 case endbuf:
1363 case wordbound:
1364 case notwordbound:
1365 case wordbeg:
1366 case wordend:
1367 continue;
1368
1369 case endline:
1370 if (translate)
1371 fastmap[translate['\n']] = 1;
1372 else
1373 fastmap['\n'] = 1;
1374
1375 if (bufp->can_be_null != 1)
1376 bufp->can_be_null = 2;
1377 break;
1378
1379 case jump_n:
1380 case finalize_jump:
1381 case maybe_finalize_jump:
1382 case jump:
1383 case dummy_failure_jump:
1384 EXTRACT_NUMBER_AND_INCR (j, p);
1385 p += j;
1386 if (j > 0)
1387 continue;
1388 /* Jump backward reached implies we just went through
1389 the body of a loop and matched nothing.
1390 Opcode jumped to should be an on_failure_jump.
1391 Just treat it like an ordinary jump.
1392 For a * loop, it has pushed its failure point already;
1393 If so, discard that as redundant. */
1394
1395 if ((enum regexpcode) *p != on_failure_jump
1396 && (enum regexpcode) *p != succeed_n)
1397 continue;
1398 p++;
1399 EXTRACT_NUMBER_AND_INCR (j, p);
1400 p += j;
1401 if (stackp != stackb && *stackp == p)
1402 stackp--;
1403 continue;
1404
1405 case on_failure_jump:
1406 handle_on_failure_jump:
1407 EXTRACT_NUMBER_AND_INCR (j, p);
1408 *++stackp = p + j;
1409 if (is_a_succeed_n)
1410 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
1411 continue;
1412
1413 case succeed_n:
1414 is_a_succeed_n = 1;
1415 /* Get to the number of times to succeed. */
1416 p += 2;
1417 /* Increment p past the n for when k != 0. */
1418 EXTRACT_NUMBER_AND_INCR (k, p);
1419 if (k == 0)
1420 {
1421 p -= 4;
1422 goto handle_on_failure_jump;
1423 }
1424 continue;
1425
1426 case set_number_at:
1427 p += 4;
1428 continue;
1429
1430 case start_memory:
1431 case stop_memory:
1432 p++;
1433 continue;
1434
1435 case duplicate:
1436 bufp->can_be_null = 1;
1437 fastmap['\n'] = 1;
1438 case anychar:
1439 for (j = 0; j < (1 << BYTEWIDTH); j++)
1440 if (j != '\n')
1441 fastmap[j] = 1;
1442 if (bufp->can_be_null)
1443 return;
1444 /* Don't return; check the alternative paths
1445 so we can set can_be_null if appropriate. */
1446 break;
1447
1448 case wordchar:
1449 for (j = 0; j < (1 << BYTEWIDTH); j++)
1450 if (SYNTAX (j) == Sword)
1451 fastmap[j] = 1;
1452 break;
1453
1454 case notwordchar:
1455 for (j = 0; j < (1 << BYTEWIDTH); j++)
1456 if (SYNTAX (j) != Sword)
1457 fastmap[j] = 1;
1458 break;
1459
1460#ifdef emacs
1461 case syntaxspec:
1462 k = *p++;
1463 for (j = 0; j < (1 << BYTEWIDTH); j++)
1464 if (SYNTAX (j) == (enum syntaxcode) k)
1465 fastmap[j] = 1;
1466 break;
1467
1468 case notsyntaxspec:
1469 k = *p++;
1470 for (j = 0; j < (1 << BYTEWIDTH); j++)
1471 if (SYNTAX (j) != (enum syntaxcode) k)
1472 fastmap[j] = 1;
1473 break;
1474#endif /* not emacs */
1475
1476 case charset:
1477 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1478 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
1479 {
1480 if (translate)
1481 fastmap[translate[j]] = 1;
1482 else
1483 fastmap[j] = 1;
1484 }
1485 break;
1486
1487 case charset_not:
1488 /* Chars beyond end of map must be allowed */
1489 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
1490 if (translate)
1491 fastmap[translate[j]] = 1;
1492 else
1493 fastmap[j] = 1;
1494
1495 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1496 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
1497 {
1498 if (translate)
1499 fastmap[translate[j]] = 1;
1500 else
1501 fastmap[j] = 1;
1502 }
1503 break;
1504 }
1505
1506 /* Get here means we have successfully found the possible starting
1507 characters of one path of the pattern. We need not follow this
1508 path any farther. Instead, look at the next alternative
1509 remembered in the stack. */
1510 if (stackp != stackb)
1511 p = *stackp--;
1512 else
1513 break;
1514 }
1515}
1516
1517
1518\f
1519/* Like re_search_2, below, but only one string is specified, and
1520 doesn't let you say where to stop matching. */
1521
1522int
1523re_search (pbufp, string, size, startpos, range, regs)
1524 struct re_pattern_buffer *pbufp;
1525 char *string;
1526 int size, startpos, range;
1527 struct re_registers *regs;
1528{
1529 return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
1530 regs, size);
1531}
1532
1533
1534/* Using the compiled pattern in PBUFP->buffer, first tries to match the
1535 virtual concatenation of STRING1 and STRING2, starting first at index
1536 STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
1537 places to try before giving up. If RANGE is negative, it searches
1538 backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
1539 - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
1540 In REGS, return the indices of the virtual concatenation of STRING1
1541 and STRING2 that matched the entire PBUFP->buffer and its contained
1542 subexpressions. Do not consider matching one past the index MSTOP in
1543 the virtual concatenation of STRING1 and STRING2.
1544
1545 The value returned is the position in the strings at which the match
1546 was found, or -1 if no match was found, or -2 if error (such as
1547 failure stack overflow). */
1548
1549int
1550re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
1551 regs, mstop)
1552 struct re_pattern_buffer *pbufp;
1553 char *string1, *string2;
1554 int size1, size2;
1555 int startpos;
1556 register int range;
1557 struct re_registers *regs;
1558 int mstop;
1559{
1560 register char *fastmap = pbufp->fastmap;
1561 register unsigned char *translate = (unsigned char *) pbufp->translate;
1562 int total_size = size1 + size2;
1563 int endpos = startpos + range;
1564 int val;
1565
1566 /* Check for out-of-range starting position. */
1567 if (startpos < 0 || startpos > total_size)
1568 return -1;
1569
1570 /* Fix up range if it would eventually take startpos outside of the
1571 virtual concatenation of string1 and string2. */
1572 if (endpos < -1)
1573 range = -1 - startpos;
1574 else if (endpos > total_size)
1575 range = total_size - startpos;
1576
1577 /* Update the fastmap now if not correct already. */
1578 if (fastmap && !pbufp->fastmap_accurate)
1579 re_compile_fastmap (pbufp);
1580
1581 /* If the search isn't to be a backwards one, don't waste time in a
1582 long search for a pattern that says it is anchored. */
1583 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
1584 && range > 0)
1585 {
1586 if (startpos > 0)
1587 return -1;
1588 else
1589 range = 1;
1590 }
1591
1592 while (1)
1593 {
1594 /* If a fastmap is supplied, skip quickly over characters that
1595 cannot possibly be the start of a match. Note, however, that
1596 if the pattern can possibly match the null string, we must
1597 test it at each starting point so that we take the first null
1598 string we get. */
1599
1600 if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
1601 {
1602 if (range > 0) /* Searching forwards. */
1603 {
1604 register int lim = 0;
1605 register unsigned char *p;
1606 int irange = range;
1607 if (startpos < size1 && startpos + range >= size1)
1608 lim = range - (size1 - startpos);
1609
1610 p = ((unsigned char *)
1611 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
1612
1613 while (range > lim && !fastmap[translate
1614 ? translate[*p++]
1615 : *p++])
1616 range--;
1617 startpos += irange - range;
1618 }
1619 else /* Searching backwards. */
1620 {
1621 register unsigned char c;
1622
1623 if (string1 == 0 || startpos >= size1)
1624 c = string2[startpos - size1];
1625 else
1626 c = string1[startpos];
1627
1628 c &= 0xff;
1629 if (translate ? !fastmap[translate[c]] : !fastmap[c])
1630 goto advance;
1631 }
1632 }
1633
1634 if (range >= 0 && startpos == total_size
1635 && fastmap && pbufp->can_be_null == 0)
1636 return -1;
1637
1638 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
1639 regs, mstop);
1640 if (val >= 0)
1641 return startpos;
1642 if (val == -2)
1643 return -2;
1644
1645#ifdef C_ALLOCA
1646 alloca (0);
1647#endif /* C_ALLOCA */
1648
1649 advance:
1650 if (!range)
1651 break;
1652 else if (range > 0)
1653 {
1654 range--;
1655 startpos++;
1656 }
1657 else
1658 {
1659 range++;
1660 startpos--;
1661 }
1662 }
1663 return -1;
1664}
1665
1666
1667\f
1668#ifndef emacs /* emacs never uses this. */
1669int
1670re_match (pbufp, string, size, pos, regs)
1671 struct re_pattern_buffer *pbufp;
1672 char *string;
1673 int size, pos;
1674 struct re_registers *regs;
1675{
1676 return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
1677}
1678#endif /* not emacs */
1679
1680
1681/* The following are used for re_match_2, defined below: */
1682
1683/* Roughly the maximum number of failure points on the stack. Would be
1684 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
1685
1686int re_max_failures = 2000;
1687
1688/* Routine used by re_match_2. */
1689static int bcmp_translate ();
1690
1691
1692/* Structure and accessing macros used in re_match_2: */
1693
1694struct register_info
1695{
1696 unsigned is_active : 1;
1697 unsigned matched_something : 1;
1698};
1699
1700#define IS_ACTIVE(R) ((R).is_active)
1701#define MATCHED_SOMETHING(R) ((R).matched_something)
1702
1703
1704/* Macros used by re_match_2: */
1705
1706
1707/* I.e., regstart, regend, and reg_info. */
1708
1709#define NUM_REG_ITEMS 3
1710
1711/* We push at most this many things on the stack whenever we
1712 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
1713 arguments to the PUSH_FAILURE_POINT macro. */
1714
1715#define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
1716
1717
1718/* We push this many things on the stack whenever we fail. */
1719
1720#define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
1721
1722
1723/* This pushes most of the information about the current state we will want
1724 if we ever fail back to it. */
1725
1726#define PUSH_FAILURE_POINT(pattern_place, string_place) \
1727 { \
1728 short last_used_reg, this_reg; \
1729 \
1730 /* Find out how many registers are active or have been matched. \
1731 (Aside from register zero, which is only set at the end.) */ \
1732 for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
1733 if (regstart[last_used_reg] != (unsigned char *) -1) \
1734 break; \
1735 \
1736 if (stacke - stackp < NUM_FAILURE_ITEMS) \
1737 { \
1738 unsigned char **stackx; \
1739 unsigned int len = stacke - stackb; \
1740 if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
1741 return -2; \
1742 \
1743 /* Roughly double the size of the stack. */ \
1744 stackx = (unsigned char **) alloca (2 * len \
1745 * sizeof (unsigned char *));\
1746 /* Only copy what is in use. */ \
1747 bcopy (stackb, stackx, len * sizeof (char *)); \
1748 stackp = stackx + (stackp - stackb); \
1749 stackb = stackx; \
1750 stacke = stackb + 2 * len; \
1751 } \
1752 \
1753 /* Now push the info for each of those registers. */ \
1754 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
1755 { \
1756 *stackp++ = regstart[this_reg]; \
1757 *stackp++ = regend[this_reg]; \
1758 *stackp++ = (unsigned char *) &reg_info[this_reg]; \
1759 } \
1760 \
1761 /* Push how many registers we saved. */ \
1762 *stackp++ = (unsigned char *) last_used_reg; \
1763 \
1764 *stackp++ = pattern_place; \
1765 *stackp++ = string_place; \
1766 }
1767
1768
1769/* This pops what PUSH_FAILURE_POINT pushes. */
1770
1771#define POP_FAILURE_POINT() \
1772 { \
1773 int temp; \
1774 stackp -= 2; /* Remove failure points. */ \
1775 temp = (int) *--stackp; /* How many regs pushed. */ \
1776 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
1777 stackp -= temp; /* Remove the register info. */ \
1778 }
1779
1780
1781#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
1782
1783/* Is true if there is a first string and if PTR is pointing anywhere
1784 inside it or just past the end. */
1785
1786#define IS_IN_FIRST_STRING(ptr) \
1787 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
1788
1789/* Call before fetching a character with *d. This switches over to
1790 string2 if necessary. */
1791
1792#define PREFETCH \
1793 while (d == dend) \
1794 { \
1795 /* end of string2 => fail. */ \
1796 if (dend == end_match_2) \
1797 goto fail; \
1798 /* end of string1 => advance to string2. */ \
1799 d = string2; \
1800 dend = end_match_2; \
1801 }
1802
1803
1804/* Call this when have matched something; it sets `matched' flags for the
1805 registers corresponding to the subexpressions of which we currently
1806 are inside. */
1807#define SET_REGS_MATCHED \
1808 { unsigned this_reg; \
1809 for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
1810 { \
1811 if (IS_ACTIVE(reg_info[this_reg])) \
1812 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
1813 else \
1814 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
1815 } \
1816 }
1817
1818/* Test if at very beginning or at very end of the virtual concatenation
1819 of string1 and string2. If there is only one string, we've put it in
1820 string2. */
1821
1822#define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2)
1823#define AT_STRINGS_END (d == end2)
1824
1825#define AT_WORD_BOUNDARY \
1826 (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d))
1827
1828/* We have two special cases to check for:
1829 1) if we're past the end of string1, we have to look at the first
1830 character in string2;
1831 2) if we're before the beginning of string2, we have to look at the
1832 last character in string1; we assume there is a string1, so use
1833 this in conjunction with AT_STRINGS_BEG. */
1834#define IS_A_LETTER(d) \
1835 (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
1836 == Sword)
1837
1838
1839/* Match the pattern described by PBUFP against the virtual
1840 concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
1841 respectively. Start the match at index POS in the virtual
1842 concatenation of STRING1 and STRING2. In REGS, return the indices of
1843 the virtual concatenation of STRING1 and STRING2 that matched the
1844 entire PBUFP->buffer and its contained subexpressions. Do not
1845 consider matching one past the index MSTOP in the virtual
1846 concatenation of STRING1 and STRING2.
1847
1848 If pbufp->fastmap is nonzero, then it had better be up to date.
1849
1850 The reason that the data to match are specified as two components
1851 which are to be regarded as concatenated is so this function can be
1852 used directly on the contents of an Emacs buffer.
1853
1854 -1 is returned if there is no match. -2 is returned if there is an
1855 error (such as match stack overflow). Otherwise the value is the
1856 length of the substring which was matched. */
1857
1858int
1859re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
1860 struct re_pattern_buffer *pbufp;
1861 char *string1_arg, *string2_arg;
1862 int size1, size2;
1863 int pos;
1864 struct re_registers *regs;
1865 int mstop;
1866{
1867 register unsigned char *p = (unsigned char *) pbufp->buffer;
1868
1869 /* Pointer to beyond end of buffer. */
1870 register unsigned char *pend = p + pbufp->used;
1871
1872 unsigned char *string1 = (unsigned char *) string1_arg;
1873 unsigned char *string2 = (unsigned char *) string2_arg;
1874 unsigned char *end1; /* Just past end of first string. */
1875 unsigned char *end2; /* Just past end of second string. */
1876
1877 /* Pointers into string1 and string2, just past the last characters in
1878 each to consider matching. */
1879 unsigned char *end_match_1, *end_match_2;
1880
1881 register unsigned char *d, *dend;
1882 register int mcnt; /* Multipurpose. */
1883 unsigned char *translate = (unsigned char *) pbufp->translate;
1884 unsigned is_a_jump_n = 0;
1885
1886 /* Failure point stack. Each place that can handle a failure further
1887 down the line pushes a failure point on this stack. It consists of
1888 restart, regend, and reg_info for all registers corresponding to the
1889 subexpressions we're currently inside, plus the number of such
1890 registers, and, finally, two char *'s. The first char * is where to
1891 resume scanning the pattern; the second one is where to resume
1892 scanning the strings. If the latter is zero, the failure point is a
1893 ``dummy''; if a failure happens and the failure point is a dummy, it
1894 gets discarded and the next next one is tried. */
1895
1896 unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1897 unsigned char **stackb = initial_stack;
1898 unsigned char **stackp = stackb;
1899 unsigned char **stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1900
1901
1902 /* Information on the contents of registers. These are pointers into
1903 the input strings; they record just what was matched (on this
1904 attempt) by a subexpression part of the pattern, that is, the
1905 regnum-th regstart pointer points to where in the pattern we began
1906 matching and the regnum-th regend points to right after where we
1907 stopped matching the regnum-th subexpression. (The zeroth register
1908 keeps track of what the whole pattern matches.) */
1909
1910 unsigned char *regstart[RE_NREGS];
1911 unsigned char *regend[RE_NREGS];
1912
1913 /* The is_active field of reg_info helps us keep track of which (possibly
1914 nested) subexpressions we are currently in. The matched_something
1915 field of reg_info[reg_num] helps us tell whether or not we have
1916 matched any of the pattern so far this time through the reg_num-th
1917 subexpression. These two fields get reset each time through any
1918 loop their register is in. */
1919
1920 struct register_info reg_info[RE_NREGS];
1921
1922
1923 /* The following record the register info as found in the above
1924 variables when we find a match better than any we've seen before.
1925 This happens as we backtrack through the failure points, which in
1926 turn happens only if we have not yet matched the entire string. */
1927
1928 unsigned best_regs_set = 0;
1929 unsigned char *best_regstart[RE_NREGS];
1930 unsigned char *best_regend[RE_NREGS];
1931
1932 /* Initialize subexpression text positions to -1 to mark ones that no
1933 \( or ( and \) or ) has been seen for. Also set all registers to
1934 inactive and mark them as not having matched anything or ever
1935 failed. */
1936 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1937 {
1938 regstart[mcnt] = regend[mcnt] = (unsigned char *) -1;
1939 IS_ACTIVE (reg_info[mcnt]) = 0;
1940 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
1941 }
1942
1943 if (regs)
1944 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1945 regs->start[mcnt] = regs->end[mcnt] = -1;
1946
1947 /* Set up pointers to ends of strings.
1948 Don't allow the second string to be empty unless both are empty. */
1949 if (size2 == 0)
1950 {
1951 string2 = string1;
1952 size2 = size1;
1953 string1 = 0;
1954 size1 = 0;
1955 }
1956 end1 = string1 + size1;
1957 end2 = string2 + size2;
1958
1959 /* Compute where to stop matching, within the two strings. */
1960 if (mstop <= size1)
1961 {
1962 end_match_1 = string1 + mstop;
1963 end_match_2 = string2;
1964 }
1965 else
1966 {
1967 end_match_1 = end1;
1968 end_match_2 = string2 + mstop - size1;
1969 }
1970
1971 /* `p' scans through the pattern as `d' scans through the data. `dend'
1972 is the end of the input string that `d' points within. `d' is
1973 advanced into the following input string whenever necessary, but
1974 this happens before fetching; therefore, at the beginning of the
1975 loop, `d' can be pointing at the end of a string, but it cannot
1976 equal string2. */
1977
1978 if (size1 != 0 && pos <= size1)
1979 d = string1 + pos, dend = end_match_1;
1980 else
1981 d = string2 + pos - size1, dend = end_match_2;
1982
1983
1984 /* This loops over pattern commands. It exits by returning from the
1985 function if match is complete, or it drops through if match fails
1986 at this starting point in the input data. */
1987
1988 while (1)
1989 {
1990 is_a_jump_n = 0;
1991 /* End of pattern means we might have succeeded. */
1992 if (p == pend)
1993 {
1994 /* If not end of string, try backtracking. Otherwise done. */
1995 if (d != end_match_2)
1996 {
1997 if (stackp != stackb)
1998 {
1999 /* More failure points to try. */
2000
2001 unsigned in_same_string =
2002 IS_IN_FIRST_STRING (best_regend[0])
2003 == MATCHING_IN_FIRST_STRING;
2004
2005 /* If exceeds best match so far, save it. */
2006 if (! best_regs_set
2007 || (in_same_string && d > best_regend[0])
2008 || (! in_same_string && ! MATCHING_IN_FIRST_STRING))
2009 {
2010 best_regs_set = 1;
2011 best_regend[0] = d; /* Never use regstart[0]. */
2012
2013 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2014 {
2015 best_regstart[mcnt] = regstart[mcnt];
2016 best_regend[mcnt] = regend[mcnt];
2017 }
2018 }
2019 goto fail;
2020 }
2021 /* If no failure points, don't restore garbage. */
2022 else if (best_regs_set)
2023 {
2024 restore_best_regs:
2025 /* Restore best match. */
2026 d = best_regend[0];
2027
2028 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
2029 {
2030 regstart[mcnt] = best_regstart[mcnt];
2031 regend[mcnt] = best_regend[mcnt];
2032 }
2033 }
2034 }
2035
2036 /* If caller wants register contents data back, convert it
2037 to indices. */
2038 if (regs)
2039 {
2040 regs->start[0] = pos;
2041 if (MATCHING_IN_FIRST_STRING)
2042 regs->end[0] = d - string1;
2043 else
2044 regs->end[0] = d - string2 + size1;
2045 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2046 {
2047 if (regend[mcnt] == (unsigned char *) -1)
2048 {
2049 regs->start[mcnt] = -1;
2050 regs->end[mcnt] = -1;
2051 continue;
2052 }
2053 if (IS_IN_FIRST_STRING (regstart[mcnt]))
2054 regs->start[mcnt] = regstart[mcnt] - string1;
2055 else
2056 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
2057
2058 if (IS_IN_FIRST_STRING (regend[mcnt]))
2059 regs->end[mcnt] = regend[mcnt] - string1;
2060 else
2061 regs->end[mcnt] = regend[mcnt] - string2 + size1;
2062 }
2063 }
2064 return d - pos - (MATCHING_IN_FIRST_STRING
2065 ? string1
2066 : string2 - size1);
2067 }
2068
2069 /* Otherwise match next pattern command. */
2070#ifdef SWITCH_ENUM_BUG
2071 switch ((int) ((enum regexpcode) *p++))
2072#else
2073 switch ((enum regexpcode) *p++)
2074#endif
2075 {
2076
2077 /* \( [or `(', as appropriate] is represented by start_memory,
2078 \) by stop_memory. Both of those commands are followed by
2079 a register number in the next byte. The text matched
2080 within the \( and \) is recorded under that number. */
2081 case start_memory:
2082 regstart[*p] = d;
2083 IS_ACTIVE (reg_info[*p]) = 1;
2084 MATCHED_SOMETHING (reg_info[*p]) = 0;
2085 p++;
2086 break;
2087
2088 case stop_memory:
2089 regend[*p] = d;
2090 IS_ACTIVE (reg_info[*p]) = 0;
2091
2092 /* If just failed to match something this time around with a sub-
2093 expression that's in a loop, try to force exit from the loop. */
2094 if ((! MATCHED_SOMETHING (reg_info[*p])
2095 || (enum regexpcode) p[-3] == start_memory)
2096 && (p + 1) != pend)
2097 {
2098 register unsigned char *p2 = p + 1;
2099 mcnt = 0;
2100 switch (*p2++)
2101 {
2102 case jump_n:
2103 is_a_jump_n = 1;
2104 case finalize_jump:
2105 case maybe_finalize_jump:
2106 case jump:
2107 case dummy_failure_jump:
2108 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2109 if (is_a_jump_n)
2110 p2 += 2;
2111 break;
2112 }
2113 p2 += mcnt;
2114
2115 /* If the next operation is a jump backwards in the pattern
2116 to an on_failure_jump, exit from the loop by forcing a
2117 failure after pushing on the stack the on_failure_jump's
2118 jump in the pattern, and d. */
2119 if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
2120 {
2121 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2122 PUSH_FAILURE_POINT (p2 + mcnt, d);
2123 goto fail;
2124 }
2125 }
2126 p++;
2127 break;
2128
2129 /* \<digit> has been turned into a `duplicate' command which is
2130 followed by the numeric value of <digit> as the register number. */
2131 case duplicate:
2132 {
2133 int regno = *p++; /* Get which register to match against */
2134 register unsigned char *d2, *dend2;
2135
2136 /* Where in input to try to start matching. */
2137 d2 = regstart[regno];
2138
2139 /* Where to stop matching; if both the place to start and
2140 the place to stop matching are in the same string, then
2141 set to the place to stop, otherwise, for now have to use
2142 the end of the first string. */
2143
2144 dend2 = ((IS_IN_FIRST_STRING (regstart[regno])
2145 == IS_IN_FIRST_STRING (regend[regno]))
2146 ? regend[regno] : end_match_1);
2147 while (1)
2148 {
2149 /* If necessary, advance to next segment in register
2150 contents. */
2151 while (d2 == dend2)
2152 {
2153 if (dend2 == end_match_2) break;
2154 if (dend2 == regend[regno]) break;
2155 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
2156 }
2157 /* At end of register contents => success */
2158 if (d2 == dend2) break;
2159
2160 /* If necessary, advance to next segment in data. */
2161 PREFETCH;
2162
2163 /* How many characters left in this segment to match. */
2164 mcnt = dend - d;
2165
2166 /* Want how many consecutive characters we can match in
2167 one shot, so, if necessary, adjust the count. */
2168 if (mcnt > dend2 - d2)
2169 mcnt = dend2 - d2;
2170
2171 /* Compare that many; failure if mismatch, else move
2172 past them. */
2173 if (translate
2174 ? bcmp_translate (d, d2, mcnt, translate)
2175 : bcmp (d, d2, mcnt))
2176 goto fail;
2177 d += mcnt, d2 += mcnt;
2178 }
2179 }
2180 break;
2181
2182 case anychar:
2183 PREFETCH; /* Fetch a data character. */
2184 /* Match anything but a newline, maybe even a null. */
2185 if ((translate ? translate[*d] : *d) == '\n'
2186 || ((obscure_syntax & RE_DOT_NOT_NULL)
2187 && (translate ? translate[*d] : *d) == '\000'))
2188 goto fail;
2189 SET_REGS_MATCHED;
2190 d++;
2191 break;
2192
2193 case charset:
2194 case charset_not:
2195 {
2196 int not = 0; /* Nonzero for charset_not. */
2197 register int c;
2198 if (*(p - 1) == (unsigned char) charset_not)
2199 not = 1;
2200
2201 PREFETCH; /* Fetch a data character. */
2202
2203 if (translate)
2204 c = translate[*d];
2205 else
2206 c = *d;
2207
2208 if (c < *p * BYTEWIDTH
2209 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2210 not = !not;
2211
2212 p += 1 + *p;
2213
2214 if (!not) goto fail;
2215 SET_REGS_MATCHED;
2216 d++;
2217 break;
2218 }
2219
2220 case begline:
2221 if ((size1 != 0 && d == string1)
2222 || (size1 == 0 && size2 != 0 && d == string2)
2223 || (d && d[-1] == '\n')
2224 || (size1 == 0 && size2 == 0))
2225 break;
2226 else
2227 goto fail;
2228
2229 case endline:
2230 if (d == end2
2231 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
2232 break;
2233 goto fail;
2234
2235 /* `or' constructs are handled by starting each alternative with
2236 an on_failure_jump that points to the start of the next
2237 alternative. Each alternative except the last ends with a
2238 jump to the joining point. (Actually, each jump except for
2239 the last one really jumps to the following jump, because
2240 tensioning the jumps is a hassle.) */
2241
2242 /* The start of a stupid repeat has an on_failure_jump that points
2243 past the end of the repeat text. This makes a failure point so
2244 that on failure to match a repetition, matching restarts past
2245 as many repetitions have been found with no way to fail and
2246 look for another one. */
2247
2248 /* A smart repeat is similar but loops back to the on_failure_jump
2249 so that each repetition makes another failure point. */
2250
2251 case on_failure_jump:
2252 on_failure:
2253 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2254 PUSH_FAILURE_POINT (p + mcnt, d);
2255 break;
2256
2257 /* The end of a smart repeat has a maybe_finalize_jump back.
2258 Change it either to a finalize_jump or an ordinary jump. */
2259 case maybe_finalize_jump:
2260 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2261 {
2262 register unsigned char *p2 = p;
2263 /* Compare what follows with the beginning of the repeat.
2264 If we can establish that there is nothing that they would
2265 both match, we can change to finalize_jump. */
2266 while (p2 + 1 != pend
2267 && (*p2 == (unsigned char) stop_memory
2268 || *p2 == (unsigned char) start_memory))
2269 p2 += 2; /* Skip over reg number. */
2270 if (p2 == pend)
2271 p[-3] = (unsigned char) finalize_jump;
2272 else if (*p2 == (unsigned char) exactn
2273 || *p2 == (unsigned char) endline)
2274 {
2275 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
2276 register unsigned char *p1 = p + mcnt;
2277 /* p1[0] ... p1[2] are an on_failure_jump.
2278 Examine what follows that. */
2279 if (p1[3] == (unsigned char) exactn && p1[5] != c)
2280 p[-3] = (unsigned char) finalize_jump;
2281 else if (p1[3] == (unsigned char) charset
2282 || p1[3] == (unsigned char) charset_not)
2283 {
2284 int not = p1[3] == (unsigned char) charset_not;
2285 if (c < p1[4] * BYTEWIDTH
2286 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2287 not = !not;
2288 /* `not' is 1 if c would match. */
2289 /* That means it is not safe to finalize. */
2290 if (!not)
2291 p[-3] = (unsigned char) finalize_jump;
2292 }
2293 }
2294 }
2295 p -= 2; /* Point at relative address again. */
2296 if (p[-1] != (unsigned char) finalize_jump)
2297 {
2298 p[-1] = (unsigned char) jump;
2299 goto nofinalize;
2300 }
2301 /* Note fall through. */
2302
2303 /* The end of a stupid repeat has a finalize_jump back to the
2304 start, where another failure point will be made which will
2305 point to after all the repetitions found so far. */
2306
2307 /* Take off failure points put on by matching on_failure_jump
2308 because didn't fail. Also remove the register information
2309 put on by the on_failure_jump. */
2310 case finalize_jump:
2311 POP_FAILURE_POINT ();
2312 /* Note fall through. */
2313
2314 /* Jump without taking off any failure points. */
2315 case jump:
2316 nofinalize:
2317 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2318 p += mcnt;
2319 break;
2320
2321 case dummy_failure_jump:
2322 /* Normally, the on_failure_jump pushes a failure point, which
2323 then gets popped at finalize_jump. We will end up at
2324 finalize_jump, also, and with a pattern of, say, `a+', we
2325 are skipping over the on_failure_jump, so we have to push
2326 something meaningless for finalize_jump to pop. */
2327 PUSH_FAILURE_POINT (0, 0);
2328 goto nofinalize;
2329
2330
2331 /* Have to succeed matching what follows at least n times. Then
2332 just handle like an on_failure_jump. */
2333 case succeed_n:
2334 EXTRACT_NUMBER (mcnt, p + 2);
2335 /* Originally, this is how many times we HAVE to succeed. */
2336 if (mcnt)
2337 {
2338 mcnt--;
2339 p += 2;
2340 STORE_NUMBER_AND_INCR (p, mcnt);
2341 }
2342 else if (mcnt == 0)
2343 {
2344 p[2] = unused;
2345 p[3] = unused;
2346 goto on_failure;
2347 }
2348 else
2349 {
2350 fprintf (stderr, "regex: the succeed_n's n is not set.\n");
2351 exit (1);
2352 }
2353 break;
2354
2355 case jump_n:
2356 EXTRACT_NUMBER (mcnt, p + 2);
2357 /* Originally, this is how many times we CAN jump. */
2358 if (mcnt)
2359 {
2360 mcnt--;
2361 STORE_NUMBER(p + 2, mcnt);
2362 goto nofinalize; /* Do the jump without taking off
2363 any failure points. */
2364 }
2365 /* If don't have to jump any more, skip over the rest of command. */
2366 else
2367 p += 4;
2368 break;
2369
2370 case set_number_at:
2371 {
2372 register unsigned char *p1;
2373
2374 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2375 p1 = p + mcnt;
2376 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2377 STORE_NUMBER (p1, mcnt);
2378 break;
2379 }
2380
2381 /* Ignore these. Used to ignore the n of succeed_n's which
2382 currently have n == 0. */
2383 case unused:
2384 break;
2385
2386 case wordbound:
2387 if (AT_WORD_BOUNDARY)
2388 break;
2389 goto fail;
2390
2391 case notwordbound:
2392 if (AT_WORD_BOUNDARY)
2393 goto fail;
2394 break;
2395
2396 case wordbeg:
2397 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2398 if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1)))
2399 break;
2400 goto fail;
2401
2402 case wordend:
2403 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2404 if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
2405 && (!IS_A_LETTER (d) || AT_STRINGS_END))
2406 break;
2407 goto fail;
2408
2409#ifdef emacs
2410 case before_dot:
2411 if (PTR_CHAR_POS (d) >= point)
2412 goto fail;
2413 break;
2414
2415 case at_dot:
2416 if (PTR_CHAR_POS (d) != point)
2417 goto fail;
2418 break;
2419
2420 case after_dot:
2421 if (PTR_CHAR_POS (d) <= point)
2422 goto fail;
2423 break;
2424
2425 case wordchar:
2426 mcnt = (int) Sword;
2427 goto matchsyntax;
2428
2429 case syntaxspec:
2430 mcnt = *p++;
2431 matchsyntax:
2432 PREFETCH;
2433 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
2434 SET_REGS_MATCHED;
2435 break;
2436
2437 case notwordchar:
2438 mcnt = (int) Sword;
2439 goto matchnotsyntax;
2440
2441 case notsyntaxspec:
2442 mcnt = *p++;
2443 matchnotsyntax:
2444 PREFETCH;
2445 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
2446 SET_REGS_MATCHED;
2447 break;
2448
2449#else /* not emacs */
2450
2451 case wordchar:
2452 PREFETCH;
2453 if (!IS_A_LETTER (d))
2454 goto fail;
2455 SET_REGS_MATCHED;
2456 break;
2457
2458 case notwordchar:
2459 PREFETCH;
2460 if (IS_A_LETTER (d))
2461 goto fail;
2462 SET_REGS_MATCHED;
2463 break;
2464
2465#endif /* not emacs */
2466
2467 case begbuf:
2468 if (AT_STRINGS_BEG)
2469 break;
2470 goto fail;
2471
2472 case endbuf:
2473 if (AT_STRINGS_END)
2474 break;
2475 goto fail;
2476
2477 case exactn:
2478 /* Match the next few pattern characters exactly.
2479 mcnt is how many characters to match. */
2480 mcnt = *p++;
2481 /* This is written out as an if-else so we don't waste time
2482 testing `translate' inside the loop. */
2483 if (translate)
2484 {
2485 do
2486 {
2487 PREFETCH;
2488 if (translate[*d++] != *p++) goto fail;
2489 }
2490 while (--mcnt);
2491 }
2492 else
2493 {
2494 do
2495 {
2496 PREFETCH;
2497 if (*d++ != *p++) goto fail;
2498 }
2499 while (--mcnt);
2500 }
2501 SET_REGS_MATCHED;
2502 break;
2503 }
2504 continue; /* Successfully executed one pattern command; keep going. */
2505
2506 /* Jump here if any matching operation fails. */
2507 fail:
2508 if (stackp != stackb)
2509 /* A restart point is known. Restart there and pop it. */
2510 {
2511 short last_used_reg, this_reg;
2512
2513 /* If this failure point is from a dummy_failure_point, just
2514 skip it. */
2515 if (!stackp[-2])
2516 {
2517 POP_FAILURE_POINT ();
2518 goto fail;
2519 }
2520
2521 d = *--stackp;
2522 p = *--stackp;
2523 if (d >= string1 && d <= end1)
2524 dend = end_match_1;
2525 /* Restore register info. */
2526 last_used_reg = (short) *--stackp;
2527
2528 /* Make the ones that weren't saved -1 or 0 again. */
2529 for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
2530 {
2531 regend[this_reg] = (unsigned char *) -1;
2532 regstart[this_reg] = (unsigned char *) -1;
2533 IS_ACTIVE (reg_info[this_reg]) = 0;
2534 MATCHED_SOMETHING (reg_info[this_reg]) = 0;
2535 }
2536
2537 /* And restore the rest from the stack. */
2538 for ( ; this_reg > 0; this_reg--)
2539 {
2540 reg_info[this_reg] = *(struct register_info *) *--stackp;
2541 regend[this_reg] = *--stackp;
2542 regstart[this_reg] = *--stackp;
2543 }
2544 }
2545 else
2546 break; /* Matching at this starting point really fails. */
2547 }
2548
2549 if (best_regs_set)
2550 goto restore_best_regs;
2551 return -1; /* Failure to match. */
2552}
2553
2554
2555static int
2556bcmp_translate (s1, s2, len, translate)
2557 unsigned char *s1, *s2;
2558 register int len;
2559 unsigned char *translate;
2560{
2561 register unsigned char *p1 = s1, *p2 = s2;
2562 while (len)
2563 {
2564 if (translate [*p1++] != translate [*p2++]) return 1;
2565 len--;
2566 }
2567 return 0;
2568}
2569
2570
2571\f
2572/* Entry points compatible with 4.2 BSD regex library. */
2573
2574#ifndef emacs
2575
2576static struct re_pattern_buffer re_comp_buf;
2577
2578char *
2579re_comp (s)
2580 char *s;
2581{
2582 if (!s)
2583 {
2584 if (!re_comp_buf.buffer)
2585 return "No previous regular expression";
2586 return 0;
2587 }
2588
2589 if (!re_comp_buf.buffer)
2590 {
2591 if (!(re_comp_buf.buffer = (char *) malloc (200)))
2592 return "Memory exhausted";
2593 re_comp_buf.allocated = 200;
2594 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
2595 return "Memory exhausted";
2596 }
2597 return re_compile_pattern (s, strlen (s), &re_comp_buf);
2598}
2599
2600int
2601re_exec (s)
2602 char *s;
2603{
2604 int len = strlen (s);
2605 return 0 <= re_search (&re_comp_buf, s, len, 0, len,
2606 (struct re_registers *) 0);
2607}
2608#endif /* not emacs */
2609
2610
2611\f
2612#ifdef test
2613
2614#include <stdio.h>
2615
2616/* Indexed by a character, gives the upper case equivalent of the
2617 character. */
2618
2619char upcase[0400] =
2620 { 000, 001, 002, 003, 004, 005, 006, 007,
2621 010, 011, 012, 013, 014, 015, 016, 017,
2622 020, 021, 022, 023, 024, 025, 026, 027,
2623 030, 031, 032, 033, 034, 035, 036, 037,
2624 040, 041, 042, 043, 044, 045, 046, 047,
2625 050, 051, 052, 053, 054, 055, 056, 057,
2626 060, 061, 062, 063, 064, 065, 066, 067,
2627 070, 071, 072, 073, 074, 075, 076, 077,
2628 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2629 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2630 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2631 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
2632 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2633 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2634 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2635 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
2636 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
2637 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
2638 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
2639 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
2640 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
2641 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
2642 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
2643 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
2644 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
2645 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
2646 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
2647 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
2648 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
2649 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
2650 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
2651 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
2652 };
2653
2654#ifdef canned
2655
2656#include "tests.h"
2657
2658typedef enum { extended_test, basic_test } test_type;
2659
2660/* Use this to run the tests we've thought of. */
2661
2662void
2663main ()
2664{
2665 test_type t = extended_test;
2666
2667 if (t == basic_test)
2668 {
2669 printf ("Running basic tests:\n\n");
2670 test_posix_basic ();
2671 }
2672 else if (t == extended_test)
2673 {
2674 printf ("Running extended tests:\n\n");
2675 test_posix_extended ();
2676 }
2677}
2678
2679#else /* not canned */
2680
2681/* Use this to run interactive tests. */
2682
2683void
2684main (argc, argv)
2685 int argc;
2686 char **argv;
2687{
2688 char pat[80];
2689 struct re_pattern_buffer buf;
2690 int i;
2691 char c;
2692 char fastmap[(1 << BYTEWIDTH)];
2693
2694 /* Allow a command argument to specify the style of syntax. */
2695 if (argc > 1)
2696 obscure_syntax = atoi (argv[1]);
2697
2698 buf.allocated = 40;
2699 buf.buffer = (char *) malloc (buf.allocated);
2700 buf.fastmap = fastmap;
2701 buf.translate = upcase;
2702
2703 while (1)
2704 {
2705 gets (pat);
2706
2707 if (*pat)
2708 {
2709 re_compile_pattern (pat, strlen(pat), &buf);
2710
2711 for (i = 0; i < buf.used; i++)
2712 printchar (buf.buffer[i]);
2713
2714 putchar ('\n');
2715
2716 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
2717
2718 re_compile_fastmap (&buf);
2719 printf ("Allowed by fastmap: ");
2720 for (i = 0; i < (1 << BYTEWIDTH); i++)
2721 if (fastmap[i]) printchar (i);
2722 putchar ('\n');
2723 }
2724
2725 gets (pat); /* Now read the string to match against */
2726
2727 i = re_match (&buf, pat, strlen (pat), 0, 0);
2728 printf ("Match value %d.\n", i);
2729 }
2730}
2731
2732#endif
2733
2734
2735#ifdef NOTDEF
2736print_buf (bufp)
2737 struct re_pattern_buffer *bufp;
2738{
2739 int i;
2740
2741 printf ("buf is :\n----------------\n");
2742 for (i = 0; i < bufp->used; i++)
2743 printchar (bufp->buffer[i]);
2744
2745 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
2746
2747 printf ("Allowed by fastmap: ");
2748 for (i = 0; i < (1 << BYTEWIDTH); i++)
2749 if (bufp->fastmap[i])
2750 printchar (i);
2751 printf ("\nAllowed by translate: ");
2752 if (bufp->translate)
2753 for (i = 0; i < (1 << BYTEWIDTH); i++)
2754 if (bufp->translate[i])
2755 printchar (i);
2756 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
2757 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
2758}
2759#endif /* NOTDEF */
2760
2761printchar (c)
2762 char c;
2763{
2764 if (c < 040 || c >= 0177)
2765 {
2766 putchar ('\\');
2767 putchar (((c >> 6) & 3) + '0');
2768 putchar (((c >> 3) & 7) + '0');
2769 putchar ((c & 7) + '0');
2770 }
2771 else
2772 putchar (c);
2773}
2774
2775error (string)
2776 char *string;
2777{
2778 puts (string);
2779 exit (1);
2780}
2781#endif /* test */