BSD 4_4_Lite2 development
[unix-history] / usr / src / contrib / gawk-2.15.2 / regex.c
CommitLineData
166701a0
C
1/* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 1, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
17
18
19/* To test, compile with -Dtest. This Dtestable feature turns this into
20 a self-contained program which reads a pattern, describes how it
21 compiles, then reads a string and searches for it.
22
23 On the other hand, if you compile with both -Dtest and -Dcanned you
24 can run some tests we've already thought of. */
25
26
27#ifdef emacs
28
29/* The `emacs' switch turns on certain special matching commands
30 that make sense only in emacs. */
31
32#include "lisp.h"
33#include "buffer.h"
34#include "syntax.h"
35
36/* We write fatal error messages on standard error. */
37#include <stdio.h>
38
39/* isalpha(3) etc. are used for the character classes. */
40#include <ctype.h>
41
42#else /* not emacs */
43
44#include "awk.h"
45
46#define NO_ALLOCA /* try it out for now */
47#ifndef NO_ALLOCA
48/* Make alloca work the best possible way. */
49#ifdef __GNUC__
50#ifndef atarist
51#ifndef alloca
52#define alloca __builtin_alloca
53#endif
54#endif /* atarist */
55#else
56#if defined(sparc) && !defined(__GNUC__)
57#include <alloca.h>
58#else
59char *alloca ();
60#endif
61#endif /* __GNUC__ */
62
63#define FREE_AND_RETURN_VOID(stackb) return
64#define FREE_AND_RETURN(stackb,val) return(val)
65#define DOUBLE_STACK(stackx,stackb,len) \
66 (stackx = (unsigned char **) alloca (2 * len \
67 * sizeof (unsigned char *)),\
68 /* Only copy what is in use. */ \
69 (unsigned char **) memcpy (stackx, stackb, len * sizeof (char *)))
70#else /* NO_ALLOCA defined */
71#define FREE_AND_RETURN_VOID(stackb) free(stackb);return
72#define FREE_AND_RETURN(stackb,val) free(stackb);return(val)
73#define DOUBLE_STACK(stackx,stackb,len) \
74 (unsigned char **) realloc (stackb, 2 * len * sizeof (unsigned char *))
75#endif /* NO_ALLOCA */
76
77static void store_jump P((char *, int, char *));
78static void insert_jump P((int, char *, char *, char *));
79static void store_jump_n P((char *, int, char *, unsigned));
80static void insert_jump_n P((int, char *, char *, char *, unsigned));
81static void insert_op_2 P((int, char *, char *, int, int ));
82static int memcmp_translate P((unsigned char *, unsigned char *,
83 int, unsigned char *));
84long re_set_syntax P((long));
85
86/* Define the syntax stuff, so we can do the \<, \>, etc. */
87
88/* This must be nonzero for the wordchar and notwordchar pattern
89 commands in re_match_2. */
90#ifndef Sword
91#define Sword 1
92#endif
93
94#define SYNTAX(c) re_syntax_table[c]
95
96
97#ifdef SYNTAX_TABLE
98
99char *re_syntax_table;
100
101#else /* not SYNTAX_TABLE */
102
103static char re_syntax_table[256];
104static void init_syntax_once P((void));
105
106
107static void
108init_syntax_once ()
109{
110 register int c;
111 static int done = 0;
112
113 if (done)
114 return;
115
116 memset (re_syntax_table, 0, sizeof re_syntax_table);
117
118 for (c = 'a'; c <= 'z'; c++)
119 re_syntax_table[c] = Sword;
120
121 for (c = 'A'; c <= 'Z'; c++)
122 re_syntax_table[c] = Sword;
123
124 for (c = '0'; c <= '9'; c++)
125 re_syntax_table[c] = Sword;
126
127 /* Add specific syntax for ISO Latin-1. */
128 for (c = 0300; c <= 0377; c++)
129 re_syntax_table[c] = Sword;
130 re_syntax_table[0327] = 0;
131 re_syntax_table[0367] = 0;
132
133 done = 1;
134}
135
136#endif /* SYNTAX_TABLE */
137#undef P
138#endif /* emacs */
139
140
141/* Sequents are missing isgraph. */
142#ifndef isgraph
143#define isgraph(c) (isprint((c)) && !isspace((c)))
144#endif
145
146/* Get the interface, including the syntax bits. */
147#include "regex.h"
148
149
150/* These are the command codes that appear in compiled regular
151 expressions, one per byte. Some command codes are followed by
152 argument bytes. A command code can specify any interpretation
153 whatsoever for its arguments. Zero-bytes may appear in the compiled
154 regular expression.
155
156 The value of `exactn' is needed in search.c (search_buffer) in emacs.
157 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
158 `exactn' we use here must also be 1. */
159
160enum regexpcode
161 {
162 unused=0,
163 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
164 begline, /* Fail unless at beginning of line. */
165 endline, /* Fail unless at end of line. */
166 jump, /* Followed by two bytes giving relative address to jump to. */
167 on_failure_jump, /* Followed by two bytes giving relative address of
168 place to resume at in case of failure. */
169 finalize_jump, /* Throw away latest failure point and then jump to
170 address. */
171 maybe_finalize_jump, /* Like jump but finalize if safe to do so.
172 This is used to jump back to the beginning
173 of a repeat. If the command that follows
174 this jump is clearly incompatible with the
175 one at the beginning of the repeat, such that
176 we can be sure that there is no use backtracking
177 out of repetitions already completed,
178 then we finalize. */
179 dummy_failure_jump, /* Jump, and push a dummy failure point. This
180 failure point will be thrown away if an attempt
181 is made to use it for a failure. A + construct
182 makes this before the first repeat. Also
183 use it as an intermediary kind of jump when
184 compiling an or construct. */
185 succeed_n, /* Used like on_failure_jump except has to succeed n times;
186 then gets turned into an on_failure_jump. The relative
187 address following it is useless until then. The
188 address is followed by two bytes containing n. */
189 jump_n, /* Similar to jump, but jump n times only; also the relative
190 address following is in turn followed by yet two more bytes
191 containing n. */
192 set_number_at, /* Set the following relative location to the
193 subsequent number. */
194 anychar, /* Matches any (more or less) one character. */
195 charset, /* Matches any one char belonging to specified set.
196 First following byte is number of bitmap bytes.
197 Then come bytes for a bitmap saying which chars are in.
198 Bits in each byte are ordered low-bit-first.
199 A character is in the set if its bit is 1.
200 A character too large to have a bit in the map
201 is automatically not in the set. */
202 charset_not, /* Same parameters as charset, but match any character
203 that is not one of those specified. */
204 start_memory, /* Start remembering the text that is matched, for
205 storing in a memory register. Followed by one
206 byte containing the register number. Register numbers
207 must be in the range 0 through RE_NREGS. */
208 stop_memory, /* Stop remembering the text that is matched
209 and store it in a memory register. Followed by
210 one byte containing the register number. Register
211 numbers must be in the range 0 through RE_NREGS. */
212 duplicate, /* Match a duplicate of something remembered.
213 Followed by one byte containing the index of the memory
214 register. */
215 before_dot, /* Succeeds if before point. */
216 at_dot, /* Succeeds if at point. */
217 after_dot, /* Succeeds if after point. */
218 begbuf, /* Succeeds if at beginning of buffer. */
219 endbuf, /* Succeeds if at end of buffer. */
220 wordchar, /* Matches any word-constituent character. */
221 notwordchar, /* Matches any char that is not a word-constituent. */
222 wordbeg, /* Succeeds if at word beginning. */
223 wordend, /* Succeeds if at word end. */
224 wordbound, /* Succeeds if at a word boundary. */
225 notwordbound,/* Succeeds if not at a word boundary. */
226 syntaxspec, /* Matches any character whose syntax is specified.
227 followed by a byte which contains a syntax code,
228 e.g., Sword. */
229 notsyntaxspec /* Matches any character whose syntax differs from
230 that specified. */
231 };
232
233
234/* Number of failure points to allocate space for initially,
235 when matching. If this number is exceeded, more space is allocated,
236 so it is not a hard limit. */
237
238#ifndef NFAILURES
239#define NFAILURES 80
240#endif
241
242#ifdef CHAR_UNSIGNED
243#define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
244#endif
245#ifndef SIGN_EXTEND_CHAR
246#define SIGN_EXTEND_CHAR(x) (x)
247#endif
248
249
250/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
251#define STORE_NUMBER(destination, number) \
252 { (destination)[0] = (number) & 0377; \
253 (destination)[1] = (number) >> 8; }
254
255/* Same as STORE_NUMBER, except increment the destination pointer to
256 the byte after where the number is stored. Watch out that values for
257 DESTINATION such as p + 1 won't work, whereas p will. */
258#define STORE_NUMBER_AND_INCR(destination, number) \
259 { STORE_NUMBER(destination, number); \
260 (destination) += 2; }
261
262
263/* Put into DESTINATION a number stored in two contingous bytes starting
264 at SOURCE. */
265#define EXTRACT_NUMBER(destination, source) \
266 { (destination) = *(source) & 0377; \
267 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
268
269/* Same as EXTRACT_NUMBER, except increment the pointer for source to
270 point to second byte of SOURCE. Note that SOURCE has to be a value
271 such as p, not, e.g., p + 1. */
272#define EXTRACT_NUMBER_AND_INCR(destination, source) \
273 { EXTRACT_NUMBER (destination, source); \
274 (source) += 2; }
275
276
277/* Specify the precise syntax of regexps for compilation. This provides
278 for compatibility for various utilities which historically have
279 different, incompatible syntaxes.
280
281 The argument SYNTAX is a bit-mask comprised of the various bits
282 defined in regex.h. */
283
284long
285re_set_syntax (syntax)
286 long syntax;
287{
288 long ret;
289
290 ret = obscure_syntax;
291 obscure_syntax = syntax;
292 return ret;
293}
294
295/* Set by re_set_syntax to the current regexp syntax to recognize. */
296long obscure_syntax = 0;
297
298
299\f
300/* Macros for re_compile_pattern, which is found below these definitions. */
301
302#define CHAR_CLASS_MAX_LENGTH 6
303
304/* Fetch the next character in the uncompiled pattern, translating it if
305 necessary. */
306#define PATFETCH(c) \
307 {if (p == pend) goto end_of_pattern; \
308 c = * (unsigned char *) p++; \
309 if (translate) c = translate[c]; }
310
311/* Fetch the next character in the uncompiled pattern, with no
312 translation. */
313#define PATFETCH_RAW(c) \
314 {if (p == pend) goto end_of_pattern; \
315 c = * (unsigned char *) p++; }
316
317#define PATUNFETCH p--
318
319
320/* If the buffer isn't allocated when it comes in, use this. */
321#define INIT_BUF_SIZE 28
322
323/* Make sure we have at least N more bytes of space in buffer. */
324#define GET_BUFFER_SPACE(n) \
325 { \
326 while (b - bufp->buffer + (n) >= bufp->allocated) \
327 EXTEND_BUFFER; \
328 }
329
330/* Make sure we have one more byte of buffer space and then add CH to it. */
331#define BUFPUSH(ch) \
332 { \
333 GET_BUFFER_SPACE (1); \
334 *b++ = (char) (ch); \
335 }
336
337/* Extend the buffer by twice its current size via reallociation and
338 reset the pointers that pointed into the old allocation to point to
339 the correct places in the new allocation. If extending the buffer
340 results in it being larger than 1 << 16, then flag memory exhausted. */
341#define EXTEND_BUFFER \
342 { char *old_buffer = bufp->buffer; \
343 if (bufp->allocated == (1L<<16)) goto too_big; \
344 bufp->allocated *= 2; \
345 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
346 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
347 if (bufp->buffer == 0) \
348 goto memory_exhausted; \
349 b = (b - old_buffer) + bufp->buffer; \
350 if (fixup_jump) \
351 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
352 if (laststart) \
353 laststart = (laststart - old_buffer) + bufp->buffer; \
354 begalt = (begalt - old_buffer) + bufp->buffer; \
355 if (pending_exact) \
356 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
357 }
358
359/* Set the bit for character C in a character set list. */
360#define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
361
362/* Get the next unsigned number in the uncompiled pattern. */
363#define GET_UNSIGNED_NUMBER(num) \
364 { if (p != pend) \
365 { \
366 PATFETCH (c); \
367 while (isdigit (c)) \
368 { \
369 if (num < 0) \
370 num = 0; \
371 num = num * 10 + c - '0'; \
372 if (p == pend) \
373 break; \
374 PATFETCH (c); \
375 } \
376 } \
377 }
378
379/* Subroutines for re_compile_pattern. */
380/* static void store_jump (), insert_jump (), store_jump_n (),
381 insert_jump_n (), insert_op_2 (); */
382
383
384/* re_compile_pattern takes a regular-expression string
385 and converts it into a buffer full of byte commands for matching.
386
387 PATTERN is the address of the pattern string
388 SIZE is the length of it.
389 BUFP is a struct re_pattern_buffer * which points to the info
390 on where to store the byte commands.
391 This structure contains a char * which points to the
392 actual space, which should have been obtained with malloc.
393 re_compile_pattern may use realloc to grow the buffer space.
394
395 The number of bytes of commands can be found out by looking in
396 the `struct re_pattern_buffer' that bufp pointed to, after
397 re_compile_pattern returns. */
398
399char *
400re_compile_pattern (pattern, size, bufp)
401 char *pattern;
402 size_t size;
403 struct re_pattern_buffer *bufp;
404{
405 register char *b = bufp->buffer;
406 register char *p = pattern;
407 char *pend = pattern + size;
408 register unsigned c, c1;
409 char *p0;
410 unsigned char *translate = (unsigned char *) bufp->translate;
411
412 /* Address of the count-byte of the most recently inserted `exactn'
413 command. This makes it possible to tell whether a new exact-match
414 character can be added to that command or requires a new `exactn'
415 command. */
416
417 char *pending_exact = 0;
418
419 /* Address of the place where a forward-jump should go to the end of
420 the containing expression. Each alternative of an `or', except the
421 last, ends with a forward-jump of this sort. */
422
423 char *fixup_jump = 0;
424
425 /* Address of start of the most recently finished expression.
426 This tells postfix * where to find the start of its operand. */
427
428 char *laststart = 0;
429
430 /* In processing a repeat, 1 means zero matches is allowed. */
431
432 char zero_times_ok;
433
434 /* In processing a repeat, 1 means many matches is allowed. */
435
436 char many_times_ok;
437
438 /* Address of beginning of regexp, or inside of last \(. */
439
440 char *begalt = b;
441
442 /* In processing an interval, at least this many matches must be made. */
443 int lower_bound;
444
445 /* In processing an interval, at most this many matches can be made. */
446 int upper_bound;
447
448 /* Place in pattern (i.e., the {) to which to go back if the interval
449 is invalid. */
450 char *beg_interval = 0;
451
452 /* Stack of information saved by \( and restored by \).
453 Four stack elements are pushed by each \(:
454 First, the value of b.
455 Second, the value of fixup_jump.
456 Third, the value of regnum.
457 Fourth, the value of begalt. */
458
459 int stackb[40];
460 int *stackp = stackb;
461 int *stacke = stackb + 40;
462 int *stackt;
463
464 /* Counts \('s as they are encountered. Remembered for the matching \),
465 where it becomes the register number to put in the stop_memory
466 command. */
467
468 int regnum = 1;
469
470 bufp->fastmap_accurate = 0;
471
472#ifndef emacs
473#ifndef SYNTAX_TABLE
474 /* Initialize the syntax table. */
475 init_syntax_once();
476#endif
477#endif
478
479 if (bufp->allocated == 0)
480 {
481 bufp->allocated = INIT_BUF_SIZE;
482 if (bufp->buffer)
483 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
484 bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
485 else
486 /* Caller did not allocate a buffer. Do it for them. */
487 bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
488 if (!bufp->buffer) goto memory_exhausted;
489 begalt = b = bufp->buffer;
490 }
491
492 while (p != pend)
493 {
494 PATFETCH (c);
495
496 switch (c)
497 {
498 case '$':
499 {
500 char *p1 = p;
501 /* When testing what follows the $,
502 look past the \-constructs that don't consume anything. */
503 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
504 while (p1 != pend)
505 {
506 if (*p1 == '\\' && p1 + 1 != pend
507 && (p1[1] == '<' || p1[1] == '>'
508 || p1[1] == '`' || p1[1] == '\''
509#ifdef emacs
510 || p1[1] == '='
511#endif
512 || p1[1] == 'b' || p1[1] == 'B'))
513 p1 += 2;
514 else
515 break;
516 }
517 if (obscure_syntax & RE_TIGHT_VBAR)
518 {
519 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
520 goto normal_char;
521 /* Make operand of last vbar end before this `$'. */
522 if (fixup_jump)
523 store_jump (fixup_jump, jump, b);
524 fixup_jump = 0;
525 BUFPUSH (endline);
526 break;
527 }
528 /* $ means succeed if at end of line, but only in special contexts.
529 If validly in the middle of a pattern, it is a normal character. */
530
531 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
532 goto invalid_pattern;
533 if (p1 == pend || *p1 == '\n'
534 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
535 || (obscure_syntax & RE_NO_BK_PARENS
536 ? *p1 == ')'
537 : *p1 == '\\' && p1[1] == ')')
538 || (obscure_syntax & RE_NO_BK_VBAR
539 ? *p1 == '|'
540 : *p1 == '\\' && p1[1] == '|'))
541 {
542 BUFPUSH (endline);
543 break;
544 }
545 goto normal_char;
546 }
547 case '^':
548 /* ^ means succeed if at beg of line, but only if no preceding
549 pattern. */
550
551 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
552 goto invalid_pattern;
553 if (laststart && p - 2 >= pattern && p[-2] != '\n'
554 && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
555 goto normal_char;
556 if (obscure_syntax & RE_TIGHT_VBAR)
557 {
558 if (p != pattern + 1
559 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
560 goto normal_char;
561 BUFPUSH (begline);
562 begalt = b;
563 }
564 else
565 BUFPUSH (begline);
566 break;
567
568 case '+':
569 case '?':
570 if ((obscure_syntax & RE_BK_PLUS_QM)
571 || (obscure_syntax & RE_LIMITED_OPS))
572 goto normal_char;
573 handle_plus:
574 case '*':
575 /* If there is no previous pattern, char not special. */
576 if (!laststart)
577 {
578 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
579 goto invalid_pattern;
580 else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
581 goto normal_char;
582 }
583 /* If there is a sequence of repetition chars,
584 collapse it down to just one. */
585 zero_times_ok = 0;
586 many_times_ok = 0;
587 while (1)
588 {
589 zero_times_ok |= c != '+';
590 many_times_ok |= c != '?';
591 if (p == pend)
592 break;
593 PATFETCH (c);
594 if (c == '*')
595 ;
596 else if (!(obscure_syntax & RE_BK_PLUS_QM)
597 && (c == '+' || c == '?'))
598 ;
599 else if ((obscure_syntax & RE_BK_PLUS_QM)
600 && c == '\\')
601 {
602 /* int c1; */
603 PATFETCH (c1);
604 if (!(c1 == '+' || c1 == '?'))
605 {
606 PATUNFETCH;
607 PATUNFETCH;
608 break;
609 }
610 c = c1;
611 }
612 else
613 {
614 PATUNFETCH;
615 break;
616 }
617 }
618
619 /* Star, etc. applied to an empty pattern is equivalent
620 to an empty pattern. */
621 if (!laststart)
622 break;
623
624 /* Now we know whether or not zero matches is allowed
625 and also whether or not two or more matches is allowed. */
626 if (many_times_ok)
627 {
628 /* If more than one repetition is allowed, put in at the
629 end a backward relative jump from b to before the next
630 jump we're going to put in below (which jumps from
631 laststart to after this jump). */
632 GET_BUFFER_SPACE (3);
633 store_jump (b, maybe_finalize_jump, laststart - 3);
634 b += 3; /* Because store_jump put stuff here. */
635 }
636 /* On failure, jump from laststart to b + 3, which will be the
637 end of the buffer after this jump is inserted. */
638 GET_BUFFER_SPACE (3);
639 insert_jump (on_failure_jump, laststart, b + 3, b);
640 pending_exact = 0;
641 b += 3;
642 if (!zero_times_ok)
643 {
644 /* At least one repetition is required, so insert a
645 dummy-failure before the initial on-failure-jump
646 instruction of the loop. This effects a skip over that
647 instruction the first time we hit that loop. */
648 GET_BUFFER_SPACE (6);
649 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
650 b += 3;
651 }
652 break;
653
654 case '.':
655 laststart = b;
656 BUFPUSH (anychar);
657 break;
658
659 case '[':
660 if (p == pend)
661 goto invalid_pattern;
662 while (b - bufp->buffer
663 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
664 EXTEND_BUFFER;
665
666 laststart = b;
667 if (*p == '^')
668 {
669 BUFPUSH (charset_not);
670 p++;
671 }
672 else
673 BUFPUSH (charset);
674 p0 = p;
675
676 BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
677 /* Clear the whole map */
678 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
679
680 if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
681 SET_LIST_BIT ('\n');
682
683
684 /* Read in characters and ranges, setting map bits. */
685 while (1)
686 {
687 /* Don't translate while fetching, in case it's a range bound.
688 When we set the bit for the character, we translate it. */
689 PATFETCH_RAW (c);
690
691 /* If set, \ escapes characters when inside [...]. */
692 if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
693 {
694 PATFETCH(c1);
695 SET_LIST_BIT (c1);
696 continue;
697 }
698 if (c == ']')
699 {
700 if (p == p0 + 1)
701 {
702 /* If this is an empty bracket expression. */
703 if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
704 && p == pend)
705 goto invalid_pattern;
706 }
707 else
708 /* Stop if this isn't merely a ] inside a bracket
709 expression, but rather the end of a bracket
710 expression. */
711 break;
712 }
713 /* Get a range. */
714 if (p[0] == '-' && p[1] != ']')
715 {
716 PATFETCH (c1);
717 /* Don't translate the range bounds while fetching them. */
718 PATFETCH_RAW (c1);
719
720 if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
721 goto invalid_pattern;
722
723 if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
724 && c1 == '-' && *p != ']')
725 goto invalid_pattern;
726
727 while (c <= c1)
728 {
729 /* Translate each char that's in the range. */
730 if (translate)
731 SET_LIST_BIT (translate[c]);
732 else
733 SET_LIST_BIT (c);
734 c++;
735 }
736 }
737 else if ((obscure_syntax & RE_CHAR_CLASSES)
738 && c == '[' && p[0] == ':')
739 {
740 /* Longest valid character class word has six characters. */
741 char str[CHAR_CLASS_MAX_LENGTH];
742 PATFETCH (c);
743 c1 = 0;
744 /* If no ] at end. */
745 if (p == pend)
746 goto invalid_pattern;
747 while (1)
748 {
749 /* Don't translate the ``character class'' characters. */
750 PATFETCH_RAW (c);
751 if (c == ':' || c == ']' || p == pend
752 || c1 == CHAR_CLASS_MAX_LENGTH)
753 break;
754 str[c1++] = c;
755 }
756 str[c1] = '\0';
757 if (p == pend
758 || c == ']' /* End of the bracket expression. */
759 || p[0] != ']'
760 || p + 1 == pend
761 || (strcmp (str, "alpha") != 0
762 && strcmp (str, "upper") != 0
763 && strcmp (str, "lower") != 0
764 && strcmp (str, "digit") != 0
765 && strcmp (str, "alnum") != 0
766 && strcmp (str, "xdigit") != 0
767 && strcmp (str, "space") != 0
768 && strcmp (str, "print") != 0
769 && strcmp (str, "punct") != 0
770 && strcmp (str, "graph") != 0
771 && strcmp (str, "cntrl") != 0))
772 {
773 /* Undo the ending character, the letters, and leave
774 the leading : and [ (but set bits for them). */
775 c1++;
776 while (c1--)
777 PATUNFETCH;
778 SET_LIST_BIT ('[');
779 SET_LIST_BIT (':');
780 }
781 else
782 {
783 /* The ] at the end of the character class. */
784 PATFETCH (c);
785 if (c != ']')
786 goto invalid_pattern;
787 for (c = 0; c < (1 << BYTEWIDTH); c++)
788 {
789 if ((strcmp (str, "alpha") == 0 && isalpha (c))
790 || (strcmp (str, "upper") == 0 && isupper (c))
791 || (strcmp (str, "lower") == 0 && islower (c))
792 || (strcmp (str, "digit") == 0 && isdigit (c))
793 || (strcmp (str, "alnum") == 0 && isalnum (c))
794 || (strcmp (str, "xdigit") == 0 && isxdigit (c))
795 || (strcmp (str, "space") == 0 && isspace (c))
796 || (strcmp (str, "print") == 0 && isprint (c))
797 || (strcmp (str, "punct") == 0 && ispunct (c))
798 || (strcmp (str, "graph") == 0 && isgraph (c))
799 || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
800 SET_LIST_BIT (c);
801 }
802 }
803 }
804 else if (translate)
805 SET_LIST_BIT (translate[c]);
806 else
807 SET_LIST_BIT (c);
808 }
809
810 /* Discard any character set/class bitmap bytes that are all
811 0 at the end of the map. Decrement the map-length byte too. */
812 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
813 b[-1]--;
814 b += b[-1];
815 break;
816
817 case '(':
818 if (! (obscure_syntax & RE_NO_BK_PARENS))
819 goto normal_char;
820 else
821 goto handle_open;
822
823 case ')':
824 if (! (obscure_syntax & RE_NO_BK_PARENS))
825 goto normal_char;
826 else
827 goto handle_close;
828
829 case '\n':
830 if (! (obscure_syntax & RE_NEWLINE_OR))
831 goto normal_char;
832 else
833 goto handle_bar;
834
835 case '|':
836 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
837 && (! laststart || p == pend))
838 goto invalid_pattern;
839 else if (! (obscure_syntax & RE_NO_BK_VBAR))
840 goto normal_char;
841 else
842 goto handle_bar;
843
844 case '{':
845 if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
846 && (obscure_syntax & RE_INTERVALS)))
847 goto normal_char;
848 else
849 goto handle_interval;
850
851 case '\\':
852 if (p == pend) goto invalid_pattern;
853 PATFETCH_RAW (c);
854 switch (c)
855 {
856 case '(':
857 if (obscure_syntax & RE_NO_BK_PARENS)
858 goto normal_backsl;
859 handle_open:
860 if (stackp == stacke) goto nesting_too_deep;
861
862 /* Laststart should point to the start_memory that we are about
863 to push (unless the pattern has RE_NREGS or more ('s). */
864 *stackp++ = b - bufp->buffer;
865 if (regnum < RE_NREGS)
866 {
867 BUFPUSH (start_memory);
868 BUFPUSH (regnum);
869 }
870 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
871 *stackp++ = regnum++;
872 *stackp++ = begalt - bufp->buffer;
873 fixup_jump = 0;
874 laststart = 0;
875 begalt = b;
876 break;
877
878 case ')':
879 if (obscure_syntax & RE_NO_BK_PARENS)
880 goto normal_backsl;
881 handle_close:
882 if (stackp == stackb) goto unmatched_close;
883 begalt = *--stackp + bufp->buffer;
884 if (fixup_jump)
885 store_jump (fixup_jump, jump, b);
886 if (stackp[-1] < RE_NREGS)
887 {
888 BUFPUSH (stop_memory);
889 BUFPUSH (stackp[-1]);
890 }
891 stackp -= 2;
892 fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
893 laststart = *--stackp + bufp->buffer;
894 break;
895
896 case '|':
897 if ((obscure_syntax & RE_LIMITED_OPS)
898 || (obscure_syntax & RE_NO_BK_VBAR))
899 goto normal_backsl;
900 handle_bar:
901 if (obscure_syntax & RE_LIMITED_OPS)
902 goto normal_char;
903 /* Insert before the previous alternative a jump which
904 jumps to this alternative if the former fails. */
905 GET_BUFFER_SPACE (6);
906 insert_jump (on_failure_jump, begalt, b + 6, b);
907 pending_exact = 0;
908 b += 3;
909 /* The alternative before the previous alternative has a
910 jump after it which gets executed if it gets matched.
911 Adjust that jump so it will jump to the previous
912 alternative's analogous jump (put in below, which in
913 turn will jump to the next (if any) alternative's such
914 jump, etc.). The last such jump jumps to the correct
915 final destination. */
916 if (fixup_jump)
917 store_jump (fixup_jump, jump, b);
918
919 /* Leave space for a jump after previous alternative---to be
920 filled in later. */
921 fixup_jump = b;
922 b += 3;
923
924 laststart = 0;
925 begalt = b;
926 break;
927
928 case '{':
929 if (! (obscure_syntax & RE_INTERVALS)
930 /* Let \{ be a literal. */
931 || ((obscure_syntax & RE_INTERVALS)
932 && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
933 /* If it's the string "\{". */
934 || (p - 2 == pattern && p == pend))
935 goto normal_backsl;
936 handle_interval:
937 beg_interval = p - 1; /* The {. */
938 /* If there is no previous pattern, this isn't an interval. */
939 if (!laststart)
940 {
941 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
942 goto invalid_pattern;
943 else
944 goto normal_backsl;
945 }
946 /* It also isn't an interval if not preceded by an re
947 matching a single character or subexpression, or if
948 the current type of intervals can't handle back
949 references and the previous thing is a back reference. */
950 if (! (*laststart == anychar
951 || *laststart == charset
952 || *laststart == charset_not
953 || *laststart == start_memory
954 || (*laststart == exactn && laststart[1] == 1)
955 || (! (obscure_syntax & RE_NO_BK_REFS)
956 && *laststart == duplicate)))
957 {
958 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
959 goto normal_char;
960
961 /* Posix extended syntax is handled in previous
962 statement; this is for Posix basic syntax. */
963 if (obscure_syntax & RE_INTERVALS)
964 goto invalid_pattern;
965
966 goto normal_backsl;
967 }
968 lower_bound = -1; /* So can see if are set. */
969 upper_bound = -1;
970 GET_UNSIGNED_NUMBER (lower_bound);
971 if (c == ',')
972 {
973 GET_UNSIGNED_NUMBER (upper_bound);
974 if (upper_bound < 0)
975 upper_bound = RE_DUP_MAX;
976 }
977 if (upper_bound < 0)
978 upper_bound = lower_bound;
979 if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
980 {
981 if (c != '\\')
982 goto invalid_pattern;
983 PATFETCH (c);
984 }
985 if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
986 || lower_bound > upper_bound
987 || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
988 && p != pend && *p == '{'))
989 {
990 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
991 goto unfetch_interval;
992 else
993 goto invalid_pattern;
994 }
995
996 /* If upper_bound is zero, don't want to succeed at all;
997 jump from laststart to b + 3, which will be the end of
998 the buffer after this jump is inserted. */
999
1000 if (upper_bound == 0)
1001 {
1002 GET_BUFFER_SPACE (3);
1003 insert_jump (jump, laststart, b + 3, b);
1004 b += 3;
1005 }
1006
1007 /* Otherwise, after lower_bound number of succeeds, jump
1008 to after the jump_n which will be inserted at the end
1009 of the buffer, and insert that jump_n. */
1010 else
1011 { /* Set to 5 if only one repetition is allowed and
1012 hence no jump_n is inserted at the current end of
1013 the buffer; then only space for the succeed_n is
1014 needed. Otherwise, need space for both the
1015 succeed_n and the jump_n. */
1016
1017 unsigned slots_needed = upper_bound == 1 ? 5 : 10;
1018
1019 GET_BUFFER_SPACE (slots_needed);
1020 /* Initialize the succeed_n to n, even though it will
1021 be set by its attendant set_number_at, because
1022 re_compile_fastmap will need to know it. Jump to
1023 what the end of buffer will be after inserting
1024 this succeed_n and possibly appending a jump_n. */
1025 insert_jump_n (succeed_n, laststart, b + slots_needed,
1026 b, lower_bound);
1027 b += 5; /* Just increment for the succeed_n here. */
1028
1029 /* More than one repetition is allowed, so put in at
1030 the end of the buffer a backward jump from b to the
1031 succeed_n we put in above. By the time we've gotten
1032 to this jump when matching, we'll have matched once
1033 already, so jump back only upper_bound - 1 times. */
1034
1035 if (upper_bound > 1)
1036 {
1037 store_jump_n (b, jump_n, laststart, upper_bound - 1);
1038 b += 5;
1039 /* When hit this when matching, reset the
1040 preceding jump_n's n to upper_bound - 1. */
1041 BUFPUSH (set_number_at);
1042 GET_BUFFER_SPACE (2);
1043 STORE_NUMBER_AND_INCR (b, -5);
1044 STORE_NUMBER_AND_INCR (b, upper_bound - 1);
1045 }
1046 /* When hit this when matching, set the succeed_n's n. */
1047 GET_BUFFER_SPACE (5);
1048 insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
1049 b += 5;
1050 }
1051 pending_exact = 0;
1052 beg_interval = 0;
1053 break;
1054
1055
1056 unfetch_interval:
1057 /* If an invalid interval, match the characters as literals. */
1058 if (beg_interval)
1059 p = beg_interval;
1060 else
1061 {
1062 fprintf (stderr,
1063 "regex: no interval beginning to which to backtrack.\n");
1064 exit (1);
1065 }
1066
1067 beg_interval = 0;
1068 PATFETCH (c); /* normal_char expects char in `c'. */
1069 goto normal_char;
1070 break;
1071
1072#ifdef emacs
1073 case '=':
1074 BUFPUSH (at_dot);
1075 break;
1076
1077 case 's':
1078 laststart = b;
1079 BUFPUSH (syntaxspec);
1080 PATFETCH (c);
1081 BUFPUSH (syntax_spec_code[c]);
1082 break;
1083
1084 case 'S':
1085 laststart = b;
1086 BUFPUSH (notsyntaxspec);
1087 PATFETCH (c);
1088 BUFPUSH (syntax_spec_code[c]);
1089 break;
1090#endif /* emacs */
1091
1092 case 'w':
1093 laststart = b;
1094 BUFPUSH (wordchar);
1095 break;
1096
1097 case 'W':
1098 laststart = b;
1099 BUFPUSH (notwordchar);
1100 break;
1101
1102 case '<':
1103 BUFPUSH (wordbeg);
1104 break;
1105
1106 case '>':
1107 BUFPUSH (wordend);
1108 break;
1109
1110 case 'b':
1111 BUFPUSH (wordbound);
1112 break;
1113
1114 case 'B':
1115 BUFPUSH (notwordbound);
1116 break;
1117
1118 case '`':
1119 BUFPUSH (begbuf);
1120 break;
1121
1122 case '\'':
1123 BUFPUSH (endbuf);
1124 break;
1125
1126 case '1':
1127 case '2':
1128 case '3':
1129 case '4':
1130 case '5':
1131 case '6':
1132 case '7':
1133 case '8':
1134 case '9':
1135 if (obscure_syntax & RE_NO_BK_REFS)
1136 goto normal_char;
1137 c1 = c - '0';
1138 if (c1 >= regnum)
1139 {
1140 if (obscure_syntax & RE_NO_EMPTY_BK_REF)
1141 goto invalid_pattern;
1142 else
1143 goto normal_char;
1144 }
1145 /* Can't back reference to a subexpression if inside of it. */
1146 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
1147 if (*stackt == c1)
1148 goto normal_char;
1149 laststart = b;
1150 BUFPUSH (duplicate);
1151 BUFPUSH (c1);
1152 break;
1153
1154 case '+':
1155 case '?':
1156 if (obscure_syntax & RE_BK_PLUS_QM)
1157 goto handle_plus;
1158 else
1159 goto normal_backsl;
1160 break;
1161
1162 default:
1163 normal_backsl:
1164 /* You might think it would be useful for \ to mean
1165 not to translate; but if we don't translate it
1166 it will never match anything. */
1167 if (translate) c = translate[c];
1168 goto normal_char;
1169 }
1170 break;
1171
1172 default:
1173 normal_char: /* Expects the character in `c'. */
1174 if (!pending_exact || pending_exact + *pending_exact + 1 != b
1175 || *pending_exact == 0177 || *p == '*' || *p == '^'
1176 || ((obscure_syntax & RE_BK_PLUS_QM)
1177 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
1178 : (*p == '+' || *p == '?'))
1179 || ((obscure_syntax & RE_INTERVALS)
1180 && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
1181 ? *p == '{'
1182 : (p[0] == '\\' && p[1] == '{'))))
1183 {
1184 laststart = b;
1185 BUFPUSH (exactn);
1186 pending_exact = b;
1187 BUFPUSH (0);
1188 }
1189 BUFPUSH (c);
1190 (*pending_exact)++;
1191 }
1192 }
1193
1194 if (fixup_jump)
1195 store_jump (fixup_jump, jump, b);
1196
1197 if (stackp != stackb) goto unmatched_open;
1198
1199 bufp->used = b - bufp->buffer;
1200 return 0;
1201
1202 invalid_pattern:
1203 return "Invalid regular expression";
1204
1205 unmatched_open:
1206 return "Unmatched \\(";
1207
1208 unmatched_close:
1209 return "Unmatched \\)";
1210
1211 end_of_pattern:
1212 return "Premature end of regular expression";
1213
1214 nesting_too_deep:
1215 return "Nesting too deep";
1216
1217 too_big:
1218 return "Regular expression too big";
1219
1220 memory_exhausted:
1221 return "Memory exhausted";
1222}
1223
1224
1225/* Store a jump of the form <OPCODE> <relative address>.
1226 Store in the location FROM a jump operation to jump to relative
1227 address FROM - TO. OPCODE is the opcode to store. */
1228
1229static void
1230store_jump (from, opcode, to)
1231 char *from, *to;
1232 int opcode;
1233{
1234 from[0] = (char)opcode;
1235 STORE_NUMBER(from + 1, to - (from + 3));
1236}
1237
1238
1239/* Open up space before char FROM, and insert there a jump to TO.
1240 CURRENT_END gives the end of the storage not in use, so we know
1241 how much data to copy up. OP is the opcode of the jump to insert.
1242
1243 If you call this function, you must zero out pending_exact. */
1244
1245static void
1246insert_jump (op, from, to, current_end)
1247 int op;
1248 char *from, *to, *current_end;
1249{
1250 register char *pfrom = current_end; /* Copy from here... */
1251 register char *pto = current_end + 3; /* ...to here. */
1252
1253 while (pfrom != from)
1254 *--pto = *--pfrom;
1255 store_jump (from, op, to);
1256}
1257
1258
1259/* Store a jump of the form <opcode> <relative address> <n> .
1260
1261 Store in the location FROM a jump operation to jump to relative
1262 address FROM - TO. OPCODE is the opcode to store, N is a number the
1263 jump uses, say, to decide how many times to jump.
1264
1265 If you call this function, you must zero out pending_exact. */
1266
1267static void
1268store_jump_n (from, opcode, to, n)
1269 char *from, *to;
1270 int opcode;
1271 unsigned n;
1272{
1273 from[0] = (char)opcode;
1274 STORE_NUMBER (from + 1, to - (from + 3));
1275 STORE_NUMBER (from + 3, n);
1276}
1277
1278
1279/* Similar to insert_jump, but handles a jump which needs an extra
1280 number to handle minimum and maximum cases. Open up space at
1281 location FROM, and insert there a jump to TO. CURRENT_END gives the
1282 end of the storage in use, so we know how much data to copy up. OP is
1283 the opcode of the jump to insert.
1284
1285 If you call this function, you must zero out pending_exact. */
1286
1287static void
1288insert_jump_n (op, from, to, current_end, n)
1289 int op;
1290 char *from, *to, *current_end;
1291 unsigned n;
1292{
1293 register char *pfrom = current_end; /* Copy from here... */
1294 register char *pto = current_end + 5; /* ...to here. */
1295
1296 while (pfrom != from)
1297 *--pto = *--pfrom;
1298 store_jump_n (from, op, to, n);
1299}
1300
1301
1302/* Open up space at location THERE, and insert operation OP followed by
1303 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
1304 we know how much data to copy up.
1305
1306 If you call this function, you must zero out pending_exact. */
1307
1308static void
1309insert_op_2 (op, there, current_end, num_1, num_2)
1310 int op;
1311 char *there, *current_end;
1312 int num_1, num_2;
1313{
1314 register char *pfrom = current_end; /* Copy from here... */
1315 register char *pto = current_end + 5; /* ...to here. */
1316
1317 while (pfrom != there)
1318 *--pto = *--pfrom;
1319
1320 there[0] = (char)op;
1321 STORE_NUMBER (there + 1, num_1);
1322 STORE_NUMBER (there + 3, num_2);
1323}
1324
1325
1326\f
1327/* Given a pattern, compute a fastmap from it. The fastmap records
1328 which of the (1 << BYTEWIDTH) possible characters can start a string
1329 that matches the pattern. This fastmap is used by re_search to skip
1330 quickly over totally implausible text.
1331
1332 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
1333 area as bufp->fastmap.
1334 The other components of bufp describe the pattern to be used. */
1335
1336void
1337re_compile_fastmap (bufp)
1338 struct re_pattern_buffer *bufp;
1339{
1340 unsigned char *pattern = (unsigned char *) bufp->buffer;
1341 int size = bufp->used;
1342 register char *fastmap = bufp->fastmap;
1343 register unsigned char *p = pattern;
1344 register unsigned char *pend = pattern + size;
1345 register int j, k;
1346 unsigned char *translate = (unsigned char *) bufp->translate;
1347 unsigned is_a_succeed_n;
1348
1349#ifndef NO_ALLOCA
1350 unsigned char *stackb[NFAILURES];
1351 unsigned char **stackp = stackb;
1352
1353#else
1354 unsigned char **stackb;
1355 unsigned char **stackp;
1356 stackb = (unsigned char **) malloc (NFAILURES * sizeof (unsigned char *));
1357 stackp = stackb;
1358
1359#endif /* NO_ALLOCA */
1360 memset (fastmap, 0, (1 << BYTEWIDTH));
1361 bufp->fastmap_accurate = 1;
1362 bufp->can_be_null = 0;
1363
1364 while (p)
1365 {
1366 is_a_succeed_n = 0;
1367 if (p == pend)
1368 {
1369 bufp->can_be_null = 1;
1370 break;
1371 }
1372#ifdef SWITCH_ENUM_BUG
1373 switch ((int) ((enum regexpcode) *p++))
1374#else
1375 switch ((enum regexpcode) *p++)
1376#endif
1377 {
1378 case exactn:
1379 if (translate)
1380 fastmap[translate[p[1]]] = 1;
1381 else
1382 fastmap[p[1]] = 1;
1383 break;
1384
1385 case begline:
1386 case before_dot:
1387 case at_dot:
1388 case after_dot:
1389 case begbuf:
1390 case endbuf:
1391 case wordbound:
1392 case notwordbound:
1393 case wordbeg:
1394 case wordend:
1395 continue;
1396
1397 case endline:
1398 if (translate)
1399 fastmap[translate['\n']] = 1;
1400 else
1401 fastmap['\n'] = 1;
1402
1403 if (bufp->can_be_null != 1)
1404 bufp->can_be_null = 2;
1405 break;
1406
1407 case jump_n:
1408 case finalize_jump:
1409 case maybe_finalize_jump:
1410 case jump:
1411 case dummy_failure_jump:
1412 EXTRACT_NUMBER_AND_INCR (j, p);
1413 p += j;
1414 if (j > 0)
1415 continue;
1416 /* Jump backward reached implies we just went through
1417 the body of a loop and matched nothing.
1418 Opcode jumped to should be an on_failure_jump.
1419 Just treat it like an ordinary jump.
1420 For a * loop, it has pushed its failure point already;
1421 If so, discard that as redundant. */
1422
1423 if ((enum regexpcode) *p != on_failure_jump
1424 && (enum regexpcode) *p != succeed_n)
1425 continue;
1426 p++;
1427 EXTRACT_NUMBER_AND_INCR (j, p);
1428 p += j;
1429 if (stackp != stackb && *stackp == p)
1430 stackp--;
1431 continue;
1432
1433 case on_failure_jump:
1434 handle_on_failure_jump:
1435 EXTRACT_NUMBER_AND_INCR (j, p);
1436 *++stackp = p + j;
1437 if (is_a_succeed_n)
1438 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
1439 continue;
1440
1441 case succeed_n:
1442 is_a_succeed_n = 1;
1443 /* Get to the number of times to succeed. */
1444 p += 2;
1445 /* Increment p past the n for when k != 0. */
1446 EXTRACT_NUMBER_AND_INCR (k, p);
1447 if (k == 0)
1448 {
1449 p -= 4;
1450 goto handle_on_failure_jump;
1451 }
1452 continue;
1453
1454 case set_number_at:
1455 p += 4;
1456 continue;
1457
1458 case start_memory:
1459 case stop_memory:
1460 p++;
1461 continue;
1462
1463 case duplicate:
1464 bufp->can_be_null = 1;
1465 fastmap['\n'] = 1;
1466 case anychar:
1467 for (j = 0; j < (1 << BYTEWIDTH); j++)
1468 if (j != '\n')
1469 fastmap[j] = 1;
1470 if (bufp->can_be_null)
1471 {
1472 FREE_AND_RETURN_VOID(stackb);
1473 }
1474 /* Don't return; check the alternative paths
1475 so we can set can_be_null if appropriate. */
1476 break;
1477
1478 case wordchar:
1479 for (j = 0; j < (1 << BYTEWIDTH); j++)
1480 if (SYNTAX (j) == Sword)
1481 fastmap[j] = 1;
1482 break;
1483
1484 case notwordchar:
1485 for (j = 0; j < (1 << BYTEWIDTH); j++)
1486 if (SYNTAX (j) != Sword)
1487 fastmap[j] = 1;
1488 break;
1489
1490#ifdef emacs
1491 case syntaxspec:
1492 k = *p++;
1493 for (j = 0; j < (1 << BYTEWIDTH); j++)
1494 if (SYNTAX (j) == (enum syntaxcode) k)
1495 fastmap[j] = 1;
1496 break;
1497
1498 case notsyntaxspec:
1499 k = *p++;
1500 for (j = 0; j < (1 << BYTEWIDTH); j++)
1501 if (SYNTAX (j) != (enum syntaxcode) k)
1502 fastmap[j] = 1;
1503 break;
1504
1505#else /* not emacs */
1506 case syntaxspec:
1507 case notsyntaxspec:
1508 break;
1509#endif /* not emacs */
1510
1511 case charset:
1512 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1513 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
1514 {
1515 if (translate)
1516 fastmap[translate[j]] = 1;
1517 else
1518 fastmap[j] = 1;
1519 }
1520 break;
1521
1522 case charset_not:
1523 /* Chars beyond end of map must be allowed */
1524 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
1525 if (translate)
1526 fastmap[translate[j]] = 1;
1527 else
1528 fastmap[j] = 1;
1529
1530 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1531 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
1532 {
1533 if (translate)
1534 fastmap[translate[j]] = 1;
1535 else
1536 fastmap[j] = 1;
1537 }
1538 break;
1539
1540 case unused: /* pacify gcc -Wall */
1541 break;
1542 }
1543
1544 /* Get here means we have successfully found the possible starting
1545 characters of one path of the pattern. We need not follow this
1546 path any farther. Instead, look at the next alternative
1547 remembered in the stack. */
1548 if (stackp != stackb)
1549 p = *stackp--;
1550 else
1551 break;
1552 }
1553 FREE_AND_RETURN_VOID(stackb);
1554}
1555
1556
1557\f
1558/* Like re_search_2, below, but only one string is specified, and
1559 doesn't let you say where to stop matching. */
1560
1561int
1562re_search (pbufp, string, size, startpos, range, regs)
1563 struct re_pattern_buffer *pbufp;
1564 char *string;
1565 int size, startpos, range;
1566 struct re_registers *regs;
1567{
1568 return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
1569 regs, size);
1570}
1571
1572
1573/* Using the compiled pattern in PBUFP->buffer, first tries to match the
1574 virtual concatenation of STRING1 and STRING2, starting first at index
1575 STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
1576 places to try before giving up. If RANGE is negative, it searches
1577 backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
1578 - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
1579 In REGS, return the indices of the virtual concatenation of STRING1
1580 and STRING2 that matched the entire PBUFP->buffer and its contained
1581 subexpressions. Do not consider matching one past the index MSTOP in
1582 the virtual concatenation of STRING1 and STRING2.
1583
1584 The value returned is the position in the strings at which the match
1585 was found, or -1 if no match was found, or -2 if error (such as
1586 failure stack overflow). */
1587
1588int
1589re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
1590 regs, mstop)
1591 struct re_pattern_buffer *pbufp;
1592 char *string1, *string2;
1593 int size1, size2;
1594 int startpos;
1595 register int range;
1596 struct re_registers *regs;
1597 int mstop;
1598{
1599 register char *fastmap = pbufp->fastmap;
1600 register unsigned char *translate = (unsigned char *) pbufp->translate;
1601 int total_size = size1 + size2;
1602 int endpos = startpos + range;
1603 int val;
1604
1605 /* Check for out-of-range starting position. */
1606 if (startpos < 0 || startpos > total_size)
1607 return -1;
1608
1609 /* Fix up range if it would eventually take startpos outside of the
1610 virtual concatenation of string1 and string2. */
1611 if (endpos < -1)
1612 range = -1 - startpos;
1613 else if (endpos > total_size)
1614 range = total_size - startpos;
1615
1616 /* Update the fastmap now if not correct already. */
1617 if (fastmap && !pbufp->fastmap_accurate)
1618 re_compile_fastmap (pbufp);
1619
1620 /* If the search isn't to be a backwards one, don't waste time in a
1621 long search for a pattern that says it is anchored. */
1622 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
1623 && range > 0)
1624 {
1625 if (startpos > 0)
1626 return -1;
1627 else
1628 range = 1;
1629 }
1630
1631 while (1)
1632 {
1633 /* If a fastmap is supplied, skip quickly over characters that
1634 cannot possibly be the start of a match. Note, however, that
1635 if the pattern can possibly match the null string, we must
1636 test it at each starting point so that we take the first null
1637 string we get. */
1638
1639 if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
1640 {
1641 if (range > 0) /* Searching forwards. */
1642 {
1643 register int lim = 0;
1644 register unsigned char *p;
1645 int irange = range;
1646 if (startpos < size1 && startpos + range >= size1)
1647 lim = range - (size1 - startpos);
1648
1649 p = ((unsigned char *)
1650 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
1651
1652 while (range > lim && !fastmap[translate
1653 ? translate[*p++]
1654 : *p++])
1655 range--;
1656 startpos += irange - range;
1657 }
1658 else /* Searching backwards. */
1659 {
1660 register unsigned char c;
1661
1662 if (string1 == 0 || startpos >= size1)
1663 c = string2[startpos - size1];
1664 else
1665 c = string1[startpos];
1666
1667 c &= 0xff;
1668 if (translate ? !fastmap[translate[c]] : !fastmap[c])
1669 goto advance;
1670 }
1671 }
1672
1673 if (range >= 0 && startpos == total_size
1674 && fastmap && pbufp->can_be_null == 0)
1675 return -1;
1676
1677 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
1678 regs, mstop);
1679 if (val >= 0)
1680 return startpos;
1681 if (val == -2)
1682 return -2;
1683
1684#ifndef NO_ALLOCA
1685#ifdef C_ALLOCA
1686 alloca (0);
1687#endif /* C_ALLOCA */
1688
1689#endif /* NO_ALLOCA */
1690 advance:
1691 if (!range)
1692 break;
1693 else if (range > 0)
1694 {
1695 range--;
1696 startpos++;
1697 }
1698 else
1699 {
1700 range++;
1701 startpos--;
1702 }
1703 }
1704 return -1;
1705}
1706
1707
1708\f
1709#ifndef emacs /* emacs never uses this. */
1710int
1711re_match (pbufp, string, size, pos, regs)
1712 struct re_pattern_buffer *pbufp;
1713 char *string;
1714 int size, pos;
1715 struct re_registers *regs;
1716{
1717 return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
1718}
1719#endif /* not emacs */
1720
1721
1722/* The following are used for re_match_2, defined below: */
1723
1724/* Roughly the maximum number of failure points on the stack. Would be
1725 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
1726
1727int re_max_failures = 2000;
1728
1729/* Routine used by re_match_2. */
1730/* static int memcmp_translate (); *//* already declared */
1731
1732
1733/* Structure and accessing macros used in re_match_2: */
1734
1735struct register_info
1736{
1737 unsigned is_active : 1;
1738 unsigned matched_something : 1;
1739};
1740
1741#define IS_ACTIVE(R) ((R).is_active)
1742#define MATCHED_SOMETHING(R) ((R).matched_something)
1743
1744
1745/* Macros used by re_match_2: */
1746
1747
1748/* I.e., regstart, regend, and reg_info. */
1749
1750#define NUM_REG_ITEMS 3
1751
1752/* We push at most this many things on the stack whenever we
1753 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
1754 arguments to the PUSH_FAILURE_POINT macro. */
1755
1756#define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
1757
1758
1759/* We push this many things on the stack whenever we fail. */
1760
1761#define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
1762
1763
1764/* This pushes most of the information about the current state we will want
1765 if we ever fail back to it. */
1766
1767#define PUSH_FAILURE_POINT(pattern_place, string_place) \
1768 { \
1769 long last_used_reg, this_reg; \
1770 \
1771 /* Find out how many registers are active or have been matched. \
1772 (Aside from register zero, which is only set at the end.) */ \
1773 for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
1774 if (regstart[last_used_reg] != (unsigned char *)(-1L)) \
1775 break; \
1776 \
1777 if (stacke - stackp < NUM_FAILURE_ITEMS) \
1778 { \
1779 unsigned char **stackx; \
1780 unsigned int len = stacke - stackb; \
1781 if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
1782 { \
1783 FREE_AND_RETURN(stackb,(-2)); \
1784 } \
1785 \
1786 /* Roughly double the size of the stack. */ \
1787 stackx = DOUBLE_STACK(stackx,stackb,len); \
1788 /* Rearrange the pointers. */ \
1789 stackp = stackx + (stackp - stackb); \
1790 stackb = stackx; \
1791 stacke = stackb + 2 * len; \
1792 } \
1793 \
1794 /* Now push the info for each of those registers. */ \
1795 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
1796 { \
1797 *stackp++ = regstart[this_reg]; \
1798 *stackp++ = regend[this_reg]; \
1799 *stackp++ = (unsigned char *) &reg_info[this_reg]; \
1800 } \
1801 \
1802 /* Push how many registers we saved. */ \
1803 *stackp++ = (unsigned char *) last_used_reg; \
1804 \
1805 *stackp++ = pattern_place; \
1806 *stackp++ = string_place; \
1807 }
1808
1809
1810/* This pops what PUSH_FAILURE_POINT pushes. */
1811
1812#define POP_FAILURE_POINT() \
1813 { \
1814 int temp; \
1815 stackp -= 2; /* Remove failure points. */ \
1816 temp = (int) *--stackp; /* How many regs pushed. */ \
1817 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
1818 stackp -= temp; /* Remove the register info. */ \
1819 }
1820
1821
1822#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
1823
1824/* Is true if there is a first string and if PTR is pointing anywhere
1825 inside it or just past the end. */
1826
1827#define IS_IN_FIRST_STRING(ptr) \
1828 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
1829
1830/* Call before fetching a character with *d. This switches over to
1831 string2 if necessary. */
1832
1833#define PREFETCH \
1834 while (d == dend) \
1835 { \
1836 /* end of string2 => fail. */ \
1837 if (dend == end_match_2) \
1838 goto fail; \
1839 /* end of string1 => advance to string2. */ \
1840 d = string2; \
1841 dend = end_match_2; \
1842 }
1843
1844
1845/* Call this when have matched something; it sets `matched' flags for the
1846 registers corresponding to the subexpressions of which we currently
1847 are inside. */
1848#define SET_REGS_MATCHED \
1849 { unsigned this_reg; \
1850 for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
1851 { \
1852 if (IS_ACTIVE(reg_info[this_reg])) \
1853 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
1854 else \
1855 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
1856 } \
1857 }
1858
1859/* Test if at very beginning or at very end of the virtual concatenation
1860 of string1 and string2. If there is only one string, we've put it in
1861 string2. */
1862
1863#define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2)
1864#define AT_STRINGS_END (d == end2)
1865
1866#define AT_WORD_BOUNDARY \
1867 (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d))
1868
1869/* We have two special cases to check for:
1870 1) if we're past the end of string1, we have to look at the first
1871 character in string2;
1872 2) if we're before the beginning of string2, we have to look at the
1873 last character in string1; we assume there is a string1, so use
1874 this in conjunction with AT_STRINGS_BEG. */
1875#define IS_A_LETTER(d) \
1876 (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
1877 == Sword)
1878
1879
1880/* Match the pattern described by PBUFP against the virtual
1881 concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
1882 respectively. Start the match at index POS in the virtual
1883 concatenation of STRING1 and STRING2. In REGS, return the indices of
1884 the virtual concatenation of STRING1 and STRING2 that matched the
1885 entire PBUFP->buffer and its contained subexpressions. Do not
1886 consider matching one past the index MSTOP in the virtual
1887 concatenation of STRING1 and STRING2.
1888
1889 If pbufp->fastmap is nonzero, then it had better be up to date.
1890
1891 The reason that the data to match are specified as two components
1892 which are to be regarded as concatenated is so this function can be
1893 used directly on the contents of an Emacs buffer.
1894
1895 -1 is returned if there is no match. -2 is returned if there is an
1896 error (such as match stack overflow). Otherwise the value is the
1897 length of the substring which was matched. */
1898
1899int
1900re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
1901 struct re_pattern_buffer *pbufp;
1902 char *string1_arg, *string2_arg;
1903 int size1, size2;
1904 int pos;
1905 struct re_registers *regs;
1906 int mstop;
1907{
1908 register unsigned char *p = (unsigned char *) pbufp->buffer;
1909
1910 /* Pointer to beyond end of buffer. */
1911 register unsigned char *pend = p + pbufp->used;
1912
1913 unsigned char *string1 = (unsigned char *) string1_arg;
1914 unsigned char *string2 = (unsigned char *) string2_arg;
1915 unsigned char *end1; /* Just past end of first string. */
1916 unsigned char *end2; /* Just past end of second string. */
1917
1918 /* Pointers into string1 and string2, just past the last characters in
1919 each to consider matching. */
1920 unsigned char *end_match_1, *end_match_2;
1921
1922 register unsigned char *d, *dend;
1923 register int mcnt; /* Multipurpose. */
1924 unsigned char *translate = (unsigned char *) pbufp->translate;
1925 unsigned is_a_jump_n = 0;
1926
1927 /* Failure point stack. Each place that can handle a failure further
1928 down the line pushes a failure point on this stack. It consists of
1929 restart, regend, and reg_info for all registers corresponding to the
1930 subexpressions we're currently inside, plus the number of such
1931 registers, and, finally, two char *'s. The first char * is where to
1932 resume scanning the pattern; the second one is where to resume
1933 scanning the strings. If the latter is zero, the failure point is a
1934 ``dummy''; if a failure happens and the failure point is a dummy, it
1935 gets discarded and the next next one is tried. */
1936
1937#ifndef NO_ALLOCA
1938 unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1939#endif
1940 unsigned char **stackb;
1941 unsigned char **stackp;
1942 unsigned char **stacke;
1943
1944
1945 /* Information on the contents of registers. These are pointers into
1946 the input strings; they record just what was matched (on this
1947 attempt) by a subexpression part of the pattern, that is, the
1948 regnum-th regstart pointer points to where in the pattern we began
1949 matching and the regnum-th regend points to right after where we
1950 stopped matching the regnum-th subexpression. (The zeroth register
1951 keeps track of what the whole pattern matches.) */
1952
1953 unsigned char *regstart[RE_NREGS];
1954 unsigned char *regend[RE_NREGS];
1955
1956 /* The is_active field of reg_info helps us keep track of which (possibly
1957 nested) subexpressions we are currently in. The matched_something
1958 field of reg_info[reg_num] helps us tell whether or not we have
1959 matched any of the pattern so far this time through the reg_num-th
1960 subexpression. These two fields get reset each time through any
1961 loop their register is in. */
1962
1963 struct register_info reg_info[RE_NREGS];
1964
1965
1966 /* The following record the register info as found in the above
1967 variables when we find a match better than any we've seen before.
1968 This happens as we backtrack through the failure points, which in
1969 turn happens only if we have not yet matched the entire string. */
1970
1971 unsigned best_regs_set = 0;
1972 unsigned char *best_regstart[RE_NREGS];
1973 unsigned char *best_regend[RE_NREGS];
1974
1975 /* Initialize the stack. */
1976#ifdef NO_ALLOCA
1977 stackb = (unsigned char **) malloc (MAX_NUM_FAILURE_ITEMS * NFAILURES * sizeof (char *));
1978#else
1979 stackb = initial_stack;
1980#endif
1981 stackp = stackb;
1982 stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1983
1984#ifdef DEBUG_REGEX
1985 fprintf (stderr, "Entering re_match_2(%s%s)\n", string1_arg, string2_arg);
1986#endif
1987
1988 /* Initialize subexpression text positions to -1 to mark ones that no
1989 \( or ( and \) or ) has been seen for. Also set all registers to
1990 inactive and mark them as not having matched anything or ever
1991 failed. */
1992 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1993 {
1994 regstart[mcnt] = regend[mcnt] = (unsigned char *) (-1L);
1995 IS_ACTIVE (reg_info[mcnt]) = 0;
1996 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
1997 }
1998
1999 if (regs)
2000 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
2001 regs->start[mcnt] = regs->end[mcnt] = -1;
2002
2003 /* Set up pointers to ends of strings.
2004 Don't allow the second string to be empty unless both are empty. */
2005 if (size2 == 0)
2006 {
2007 string2 = string1;
2008 size2 = size1;
2009 string1 = 0;
2010 size1 = 0;
2011 }
2012 end1 = string1 + size1;
2013 end2 = string2 + size2;
2014
2015 /* Compute where to stop matching, within the two strings. */
2016 if (mstop <= size1)
2017 {
2018 end_match_1 = string1 + mstop;
2019 end_match_2 = string2;
2020 }
2021 else
2022 {
2023 end_match_1 = end1;
2024 end_match_2 = string2 + mstop - size1;
2025 }
2026
2027 /* `p' scans through the pattern as `d' scans through the data. `dend'
2028 is the end of the input string that `d' points within. `d' is
2029 advanced into the following input string whenever necessary, but
2030 this happens before fetching; therefore, at the beginning of the
2031 loop, `d' can be pointing at the end of a string, but it cannot
2032 equal string2. */
2033
2034 if (size1 != 0 && pos <= size1)
2035 d = string1 + pos, dend = end_match_1;
2036 else
2037 d = string2 + pos - size1, dend = end_match_2;
2038
2039
2040 /* This loops over pattern commands. It exits by returning from the
2041 function if match is complete, or it drops through if match fails
2042 at this starting point in the input data. */
2043
2044 while (1)
2045 {
2046#ifdef DEBUG_REGEX
2047 fprintf (stderr,
2048 "regex loop(%d): matching 0x%02d\n",
2049 p - (unsigned char *) pbufp->buffer,
2050 *p);
2051#endif
2052 is_a_jump_n = 0;
2053 /* End of pattern means we might have succeeded. */
2054 if (p == pend)
2055 {
2056 /* If not end of string, try backtracking. Otherwise done. */
2057 if (d != end_match_2)
2058 {
2059 if (stackp != stackb)
2060 {
2061 /* More failure points to try. */
2062
2063 unsigned in_same_string =
2064 IS_IN_FIRST_STRING (best_regend[0])
2065 == MATCHING_IN_FIRST_STRING;
2066
2067 /* If exceeds best match so far, save it. */
2068 if (! best_regs_set
2069 || (in_same_string && d > best_regend[0])
2070 || (! in_same_string && ! MATCHING_IN_FIRST_STRING))
2071 {
2072 best_regs_set = 1;
2073 best_regend[0] = d; /* Never use regstart[0]. */
2074
2075 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2076 {
2077 best_regstart[mcnt] = regstart[mcnt];
2078 best_regend[mcnt] = regend[mcnt];
2079 }
2080 }
2081 goto fail;
2082 }
2083 /* If no failure points, don't restore garbage. */
2084 else if (best_regs_set)
2085 {
2086 restore_best_regs:
2087 /* Restore best match. */
2088 d = best_regend[0];
2089
2090 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
2091 {
2092 regstart[mcnt] = best_regstart[mcnt];
2093 regend[mcnt] = best_regend[mcnt];
2094 }
2095 }
2096 }
2097
2098 /* If caller wants register contents data back, convert it
2099 to indices. */
2100 if (regs)
2101 {
2102 regs->start[0] = pos;
2103 if (MATCHING_IN_FIRST_STRING)
2104 regs->end[0] = d - string1;
2105 else
2106 regs->end[0] = d - string2 + size1;
2107 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2108 {
2109 if (regend[mcnt] == (unsigned char *)(-1L))
2110 {
2111 regs->start[mcnt] = -1;
2112 regs->end[mcnt] = -1;
2113 continue;
2114 }
2115 if (IS_IN_FIRST_STRING (regstart[mcnt]))
2116 regs->start[mcnt] = regstart[mcnt] - string1;
2117 else
2118 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
2119
2120 if (IS_IN_FIRST_STRING (regend[mcnt]))
2121 regs->end[mcnt] = regend[mcnt] - string1;
2122 else
2123 regs->end[mcnt] = regend[mcnt] - string2 + size1;
2124 }
2125 }
2126 FREE_AND_RETURN(stackb,
2127 (d - pos - (MATCHING_IN_FIRST_STRING ?
2128 string1 :
2129 string2 - size1)));
2130 }
2131
2132 /* Otherwise match next pattern command. */
2133#ifdef SWITCH_ENUM_BUG
2134 switch ((int) ((enum regexpcode) *p++))
2135#else
2136 switch ((enum regexpcode) *p++)
2137#endif
2138 {
2139
2140 /* \( [or `(', as appropriate] is represented by start_memory,
2141 \) by stop_memory. Both of those commands are followed by
2142 a register number in the next byte. The text matched
2143 within the \( and \) is recorded under that number. */
2144 case start_memory:
2145 regstart[*p] = d;
2146 IS_ACTIVE (reg_info[*p]) = 1;
2147 MATCHED_SOMETHING (reg_info[*p]) = 0;
2148 p++;
2149 break;
2150
2151 case stop_memory:
2152 regend[*p] = d;
2153 IS_ACTIVE (reg_info[*p]) = 0;
2154
2155 /* If just failed to match something this time around with a sub-
2156 expression that's in a loop, try to force exit from the loop. */
2157 if ((! MATCHED_SOMETHING (reg_info[*p])
2158 || (enum regexpcode) p[-3] == start_memory)
2159 && (p + 1) != pend)
2160 {
2161 register unsigned char *p2 = p + 1;
2162 mcnt = 0;
2163 switch (*p2++)
2164 {
2165 case jump_n:
2166 is_a_jump_n = 1;
2167 case finalize_jump:
2168 case maybe_finalize_jump:
2169 case jump:
2170 case dummy_failure_jump:
2171 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2172 if (is_a_jump_n)
2173 p2 += 2;
2174 break;
2175 }
2176 p2 += mcnt;
2177
2178 /* If the next operation is a jump backwards in the pattern
2179 to an on_failure_jump, exit from the loop by forcing a
2180 failure after pushing on the stack the on_failure_jump's
2181 jump in the pattern, and d. */
2182 if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
2183 {
2184 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2185 PUSH_FAILURE_POINT (p2 + mcnt, d);
2186 goto fail;
2187 }
2188 }
2189 p++;
2190 break;
2191
2192 /* \<digit> has been turned into a `duplicate' command which is
2193 followed by the numeric value of <digit> as the register number. */
2194 case duplicate:
2195 {
2196 int regno = *p++; /* Get which register to match against */
2197 register unsigned char *d2, *dend2;
2198
2199 /* Where in input to try to start matching. */
2200 d2 = regstart[regno];
2201
2202 /* Where to stop matching; if both the place to start and
2203 the place to stop matching are in the same string, then
2204 set to the place to stop, otherwise, for now have to use
2205 the end of the first string. */
2206
2207 dend2 = ((IS_IN_FIRST_STRING (regstart[regno])
2208 == IS_IN_FIRST_STRING (regend[regno]))
2209 ? regend[regno] : end_match_1);
2210 while (1)
2211 {
2212 /* If necessary, advance to next segment in register
2213 contents. */
2214 while (d2 == dend2)
2215 {
2216 if (dend2 == end_match_2) break;
2217 if (dend2 == regend[regno]) break;
2218 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
2219 }
2220 /* At end of register contents => success */
2221 if (d2 == dend2) break;
2222
2223 /* If necessary, advance to next segment in data. */
2224 PREFETCH;
2225
2226 /* How many characters left in this segment to match. */
2227 mcnt = dend - d;
2228
2229 /* Want how many consecutive characters we can match in
2230 one shot, so, if necessary, adjust the count. */
2231 if (mcnt > dend2 - d2)
2232 mcnt = dend2 - d2;
2233
2234 /* Compare that many; failure if mismatch, else move
2235 past them. */
2236 if (translate
2237 ? memcmp_translate (d, d2, mcnt, translate)
2238 : memcmp ((char *)d, (char *)d2, mcnt))
2239 goto fail;
2240 d += mcnt, d2 += mcnt;
2241 }
2242 }
2243 break;
2244
2245 case anychar:
2246 PREFETCH; /* Fetch a data character. */
2247 /* Match anything but a newline, maybe even a null. */
2248 if ((translate ? translate[*d] : *d) == '\n'
2249 || ((obscure_syntax & RE_DOT_NOT_NULL)
2250 && (translate ? translate[*d] : *d) == '\000'))
2251 goto fail;
2252 SET_REGS_MATCHED;
2253 d++;
2254 break;
2255
2256 case charset:
2257 case charset_not:
2258 {
2259 int not = 0; /* Nonzero for charset_not. */
2260 register int c;
2261 if (*(p - 1) == (unsigned char) charset_not)
2262 not = 1;
2263
2264 PREFETCH; /* Fetch a data character. */
2265
2266 if (translate)
2267 c = translate[*d];
2268 else
2269 c = *d;
2270
2271 if (c < *p * BYTEWIDTH
2272 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2273 not = !not;
2274
2275 p += 1 + *p;
2276
2277 if (!not) goto fail;
2278 SET_REGS_MATCHED;
2279 d++;
2280 break;
2281 }
2282
2283 case begline:
2284 if ((size1 != 0 && d == string1)
2285 || (size1 == 0 && size2 != 0 && d == string2)
2286 || (d && d[-1] == '\n')
2287 || (size1 == 0 && size2 == 0))
2288 break;
2289 else
2290 goto fail;
2291
2292 case endline:
2293 if (d == end2
2294 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
2295 break;
2296 goto fail;
2297
2298 /* `or' constructs are handled by starting each alternative with
2299 an on_failure_jump that points to the start of the next
2300 alternative. Each alternative except the last ends with a
2301 jump to the joining point. (Actually, each jump except for
2302 the last one really jumps to the following jump, because
2303 tensioning the jumps is a hassle.) */
2304
2305 /* The start of a stupid repeat has an on_failure_jump that points
2306 past the end of the repeat text. This makes a failure point so
2307 that on failure to match a repetition, matching restarts past
2308 as many repetitions have been found with no way to fail and
2309 look for another one. */
2310
2311 /* A smart repeat is similar but loops back to the on_failure_jump
2312 so that each repetition makes another failure point. */
2313
2314 case on_failure_jump:
2315 on_failure:
2316 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2317 PUSH_FAILURE_POINT (p + mcnt, d);
2318 break;
2319
2320 /* The end of a smart repeat has a maybe_finalize_jump back.
2321 Change it either to a finalize_jump or an ordinary jump. */
2322 case maybe_finalize_jump:
2323 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2324 {
2325 register unsigned char *p2 = p;
2326 /* Compare what follows with the beginning of the repeat.
2327 If we can establish that there is nothing that they would
2328 both match, we can change to finalize_jump. */
2329 while (p2 + 1 != pend
2330 && (*p2 == (unsigned char) stop_memory
2331 || *p2 == (unsigned char) start_memory))
2332 p2 += 2; /* Skip over reg number. */
2333 if (p2 == pend)
2334 p[-3] = (unsigned char) finalize_jump;
2335 else if (*p2 == (unsigned char) exactn
2336 || *p2 == (unsigned char) endline)
2337 {
2338 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
2339 register unsigned char *p1 = p + mcnt;
2340 /* p1[0] ... p1[2] are an on_failure_jump.
2341 Examine what follows that. */
2342 if (p1[3] == (unsigned char) exactn && p1[5] != c)
2343 p[-3] = (unsigned char) finalize_jump;
2344 else if (p1[3] == (unsigned char) charset
2345 || p1[3] == (unsigned char) charset_not)
2346 {
2347 int not = p1[3] == (unsigned char) charset_not;
2348 if (c < p1[4] * BYTEWIDTH
2349 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2350 not = !not;
2351 /* `not' is 1 if c would match. */
2352 /* That means it is not safe to finalize. */
2353 if (!not)
2354 p[-3] = (unsigned char) finalize_jump;
2355 }
2356 }
2357 }
2358 p -= 2; /* Point at relative address again. */
2359 if (p[-1] != (unsigned char) finalize_jump)
2360 {
2361 p[-1] = (unsigned char) jump;
2362 goto nofinalize;
2363 }
2364 /* Note fall through. */
2365
2366 /* The end of a stupid repeat has a finalize_jump back to the
2367 start, where another failure point will be made which will
2368 point to after all the repetitions found so far. */
2369
2370 /* Take off failure points put on by matching on_failure_jump
2371 because didn't fail. Also remove the register information
2372 put on by the on_failure_jump. */
2373 case finalize_jump:
2374 POP_FAILURE_POINT ();
2375 /* Note fall through. */
2376
2377 /* Jump without taking off any failure points. */
2378 case jump:
2379 nofinalize:
2380 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2381 p += mcnt;
2382 break;
2383
2384 case dummy_failure_jump:
2385 /* Normally, the on_failure_jump pushes a failure point, which
2386 then gets popped at finalize_jump. We will end up at
2387 finalize_jump, also, and with a pattern of, say, `a+', we
2388 are skipping over the on_failure_jump, so we have to push
2389 something meaningless for finalize_jump to pop. */
2390 PUSH_FAILURE_POINT (0, 0);
2391 goto nofinalize;
2392
2393
2394 /* Have to succeed matching what follows at least n times. Then
2395 just handle like an on_failure_jump. */
2396 case succeed_n:
2397 EXTRACT_NUMBER (mcnt, p + 2);
2398 /* Originally, this is how many times we HAVE to succeed. */
2399 if (mcnt)
2400 {
2401 mcnt--;
2402 p += 2;
2403 STORE_NUMBER_AND_INCR (p, mcnt);
2404 }
2405 else if (mcnt == 0)
2406 {
2407 p[2] = unused;
2408 p[3] = unused;
2409 goto on_failure;
2410 }
2411 else
2412 {
2413 fprintf (stderr, "regex: the succeed_n's n is not set.\n");
2414 exit (1);
2415 }
2416 break;
2417
2418 case jump_n:
2419 EXTRACT_NUMBER (mcnt, p + 2);
2420 /* Originally, this is how many times we CAN jump. */
2421 if (mcnt)
2422 {
2423 mcnt--;
2424 STORE_NUMBER(p + 2, mcnt);
2425 goto nofinalize; /* Do the jump without taking off
2426 any failure points. */
2427 }
2428 /* If don't have to jump any more, skip over the rest of command. */
2429 else
2430 p += 4;
2431 break;
2432
2433 case set_number_at:
2434 {
2435 register unsigned char *p1;
2436
2437 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2438 p1 = p + mcnt;
2439 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2440 STORE_NUMBER (p1, mcnt);
2441 break;
2442 }
2443
2444 /* Ignore these. Used to ignore the n of succeed_n's which
2445 currently have n == 0. */
2446 case unused:
2447 break;
2448
2449 case wordbound:
2450 if (AT_WORD_BOUNDARY)
2451 break;
2452 goto fail;
2453
2454 case notwordbound:
2455 if (AT_WORD_BOUNDARY)
2456 goto fail;
2457 break;
2458
2459 case wordbeg:
2460 if (IS_A_LETTER (d) && (!IS_A_LETTER (d - 1) || AT_STRINGS_BEG))
2461 break;
2462 goto fail;
2463
2464 case wordend:
2465 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2466 if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
2467 && (!IS_A_LETTER (d) || AT_STRINGS_END))
2468 break;
2469 goto fail;
2470
2471#ifdef emacs
2472 case before_dot:
2473 if (PTR_CHAR_POS (d) >= point)
2474 goto fail;
2475 break;
2476
2477 case at_dot:
2478 if (PTR_CHAR_POS (d) != point)
2479 goto fail;
2480 break;
2481
2482 case after_dot:
2483 if (PTR_CHAR_POS (d) <= point)
2484 goto fail;
2485 break;
2486
2487 case wordchar:
2488 mcnt = (int) Sword;
2489 goto matchsyntax;
2490
2491 case syntaxspec:
2492 mcnt = *p++;
2493 matchsyntax:
2494 PREFETCH;
2495 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
2496 SET_REGS_MATCHED;
2497 break;
2498
2499 case notwordchar:
2500 mcnt = (int) Sword;
2501 goto matchnotsyntax;
2502
2503 case notsyntaxspec:
2504 mcnt = *p++;
2505 matchnotsyntax:
2506 PREFETCH;
2507 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
2508 SET_REGS_MATCHED;
2509 break;
2510
2511#else /* not emacs */
2512
2513 case wordchar:
2514 PREFETCH;
2515 if (!IS_A_LETTER (d))
2516 goto fail;
2517 SET_REGS_MATCHED;
2518 break;
2519
2520 case notwordchar:
2521 PREFETCH;
2522 if (IS_A_LETTER (d))
2523 goto fail;
2524 SET_REGS_MATCHED;
2525 break;
2526
2527 case before_dot:
2528 case at_dot:
2529 case after_dot:
2530 case syntaxspec:
2531 case notsyntaxspec:
2532 break;
2533
2534#endif /* not emacs */
2535
2536 case begbuf:
2537 if (AT_STRINGS_BEG)
2538 break;
2539 goto fail;
2540
2541 case endbuf:
2542 if (AT_STRINGS_END)
2543 break;
2544 goto fail;
2545
2546 case exactn:
2547 /* Match the next few pattern characters exactly.
2548 mcnt is how many characters to match. */
2549 mcnt = *p++;
2550 /* This is written out as an if-else so we don't waste time
2551 testing `translate' inside the loop. */
2552 if (translate)
2553 {
2554 do
2555 {
2556 PREFETCH;
2557 if (translate[*d++] != *p++) goto fail;
2558 }
2559 while (--mcnt);
2560 }
2561 else
2562 {
2563 do
2564 {
2565 PREFETCH;
2566 if (*d++ != *p++) goto fail;
2567 }
2568 while (--mcnt);
2569 }
2570 SET_REGS_MATCHED;
2571 break;
2572 }
2573 continue; /* Successfully executed one pattern command; keep going. */
2574
2575 /* Jump here if any matching operation fails. */
2576 fail:
2577 if (stackp != stackb)
2578 /* A restart point is known. Restart there and pop it. */
2579 {
2580 short last_used_reg, this_reg;
2581
2582 /* If this failure point is from a dummy_failure_point, just
2583 skip it. */
2584 if (!stackp[-2])
2585 {
2586 POP_FAILURE_POINT ();
2587 goto fail;
2588 }
2589
2590 d = *--stackp;
2591 p = *--stackp;
2592 if (d >= string1 && d <= end1)
2593 dend = end_match_1;
2594 /* Restore register info. */
2595 last_used_reg = (long) *--stackp;
2596
2597 /* Make the ones that weren't saved -1 or 0 again. */
2598 for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
2599 {
2600 regend[this_reg] = (unsigned char *) (-1L);
2601 regstart[this_reg] = (unsigned char *) (-1L);
2602 IS_ACTIVE (reg_info[this_reg]) = 0;
2603 MATCHED_SOMETHING (reg_info[this_reg]) = 0;
2604 }
2605
2606 /* And restore the rest from the stack. */
2607 for ( ; this_reg > 0; this_reg--)
2608 {
2609 reg_info[this_reg] = *(struct register_info *) *--stackp;
2610 regend[this_reg] = *--stackp;
2611 regstart[this_reg] = *--stackp;
2612 }
2613 }
2614 else
2615 break; /* Matching at this starting point really fails. */
2616 }
2617
2618 if (best_regs_set)
2619 goto restore_best_regs;
2620
2621 FREE_AND_RETURN(stackb,(-1)); /* Failure to match. */
2622}
2623
2624
2625static int
2626memcmp_translate (s1, s2, len, translate)
2627 unsigned char *s1, *s2;
2628 register int len;
2629 unsigned char *translate;
2630{
2631 register unsigned char *p1 = s1, *p2 = s2;
2632 while (len)
2633 {
2634 if (translate [*p1++] != translate [*p2++]) return 1;
2635 len--;
2636 }
2637 return 0;
2638}
2639
2640
2641\f
2642/* Entry points compatible with 4.2 BSD regex library. */
2643
2644#if !defined(emacs) && !defined(GAWK)
2645
2646static struct re_pattern_buffer re_comp_buf;
2647
2648char *
2649re_comp (s)
2650 char *s;
2651{
2652 if (!s)
2653 {
2654 if (!re_comp_buf.buffer)
2655 return "No previous regular expression";
2656 return 0;
2657 }
2658
2659 if (!re_comp_buf.buffer)
2660 {
2661 if (!(re_comp_buf.buffer = (char *) malloc (200)))
2662 return "Memory exhausted";
2663 re_comp_buf.allocated = 200;
2664 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
2665 return "Memory exhausted";
2666 }
2667 return re_compile_pattern (s, strlen (s), &re_comp_buf);
2668}
2669
2670int
2671re_exec (s)
2672 char *s;
2673{
2674 int len = strlen (s);
2675 return 0 <= re_search (&re_comp_buf, s, len, 0, len,
2676 (struct re_registers *) 0);
2677}
2678#endif /* not emacs && not GAWK */
2679
2680
2681\f
2682#ifdef test
2683
2684#ifdef atarist
2685long _stksize = 2L; /* reserve memory for stack */
2686#endif
2687#include <stdio.h>
2688
2689/* Indexed by a character, gives the upper case equivalent of the
2690 character. */
2691
2692char upcase[0400] =
2693 { 000, 001, 002, 003, 004, 005, 006, 007,
2694 010, 011, 012, 013, 014, 015, 016, 017,
2695 020, 021, 022, 023, 024, 025, 026, 027,
2696 030, 031, 032, 033, 034, 035, 036, 037,
2697 040, 041, 042, 043, 044, 045, 046, 047,
2698 050, 051, 052, 053, 054, 055, 056, 057,
2699 060, 061, 062, 063, 064, 065, 066, 067,
2700 070, 071, 072, 073, 074, 075, 076, 077,
2701 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2702 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2703 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2704 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
2705 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2706 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2707 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2708 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
2709 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
2710 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
2711 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
2712 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
2713 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
2714 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
2715 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
2716 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
2717 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
2718 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
2719 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
2720 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
2721 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
2722 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
2723 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
2724 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
2725 };
2726
2727#ifdef canned
2728
2729#include "tests.h"
2730
2731typedef enum { extended_test, basic_test } test_type;
2732
2733/* Use this to run the tests we've thought of. */
2734
2735void
2736main ()
2737{
2738 test_type t = extended_test;
2739
2740 if (t == basic_test)
2741 {
2742 printf ("Running basic tests:\n\n");
2743 test_posix_basic ();
2744 }
2745 else if (t == extended_test)
2746 {
2747 printf ("Running extended tests:\n\n");
2748 test_posix_extended ();
2749 }
2750}
2751
2752#else /* not canned */
2753
2754/* Use this to run interactive tests. */
2755
2756void
2757main (argc, argv)
2758 int argc;
2759 char **argv;
2760{
2761 char pat[80];
2762 struct re_pattern_buffer buf;
2763 int i;
2764 char c;
2765 char fastmap[(1 << BYTEWIDTH)];
2766
2767 /* Allow a command argument to specify the style of syntax. */
2768 if (argc > 1)
2769 obscure_syntax = atol (argv[1]);
2770
2771 buf.allocated = 40;
2772 buf.buffer = (char *) malloc (buf.allocated);
2773 buf.fastmap = fastmap;
2774 buf.translate = upcase;
2775
2776 while (1)
2777 {
2778 gets (pat);
2779
2780 if (*pat)
2781 {
2782 re_compile_pattern (pat, strlen(pat), &buf);
2783
2784 for (i = 0; i < buf.used; i++)
2785 printchar (buf.buffer[i]);
2786
2787 putchar ('\n');
2788
2789 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
2790
2791 re_compile_fastmap (&buf);
2792 printf ("Allowed by fastmap: ");
2793 for (i = 0; i < (1 << BYTEWIDTH); i++)
2794 if (fastmap[i]) printchar (i);
2795 putchar ('\n');
2796 }
2797
2798 gets (pat); /* Now read the string to match against */
2799
2800 i = re_match (&buf, pat, strlen (pat), 0, 0);
2801 printf ("Match value %d.\n", i);
2802 }
2803}
2804
2805#endif
2806
2807
2808#ifdef NOTDEF
2809print_buf (bufp)
2810 struct re_pattern_buffer *bufp;
2811{
2812 int i;
2813
2814 printf ("buf is :\n----------------\n");
2815 for (i = 0; i < bufp->used; i++)
2816 printchar (bufp->buffer[i]);
2817
2818 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
2819
2820 printf ("Allowed by fastmap: ");
2821 for (i = 0; i < (1 << BYTEWIDTH); i++)
2822 if (bufp->fastmap[i])
2823 printchar (i);
2824 printf ("\nAllowed by translate: ");
2825 if (bufp->translate)
2826 for (i = 0; i < (1 << BYTEWIDTH); i++)
2827 if (bufp->translate[i])
2828 printchar (i);
2829 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
2830 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
2831}
2832#endif /* NOTDEF */
2833
2834printchar (c)
2835 char c;
2836{
2837 if (c < 040 || c >= 0177)
2838 {
2839 putchar ('\\');
2840 putchar (((c >> 6) & 3) + '0');
2841 putchar (((c >> 3) & 7) + '0');
2842 putchar ((c & 7) + '0');
2843 }
2844 else
2845 putchar (c);
2846}
2847
2848error (string)
2849 char *string;
2850{
2851 puts (string);
2852 exit (1);
2853}
2854#endif /* test */