386BSD 0.1 development
[unix-history] / usr / src / usr.bin / grep / dfa.c
CommitLineData
fd7e7ce4
WJ
1/* dfa.c - determinisitic extended regexp routines for GNU
2 Copyright (C) 1988 Free Software Foundation, Inc.
3 Written June, 1988 by Mike Haertel
4 Modified July, 1988 by Arthur David Olson
5 to assist BMG speedups
6
7 NO WARRANTY
8
9 BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
10NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
11WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
12RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
13WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
14BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
15FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
16AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
17DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
18CORRECTION.
19
20 IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
21STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
22WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
23LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
24OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
25USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
26DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
27A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
28PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
29DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
30
31 GENERAL PUBLIC LICENSE TO COPY
32
33 1. You may copy and distribute verbatim copies of this source file
34as you receive it, in any medium, provided that you conspicuously and
35appropriately publish on each copy a valid copyright notice "Copyright
36 (C) 1988 Free Software Foundation, Inc."; and include following the
37copyright notice a verbatim copy of the above disclaimer of warranty
38and of this License. You may charge a distribution fee for the
39physical act of transferring a copy.
40
41 2. You may modify your copy or copies of this source file or
42any portion of it, and copy and distribute such modifications under
43the terms of Paragraph 1 above, provided that you also do the following:
44
45 a) cause the modified files to carry prominent notices stating
46 that you changed the files and the date of any change; and
47
48 b) cause the whole of any work that you distribute or publish,
49 that in whole or in part contains or is a derivative of this
50 program or any part thereof, to be licensed at no charge to all
51 third parties on terms identical to those contained in this
52 License Agreement (except that you may choose to grant more extensive
53 warranty protection to some or all third parties, at your option).
54
55 c) You may charge a distribution fee for the physical act of
56 transferring a copy, and you may at your option offer warranty
57 protection in exchange for a fee.
58
59Mere aggregation of another unrelated program with this program (or its
60derivative) on a volume of a storage or distribution medium does not bring
61the other program under the scope of these terms.
62
63 3. You may copy and distribute this program or any portion of it in
64compiled, executable or object code form under the terms of Paragraphs
651 and 2 above provided that you do the following:
66
67 a) accompany it with the complete corresponding machine-readable
68 source code, which must be distributed under the terms of
69 Paragraphs 1 and 2 above; or,
70
71 b) accompany it with a written offer, valid for at least three
72 years, to give any third party free (except for a nominal
73 shipping charge) a complete machine-readable copy of the
74 corresponding source code, to be distributed under the terms of
75 Paragraphs 1 and 2 above; or,
76
77 c) accompany it with the information you received as to where the
78 corresponding source code may be obtained. (This alternative is
79 allowed only for noncommercial distribution and only if you
80 received the program in object code or executable form alone.)
81
82For an executable file, complete source code means all the source code for
83all modules it contains; but, as a special exception, it need not include
84source code for modules which are standard libraries that accompany the
85operating system on which the executable file runs.
86
87 4. You may not copy, sublicense, distribute or transfer this program
88except as expressly provided under this License Agreement. Any attempt
89otherwise to copy, sublicense, distribute or transfer this program is void and
90your rights to use the program under this License agreement shall be
91automatically terminated. However, parties who have received computer
92software programs from you with this License Agreement will not have
93their licenses terminated so long as such parties remain in full compliance.
94
95 5. If you wish to incorporate parts of this program into other free
96programs whose distribution conditions are different, write to the Free
97Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
98worked out a simple rule that can be stated here, but we will often permit
99this. We will be guided by the two goals of preserving the free status of
100all derivatives our free software and of promoting the sharing and reuse of
101software.
102
103
104In other words, you are welcome to use, share and improve this program.
105You are forbidden to forbid anyone else to use, share and improve
106what you give them. Help stamp out software-hoarding! */
107\f
108#include <stdio.h>
109#include <assert.h>
110#include <ctype.h>
111#include "dfa.h"
112
113#ifdef __STDC__
114typedef void *ptr_t;
115#else
116typedef char *ptr_t;
117#endif
118
119static void regmust();
120
121static ptr_t
122xcalloc(n, s)
123 int n;
124 size_t s;
125{
126 ptr_t r = calloc(n, s);
127
128 if (r)
129 return r;
130 else
131 regerror("Memory exhausted");
132}
133
134static ptr_t
135xmalloc(n)
136 size_t n;
137{
138 ptr_t r = malloc(n);
139
140 assert(n != 0);
141 if (r)
142 return r;
143 else
144 regerror("Memory exhausted");
145}
146
147static ptr_t
148xrealloc(p, n)
149 ptr_t p;
150 size_t n;
151{
152 ptr_t r = realloc(p, n);
153
154 assert(n != 0);
155 if (r)
156 return r;
157 else
158 regerror("Memory exhausted");
159}
160
161#define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t)))
162#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
163#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
164
165/* Reallocate an array of type t if nalloc is too small for index. */
166#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
167 if ((index) >= (nalloc)) \
168 { \
169 while ((index) >= (nalloc)) \
170 (nalloc) *= 2; \
171 REALLOC(p, t, nalloc); \
172 }
173\f
174/* Stuff pertaining to charsets. */
175
176static
177tstbit(b, c)
178 int b;
179 _charset c;
180{
181 return c[b / INTBITS] & 1 << b % INTBITS;
182}
183
184static void
185setbit(b, c)
186 int b;
187 _charset c;
188{
189 c[b / INTBITS] |= 1 << b % INTBITS;
190}
191
192static void
193clrbit(b, c)
194 int b;
195 _charset c;
196{
197 c[b / INTBITS] &= ~(1 << b % INTBITS);
198}
199
200static void
201copyset(src, dst)
202 const _charset src;
203 _charset dst;
204{
205 int i;
206
207 for (i = 0; i < _CHARSET_INTS; ++i)
208 dst[i] = src[i];
209}
210
211static void
212zeroset(s)
213 _charset s;
214{
215 int i;
216
217 for (i = 0; i < _CHARSET_INTS; ++i)
218 s[i] = 0;
219}
220
221static void
222notset(s)
223 _charset s;
224{
225 int i;
226
227 for (i = 0; i < _CHARSET_INTS; ++i)
228 s[i] = ~s[i];
229}
230
231static
232equal(s1, s2)
233 const _charset s1;
234 const _charset s2;
235{
236 int i;
237
238 for (i = 0; i < _CHARSET_INTS; ++i)
239 if (s1[i] != s2[i])
240 return 0;
241 return 1;
242}
243\f
244/* A pointer to the current regexp is kept here during parsing. */
245static struct regexp *reg;
246
247/* Find the index of charset s in reg->charsets, or allocate a new charset. */
248static
249charset_index(s)
250 const _charset s;
251{
252 int i;
253
254 for (i = 0; i < reg->cindex; ++i)
255 if (equal(s, reg->charsets[i]))
256 return i;
257 REALLOC_IF_NECESSARY(reg->charsets, _charset, reg->calloc, reg->cindex);
258 ++reg->cindex;
259 copyset(s, reg->charsets[i]);
260 return i;
261}
262
263/* Syntax bits controlling the behavior of the lexical analyzer. */
264static syntax_bits, syntax_bits_set;
265
266/* Flag for case-folding letters into sets. */
267static case_fold;
268
269/* Entry point to set syntax options. */
270void
271regsyntax(bits, fold)
272 int bits;
273 int fold;
274{
275 syntax_bits_set = 1;
276 syntax_bits = bits;
277 case_fold = fold;
278}
279
280/* Lexical analyzer. */
281static const char *lexstart; /* Pointer to beginning of input string. */
282static const char *lexptr; /* Pointer to next input character. */
283static lexleft; /* Number of characters remaining. */
284static caret_allowed; /* True if backward context allows ^
285 (meaningful only if RE_CONTEXT_INDEP_OPS
286 is turned off). */
287static closure_allowed; /* True if backward context allows closures
288 (meaningful only if RE_CONTEXT_INDEP_OPS
289 is turned off). */
290
291/* Note that characters become unsigned here. */
292#define FETCH(c, eoferr) \
293 { \
294 if (! lexleft) \
295 if (eoferr) \
296 regerror(eoferr); \
297 else \
298 return _END; \
299 (c) = (unsigned char) *lexptr++; \
300 --lexleft; \
301 }
302
303static _token
304lex()
305{
306 _token c, c2;
307 int invert;
308 _charset cset;
309
310 FETCH(c, (char *) 0);
311 switch (c)
312 {
313 case '^':
314 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS)
315 && (!caret_allowed ||
316 (syntax_bits & RE_TIGHT_VBAR) && lexptr - 1 != lexstart))
317 goto normal_char;
318 caret_allowed = 0;
319 return syntax_bits & RE_TIGHT_VBAR ? _ALLBEGLINE : _BEGLINE;
320
321 case '$':
322 if (syntax_bits & RE_CONTEXT_INDEP_OPS || !lexleft
323 || (! (syntax_bits & RE_TIGHT_VBAR)
324 && ((syntax_bits & RE_NO_BK_PARENS
325 ? lexleft > 0 && *lexptr == ')'
326 : lexleft > 1 && *lexptr == '\\' && lexptr[1] == ')')
327 || (syntax_bits & RE_NO_BK_VBAR
328 ? lexleft > 0 && *lexptr == '|'
329 : lexleft > 1 && *lexptr == '\\' && lexptr[1] == '|'))))
330 return syntax_bits & RE_TIGHT_VBAR ? _ALLENDLINE : _ENDLINE;
331 goto normal_char;
332
333 case '\\':
334 FETCH(c, "Unfinished \\ quote");
335 switch (c)
336 {
337 case '1':
338 case '2':
339 case '3':
340 case '4':
341 case '5':
342 case '6':
343 case '7':
344 case '8':
345 case '9':
346 caret_allowed = 0;
347 closure_allowed = 1;
348 return _BACKREF;
349
350 case '<':
351 caret_allowed = 0;
352 return _BEGWORD;
353
354 case '>':
355 caret_allowed = 0;
356 return _ENDWORD;
357
358 case 'b':
359 caret_allowed = 0;
360 return _LIMWORD;
361
362 case 'B':
363 caret_allowed = 0;
364 return _NOTLIMWORD;
365
366 case 'w':
367 case 'W':
368 zeroset(cset);
369 for (c2 = 0; c2 < _NOTCHAR; ++c2)
370 if (ISALNUM(c2))
371 setbit(c2, cset);
372 if (c == 'W')
373 notset(cset);
374 caret_allowed = 0;
375 closure_allowed = 1;
376 return _SET + charset_index(cset);
377
378 case '?':
379 if (syntax_bits & RE_BK_PLUS_QM)
380 goto qmark;
381 goto normal_char;
382
383 case '+':
384 if (syntax_bits & RE_BK_PLUS_QM)
385 goto plus;
386 goto normal_char;
387
388 case '|':
389 if (! (syntax_bits & RE_NO_BK_VBAR))
390 goto or;
391 goto normal_char;
392
393 case '(':
394 if (! (syntax_bits & RE_NO_BK_PARENS))
395 goto lparen;
396 goto normal_char;
397
398 case ')':
399 if (! (syntax_bits & RE_NO_BK_PARENS))
400 goto rparen;
401 goto normal_char;
402
403 default:
404 goto normal_char;
405 }
406
407 case '?':
408 if (syntax_bits & RE_BK_PLUS_QM)
409 goto normal_char;
410 qmark:
411 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)
412 goto normal_char;
413 return _QMARK;
414
415 case '*':
416 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)
417 goto normal_char;
418 return _STAR;
419
420 case '+':
421 if (syntax_bits & RE_BK_PLUS_QM)
422 goto normal_char;
423 plus:
424 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)
425 goto normal_char;
426 return _PLUS;
427
428 case '|':
429 if (! (syntax_bits & RE_NO_BK_VBAR))
430 goto normal_char;
431 or:
432 caret_allowed = 1;
433 closure_allowed = 0;
434 return _OR;
435
436 case '\n':
437 if (! (syntax_bits & RE_NEWLINE_OR))
438 goto normal_char;
439 goto or;
440
441 case '(':
442 if (! (syntax_bits & RE_NO_BK_PARENS))
443 goto normal_char;
444 lparen:
445 caret_allowed = 1;
446 closure_allowed = 0;
447 return _LPAREN;
448
449 case ')':
450 if (! (syntax_bits & RE_NO_BK_PARENS))
451 goto normal_char;
452 rparen:
453 caret_allowed = 0;
454 closure_allowed = 1;
455 return _RPAREN;
456
457 case '.':
458 zeroset(cset);
459 notset(cset);
460 clrbit('\n', cset);
461 caret_allowed = 0;
462 closure_allowed = 1;
463 return _SET + charset_index(cset);
464
465 case '[':
466 zeroset(cset);
467 FETCH(c, "Unbalanced [");
468 if (c == '^')
469 {
470 FETCH(c, "Unbalanced [");
471 invert = 1;
472 }
473 else
474 invert = 0;
475 do
476 {
477 FETCH(c2, "Unbalanced [");
478 if (c2 == '-')
479 {
480 FETCH(c2, "Unbalanced [");
481 while (c <= c2)
482 setbit(c++, cset);
483 FETCH(c, "Unbalanced [");
484 }
485 else
486 {
487 setbit(c, cset);
488 c = c2;
489 }
490 }
491 while (c != ']');
492 if (invert)
493 notset(cset);
494 caret_allowed = 0;
495 closure_allowed = 1;
496 return _SET + charset_index(cset);
497
498 default:
499 normal_char:
500 caret_allowed = 0;
501 closure_allowed = 1;
502 if (case_fold && ISALPHA(c))
503 {
504 zeroset(cset);
505 if (isupper(c))
506 c = tolower(c);
507 setbit(c, cset);
508 setbit(toupper(c), cset);
509 return _SET + charset_index(cset);
510 }
511 return c;
512 }
513}
514\f
515/* Recursive descent parser for regular expressions. */
516
517static _token tok; /* Lookahead token. */
518static depth; /* Current depth of a hypothetical stack
519 holding deferred productions. This is
520 used to determine the depth that will be
521 required of the real stack later on in
522 reganalyze(). */
523
524/* Add the given token to the parse tree, maintaining the depth count and
525 updating the maximum depth if necessary. */
526static void
527addtok(t)
528 _token t;
529{
530 REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex);
531 reg->tokens[reg->tindex++] = t;
532
533 switch (t)
534 {
535 case _QMARK:
536 case _STAR:
537 case _PLUS:
538 break;
539
540 case _CAT:
541 case _OR:
542 --depth;
543 break;
544
545 default:
546 ++reg->nleaves;
547 case _EMPTY:
548 ++depth;
549 break;
550 }
551 if (depth > reg->depth)
552 reg->depth = depth;
553}
554
555/* The grammar understood by the parser is as follows.
556
557 start:
558 regexp
559 _ALLBEGLINE regexp
560 regexp _ALLENDLINE
561 _ALLBEGLINE regexp _ALLENDLINE
562
563 regexp:
564 regexp _OR branch
565 branch
566
567 branch:
568 branch closure
569 closure
570
571 closure:
572 closure _QMARK
573 closure _STAR
574 closure _PLUS
575 atom
576
577 atom:
578 <normal character>
579 _SET
580 _BACKREF
581 _BEGLINE
582 _ENDLINE
583 _BEGWORD
584 _ENDWORD
585 _LIMWORD
586 _NOTLIMWORD
587 <empty>
588
589 The parser builds a parse tree in postfix form in an array of tokens. */
590
591#ifdef __STDC__
592static void regexp(void);
593#else
594static void regexp();
595#endif
596
597static void
598atom()
599{
600 if (tok >= 0 && tok < _NOTCHAR || tok >= _SET || tok == _BACKREF
601 || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD
602 || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD)
603 {
604 addtok(tok);
605 tok = lex();
606 }
607 else if (tok == _LPAREN)
608 {
609 tok = lex();
610 regexp();
611 if (tok != _RPAREN)
612 regerror("Unbalanced (");
613 tok = lex();
614 }
615 else
616 addtok(_EMPTY);
617}
618
619static void
620closure()
621{
622 atom();
623 while (tok == _QMARK || tok == _STAR || tok == _PLUS)
624 {
625 addtok(tok);
626 tok = lex();
627 }
628}
629
630static void
631branch()
632{
633 closure();
634 while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0)
635 {
636 closure();
637 addtok(_CAT);
638 }
639}
640
641static void
642regexp()
643{
644 branch();
645 while (tok == _OR)
646 {
647 tok = lex();
648 branch();
649 addtok(_OR);
650 }
651}
652
653/* Main entry point for the parser. S is a string to be parsed, len is the
654 length of the string, so s can include NUL characters. R is a pointer to
655 the struct regexp to parse into. */
656void
657regparse(s, len, r)
658 const char *s;
659 size_t len;
660 struct regexp *r;
661{
662 reg = r;
663 lexstart = lexptr = s;
664 lexleft = len;
665 caret_allowed = 1;
666 closure_allowed = 0;
667
668 if (! syntax_bits_set)
669 regerror("No syntax specified");
670
671 tok = lex();
672 depth = r->depth;
673
674 if (tok == _ALLBEGLINE)
675 {
676 addtok(_BEGLINE);
677 tok = lex();
678 regexp();
679 addtok(_CAT);
680 }
681 else
682 regexp();
683
684 if (tok == _ALLENDLINE)
685 {
686 addtok(_ENDLINE);
687 addtok(_CAT);
688 tok = lex();
689 }
690
691 if (tok != _END)
692 regerror("Unbalanced )");
693
694 addtok(_END - r->nregexps);
695 addtok(_CAT);
696
697 if (r->nregexps)
698 addtok(_OR);
699
700 ++r->nregexps;
701}
702\f
703/* Some primitives for operating on sets of positions. */
704
705/* Copy one set to another; the destination must be large enough. */
706static void
707copy(src, dst)
708 const _position_set *src;
709 _position_set *dst;
710{
711 int i;
712
713 for (i = 0; i < src->nelem; ++i)
714 dst->elems[i] = src->elems[i];
715 dst->nelem = src->nelem;
716}
717
718/* Insert a position in a set. Position sets are maintained in sorted
719 order according to index. If position already exists in the set with
720 the same index then their constraints are logically or'd together.
721 S->elems must point to an array large enough to hold the resulting set. */
722static void
723insert(p, s)
724 _position p;
725 _position_set *s;
726{
727 int i;
728 _position t1, t2;
729
730 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
731 ;
732 if (i < s->nelem && p.index == s->elems[i].index)
733 s->elems[i].constraint |= p.constraint;
734 else
735 {
736 t1 = p;
737 ++s->nelem;
738 while (i < s->nelem)
739 {
740 t2 = s->elems[i];
741 s->elems[i++] = t1;
742 t1 = t2;
743 }
744 }
745}
746
747/* Merge two sets of positions into a third. The result is exactly as if
748 the positions of both sets were inserted into an initially empty set. */
749static void
750merge(s1, s2, m)
751 _position_set *s1;
752 _position_set *s2;
753 _position_set *m;
754{
755 int i = 0, j = 0;
756
757 m->nelem = 0;
758 while (i < s1->nelem && j < s2->nelem)
759 if (s1->elems[i].index > s2->elems[j].index)
760 m->elems[m->nelem++] = s1->elems[i++];
761 else if (s1->elems[i].index < s2->elems[j].index)
762 m->elems[m->nelem++] = s2->elems[j++];
763 else
764 {
765 m->elems[m->nelem] = s1->elems[i++];
766 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
767 }
768 while (i < s1->nelem)
769 m->elems[m->nelem++] = s1->elems[i++];
770 while (j < s2->nelem)
771 m->elems[m->nelem++] = s2->elems[j++];
772}
773
774/* Delete a position from a set. */
775static void
776delete(p, s)
777 _position p;
778 _position_set *s;
779{
780 int i;
781
782 for (i = 0; i < s->nelem; ++i)
783 if (p.index == s->elems[i].index)
784 break;
785 if (i < s->nelem)
786 for (--s->nelem; i < s->nelem; ++i)
787 s->elems[i] = s->elems[i + 1];
788}
789\f
790/* Find the index of the state corresponding to the given position set with
791 the given preceding context, or create a new state if there is no such
792 state. Newline and letter tell whether we got here on a newline or
793 letter, respectively. */
794static
795state_index(r, s, newline, letter)
796 struct regexp *r;
797 _position_set *s;
798 int newline;
799 int letter;
800{
801 int hash = 0;
802 int constraint;
803 int i, j;
804
805 newline = newline ? 1 : 0;
806 letter = letter ? 1 : 0;
807
808 for (i = 0; i < s->nelem; ++i)
809 hash ^= s->elems[i].index + s->elems[i].constraint;
810
811 /* Try to find a state that exactly matches the proposed one. */
812 for (i = 0; i < r->sindex; ++i)
813 {
814 if (hash != r->states[i].hash || s->nelem != r->states[i].elems.nelem
815 || newline != r->states[i].newline || letter != r->states[i].letter)
816 continue;
817 for (j = 0; j < s->nelem; ++j)
818 if (s->elems[j].constraint
819 != r->states[i].elems.elems[j].constraint
820 || s->elems[j].index != r->states[i].elems.elems[j].index)
821 break;
822 if (j == s->nelem)
823 return i;
824 }
825
826 /* We'll have to create a new state. */
827 REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex);
828 r->states[i].hash = hash;
829 MALLOC(r->states[i].elems.elems, _position, s->nelem);
830 copy(s, &r->states[i].elems);
831 r->states[i].newline = newline;
832 r->states[i].letter = letter;
833 r->states[i].backref = 0;
834 r->states[i].constraint = 0;
835 r->states[i].first_end = 0;
836 for (j = 0; j < s->nelem; ++j)
837 if (r->tokens[s->elems[j].index] < 0)
838 {
839 constraint = s->elems[j].constraint;
840 if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
841 || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
842 || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
843 || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
844 r->states[i].constraint |= constraint;
845 if (! r->states[i].first_end)
846 r->states[i].first_end = r->tokens[s->elems[j].index];
847 }
848 else if (r->tokens[s->elems[j].index] == _BACKREF)
849 {
850 r->states[i].constraint = _NO_CONSTRAINT;
851 r->states[i].backref = 1;
852 }
853
854 ++r->sindex;
855
856 return i;
857}
858\f
859/* Find the epsilon closure of a set of positions. If any position of the set
860 contains a symbol that matches the empty string in some context, replace
861 that position with the elements of its follow labeled with an appropriate
862 constraint. Repeat exhaustively until no funny positions are left.
863 S->elems must be large enough to hold the result. */
864epsclosure(s, r)
865 _position_set *s;
866 struct regexp *r;
867{
868 int i, j;
869 int *visited;
870 _position p, old;
871
872 MALLOC(visited, int, r->tindex);
873 for (i = 0; i < r->tindex; ++i)
874 visited[i] = 0;
875
876 for (i = 0; i < s->nelem; ++i)
877 if (r->tokens[s->elems[i].index] >= _NOTCHAR
878 && r->tokens[s->elems[i].index] != _BACKREF
879 && r->tokens[s->elems[i].index] < _SET)
880 {
881 old = s->elems[i];
882 p.constraint = old.constraint;
883 delete(s->elems[i], s);
884 if (visited[old.index])
885 {
886 --i;
887 continue;
888 }
889 visited[old.index] = 1;
890 switch (r->tokens[old.index])
891 {
892 case _BEGLINE:
893 p.constraint &= _BEGLINE_CONSTRAINT;
894 break;
895 case _ENDLINE:
896 p.constraint &= _ENDLINE_CONSTRAINT;
897 break;
898 case _BEGWORD:
899 p.constraint &= _BEGWORD_CONSTRAINT;
900 break;
901 case _ENDWORD:
902 p.constraint &= _ENDWORD_CONSTRAINT;
903 break;
904 case _LIMWORD:
905 p.constraint &= _ENDWORD_CONSTRAINT;
906 break;
907 case _NOTLIMWORD:
908 p.constraint &= _NOTLIMWORD_CONSTRAINT;
909 break;
910 }
911 for (j = 0; j < r->follows[old.index].nelem; ++j)
912 {
913 p.index = r->follows[old.index].elems[j].index;
914 insert(p, s);
915 }
916 /* Force rescan to start at the beginning. */
917 i = -1;
918 }
919
920 free(visited);
921}
922\f
923/* Perform bottom-up analysis on the parse tree, computing various functions.
924 Note that at this point, we're pretending constructs like \< are real
925 characters rather than constraints on what can follow them.
926
927 Nullable: A node is nullable if it is at the root of a regexp that can
928 match the empty string.
929 * _EMPTY leaves are nullable.
930 * No other leaf is nullable.
931 * A _QMARK or _STAR node is nullable.
932 * A _PLUS node is nullable if its argument is nullable.
933 * A _CAT node is nullable if both its arguments are nullable.
934 * An _OR node is nullable if either argument is nullable.
935
936 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
937 that could correspond to the first character of a string matching the
938 regexp rooted at the given node.
939 * _EMPTY leaves have empty firstpos.
940 * The firstpos of a nonempty leaf is that leaf itself.
941 * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its
942 argument.
943 * The firstpos of a _CAT node is the firstpos of the left argument, union
944 the firstpos of the right if the left argument is nullable.
945 * The firstpos of an _OR node is the union of firstpos of each argument.
946
947 Lastpos: The lastpos of a node is the set of positions that could
948 correspond to the last character of a string matching the regexp at
949 the given node.
950 * _EMPTY leaves have empty lastpos.
951 * The lastpos of a nonempty leaf is that leaf itself.
952 * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its
953 argument.
954 * The lastpos of a _CAT node is the lastpos of its right argument, union
955 the lastpos of the left if the right argument is nullable.
956 * The lastpos of an _OR node is the union of the lastpos of each argument.
957
958 Follow: The follow of a position is the set of positions that could
959 correspond to the character following a character matching the node in
960 a string matching the regexp. At this point we consider special symbols
961 that match the empty string in some context to be just normal characters.
962 Later, if we find that a special symbol is in a follow set, we will
963 replace it with the elements of its follow, labeled with an appropriate
964 constraint.
965 * Every node in the firstpos of the argument of a _STAR or _PLUS node is in
966 the follow of every node in the lastpos.
967 * Every node in the firstpos of the second argument of a _CAT node is in
968 the follow of every node in the lastpos of the first argument.
969
970 Because of the postfix representation of the parse tree, the depth-first
971 analysis is conveniently done by a linear scan with the aid of a stack.
972 Sets are stored as arrays of the elements, obeying a stack-like allocation
973 scheme; the number of elements in each set deeper in the stack can be
974 used to determine the address of a particular set's array. */
975void
976reganalyze(r, searchflag)
977 struct regexp *r;
978 int searchflag;
979{
980 int *nullable; /* Nullable stack. */
981 int *nfirstpos; /* Element count stack for firstpos sets. */
982 _position *firstpos; /* Array where firstpos elements are stored. */
983 int *nlastpos; /* Element count stack for lastpos sets. */
984 _position *lastpos; /* Array where lastpos elements are stored. */
985 int *nalloc; /* Sizes of arrays allocated to follow sets. */
986 _position_set tmp; /* Temporary set for merging sets. */
987 _position_set merged; /* Result of merging sets. */
988 int wants_newline; /* True if some position wants newline info. */
989 int *o_nullable;
990 int *o_nfirst, *o_nlast;
991 _position *o_firstpos, *o_lastpos;
992 int i, j;
993 _position *pos;
994
995 r->searchflag = searchflag;
996
997 MALLOC(nullable, int, r->depth);
998 o_nullable = nullable;
999 MALLOC(nfirstpos, int, r->depth);
1000 o_nfirst = nfirstpos;
1001 MALLOC(firstpos, _position, r->nleaves);
1002 o_firstpos = firstpos, firstpos += r->nleaves;
1003 MALLOC(nlastpos, int, r->depth);
1004 o_nlast = nlastpos;
1005 MALLOC(lastpos, _position, r->nleaves);
1006 o_lastpos = lastpos, lastpos += r->nleaves;
1007 MALLOC(nalloc, int, r->tindex);
1008 for (i = 0; i < r->tindex; ++i)
1009 nalloc[i] = 0;
1010 MALLOC(merged.elems, _position, r->nleaves);
1011
1012 CALLOC(r->follows, _position_set, r->tindex);
1013
1014 for (i = 0; i < r->tindex; ++i)
1015 switch (r->tokens[i])
1016 {
1017 case _EMPTY:
1018 /* The empty set is nullable. */
1019 *nullable++ = 1;
1020
1021 /* The firstpos and lastpos of the empty leaf are both empty. */
1022 *nfirstpos++ = *nlastpos++ = 0;
1023 break;
1024
1025 case _STAR:
1026 case _PLUS:
1027 /* Every element in the firstpos of the argument is in the follow
1028 of every element in the lastpos. */
1029 tmp.nelem = nfirstpos[-1];
1030 tmp.elems = firstpos;
1031 pos = lastpos;
1032 for (j = 0; j < nlastpos[-1]; ++j)
1033 {
1034 merge(&tmp, &r->follows[pos[j].index], &merged);
1035 REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,
1036 nalloc[pos[j].index], merged.nelem - 1);
1037 copy(&merged, &r->follows[pos[j].index]);
1038 }
1039
1040 case _QMARK:
1041 /* A _QMARK or _STAR node is automatically nullable. */
1042 if (r->tokens[i] != _PLUS)
1043 nullable[-1] = 1;
1044 break;
1045
1046 case _CAT:
1047 /* Every element in the firstpos of the second argument is in the
1048 follow of every element in the lastpos of the first argument. */
1049 tmp.nelem = nfirstpos[-1];
1050 tmp.elems = firstpos;
1051 pos = lastpos + nlastpos[-1];
1052 for (j = 0; j < nlastpos[-2]; ++j)
1053 {
1054 merge(&tmp, &r->follows[pos[j].index], &merged);
1055 REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,
1056 nalloc[pos[j].index], merged.nelem - 1);
1057 copy(&merged, &r->follows[pos[j].index]);
1058 }
1059
1060 /* The firstpos of a _CAT node is the firstpos of the first argument,
1061 union that of the second argument if the first is nullable. */
1062 if (nullable[-2])
1063 nfirstpos[-2] += nfirstpos[-1];
1064 else
1065 firstpos += nfirstpos[-1];
1066 --nfirstpos;
1067
1068 /* The lastpos of a _CAT node is the lastpos of the second argument,
1069 union that of the first argument if the second is nullable. */
1070 if (nullable[-1])
1071 nlastpos[-2] += nlastpos[-1];
1072 else
1073 {
1074 pos = lastpos + nlastpos[-2];
1075 for (j = nlastpos[-1] - 1; j >= 0; --j)
1076 pos[j] = lastpos[j];
1077 lastpos += nlastpos[-2];
1078 nlastpos[-2] = nlastpos[-1];
1079 }
1080 --nlastpos;
1081
1082 /* A _CAT node is nullable if both arguments are nullable. */
1083 nullable[-2] = nullable[-1] && nullable[-2];
1084 --nullable;
1085 break;
1086
1087 case _OR:
1088 /* The firstpos is the union of the firstpos of each argument. */
1089 nfirstpos[-2] += nfirstpos[-1];
1090 --nfirstpos;
1091
1092 /* The lastpos is the union of the lastpos of each argument. */
1093 nlastpos[-2] += nlastpos[-1];
1094 --nlastpos;
1095
1096 /* An _OR node is nullable if either argument is nullable. */
1097 nullable[-2] = nullable[-1] || nullable[-2];
1098 --nullable;
1099 break;
1100
1101 default:
1102 /* Anything else is a nonempty position. (Note that special
1103 constructs like \< are treated as nonempty strings here;
1104 an "epsilon closure" effectively makes them nullable later.
1105 Backreferences have to get a real position so we can detect
1106 transitions on them later. But they are nullable. */
1107 *nullable++ = r->tokens[i] == _BACKREF;
1108
1109 /* This position is in its own firstpos and lastpos. */
1110 *nfirstpos++ = *nlastpos++ = 1;
1111 --firstpos, --lastpos;
1112 firstpos->index = lastpos->index = i;
1113 firstpos->constraint = lastpos->constraint = _NO_CONSTRAINT;
1114
1115 /* Allocate the follow set for this position. */
1116 nalloc[i] = 1;
1117 MALLOC(r->follows[i].elems, _position, nalloc[i]);
1118 break;
1119 }
1120
1121 /* For each follow set that is the follow set of a real position, replace
1122 it with its epsilon closure. */
1123 for (i = 0; i < r->tindex; ++i)
1124 if (r->tokens[i] < _NOTCHAR || r->tokens[i] == _BACKREF
1125 || r->tokens[i] >= _SET)
1126 {
1127 copy(&r->follows[i], &merged);
1128 epsclosure(&merged, r);
1129 if (r->follows[i].nelem < merged.nelem)
1130 REALLOC(r->follows[i].elems, _position, merged.nelem);
1131 copy(&merged, &r->follows[i]);
1132 }
1133
1134 /* Get the epsilon closure of the firstpos of the regexp. The result will
1135 be the set of positions of state 0. */
1136 merged.nelem = 0;
1137 for (i = 0; i < nfirstpos[-1]; ++i)
1138 insert(firstpos[i], &merged);
1139 epsclosure(&merged, r);
1140
1141 /* Check if any of the positions of state 0 will want newline context. */
1142 wants_newline = 0;
1143 for (i = 0; i < merged.nelem; ++i)
1144 if (_PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1145 wants_newline = 1;
1146
1147 /* Build the initial state. */
1148 r->salloc = 1;
1149 r->sindex = 0;
1150 MALLOC(r->states, _dfa_state, r->salloc);
1151 state_index(r, &merged, wants_newline, 0);
1152
1153 free(o_nullable);
1154 free(o_nfirst);
1155 free(o_firstpos);
1156 free(o_nlast);
1157 free(o_lastpos);
1158 free(nalloc);
1159 free(merged.elems);
1160}
1161\f
1162/* Find, for each character, the transition out of state s of r, and store
1163 it in the appropriate slot of trans.
1164
1165 We divide the positions of s into groups (positions can appear in more
1166 than one group). Each group is labeled with a set of characters that
1167 every position in the group matches (taking into account, if necessary,
1168 preceding context information of s). For each group, find the union
1169 of the its elements' follows. This set is the set of positions of the
1170 new state. For each character in the group's label, set the transition
1171 on this character to be to a state corresponding to the set's positions,
1172 and its associated backward context information, if necessary.
1173
1174 If we are building a searching matcher, we include the positions of state
1175 0 in every state.
1176
1177 The collection of groups is constructed by building an equivalence-class
1178 partition of the positions of s.
1179
1180 For each position, find the set of characters C that it matches. Eliminate
1181 any characters from C that fail on grounds of backward context.
1182
1183 Search through the groups, looking for a group whose label L has nonempty
1184 intersection with C. If L - C is nonempty, create a new group labeled
1185 L - C and having the same positions as the current group, and set L to
1186 the intersection of L and C. Insert the position in this group, set
1187 C = C - L, and resume scanning.
1188
1189 If after comparing with every group there are characters remaining in C,
1190 create a new group labeled with the characters of C and insert this
1191 position in that group. */
1192void
1193regstate(s, r, trans)
1194 int s;
1195 struct regexp *r;
1196 int trans[];
1197{
1198 _position_set grps[_NOTCHAR]; /* As many as will ever be needed. */
1199 _charset labels[_NOTCHAR]; /* Labels corresponding to the groups. */
1200 int ngrps = 0; /* Number of groups actually used. */
1201 _position pos; /* Current position being considered. */
1202 _charset matches; /* Set of matching characters. */
1203 int matchesf; /* True if matches is nonempty. */
1204 _charset intersect; /* Intersection with some label set. */
1205 int intersectf; /* True if intersect is nonempty. */
1206 _charset leftovers; /* Stuff in the label that didn't match. */
1207 int leftoversf; /* True if leftovers is nonempty. */
1208 static _charset letters; /* Set of characters considered letters. */
1209 static _charset newline; /* Set of characters that aren't newline. */
1210 _position_set follows; /* Union of the follows of some group. */
1211 _position_set tmp; /* Temporary space for merging sets. */
1212 int state; /* New state. */
1213 int wants_newline; /* New state wants to know newline context. */
1214 int state_newline; /* New state on a newline transition. */
1215 int wants_letter; /* New state wants to know letter context. */
1216 int state_letter; /* New state on a letter transition. */
1217 static initialized; /* Flag for static initialization. */
1218 int i, j, k;
1219
1220 /* Initialize the set of letters, if necessary. */
1221 if (! initialized)
1222 {
1223 initialized = 1;
1224 for (i = 0; i < _NOTCHAR; ++i)
1225 if (ISALNUM(i))
1226 setbit(i, letters);
1227 setbit('\n', newline);
1228 }
1229
1230 zeroset(matches);
1231
1232 for (i = 0; i < r->states[s].elems.nelem; ++i)
1233 {
1234 pos = r->states[s].elems.elems[i];
1235 if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR)
1236 setbit(r->tokens[pos.index], matches);
1237 else if (r->tokens[pos.index] >= _SET)
1238 copyset(r->charsets[r->tokens[pos.index] - _SET], matches);
1239 else
1240 continue;
1241
1242 /* Some characters may need to be climinated from matches because
1243 they fail in the current context. */
1244 if (pos.constraint != 0xff)
1245 {
1246 if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint,
1247 r->states[s].newline, 1))
1248 clrbit('\n', matches);
1249 if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint,
1250 r->states[s].newline, 0))
1251 for (j = 0; j < _CHARSET_INTS; ++j)
1252 matches[j] &= newline[j];
1253 if (! _MATCHES_LETTER_CONTEXT(pos.constraint,
1254 r->states[s].letter, 1))
1255 for (j = 0; j < _CHARSET_INTS; ++j)
1256 matches[j] &= ~letters[j];
1257 if (! _MATCHES_LETTER_CONTEXT(pos.constraint,
1258 r->states[s].letter, 0))
1259 for (j = 0; j < _CHARSET_INTS; ++j)
1260 matches[j] &= letters[j];
1261
1262 /* If there are no characters left, there's no point in going on. */
1263 for (j = 0; j < _CHARSET_INTS && !matches[j]; ++j)
1264 ;
1265 if (j == _CHARSET_INTS)
1266 continue;
1267 }
1268
1269 for (j = 0; j < ngrps; ++j)
1270 {
1271 /* If matches contains a single character only, and the current
1272 group's label doesn't contain that character, go on to the
1273 next group. */
1274 if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR
1275 && !tstbit(r->tokens[pos.index], labels[j]))
1276 continue;
1277
1278 /* Check if this group's label has a nonempty intersection with
1279 matches. */
1280 intersectf = 0;
1281 for (k = 0; k < _CHARSET_INTS; ++k)
1282 (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0;
1283 if (! intersectf)
1284 continue;
1285
1286 /* It does; now find the set differences both ways. */
1287 leftoversf = matchesf = 0;
1288 for (k = 0; k < _CHARSET_INTS; ++k)
1289 {
1290 /* Even an optimizing compiler can't know this for sure. */
1291 int match = matches[k], label = labels[j][k];
1292
1293 (leftovers[k] = ~match & label) ? leftoversf = 1 : 0;
1294 (matches[k] = match & ~label) ? matchesf = 1 : 0;
1295 }
1296
1297 /* If there were leftovers, create a new group labeled with them. */
1298 if (leftoversf)
1299 {
1300 copyset(leftovers, labels[ngrps]);
1301 copyset(intersect, labels[j]);
1302 MALLOC(grps[ngrps].elems, _position, r->nleaves);
1303 copy(&grps[j], &grps[ngrps]);
1304 ++ngrps;
1305 }
1306
1307 /* Put the position in the current group. Note that there is no
1308 reason to call insert() here. */
1309 grps[j].elems[grps[j].nelem++] = pos;
1310
1311 /* If every character matching the current position has been
1312 accounted for, we're done. */
1313 if (! matchesf)
1314 break;
1315 }
1316
1317 /* If we've passed the last group, and there are still characters
1318 unaccounted for, then we'll have to create a new group. */
1319 if (j == ngrps)
1320 {
1321 copyset(matches, labels[ngrps]);
1322 zeroset(matches);
1323 MALLOC(grps[ngrps].elems, _position, r->nleaves);
1324 grps[ngrps].nelem = 1;
1325 grps[ngrps].elems[0] = pos;
1326 ++ngrps;
1327 }
1328 }
1329
1330 MALLOC(follows.elems, _position, r->nleaves);
1331 MALLOC(tmp.elems, _position, r->nleaves);
1332
1333 /* If we are a searching matcher, the default transition is to a state
1334 containing the positions of state 0, otherwise the default transition
1335 is to fail miserably. */
1336 if (r->searchflag)
1337 {
1338 wants_newline = 0;
1339 wants_letter = 0;
1340 for (i = 0; i < r->states[0].elems.nelem; ++i)
1341 {
1342 if (_PREV_NEWLINE_DEPENDENT(r->states[0].elems.elems[i].constraint))
1343 wants_newline = 1;
1344 if (_PREV_LETTER_DEPENDENT(r->states[0].elems.elems[i].constraint))
1345 wants_letter = 1;
1346 }
1347 copy(&r->states[0].elems, &follows);
1348 state = state_index(r, &follows, 0, 0);
1349 if (wants_newline)
1350 state_newline = state_index(r, &follows, 1, 0);
1351 else
1352 state_newline = state;
1353 if (wants_letter)
1354 state_letter = state_index(r, &follows, 0, 1);
1355 else
1356 state_letter = state;
1357 for (i = 0; i < _NOTCHAR; ++i)
1358 if (i == '\n')
1359 trans[i] = state_newline;
1360 else if (ISALNUM(i))
1361 trans[i] = state_letter;
1362 else
1363 trans[i] = state;
1364 }
1365 else
1366 for (i = 0; i < _NOTCHAR; ++i)
1367 trans[i] = -1;
1368
1369 for (i = 0; i < ngrps; ++i)
1370 {
1371 follows.nelem = 0;
1372
1373 /* Find the union of the follows of the positions of the group.
1374 This is a hideously inefficient loop. Fix it someday. */
1375 for (j = 0; j < grps[i].nelem; ++j)
1376 for (k = 0; k < r->follows[grps[i].elems[j].index].nelem; ++k)
1377 insert(r->follows[grps[i].elems[j].index].elems[k], &follows);
1378
1379 /* If we are building a searching matcher, throw in the positions
1380 of state 0 as well. */
1381 if (r->searchflag)
1382 for (j = 0; j < r->states[0].elems.nelem; ++j)
1383 insert(r->states[0].elems.elems[j], &follows);
1384
1385 /* Find out if the new state will want any context information. */
1386 wants_newline = 0;
1387 if (tstbit('\n', labels[i]))
1388 for (j = 0; j < follows.nelem; ++j)
1389 if (_PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1390 wants_newline = 1;
1391
1392 wants_letter = 0;
1393 for (j = 0; j < _CHARSET_INTS; ++j)
1394 if (labels[i][j] & letters[j])
1395 break;
1396 if (j < _CHARSET_INTS)
1397 for (j = 0; j < follows.nelem; ++j)
1398 if (_PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1399 wants_letter = 1;
1400
1401 /* Find the state(s) corresponding to the union of the follows. */
1402 state = state_index(r, &follows, 0, 0);
1403 if (wants_newline)
1404 state_newline = state_index(r, &follows, 1, 0);
1405 else
1406 state_newline = state;
1407 if (wants_letter)
1408 state_letter = state_index(r, &follows, 0, 1);
1409 else
1410 state_letter = state;
1411
1412 /* Set the transitions for each character in the current label. */
1413 for (j = 0; j < _CHARSET_INTS; ++j)
1414 for (k = 0; k < INTBITS; ++k)
1415 if (labels[i][j] & 1 << k)
1416 {
1417 int c = j * INTBITS + k;
1418
1419 if (c == '\n')
1420 trans[c] = state_newline;
1421 else if (ISALNUM(c))
1422 trans[c] = state_letter;
1423 else if (c < _NOTCHAR)
1424 trans[c] = state;
1425 }
1426 }
1427
1428 for (i = 0; i < ngrps; ++i)
1429 free(grps[i].elems);
1430 free(follows.elems);
1431 free(tmp.elems);
1432}
1433\f
1434/* Some routines for manipulating a compiled regexp's transition tables.
1435 Each state may or may not have a transition table; if it does, and it
1436 is a non-accepting state, then r->trans[state] points to its table.
1437 If it is an accepting state then r->fails[state] points to its table.
1438 If it has no table at all, then r->trans[state] is NULL.
1439 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1440
1441static void
1442build_state(s, r)
1443 int s;
1444 struct regexp *r;
1445{
1446 int *trans; /* The new transition table. */
1447 int i;
1448
1449 /* Set an upper limit on the number of transition tables that will ever
1450 exist at once. 1024 is arbitrary. The idea is that the frequently
1451 used transition tables will be quickly rebuilt, whereas the ones that
1452 were only needed once or twice will be cleared away. */
1453 if (r->trcount >= 1024)
1454 {
1455 for (i = 0; i < r->tralloc; ++i)
1456 if (r->trans[i])
1457 {
1458 free((ptr_t) r->trans[i]);
1459 r->trans[i] = NULL;
1460 }
1461 else if (r->fails[i])
1462 {
1463 free((ptr_t) r->fails[i]);
1464 r->fails[i] = NULL;
1465 }
1466 r->trcount = 0;
1467 }
1468
1469 ++r->trcount;
1470
1471 /* Set up the success bits for this state. */
1472 r->success[s] = 0;
1473 if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 1, r->states[s].letter, 0,
1474 s, *r))
1475 r->success[s] |= 4;
1476 if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 1,
1477 s, *r))
1478 r->success[s] |= 2;
1479 if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 0,
1480 s, *r))
1481 r->success[s] |= 1;
1482
1483 MALLOC(trans, int, _NOTCHAR);
1484 regstate(s, r, trans);
1485
1486 /* Now go through the new transition table, and make sure that the trans
1487 and fail arrays are allocated large enough to hold a pointer for the
1488 largest state mentioned in the table. */
1489 for (i = 0; i < _NOTCHAR; ++i)
1490 if (trans[i] >= r->tralloc)
1491 {
1492 int oldalloc = r->tralloc;
1493
1494 while (trans[i] >= r->tralloc)
1495 r->tralloc *= 2;
1496 REALLOC(r->realtrans, int *, r->tralloc + 1);
1497 r->trans = r->realtrans + 1;
1498 REALLOC(r->fails, int *, r->tralloc);
1499 REALLOC(r->success, int, r->tralloc);
1500 REALLOC(r->newlines, int, r->tralloc);
1501 while (oldalloc < r->tralloc)
1502 {
1503 r->trans[oldalloc] = NULL;
1504 r->fails[oldalloc++] = NULL;
1505 }
1506 }
1507
1508 /* Keep the newline transition in a special place so we can use it as
1509 a sentinel. */
1510 r->newlines[s] = trans['\n'];
1511 trans['\n'] = -1;
1512
1513 if (ACCEPTING(s, *r))
1514 r->fails[s] = trans;
1515 else
1516 r->trans[s] = trans;
1517}
1518
1519static void
1520build_state_zero(r)
1521 struct regexp *r;
1522{
1523 r->tralloc = 1;
1524 r->trcount = 0;
1525 CALLOC(r->realtrans, int *, r->tralloc + 1);
1526 r->trans = r->realtrans + 1;
1527 CALLOC(r->fails, int *, r->tralloc);
1528 MALLOC(r->success, int, r->tralloc);
1529 MALLOC(r->newlines, int, r->tralloc);
1530 build_state(0, r);
1531}
1532\f
1533/* Search through a buffer looking for a match to the given struct regexp.
1534 Find the first occurrence of a string matching the regexp in the buffer,
1535 and the shortest possible version thereof. Return a pointer to the first
1536 character after the match, or NULL if none is found. Begin points to
1537 the beginning of the buffer, and end points to the first character after
1538 its end. We store a newline in *end to act as a sentinel, so end had
1539 better point somewhere valid. Newline is a flag indicating whether to
1540 allow newlines to be in the matching string. If count is non-
1541 NULL it points to a place we're supposed to increment every time we
1542 see a newline. Finally, if backref is non-NULL it points to a place
1543 where we're supposed to store a 1 if backreferencing happened and the
1544 match needs to be verified by a backtracking matcher. Otherwise
1545 we store a 0 in *backref. */
1546char *
1547regexecute(r, begin, end, newline, count, backref)
1548 struct regexp *r;
1549 char *begin;
1550 char *end;
1551 int newline;
1552 int *count;
1553 int *backref;
1554{
1555 register s, s1, tmp; /* Current state. */
1556 register unsigned char *p; /* Current input character. */
1557 register **trans, *t; /* Copy of r->trans so it can be optimized
1558 into a register. */
1559 static sbit[_NOTCHAR]; /* Table for anding with r->success. */
1560 static sbit_init;
1561
1562 if (! sbit_init)
1563 {
1564 int i;
1565
1566 sbit_init = 1;
1567 for (i = 0; i < _NOTCHAR; ++i)
1568 if (i == '\n')
1569 sbit[i] = 4;
1570 else if (ISALNUM(i))
1571 sbit[i] = 2;
1572 else
1573 sbit[i] = 1;
1574 }
1575
1576 if (! r->tralloc)
1577 build_state_zero(r);
1578
1579 s = 0;
1580 p = (unsigned char *) begin;
1581 trans = r->trans;
1582 *end = '\n';
1583
1584 for (;;)
1585 {
1586 /* The dreaded inner loop. */
1587 if (t = trans[s])
1588 do
1589 {
1590 s1 = t[*p++];
1591 if (! (t = trans[s1]))
1592 goto last_was_s;
1593 s = t[*p++];
1594 }
1595 while (t = trans[s]);
1596 goto last_was_s1;
1597 last_was_s:
1598 tmp = s, s = s1, s1 = tmp;
1599 last_was_s1:
1600
1601 if (s >= 0 && p <= (unsigned char *) end && r->fails[s])
1602 {
1603 if (r->success[s] & sbit[*p])
1604 {
1605 if (backref)
1606 if (r->states[s].backref)
1607 *backref = 1;
1608 else
1609 *backref = 0;
1610 return (char *) p;
1611 }
1612
1613 s1 = s;
1614 s = r->fails[s][*p++];
1615 continue;
1616 }
1617
1618 /* If the previous character was a newline, count it. */
1619 if (count && (char *) p <= end && p[-1] == '\n')
1620 ++*count;
1621
1622 /* Check if we've run off the end of the buffer. */
1623 if ((char *) p >= end)
1624 return NULL;
1625
1626 if (s >= 0)
1627 {
1628 build_state(s, r);
1629 trans = r->trans;
1630 continue;
1631 }
1632
1633 if (p[-1] == '\n' && newline)
1634 {
1635 s = r->newlines[s1];
1636 continue;
1637 }
1638
1639 s = 0;
1640 }
1641}
1642\f
1643/* Initialize the components of a regexp that the other routines don't
1644 initialize for themselves. */
1645void
1646reginit(r)
1647 struct regexp *r;
1648{
1649 r->calloc = 1;
1650 MALLOC(r->charsets, _charset, r->calloc);
1651 r->cindex = 0;
1652
1653 r->talloc = 1;
1654 MALLOC(r->tokens, _token, r->talloc);
1655 r->tindex = r->depth = r->nleaves = r->nregexps = 0;
1656
1657 r->searchflag = 0;
1658 r->tralloc = 0;
1659}
1660
1661/* Parse and analyze a single string of the given length. */
1662void
1663regcompile(s, len, r, searchflag)
1664 const char *s;
1665 size_t len;
1666 struct regexp *r;
1667 int searchflag;
1668{
1669 if (case_fold) /* dummy folding in service of regmust() */
1670 {
1671 char *copy;
1672 int i;
1673
1674 copy = malloc(len);
1675 if (!copy)
1676 regerror("out of memory");
1677
1678 /* This is a complete kludge and could potentially break
1679 \<letter> escapes . . . */
1680 case_fold = 0;
1681 for (i = 0; i < len; ++i)
1682 if (ISUPPER(s[i]))
1683 copy[i] = tolower(s[i]);
1684 else
1685 copy[i] = s[i];
1686
1687 reginit(r);
1688 r->mustn = 0;
1689 r->must[0] = '\0';
1690 regparse(copy, len, r);
1691 free(copy);
1692 regmust(r);
1693 reganalyze(r, searchflag);
1694 case_fold = 1;
1695 reginit(r);
1696 regparse(s, len, r);
1697 reganalyze(r, searchflag);
1698 }
1699 else
1700 {
1701 reginit(r);
1702 regparse(s, len, r);
1703 regmust(r);
1704 reganalyze(r, searchflag);
1705 }
1706}
1707
1708/* Free the storage held by the components of a regexp. */
1709void
1710regfree(r)
1711 struct regexp *r;
1712{
1713 int i;
1714
1715 free((ptr_t) r->charsets);
1716 free((ptr_t) r->tokens);
1717 for (i = 0; i < r->sindex; ++i)
1718 free((ptr_t) r->states[i].elems.elems);
1719 free((ptr_t) r->states);
1720 for (i = 0; i < r->tindex; ++i)
1721 if (r->follows[i].elems)
1722 free((ptr_t) r->follows[i].elems);
1723 free((ptr_t) r->follows);
1724 for (i = 0; i < r->tralloc; ++i)
1725 if (r->trans[i])
1726 free((ptr_t) r->trans[i]);
1727 else if (r->fails[i])
1728 free((ptr_t) r->fails[i]);
1729 free((ptr_t) r->realtrans);
1730 free((ptr_t) r->fails);
1731 free((ptr_t) r->newlines);
1732}
1733
1734/*
1735Having found the postfix representation of the regular expression,
1736try to find a long sequence of characters that must appear in any line
1737containing the r.e.
1738Finding a "longest" sequence is beyond the scope here;
1739we take an easy way out and hope for the best.
1740(Take "(ab|a)b"--please.)
1741
1742We do a bottom-up calculation of sequences of characters that must appear
1743in matches of r.e.'s represented by trees rooted at the nodes of the postfix
1744representation:
1745 sequences that must appear at the left of the match ("left")
1746 sequences that must appear at the right of the match ("right")
1747 lists of sequences that must appear somewhere in the match ("in")
1748 sequences that must constitute the match ("is")
1749When we get to the root of the tree, we use one of the longest of its
1750calculated "in" sequences as our answer. The sequence we find is returned in
1751r->must (where "r" is the single argument passed to "regmust");
1752the length of the sequence is returned in r->mustn.
1753
1754The sequences calculated for the various types of node (in pseudo ANSI c)
1755are shown below. "p" is the operand of unary operators (and the left-hand
1756operand of binary operators); "q" is the right-hand operand of binary operators
1757.
1758"ZERO" means "a zero-length sequence" below.
1759
1760Type left right is in
1761---- ---- ----- -- --
1762char c # c # c # c # c
1763
1764SET ZERO ZERO ZERO ZERO
1765
1766STAR ZERO ZERO ZERO ZERO
1767
1768QMARK ZERO ZERO ZERO ZERO
1769
1770PLUS p->left p->right ZERO p->in
1771
1772CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
1773 p->left : q->right : q->is!=ZERO) ? q->in plus
1774 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
1775 ZERO
1776
1777OR longest common longest common (do p->is and substrings common to
1778 leading trailing q->is have same p->in and q->in
1779 (sub)sequence (sub)sequence length and
1780 of p->left of p->right content) ?
1781 and q->left and q->right p->is : NULL
1782
1783If there's anything else we recognize in the tree, all four sequences get set
1784to zero-length sequences. If there's something we don't recognize in the tree,
1785we just return a zero-length sequence.
1786
1787Break ties in favor of infrequent letters (choosing 'zzz' in preference to
1788'aaa')?
1789
1790And. . .is it here or someplace that we might ponder "optimizations" such as
1791 egrep 'psi|epsilon' -> egrep 'psi'
1792 egrep 'pepsi|epsilon' -> egrep 'epsi'
1793 (Yes, we now find "epsi" as a "string
1794 that must occur", but we might also
1795 simplify the *entire* r.e. being sought
1796)
1797 grep '[c]' -> grep 'c'
1798 grep '(ab|a)b' -> grep 'ab'
1799 grep 'ab*' -> grep 'a'
1800 grep 'a*b' -> grep 'b'
1801There are several issues:
1802 Is optimization easy (enough)?
1803
1804 Does optimization actually accomplish anything,
1805 or is the automaton you get from "psi|epsilon" (for example)
1806 the same as the one you get from "psi" (for example)?
1807
1808 Are optimizable r.e.'s likely to be used in real-life situations
1809 (something like 'ab*' is probably unlikely; something like is
1810 'psi|epsilon' is likelier)?
1811*/
1812
1813static char *
1814icatalloc(old, new)
1815char * old;
1816char * new;
1817{
1818 register char * result;
1819 register int oldsize, newsize;
1820
1821 newsize = (new == NULL) ? 0 : strlen(new);
1822 if (old == NULL)
1823 oldsize = 0;
1824 else if (newsize == 0)
1825 return old;
1826 else oldsize = strlen(old);
1827 if (old == NULL)
1828 result = (char *) malloc(newsize + 1);
1829 else result = (char *) realloc((void *) old, oldsize + newsize + 1);
1830 if (result != NULL && new != NULL)
1831 (void) strcpy(result + oldsize, new);
1832 return result;
1833}
1834
1835static char *
1836icpyalloc(string)
1837const char * string;
1838{
1839 return icatalloc((char *) NULL, string);
1840}
1841
1842static char *
1843istrstr(lookin, lookfor)
1844char * lookin;
1845register char * lookfor;
1846{
1847 register char * cp;
1848 register int len;
1849
1850 len = strlen(lookfor);
1851 for (cp = lookin; *cp != '\0'; ++cp)
1852 if (strncmp(cp, lookfor, len) == 0)
1853 return cp;
1854 return NULL;
1855}
1856
1857static void
1858ifree(cp)
1859char * cp;
1860{
1861 if (cp != NULL)
1862 free(cp);
1863}
1864
1865static void
1866freelist(cpp)
1867register char ** cpp;
1868{
1869 register int i;
1870
1871 if (cpp == NULL)
1872 return;
1873 for (i = 0; cpp[i] != NULL; ++i) {
1874 free(cpp[i]);
1875 cpp[i] = NULL;
1876 }
1877}
1878
1879static char **
1880enlist(cpp, new, len)
1881register char ** cpp;
1882register char * new;
1883{
1884 register int i, j;
1885
1886 if (cpp == NULL)
1887 return NULL;
1888 if ((new = icpyalloc(new)) == NULL) {
1889 freelist(cpp);
1890 return NULL;
1891 }
1892 new[len] = '\0';
1893 /*
1894 ** Is there already something in the list that's new (or longer)?
1895 */
1896 for (i = 0; cpp[i] != NULL; ++i)
1897 if (istrstr(cpp[i], new) != NULL) {
1898 free(new);
1899 return cpp;
1900 }
1901 /*
1902 ** Eliminate any obsoleted strings.
1903 */
1904 j = 0;
1905 while (cpp[j] != NULL)
1906 if (istrstr(new, cpp[j]) == NULL)
1907 ++j;
1908 else {
1909 free(cpp[j]);
1910 if (--i == j)
1911 break;
1912 cpp[j] = cpp[i];
1913 }
1914 /*
1915 ** Add the new string.
1916 */
1917 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
1918 if (cpp == NULL)
1919 return NULL;
1920 cpp[i] = new;
1921 cpp[i + 1] = NULL;
1922 return cpp;
1923}
1924
1925/*
1926** Given pointers to two strings,
1927** return a pointer to an allocated list of their distinct common substrings.
1928** Return NULL if something seems wild.
1929*/
1930
1931static char **
1932comsubs(left, right)
1933char * left;
1934char * right;
1935{
1936 register char ** cpp;
1937 register char * lcp;
1938 register char * rcp;
1939 register int i, len;
1940
1941 if (left == NULL || right == NULL)
1942 return NULL;
1943 cpp = (char **) malloc(sizeof *cpp);
1944 if (cpp == NULL)
1945 return NULL;
1946 cpp[0] = NULL;
1947 for (lcp = left; *lcp != '\0'; ++lcp) {
1948 len = 0;
1949 rcp = strchr(right, *lcp);
1950 while (rcp != NULL) {
1951 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
1952 ;
1953 if (i > len)
1954 len = i;
1955 rcp = strchr(rcp + 1, *lcp);
1956 }
1957 if (len == 0)
1958 continue;
1959 if ((cpp = enlist(cpp, lcp, len)) == NULL)
1960 break;
1961 }
1962 return cpp;
1963}
1964
1965static char **
1966addlists(old, new)
1967char ** old;
1968char ** new;
1969{
1970 register int i;
1971
1972 if (old == NULL || new == NULL)
1973 return NULL;
1974 for (i = 0; new[i] != NULL; ++i) {
1975 old = enlist(old, new[i], strlen(new[i]));
1976 if (old == NULL)
1977 break;
1978 }
1979 return old;
1980}
1981
1982/*
1983** Given two lists of substrings,
1984** return a new list giving substrings common to both.
1985*/
1986
1987static char **
1988inboth(left, right)
1989char ** left;
1990char ** right;
1991{
1992 register char ** both;
1993 register char ** temp;
1994 register int lnum, rnum;
1995
1996 if (left == NULL || right == NULL)
1997 return NULL;
1998 both = (char **) malloc(sizeof *both);
1999 if (both == NULL)
2000 return NULL;
2001 both[0] = NULL;
2002 for (lnum = 0; left[lnum] != NULL; ++lnum) {
2003 for (rnum = 0; right[rnum] != NULL; ++rnum) {
2004 temp = comsubs(left[lnum], right[rnum]);
2005 if (temp == NULL) {
2006 freelist(both);
2007 return NULL;
2008 }
2009 both = addlists(both, temp);
2010 freelist(temp);
2011 if (both == NULL)
2012 return NULL;
2013 }
2014 }
2015 return both;
2016}
2017
2018typedef struct {
2019 char ** in;
2020 char * left;
2021 char * right;
2022 char * is;
2023} must;
2024
2025static void
2026resetmust(mp)
2027register must * mp;
2028{
2029 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2030 freelist(mp->in);
2031}
2032
2033static void
2034regmust(r)
2035register struct regexp * r;
2036{
2037 register must * musts;
2038 register must * mp;
2039 register char * result;
2040 register int ri;
2041 register int i;
2042 register _token t;
2043 static must must0;
2044
2045 reg->mustn = 0;
2046 reg->must[0] = '\0';
2047 musts = (must *) malloc((reg->tindex + 1) * sizeof *musts);
2048 if (musts == NULL)
2049 return;
2050 mp = musts;
2051 for (i = 0; i <= reg->tindex; ++i)
2052 mp[i] = must0;
2053 for (i = 0; i <= reg->tindex; ++i) {
2054 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2055 mp[i].left = malloc(2);
2056 mp[i].right = malloc(2);
2057 mp[i].is = malloc(2);
2058 if (mp[i].in == NULL || mp[i].left == NULL ||
2059 mp[i].right == NULL || mp[i].is == NULL)
2060 goto done;
2061 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2062 mp[i].in[0] = NULL;
2063 }
2064 result = "";
2065 for (ri = 0; ri < reg->tindex; ++ri) {
2066 switch (t = reg->tokens[ri]) {
2067 case _ALLBEGLINE:
2068 case _ALLENDLINE:
2069 case _LPAREN:
2070 case _RPAREN:
2071 goto done; /* "cannot happen" */
2072 case _EMPTY:
2073 case _BEGLINE:
2074 case _ENDLINE:
2075 case _BEGWORD:
2076 case _ENDWORD:
2077 case _LIMWORD:
2078 case _NOTLIMWORD:
2079 case _BACKREF:
2080 resetmust(mp);
2081 break;
2082 case _STAR:
2083 case _QMARK:
2084 if (mp <= musts)
2085 goto done; /* "cannot happen" */
2086 --mp;
2087 resetmust(mp);
2088 break;
2089 case _OR:
2090 if (mp < &musts[2])
2091 goto done; /* "cannot happen" */
2092 {
2093 register char ** new;
2094 register must * lmp;
2095 register must * rmp;
2096 register int j, ln, rn, n;
2097
2098 rmp = --mp;
2099 lmp = --mp;
2100 /* Guaranteed to be. Unlikely, but. . . */
2101 if (strcmp(lmp->is, rmp->is) != 0)
2102 lmp->is[0] = '\0';
2103 /* Left side--easy */
2104 i = 0;
2105 while (lmp->left[i] != '\0' &&
2106 lmp->left[i] == rmp->left[i])
2107 ++i;
2108 lmp->left[i] = '\0';
2109 /* Right side */
2110 ln = strlen(lmp->right);
2111 rn = strlen(rmp->right);
2112 n = ln;
2113 if (n > rn)
2114 n = rn;
2115 for (i = 0; i < n; ++i)
2116 if (lmp->right[ln - i - 1] !=
2117 rmp->right[rn - i - 1])
2118 break;
2119 for (j = 0; j < i; ++j)
2120 lmp->right[j] =
2121 lmp->right[(ln - i) + j];
2122 lmp->right[j] = '\0';
2123 new = inboth(lmp->in, rmp->in);
2124 if (new == NULL)
2125 goto done;
2126 freelist(lmp->in);
2127 free((char *) lmp->in);
2128 lmp->in = new;
2129 }
2130 break;
2131 case _PLUS:
2132 if (mp <= musts)
2133 goto done; /* "cannot happen" */
2134 --mp;
2135 mp->is[0] = '\0';
2136 break;
2137 case _END:
2138 if (mp != &musts[1])
2139 goto done; /* "cannot happen" */
2140 for (i = 0; musts[0].in[i] != NULL; ++i)
2141 if (strlen(musts[0].in[i]) > strlen(result))
2142 result = musts[0].in[i];
2143 goto done;
2144 case _CAT:
2145 if (mp < &musts[2])
2146 goto done; /* "cannot happen" */
2147 {
2148 register must * lmp;
2149 register must * rmp;
2150
2151 rmp = --mp;
2152 lmp = --mp;
2153 /*
2154 ** In. Everything in left, plus everything in
2155 ** right, plus catenation of
2156 ** left's right and right's left.
2157 */
2158 lmp->in = addlists(lmp->in, rmp->in);
2159 if (lmp->in == NULL)
2160 goto done;
2161 if (lmp->right[0] != '\0' &&
2162 rmp->left[0] != '\0') {
2163 register char * tp;
2164
2165 tp = icpyalloc(lmp->right);
2166 if (tp == NULL)
2167 goto done;
2168 tp = icatalloc(tp, rmp->left);
2169 if (tp == NULL)
2170 goto done;
2171 lmp->in = enlist(lmp->in, tp,
2172 strlen(tp));
2173 free(tp);
2174 if (lmp->in == NULL)
2175 goto done;
2176 }
2177 /* Left-hand */
2178 if (lmp->is[0] != '\0') {
2179 lmp->left = icatalloc(lmp->left,
2180 rmp->left);
2181 if (lmp->left == NULL)
2182 goto done;
2183 }
2184 /* Right-hand */
2185 if (rmp->is[0] == '\0')
2186 lmp->right[0] = '\0';
2187 lmp->right = icatalloc(lmp->right, rmp->right);
2188 if (lmp->right == NULL)
2189 goto done;
2190 /* Guaranteed to be */
2191 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') {
2192 lmp->is = icatalloc(lmp->is, rmp->is);
2193 if (lmp->is == NULL)
2194 goto done;
2195 }
2196 }
2197 break;
2198 default:
2199 if (t < _END) {
2200 /* "cannot happen" */
2201 goto done;
2202 } else if (t == '\0') {
2203 /* not on *my* shift */
2204 goto done;
2205 } else if (t >= _SET) {
2206 /* easy enough */
2207 resetmust(mp);
2208 } else {
2209 /* plain character */
2210 resetmust(mp);
2211 mp->is[0] = mp->left[0] = mp->right[0] = t;
2212 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2213 mp->in = enlist(mp->in, mp->is, 1);
2214 if (mp->in == NULL)
2215 goto done;
2216 }
2217 break;
2218 }
2219 ++mp;
2220 }
2221done:
2222 (void) strncpy(reg->must, result, MUST_MAX - 1);
2223 reg->must[MUST_MAX - 1] = '\0';
2224 reg->mustn = strlen(reg->must);
2225 mp = musts;
2226 for (i = 0; i <= reg->tindex; ++i) {
2227 freelist(mp[i].in);
2228 ifree((char *) mp[i].in);
2229 ifree(mp[i].left);
2230 ifree(mp[i].right);
2231 ifree(mp[i].is);
2232 }
2233 free((char *) mp);
2234}