BSD 4_4_Lite2 development
[unix-history] / usr / src / contrib / gawk-2.15.2 / dfa.c
CommitLineData
9049701c
C
1/* dfa.c - determinisitic extended regexp routines for GNU
2 Copyright (C) 1988 Free Software Foundation, Inc.
3 Written June, 1988 by Mike Haertel
4 Modified July, 1988 by Arthur David Olson
5 to assist BMG speedups
6
7 NO WARRANTY
8
9 BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
10NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
11WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
12RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
13WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
14BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
15FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
16AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
17DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
18CORRECTION.
19
20 IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
21STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
22WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
23LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
24OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
25USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
26DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
27A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
28PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
29DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
30
31 GENERAL PUBLIC LICENSE TO COPY
32
33 1. You may copy and distribute verbatim copies of this source file
34as you receive it, in any medium, provided that you conspicuously and
35appropriately publish on each copy a valid copyright notice "Copyright
36 (C) 1988 Free Software Foundation, Inc."; and include following the
37copyright notice a verbatim copy of the above disclaimer of warranty
38and of this License. You may charge a distribution fee for the
39physical act of transferring a copy.
40
41 2. You may modify your copy or copies of this source file or
42any portion of it, and copy and distribute such modifications under
43the terms of Paragraph 1 above, provided that you also do the following:
44
45 a) cause the modified files to carry prominent notices stating
46 that you changed the files and the date of any change; and
47
48 b) cause the whole of any work that you distribute or publish,
49 that in whole or in part contains or is a derivative of this
50 program or any part thereof, to be licensed at no charge to all
51 third parties on terms identical to those contained in this
52 License Agreement (except that you may choose to grant more extensive
53 warranty protection to some or all third parties, at your option).
54
55 c) You may charge a distribution fee for the physical act of
56 transferring a copy, and you may at your option offer warranty
57 protection in exchange for a fee.
58
59Mere aggregation of another unrelated program with this program (or its
60derivative) on a volume of a storage or distribution medium does not bring
61the other program under the scope of these terms.
62
63 3. You may copy and distribute this program or any portion of it in
64compiled, executable or object code form under the terms of Paragraphs
651 and 2 above provided that you do the following:
66
67 a) accompany it with the complete corresponding machine-readable
68 source code, which must be distributed under the terms of
69 Paragraphs 1 and 2 above; or,
70
71 b) accompany it with a written offer, valid for at least three
72 years, to give any third party free (except for a nominal
73 shipping charge) a complete machine-readable copy of the
74 corresponding source code, to be distributed under the terms of
75 Paragraphs 1 and 2 above; or,
76
77 c) accompany it with the information you received as to where the
78 corresponding source code may be obtained. (This alternative is
79 allowed only for noncommercial distribution and only if you
80 received the program in object code or executable form alone.)
81
82For an executable file, complete source code means all the source code for
83all modules it contains; but, as a special exception, it need not include
84source code for modules which are standard libraries that accompany the
85operating system on which the executable file runs.
86
87 4. You may not copy, sublicense, distribute or transfer this program
88except as expressly provided under this License Agreement. Any attempt
89otherwise to copy, sublicense, distribute or transfer this program is void and
90your rights to use the program under this License agreement shall be
91automatically terminated. However, parties who have received computer
92software programs from you with this License Agreement will not have
93their licenses terminated so long as such parties remain in full compliance.
94
95 5. If you wish to incorporate parts of this program into other free
96programs whose distribution conditions are different, write to the Free
97Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
98worked out a simple rule that can be stated here, but we will often permit
99this. We will be guided by the two goals of preserving the free status of
100all derivatives our free software and of promoting the sharing and reuse of
101software.
102
103
104In other words, you are welcome to use, share and improve this program.
105You are forbidden to forbid anyone else to use, share and improve
106what you give them. Help stamp out software-hoarding! */
107\f
108#include "awk.h"
109#include <assert.h>
110
111#ifdef setbit /* surprise - setbit and clrbit are macros on NeXT */
112#undef setbit
113#endif
114#ifdef clrbit
115#undef clrbit
116#endif
117
118#ifdef __STDC__
119typedef void *ptr_t;
120#else
121typedef char *ptr_t;
122#endif
123
124typedef struct {
125 char ** in;
126 char * left;
127 char * right;
128 char * is;
129} must;
130
131static ptr_t xcalloc P((int n, size_t s));
132static ptr_t xmalloc P((size_t n));
133static ptr_t xrealloc P((ptr_t p, size_t n));
134static int tstbit P((int b, _charset c));
135static void setbit P((int b, _charset c));
136static void clrbit P((int b, _charset c));
137static void copyset P((const _charset src, _charset dst));
138static void zeroset P((_charset s));
139static void notset P((_charset s));
140static int equal P((const _charset s1, const _charset s2));
141static int charset_index P((const _charset s));
142static _token lex P((void));
143static void addtok P((_token t));
144static void atom P((void));
145static void closure P((void));
146static void branch P((void));
147static void regexp P((void));
148static void copy P((const _position_set *src, _position_set *dst));
149static void insert P((_position p, _position_set *s));
150static void merge P((_position_set *s1, _position_set *s2, _position_set *m));
151static void delete P((_position p, _position_set *s));
152static int state_index P((struct regexp *r, _position_set *s,
153 int newline, int letter));
154static void epsclosure P((_position_set *s, struct regexp *r));
155static void build_state P((int s, struct regexp *r));
156static void build_state_zero P((struct regexp *r));
157static char *icatalloc P((char *old, const char *new));
158static char *icpyalloc P((const char *string));
159static char *istrstr P((char *lookin, char *lookfor));
160static void ifree P((char *cp));
161static void freelist P((char **cpp));
162static char **enlist P((char **cpp, char *new, size_t len));
163static char **comsubs P((char *left, char *right));
164static char **addlists P((char **old, char **new));
165static char **inboth P((char **left, char **right));
166static void resetmust P((must *mp));
167static void regmust P((struct regexp *r));
168
169#undef P
170
171static ptr_t
172xcalloc(n, s)
173 int n;
174 size_t s;
175{
176 ptr_t r = calloc(n, s);
177
178 if (NULL == r)
179 reg_error("Memory exhausted"); /* reg_error does not return */
180 return r;
181}
182
183static ptr_t
184xmalloc(n)
185 size_t n;
186{
187 ptr_t r = malloc(n);
188
189 assert(n != 0);
190 if (NULL == r)
191 reg_error("Memory exhausted");
192 return r;
193}
194
195static ptr_t
196xrealloc(p, n)
197 ptr_t p;
198 size_t n;
199{
200 ptr_t r = realloc(p, n);
201
202 assert(n != 0);
203 if (NULL == r)
204 reg_error("Memory exhausted");
205 return r;
206}
207
208#define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t)))
209#undef MALLOC
210#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
211#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
212
213/* Reallocate an array of type t if nalloc is too small for index. */
214#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
215 if ((index) >= (nalloc)) \
216 { \
217 while ((index) >= (nalloc)) \
218 (nalloc) *= 2; \
219 REALLOC(p, t, nalloc); \
220 }
221\f
222/* Stuff pertaining to charsets. */
223
224static int
225tstbit(b, c)
226 int b;
227 _charset c;
228{
229 return c[b / INTBITS] & 1 << b % INTBITS;
230}
231
232static void
233setbit(b, c)
234 int b;
235 _charset c;
236{
237 c[b / INTBITS] |= 1 << b % INTBITS;
238}
239
240static void
241clrbit(b, c)
242 int b;
243 _charset c;
244{
245 c[b / INTBITS] &= ~(1 << b % INTBITS);
246}
247
248static void
249copyset(src, dst)
250 const _charset src;
251 _charset dst;
252{
253 int i;
254
255 for (i = 0; i < _CHARSET_INTS; ++i)
256 dst[i] = src[i];
257}
258
259static void
260zeroset(s)
261 _charset s;
262{
263 int i;
264
265 for (i = 0; i < _CHARSET_INTS; ++i)
266 s[i] = 0;
267}
268
269static void
270notset(s)
271 _charset s;
272{
273 int i;
274
275 for (i = 0; i < _CHARSET_INTS; ++i)
276 s[i] = ~s[i];
277}
278
279static int
280equal(s1, s2)
281 const _charset s1;
282 const _charset s2;
283{
284 int i;
285
286 for (i = 0; i < _CHARSET_INTS; ++i)
287 if (s1[i] != s2[i])
288 return 0;
289 return 1;
290}
291\f
292/* A pointer to the current regexp is kept here during parsing. */
293static struct regexp *reg;
294
295/* Find the index of charset s in reg->charsets, or allocate a new charset. */
296static int
297charset_index(s)
298 const _charset s;
299{
300 int i;
301
302 for (i = 0; i < reg->cindex; ++i)
303 if (equal(s, reg->charsets[i]))
304 return i;
305 REALLOC_IF_NECESSARY(reg->charsets, _charset, reg->calloc, reg->cindex);
306 ++reg->cindex;
307 copyset(s, reg->charsets[i]);
308 return i;
309}
310
311/* Syntax bits controlling the behavior of the lexical analyzer. */
312static syntax_bits, syntax_bits_set;
313
314/* Flag for case-folding letters into sets. */
315static case_fold;
316
317/* Entry point to set syntax options. */
318void
319regsyntax(bits, fold)
320 long bits;
321 int fold;
322{
323 syntax_bits_set = 1;
324 syntax_bits = bits;
325 case_fold = fold;
326}
327
328/* Lexical analyzer. */
329static const char *lexstart; /* Pointer to beginning of input string. */
330static const char *lexptr; /* Pointer to next input character. */
331static lexleft; /* Number of characters remaining. */
332static caret_allowed; /* True if backward context allows ^
333 (meaningful only if RE_CONTEXT_INDEP_OPS
334 is turned off). */
335static closure_allowed; /* True if backward context allows closures
336 (meaningful only if RE_CONTEXT_INDEP_OPS
337 is turned off). */
338
339/* Note that characters become unsigned here. */
340#define FETCH(c, eoferr) \
341 { \
342 if (! lexleft) \
343 if (eoferr != NULL) \
344 reg_error(eoferr); \
345 else \
346 return _END; \
347 (c) = (unsigned char) *lexptr++; \
348 --lexleft; \
349 }
350
351static _token
352lex()
353{
354 _token c, c2;
355 int invert;
356 _charset cset;
357
358 FETCH(c, (char *) 0);
359 switch (c)
360 {
361 case '^':
362 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS)
363 && (!caret_allowed ||
364 ((syntax_bits & RE_TIGHT_VBAR) && lexptr - 1 != lexstart)))
365 goto normal_char;
366 caret_allowed = 0;
367 return syntax_bits & RE_TIGHT_VBAR ? _ALLBEGLINE : _BEGLINE;
368
369 case '$':
370 if (syntax_bits & RE_CONTEXT_INDEP_OPS || !lexleft
371 || (! (syntax_bits & RE_TIGHT_VBAR)
372 && ((syntax_bits & RE_NO_BK_PARENS
373 ? lexleft > 0 && *lexptr == ')'
374 : lexleft > 1 && *lexptr == '\\' && lexptr[1] == ')')
375 || (syntax_bits & RE_NO_BK_VBAR
376 ? lexleft > 0 && *lexptr == '|'
377 : lexleft > 1 && *lexptr == '\\' && lexptr[1] == '|'))))
378 return syntax_bits & RE_TIGHT_VBAR ? _ALLENDLINE : _ENDLINE;
379 goto normal_char;
380
381 case '\\':
382 FETCH(c, "Unfinished \\ quote");
383 switch (c)
384 {
385 case '1':
386 case '2':
387 case '3':
388 case '4':
389 case '5':
390 case '6':
391 case '7':
392 case '8':
393 case '9':
394 caret_allowed = 0;
395 closure_allowed = 1;
396 return _BACKREF;
397
398 case '<':
399 caret_allowed = 0;
400 return _BEGWORD;
401
402 case '>':
403 caret_allowed = 0;
404 return _ENDWORD;
405
406 case 'b':
407 caret_allowed = 0;
408 return _LIMWORD;
409
410 case 'B':
411 caret_allowed = 0;
412 return _NOTLIMWORD;
413
414 case 'w':
415 case 'W':
416 zeroset(cset);
417 for (c2 = 0; c2 < _NOTCHAR; ++c2)
418 if (ISALNUM(c2))
419 setbit(c2, cset);
420 if (c == 'W')
421 notset(cset);
422 caret_allowed = 0;
423 closure_allowed = 1;
424 return _SET + charset_index(cset);
425
426 case '?':
427 if (syntax_bits & RE_BK_PLUS_QM)
428 goto qmark;
429 goto normal_char;
430
431 case '+':
432 if (syntax_bits & RE_BK_PLUS_QM)
433 goto plus;
434 goto normal_char;
435
436 case '|':
437 if (! (syntax_bits & RE_NO_BK_VBAR))
438 goto or;
439 goto normal_char;
440
441 case '(':
442 if (! (syntax_bits & RE_NO_BK_PARENS))
443 goto lparen;
444 goto normal_char;
445
446 case ')':
447 if (! (syntax_bits & RE_NO_BK_PARENS))
448 goto rparen;
449 goto normal_char;
450
451 default:
452 goto normal_char;
453 }
454
455 case '?':
456 if (syntax_bits & RE_BK_PLUS_QM)
457 goto normal_char;
458 qmark:
459 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)
460 goto normal_char;
461 return _QMARK;
462
463 case '*':
464 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)
465 goto normal_char;
466 return _STAR;
467
468 case '+':
469 if (syntax_bits & RE_BK_PLUS_QM)
470 goto normal_char;
471 plus:
472 if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed)
473 goto normal_char;
474 return _PLUS;
475
476 case '|':
477 if (! (syntax_bits & RE_NO_BK_VBAR))
478 goto normal_char;
479 or:
480 caret_allowed = 1;
481 closure_allowed = 0;
482 return _OR;
483
484 case '\n':
485 if (! (syntax_bits & RE_NEWLINE_OR))
486 goto normal_char;
487 goto or;
488
489 case '(':
490 if (! (syntax_bits & RE_NO_BK_PARENS))
491 goto normal_char;
492 lparen:
493 caret_allowed = 1;
494 closure_allowed = 0;
495 return _LPAREN;
496
497 case ')':
498 if (! (syntax_bits & RE_NO_BK_PARENS))
499 goto normal_char;
500 rparen:
501 caret_allowed = 0;
502 closure_allowed = 1;
503 return _RPAREN;
504
505 case '.':
506 zeroset(cset);
507 notset(cset);
508 clrbit('\n', cset);
509 caret_allowed = 0;
510 closure_allowed = 1;
511 return _SET + charset_index(cset);
512
513 case '[':
514 zeroset(cset);
515 FETCH(c, "Unbalanced [");
516 if (c == '^')
517 {
518 FETCH(c, "Unbalanced [");
519 invert = 1;
520 }
521 else
522 invert = 0;
523 do
524 {
525 FETCH(c2, "Unbalanced [");
526 if ((syntax_bits & RE_AWK_CLASS_HACK) && c == '\\')
527 {
528 c = c2;
529 FETCH(c2, "Unbalanced [");
530 }
531 if (c2 == '-')
532 {
533 FETCH(c2, "Unbalanced [");
534 if (c2 == ']' && (syntax_bits & RE_AWK_CLASS_HACK))
535 {
536 setbit(c, cset);
537 setbit('-', cset);
538 break;
539 }
540 while (c <= c2)
541 setbit(c++, cset);
542 FETCH(c, "Unbalanced [");
543 }
544 else
545 {
546 setbit(c, cset);
547 c = c2;
548 }
549 }
550 while (c != ']');
551 if (invert)
552 notset(cset);
553 caret_allowed = 0;
554 closure_allowed = 1;
555 return _SET + charset_index(cset);
556
557 default:
558 normal_char:
559 caret_allowed = 0;
560 closure_allowed = 1;
561 if (case_fold && ISALPHA(c))
562 {
563 zeroset(cset);
564 if (isupper(c))
565 c = tolower(c);
566 setbit(c, cset);
567 setbit(toupper(c), cset);
568 return _SET + charset_index(cset);
569 }
570 return c;
571 }
572}
573\f
574/* Recursive descent parser for regular expressions. */
575
576static _token tok; /* Lookahead token. */
577static depth; /* Current depth of a hypothetical stack
578 holding deferred productions. This is
579 used to determine the depth that will be
580 required of the real stack later on in
581 reganalyze(). */
582
583/* Add the given token to the parse tree, maintaining the depth count and
584 updating the maximum depth if necessary. */
585static void
586addtok(t)
587 _token t;
588{
589 REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex);
590 reg->tokens[reg->tindex++] = t;
591
592 switch (t)
593 {
594 case _QMARK:
595 case _STAR:
596 case _PLUS:
597 break;
598
599 case _CAT:
600 case _OR:
601 --depth;
602 break;
603
604 default:
605 ++reg->nleaves;
606 case _EMPTY:
607 ++depth;
608 break;
609 }
610 if (depth > reg->depth)
611 reg->depth = depth;
612}
613
614/* The grammar understood by the parser is as follows.
615
616 start:
617 regexp
618 _ALLBEGLINE regexp
619 regexp _ALLENDLINE
620 _ALLBEGLINE regexp _ALLENDLINE
621
622 regexp:
623 regexp _OR branch
624 branch
625
626 branch:
627 branch closure
628 closure
629
630 closure:
631 closure _QMARK
632 closure _STAR
633 closure _PLUS
634 atom
635
636 atom:
637 <normal character>
638 _SET
639 _BACKREF
640 _BEGLINE
641 _ENDLINE
642 _BEGWORD
643 _ENDWORD
644 _LIMWORD
645 _NOTLIMWORD
646 <empty>
647
648 The parser builds a parse tree in postfix form in an array of tokens. */
649
650#ifdef __STDC__
651static void regexp(void);
652#else
653static void regexp();
654#endif
655
656static void
657atom()
658{
659 if (tok >= 0 && (tok < _NOTCHAR || tok >= _SET || tok == _BACKREF
660 || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD
661 || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD))
662 {
663 addtok(tok);
664 tok = lex();
665 }
666 else if (tok == _LPAREN)
667 {
668 tok = lex();
669 regexp();
670 if (tok != _RPAREN)
671 reg_error("Unbalanced (");
672 tok = lex();
673 }
674 else
675 addtok(_EMPTY);
676}
677
678static void
679closure()
680{
681 atom();
682 while (tok == _QMARK || tok == _STAR || tok == _PLUS)
683 {
684 addtok(tok);
685 tok = lex();
686 }
687}
688
689static void
690branch()
691{
692 closure();
693 while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0)
694 {
695 closure();
696 addtok(_CAT);
697 }
698}
699
700static void
701regexp()
702{
703 branch();
704 while (tok == _OR)
705 {
706 tok = lex();
707 branch();
708 addtok(_OR);
709 }
710}
711
712/* Main entry point for the parser. S is a string to be parsed, len is the
713 length of the string, so s can include NUL characters. R is a pointer to
714 the struct regexp to parse into. */
715void
716regparse(s, len, r)
717 const char *s;
718 size_t len;
719 struct regexp *r;
720{
721 reg = r;
722 lexstart = lexptr = s;
723 lexleft = len;
724 caret_allowed = 1;
725 closure_allowed = 0;
726
727 if (! syntax_bits_set)
728 reg_error("No syntax specified");
729
730 tok = lex();
731 depth = r->depth;
732
733 if (tok == _ALLBEGLINE)
734 {
735 addtok(_BEGLINE);
736 tok = lex();
737 regexp();
738 addtok(_CAT);
739 }
740 else
741 regexp();
742
743 if (tok == _ALLENDLINE)
744 {
745 addtok(_ENDLINE);
746 addtok(_CAT);
747 tok = lex();
748 }
749
750 if (tok != _END)
751 reg_error("Unbalanced )");
752
753 addtok(_END - r->nregexps);
754 addtok(_CAT);
755
756 if (r->nregexps)
757 addtok(_OR);
758
759 ++r->nregexps;
760}
761\f
762/* Some primitives for operating on sets of positions. */
763
764/* Copy one set to another; the destination must be large enough. */
765static void
766copy(src, dst)
767 const _position_set *src;
768 _position_set *dst;
769{
770 int i;
771
772 for (i = 0; i < src->nelem; ++i)
773 dst->elems[i] = src->elems[i];
774 dst->nelem = src->nelem;
775}
776
777/* Insert a position in a set. Position sets are maintained in sorted
778 order according to index. If position already exists in the set with
779 the same index then their constraints are logically or'd together.
780 S->elems must point to an array large enough to hold the resulting set. */
781static void
782insert(p, s)
783 _position p;
784 _position_set *s;
785{
786 int i;
787 _position t1, t2;
788
789 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
790 ;
791 if (i < s->nelem && p.index == s->elems[i].index)
792 s->elems[i].constraint |= p.constraint;
793 else
794 {
795 t1 = p;
796 ++s->nelem;
797 while (i < s->nelem)
798 {
799 t2 = s->elems[i];
800 s->elems[i++] = t1;
801 t1 = t2;
802 }
803 }
804}
805
806/* Merge two sets of positions into a third. The result is exactly as if
807 the positions of both sets were inserted into an initially empty set. */
808static void
809merge(s1, s2, m)
810 _position_set *s1;
811 _position_set *s2;
812 _position_set *m;
813{
814 int i = 0, j = 0;
815
816 m->nelem = 0;
817 while (i < s1->nelem && j < s2->nelem)
818 if (s1->elems[i].index > s2->elems[j].index)
819 m->elems[m->nelem++] = s1->elems[i++];
820 else if (s1->elems[i].index < s2->elems[j].index)
821 m->elems[m->nelem++] = s2->elems[j++];
822 else
823 {
824 m->elems[m->nelem] = s1->elems[i++];
825 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
826 }
827 while (i < s1->nelem)
828 m->elems[m->nelem++] = s1->elems[i++];
829 while (j < s2->nelem)
830 m->elems[m->nelem++] = s2->elems[j++];
831}
832
833/* Delete a position from a set. */
834static void
835delete(p, s)
836 _position p;
837 _position_set *s;
838{
839 int i;
840
841 for (i = 0; i < s->nelem; ++i)
842 if (p.index == s->elems[i].index)
843 break;
844 if (i < s->nelem)
845 for (--s->nelem; i < s->nelem; ++i)
846 s->elems[i] = s->elems[i + 1];
847}
848\f
849/* Find the index of the state corresponding to the given position set with
850 the given preceding context, or create a new state if there is no such
851 state. Newline and letter tell whether we got here on a newline or
852 letter, respectively. */
853static int
854state_index(r, s, newline, letter)
855 struct regexp *r;
856 _position_set *s;
857 int newline;
858 int letter;
859{
860 int lhash = 0;
861 int constraint;
862 int i, j;
863
864 newline = newline ? 1 : 0;
865 letter = letter ? 1 : 0;
866
867 for (i = 0; i < s->nelem; ++i)
868 lhash ^= s->elems[i].index + s->elems[i].constraint;
869
870 /* Try to find a state that exactly matches the proposed one. */
871 for (i = 0; i < r->sindex; ++i)
872 {
873 if (lhash != r->states[i].hash || s->nelem != r->states[i].elems.nelem
874 || newline != r->states[i].newline || letter != r->states[i].letter)
875 continue;
876 for (j = 0; j < s->nelem; ++j)
877 if (s->elems[j].constraint
878 != r->states[i].elems.elems[j].constraint
879 || s->elems[j].index != r->states[i].elems.elems[j].index)
880 break;
881 if (j == s->nelem)
882 return i;
883 }
884
885 /* We'll have to create a new state. */
886 REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex);
887 r->states[i].hash = lhash;
888 MALLOC(r->states[i].elems.elems, _position, s->nelem);
889 copy(s, &r->states[i].elems);
890 r->states[i].newline = newline;
891 r->states[i].letter = letter;
892 r->states[i].backref = 0;
893 r->states[i].constraint = 0;
894 r->states[i].first_end = 0;
895 for (j = 0; j < s->nelem; ++j)
896 if (r->tokens[s->elems[j].index] < 0)
897 {
898 constraint = s->elems[j].constraint;
899 if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
900 || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
901 || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
902 || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
903 r->states[i].constraint |= constraint;
904 if (! r->states[i].first_end)
905 r->states[i].first_end = r->tokens[s->elems[j].index];
906 }
907 else if (r->tokens[s->elems[j].index] == _BACKREF)
908 {
909 r->states[i].constraint = _NO_CONSTRAINT;
910 r->states[i].backref = 1;
911 }
912
913 ++r->sindex;
914
915 return i;
916}
917\f
918/* Find the epsilon closure of a set of positions. If any position of the set
919 contains a symbol that matches the empty string in some context, replace
920 that position with the elements of its follow labeled with an appropriate
921 constraint. Repeat exhaustively until no funny positions are left.
922 S->elems must be large enough to hold the result. */
923static void
924epsclosure(s, r)
925 _position_set *s;
926 struct regexp *r;
927{
928 int i, j;
929 int *visited;
930 _position p, old;
931
932 MALLOC(visited, int, r->tindex);
933 for (i = 0; i < r->tindex; ++i)
934 visited[i] = 0;
935
936 for (i = 0; i < s->nelem; ++i)
937 if (r->tokens[s->elems[i].index] >= _NOTCHAR
938 && r->tokens[s->elems[i].index] != _BACKREF
939 && r->tokens[s->elems[i].index] < _SET)
940 {
941 old = s->elems[i];
942 p.constraint = old.constraint;
943 delete(s->elems[i], s);
944 if (visited[old.index])
945 {
946 --i;
947 continue;
948 }
949 visited[old.index] = 1;
950 switch (r->tokens[old.index])
951 {
952 case _BEGLINE:
953 p.constraint &= _BEGLINE_CONSTRAINT;
954 break;
955 case _ENDLINE:
956 p.constraint &= _ENDLINE_CONSTRAINT;
957 break;
958 case _BEGWORD:
959 p.constraint &= _BEGWORD_CONSTRAINT;
960 break;
961 case _ENDWORD:
962 p.constraint &= _ENDWORD_CONSTRAINT;
963 break;
964 case _LIMWORD:
965 p.constraint &= _ENDWORD_CONSTRAINT;
966 break;
967 case _NOTLIMWORD:
968 p.constraint &= _NOTLIMWORD_CONSTRAINT;
969 break;
970 default:
971 break;
972 }
973 for (j = 0; j < r->follows[old.index].nelem; ++j)
974 {
975 p.index = r->follows[old.index].elems[j].index;
976 insert(p, s);
977 }
978 /* Force rescan to start at the beginning. */
979 i = -1;
980 }
981
982 free(visited);
983}
984\f
985/* Perform bottom-up analysis on the parse tree, computing various functions.
986 Note that at this point, we're pretending constructs like \< are real
987 characters rather than constraints on what can follow them.
988
989 Nullable: A node is nullable if it is at the root of a regexp that can
990 match the empty string.
991 * _EMPTY leaves are nullable.
992 * No other leaf is nullable.
993 * A _QMARK or _STAR node is nullable.
994 * A _PLUS node is nullable if its argument is nullable.
995 * A _CAT node is nullable if both its arguments are nullable.
996 * An _OR node is nullable if either argument is nullable.
997
998 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
999 that could correspond to the first character of a string matching the
1000 regexp rooted at the given node.
1001 * _EMPTY leaves have empty firstpos.
1002 * The firstpos of a nonempty leaf is that leaf itself.
1003 * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its
1004 argument.
1005 * The firstpos of a _CAT node is the firstpos of the left argument, union
1006 the firstpos of the right if the left argument is nullable.
1007 * The firstpos of an _OR node is the union of firstpos of each argument.
1008
1009 Lastpos: The lastpos of a node is the set of positions that could
1010 correspond to the last character of a string matching the regexp at
1011 the given node.
1012 * _EMPTY leaves have empty lastpos.
1013 * The lastpos of a nonempty leaf is that leaf itself.
1014 * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its
1015 argument.
1016 * The lastpos of a _CAT node is the lastpos of its right argument, union
1017 the lastpos of the left if the right argument is nullable.
1018 * The lastpos of an _OR node is the union of the lastpos of each argument.
1019
1020 Follow: The follow of a position is the set of positions that could
1021 correspond to the character following a character matching the node in
1022 a string matching the regexp. At this point we consider special symbols
1023 that match the empty string in some context to be just normal characters.
1024 Later, if we find that a special symbol is in a follow set, we will
1025 replace it with the elements of its follow, labeled with an appropriate
1026 constraint.
1027 * Every node in the firstpos of the argument of a _STAR or _PLUS node is in
1028 the follow of every node in the lastpos.
1029 * Every node in the firstpos of the second argument of a _CAT node is in
1030 the follow of every node in the lastpos of the first argument.
1031
1032 Because of the postfix representation of the parse tree, the depth-first
1033 analysis is conveniently done by a linear scan with the aid of a stack.
1034 Sets are stored as arrays of the elements, obeying a stack-like allocation
1035 scheme; the number of elements in each set deeper in the stack can be
1036 used to determine the address of a particular set's array. */
1037void
1038reganalyze(r, searchflag)
1039 struct regexp *r;
1040 int searchflag;
1041{
1042 int *nullable; /* Nullable stack. */
1043 int *nfirstpos; /* Element count stack for firstpos sets. */
1044 _position *firstpos; /* Array where firstpos elements are stored. */
1045 int *nlastpos; /* Element count stack for lastpos sets. */
1046 _position *lastpos; /* Array where lastpos elements are stored. */
1047 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1048 _position_set tmp; /* Temporary set for merging sets. */
1049 _position_set merged; /* Result of merging sets. */
1050 int wants_newline; /* True if some position wants newline info. */
1051 int *o_nullable;
1052 int *o_nfirst, *o_nlast;
1053 _position *o_firstpos, *o_lastpos;
1054 int i, j;
1055 _position *pos;
1056
1057 r->searchflag = searchflag;
1058
1059 MALLOC(nullable, int, r->depth);
1060 o_nullable = nullable;
1061 MALLOC(nfirstpos, int, r->depth);
1062 o_nfirst = nfirstpos;
1063 MALLOC(firstpos, _position, r->nleaves);
1064 o_firstpos = firstpos, firstpos += r->nleaves;
1065 MALLOC(nlastpos, int, r->depth);
1066 o_nlast = nlastpos;
1067 MALLOC(lastpos, _position, r->nleaves);
1068 o_lastpos = lastpos, lastpos += r->nleaves;
1069 MALLOC(nalloc, int, r->tindex);
1070 for (i = 0; i < r->tindex; ++i)
1071 nalloc[i] = 0;
1072 MALLOC(merged.elems, _position, r->nleaves);
1073
1074 CALLOC(r->follows, _position_set, r->tindex);
1075
1076 for (i = 0; i < r->tindex; ++i)
1077 switch (r->tokens[i])
1078 {
1079 case _EMPTY:
1080 /* The empty set is nullable. */
1081 *nullable++ = 1;
1082
1083 /* The firstpos and lastpos of the empty leaf are both empty. */
1084 *nfirstpos++ = *nlastpos++ = 0;
1085 break;
1086
1087 case _STAR:
1088 case _PLUS:
1089 /* Every element in the firstpos of the argument is in the follow
1090 of every element in the lastpos. */
1091 tmp.nelem = nfirstpos[-1];
1092 tmp.elems = firstpos;
1093 pos = lastpos;
1094 for (j = 0; j < nlastpos[-1]; ++j)
1095 {
1096 merge(&tmp, &r->follows[pos[j].index], &merged);
1097 REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,
1098 nalloc[pos[j].index], merged.nelem - 1);
1099 copy(&merged, &r->follows[pos[j].index]);
1100 }
1101
1102 case _QMARK:
1103 /* A _QMARK or _STAR node is automatically nullable. */
1104 if (r->tokens[i] != _PLUS)
1105 nullable[-1] = 1;
1106 break;
1107
1108 case _CAT:
1109 /* Every element in the firstpos of the second argument is in the
1110 follow of every element in the lastpos of the first argument. */
1111 tmp.nelem = nfirstpos[-1];
1112 tmp.elems = firstpos;
1113 pos = lastpos + nlastpos[-1];
1114 for (j = 0; j < nlastpos[-2]; ++j)
1115 {
1116 merge(&tmp, &r->follows[pos[j].index], &merged);
1117 REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position,
1118 nalloc[pos[j].index], merged.nelem - 1);
1119 copy(&merged, &r->follows[pos[j].index]);
1120 }
1121
1122 /* The firstpos of a _CAT node is the firstpos of the first argument,
1123 union that of the second argument if the first is nullable. */
1124 if (nullable[-2])
1125 nfirstpos[-2] += nfirstpos[-1];
1126 else
1127 firstpos += nfirstpos[-1];
1128 --nfirstpos;
1129
1130 /* The lastpos of a _CAT node is the lastpos of the second argument,
1131 union that of the first argument if the second is nullable. */
1132 if (nullable[-1])
1133 nlastpos[-2] += nlastpos[-1];
1134 else
1135 {
1136 pos = lastpos + nlastpos[-2];
1137 for (j = nlastpos[-1] - 1; j >= 0; --j)
1138 pos[j] = lastpos[j];
1139 lastpos += nlastpos[-2];
1140 nlastpos[-2] = nlastpos[-1];
1141 }
1142 --nlastpos;
1143
1144 /* A _CAT node is nullable if both arguments are nullable. */
1145 nullable[-2] = nullable[-1] && nullable[-2];
1146 --nullable;
1147 break;
1148
1149 case _OR:
1150 /* The firstpos is the union of the firstpos of each argument. */
1151 nfirstpos[-2] += nfirstpos[-1];
1152 --nfirstpos;
1153
1154 /* The lastpos is the union of the lastpos of each argument. */
1155 nlastpos[-2] += nlastpos[-1];
1156 --nlastpos;
1157
1158 /* An _OR node is nullable if either argument is nullable. */
1159 nullable[-2] = nullable[-1] || nullable[-2];
1160 --nullable;
1161 break;
1162
1163 default:
1164 /* Anything else is a nonempty position. (Note that special
1165 constructs like \< are treated as nonempty strings here;
1166 an "epsilon closure" effectively makes them nullable later.
1167 Backreferences have to get a real position so we can detect
1168 transitions on them later. But they are nullable. */
1169 *nullable++ = r->tokens[i] == _BACKREF;
1170
1171 /* This position is in its own firstpos and lastpos. */
1172 *nfirstpos++ = *nlastpos++ = 1;
1173 --firstpos, --lastpos;
1174 firstpos->index = lastpos->index = i;
1175 firstpos->constraint = lastpos->constraint = _NO_CONSTRAINT;
1176
1177 /* Allocate the follow set for this position. */
1178 nalloc[i] = 1;
1179 MALLOC(r->follows[i].elems, _position, nalloc[i]);
1180 break;
1181 }
1182
1183 /* For each follow set that is the follow set of a real position, replace
1184 it with its epsilon closure. */
1185 for (i = 0; i < r->tindex; ++i)
1186 if (r->tokens[i] < _NOTCHAR || r->tokens[i] == _BACKREF
1187 || r->tokens[i] >= _SET)
1188 {
1189 copy(&r->follows[i], &merged);
1190 epsclosure(&merged, r);
1191 if (r->follows[i].nelem < merged.nelem)
1192 REALLOC(r->follows[i].elems, _position, merged.nelem);
1193 copy(&merged, &r->follows[i]);
1194 }
1195
1196 /* Get the epsilon closure of the firstpos of the regexp. The result will
1197 be the set of positions of state 0. */
1198 merged.nelem = 0;
1199 for (i = 0; i < nfirstpos[-1]; ++i)
1200 insert(firstpos[i], &merged);
1201 epsclosure(&merged, r);
1202
1203 /* Check if any of the positions of state 0 will want newline context. */
1204 wants_newline = 0;
1205 for (i = 0; i < merged.nelem; ++i)
1206 if (_PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1207 wants_newline = 1;
1208
1209 /* Build the initial state. */
1210 r->salloc = 1;
1211 r->sindex = 0;
1212 MALLOC(r->states, _dfa_state, r->salloc);
1213 state_index(r, &merged, wants_newline, 0);
1214
1215 free(o_nullable);
1216 free(o_nfirst);
1217 free(o_firstpos);
1218 free(o_nlast);
1219 free(o_lastpos);
1220 free(nalloc);
1221 free(merged.elems);
1222}
1223\f
1224/* Find, for each character, the transition out of state s of r, and store
1225 it in the appropriate slot of trans.
1226
1227 We divide the positions of s into groups (positions can appear in more
1228 than one group). Each group is labeled with a set of characters that
1229 every position in the group matches (taking into account, if necessary,
1230 preceding context information of s). For each group, find the union
1231 of the its elements' follows. This set is the set of positions of the
1232 new state. For each character in the group's label, set the transition
1233 on this character to be to a state corresponding to the set's positions,
1234 and its associated backward context information, if necessary.
1235
1236 If we are building a searching matcher, we include the positions of state
1237 0 in every state.
1238
1239 The collection of groups is constructed by building an equivalence-class
1240 partition of the positions of s.
1241
1242 For each position, find the set of characters C that it matches. Eliminate
1243 any characters from C that fail on grounds of backward context.
1244
1245 Search through the groups, looking for a group whose label L has nonempty
1246 intersection with C. If L - C is nonempty, create a new group labeled
1247 L - C and having the same positions as the current group, and set L to
1248 the intersection of L and C. Insert the position in this group, set
1249 C = C - L, and resume scanning.
1250
1251 If after comparing with every group there are characters remaining in C,
1252 create a new group labeled with the characters of C and insert this
1253 position in that group. */
1254void
1255regstate(s, r, trans)
1256 int s;
1257 struct regexp *r;
1258 int trans[];
1259{
1260 _position_set grps[_NOTCHAR]; /* As many as will ever be needed. */
1261 _charset labels[_NOTCHAR]; /* Labels corresponding to the groups. */
1262 int ngrps = 0; /* Number of groups actually used. */
1263 _position pos; /* Current position being considered. */
1264 _charset matches; /* Set of matching characters. */
1265 int matchesf; /* True if matches is nonempty. */
1266 _charset intersect; /* Intersection with some label set. */
1267 int intersectf; /* True if intersect is nonempty. */
1268 _charset leftovers; /* Stuff in the label that didn't match. */
1269 int leftoversf; /* True if leftovers is nonempty. */
1270 static _charset letters; /* Set of characters considered letters. */
1271 static _charset newline; /* Set of characters that aren't newline. */
1272 _position_set follows; /* Union of the follows of some group. */
1273 _position_set tmp; /* Temporary space for merging sets. */
1274 int state; /* New state. */
1275 int wants_newline; /* New state wants to know newline context. */
1276 int state_newline; /* New state on a newline transition. */
1277 int wants_letter; /* New state wants to know letter context. */
1278 int state_letter; /* New state on a letter transition. */
1279 static initialized; /* Flag for static initialization. */
1280 int i, j, k;
1281
1282 /* Initialize the set of letters, if necessary. */
1283 if (! initialized)
1284 {
1285 initialized = 1;
1286 for (i = 0; i < _NOTCHAR; ++i)
1287 if (ISALNUM(i))
1288 setbit(i, letters);
1289 setbit('\n', newline);
1290 }
1291
1292 zeroset(matches);
1293
1294 for (i = 0; i < r->states[s].elems.nelem; ++i)
1295 {
1296 pos = r->states[s].elems.elems[i];
1297 if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR)
1298 setbit(r->tokens[pos.index], matches);
1299 else if (r->tokens[pos.index] >= _SET)
1300 copyset(r->charsets[r->tokens[pos.index] - _SET], matches);
1301 else
1302 continue;
1303
1304 /* Some characters may need to be climinated from matches because
1305 they fail in the current context. */
1306 if (pos.constraint != 0xff)
1307 {
1308 if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint,
1309 r->states[s].newline, 1))
1310 clrbit('\n', matches);
1311 if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint,
1312 r->states[s].newline, 0))
1313 for (j = 0; j < _CHARSET_INTS; ++j)
1314 matches[j] &= newline[j];
1315 if (! _MATCHES_LETTER_CONTEXT(pos.constraint,
1316 r->states[s].letter, 1))
1317 for (j = 0; j < _CHARSET_INTS; ++j)
1318 matches[j] &= ~letters[j];
1319 if (! _MATCHES_LETTER_CONTEXT(pos.constraint,
1320 r->states[s].letter, 0))
1321 for (j = 0; j < _CHARSET_INTS; ++j)
1322 matches[j] &= letters[j];
1323
1324 /* If there are no characters left, there's no point in going on. */
1325 for (j = 0; j < _CHARSET_INTS && !matches[j]; ++j)
1326 ;
1327 if (j == _CHARSET_INTS)
1328 continue;
1329 }
1330
1331 for (j = 0; j < ngrps; ++j)
1332 {
1333 /* If matches contains a single character only, and the current
1334 group's label doesn't contain that character, go on to the
1335 next group. */
1336 if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR
1337 && !tstbit(r->tokens[pos.index], labels[j]))
1338 continue;
1339
1340 /* Check if this group's label has a nonempty intersection with
1341 matches. */
1342 intersectf = 0;
1343 for (k = 0; k < _CHARSET_INTS; ++k)
1344 (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0;
1345 if (! intersectf)
1346 continue;
1347
1348 /* It does; now find the set differences both ways. */
1349 leftoversf = matchesf = 0;
1350 for (k = 0; k < _CHARSET_INTS; ++k)
1351 {
1352 /* Even an optimizing compiler can't know this for sure. */
1353 int match = matches[k], label = labels[j][k];
1354
1355 (leftovers[k] = ~match & label) ? leftoversf = 1 : 0;
1356 (matches[k] = match & ~label) ? matchesf = 1 : 0;
1357 }
1358
1359 /* If there were leftovers, create a new group labeled with them. */
1360 if (leftoversf)
1361 {
1362 copyset(leftovers, labels[ngrps]);
1363 copyset(intersect, labels[j]);
1364 MALLOC(grps[ngrps].elems, _position, r->nleaves);
1365 copy(&grps[j], &grps[ngrps]);
1366 ++ngrps;
1367 }
1368
1369 /* Put the position in the current group. Note that there is no
1370 reason to call insert() here. */
1371 grps[j].elems[grps[j].nelem++] = pos;
1372
1373 /* If every character matching the current position has been
1374 accounted for, we're done. */
1375 if (! matchesf)
1376 break;
1377 }
1378
1379 /* If we've passed the last group, and there are still characters
1380 unaccounted for, then we'll have to create a new group. */
1381 if (j == ngrps)
1382 {
1383 copyset(matches, labels[ngrps]);
1384 zeroset(matches);
1385 MALLOC(grps[ngrps].elems, _position, r->nleaves);
1386 grps[ngrps].nelem = 1;
1387 grps[ngrps].elems[0] = pos;
1388 ++ngrps;
1389 }
1390 }
1391
1392 MALLOC(follows.elems, _position, r->nleaves);
1393 MALLOC(tmp.elems, _position, r->nleaves);
1394
1395 /* If we are a searching matcher, the default transition is to a state
1396 containing the positions of state 0, otherwise the default transition
1397 is to fail miserably. */
1398 if (r->searchflag)
1399 {
1400 wants_newline = 0;
1401 wants_letter = 0;
1402 for (i = 0; i < r->states[0].elems.nelem; ++i)
1403 {
1404 if (_PREV_NEWLINE_DEPENDENT(r->states[0].elems.elems[i].constraint))
1405 wants_newline = 1;
1406 if (_PREV_LETTER_DEPENDENT(r->states[0].elems.elems[i].constraint))
1407 wants_letter = 1;
1408 }
1409 copy(&r->states[0].elems, &follows);
1410 state = state_index(r, &follows, 0, 0);
1411 if (wants_newline)
1412 state_newline = state_index(r, &follows, 1, 0);
1413 else
1414 state_newline = state;
1415 if (wants_letter)
1416 state_letter = state_index(r, &follows, 0, 1);
1417 else
1418 state_letter = state;
1419 for (i = 0; i < _NOTCHAR; ++i)
1420 trans[i] = (ISALNUM(i)) ? state_letter : state ;
1421 trans['\n'] = state_newline;
1422 }
1423 else
1424 for (i = 0; i < _NOTCHAR; ++i)
1425 trans[i] = -1;
1426
1427 for (i = 0; i < ngrps; ++i)
1428 {
1429 follows.nelem = 0;
1430
1431 /* Find the union of the follows of the positions of the group.
1432 This is a hideously inefficient loop. Fix it someday. */
1433 for (j = 0; j < grps[i].nelem; ++j)
1434 for (k = 0; k < r->follows[grps[i].elems[j].index].nelem; ++k)
1435 insert(r->follows[grps[i].elems[j].index].elems[k], &follows);
1436
1437 /* If we are building a searching matcher, throw in the positions
1438 of state 0 as well. */
1439 if (r->searchflag)
1440 for (j = 0; j < r->states[0].elems.nelem; ++j)
1441 insert(r->states[0].elems.elems[j], &follows);
1442
1443 /* Find out if the new state will want any context information. */
1444 wants_newline = 0;
1445 if (tstbit('\n', labels[i]))
1446 for (j = 0; j < follows.nelem; ++j)
1447 if (_PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1448 wants_newline = 1;
1449
1450 wants_letter = 0;
1451 for (j = 0; j < _CHARSET_INTS; ++j)
1452 if (labels[i][j] & letters[j])
1453 break;
1454 if (j < _CHARSET_INTS)
1455 for (j = 0; j < follows.nelem; ++j)
1456 if (_PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1457 wants_letter = 1;
1458
1459 /* Find the state(s) corresponding to the union of the follows. */
1460 state = state_index(r, &follows, 0, 0);
1461 if (wants_newline)
1462 state_newline = state_index(r, &follows, 1, 0);
1463 else
1464 state_newline = state;
1465 if (wants_letter)
1466 state_letter = state_index(r, &follows, 0, 1);
1467 else
1468 state_letter = state;
1469
1470 /* Set the transitions for each character in the current label. */
1471 for (j = 0; j < _CHARSET_INTS; ++j)
1472 for (k = 0; k < INTBITS; ++k)
1473 if (labels[i][j] & 1 << k)
1474 {
1475 int c = j * INTBITS + k;
1476
1477 if (c == '\n')
1478 trans[c] = state_newline;
1479 else if (ISALNUM(c))
1480 trans[c] = state_letter;
1481 else if (c < _NOTCHAR)
1482 trans[c] = state;
1483 }
1484 }
1485
1486 for (i = 0; i < ngrps; ++i)
1487 free(grps[i].elems);
1488 free(follows.elems);
1489 free(tmp.elems);
1490}
1491\f
1492/* Some routines for manipulating a compiled regexp's transition tables.
1493 Each state may or may not have a transition table; if it does, and it
1494 is a non-accepting state, then r->trans[state] points to its table.
1495 If it is an accepting state then r->fails[state] points to its table.
1496 If it has no table at all, then r->trans[state] is NULL.
1497 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1498
1499static void
1500build_state(s, r)
1501 int s;
1502 struct regexp *r;
1503{
1504 int *trans; /* The new transition table. */
1505 int i;
1506
1507 /* Set an upper limit on the number of transition tables that will ever
1508 exist at once. 1024 is arbitrary. The idea is that the frequently
1509 used transition tables will be quickly rebuilt, whereas the ones that
1510 were only needed once or twice will be cleared away. */
1511 if (r->trcount >= 1024)
1512 {
1513 for (i = 0; i < r->tralloc; ++i)
1514 if (r->trans[i])
1515 {
1516 free((ptr_t) r->trans[i]);
1517 r->trans[i] = NULL;
1518 }
1519 else if (r->fails[i])
1520 {
1521 free((ptr_t) r->fails[i]);
1522 r->fails[i] = NULL;
1523 }
1524 r->trcount = 0;
1525 }
1526
1527 ++r->trcount;
1528
1529 /* Set up the success bits for this state. */
1530 r->success[s] = 0;
1531 if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 1, r->states[s].letter, 0,
1532 s, *r))
1533 r->success[s] |= 4;
1534 if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 1,
1535 s, *r))
1536 r->success[s] |= 2;
1537 if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 0,
1538 s, *r))
1539 r->success[s] |= 1;
1540
1541 MALLOC(trans, int, _NOTCHAR);
1542 regstate(s, r, trans);
1543
1544 /* Now go through the new transition table, and make sure that the trans
1545 and fail arrays are allocated large enough to hold a pointer for the
1546 largest state mentioned in the table. */
1547 for (i = 0; i < _NOTCHAR; ++i)
1548 if (trans[i] >= r->tralloc)
1549 {
1550 int oldalloc = r->tralloc;
1551
1552 while (trans[i] >= r->tralloc)
1553 r->tralloc *= 2;
1554 REALLOC(r->realtrans, int *, r->tralloc + 1);
1555 r->trans = r->realtrans + 1;
1556 REALLOC(r->fails, int *, r->tralloc);
1557 REALLOC(r->success, int, r->tralloc);
1558 REALLOC(r->newlines, int, r->tralloc);
1559 while (oldalloc < r->tralloc)
1560 {
1561 r->trans[oldalloc] = NULL;
1562 r->fails[oldalloc++] = NULL;
1563 }
1564 }
1565
1566 /* Keep the newline transition in a special place so we can use it as
1567 a sentinel. */
1568 r->newlines[s] = trans['\n'];
1569 trans['\n'] = -1;
1570
1571 if (ACCEPTING(s, *r))
1572 r->fails[s] = trans;
1573 else
1574 r->trans[s] = trans;
1575}
1576
1577static void
1578build_state_zero(r)
1579 struct regexp *r;
1580{
1581 r->tralloc = 1;
1582 r->trcount = 0;
1583 CALLOC(r->realtrans, int *, r->tralloc + 1);
1584 r->trans = r->realtrans + 1;
1585 CALLOC(r->fails, int *, r->tralloc);
1586 MALLOC(r->success, int, r->tralloc);
1587 MALLOC(r->newlines, int, r->tralloc);
1588 build_state(0, r);
1589}
1590\f
1591/* Search through a buffer looking for a match to the given struct regexp.
1592 Find the first occurrence of a string matching the regexp in the buffer,
1593 and the shortest possible version thereof. Return a pointer to the first
1594 character after the match, or NULL if none is found. Begin points to
1595 the beginning of the buffer, and end points to the first character after
1596 its end. We store a newline in *end to act as a sentinel, so end had
1597 better point somewhere valid. Newline is a flag indicating whether to
1598 allow newlines to be in the matching string. If count is non-
1599 NULL it points to a place we're supposed to increment every time we
1600 see a newline. Finally, if backref is non-NULL it points to a place
1601 where we're supposed to store a 1 if backreferencing happened and the
1602 match needs to be verified by a backtracking matcher. Otherwise
1603 we store a 0 in *backref. */
1604char *
1605regexecute(r, begin, end, newline, count, backref)
1606 struct regexp *r;
1607 char *begin;
1608 char *end;
1609 int newline;
1610 int *count;
1611 int *backref;
1612{
1613 register s, s1, tmp; /* Current state. */
1614 register unsigned char *p; /* Current input character. */
1615 register **trans, *t; /* Copy of r->trans so it can be optimized
1616 into a register. */
1617 static sbit[_NOTCHAR]; /* Table for anding with r->success. */
1618 static sbit_init;
1619
1620 if (! sbit_init)
1621 {
1622 int i;
1623
1624 sbit_init = 1;
1625 for (i = 0; i < _NOTCHAR; ++i)
1626 sbit[i] = (ISALNUM(i)) ? 2 : 1;
1627 sbit['\n'] = 4;
1628 }
1629
1630 if (! r->tralloc)
1631 build_state_zero(r);
1632
1633 s = s1 = 0;
1634 p = (unsigned char *) begin;
1635 trans = r->trans;
1636 *end = '\n';
1637
1638 for (;;)
1639 {
1640 while ((t = trans[s]) != 0) { /* hand-optimized loop */
1641 s1 = t[*p++];
1642 if ((t = trans[s1]) == 0) {
1643 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1644 break;
1645 }
1646 s = t[*p++];
1647 }
1648
1649 if (s >= 0 && p <= (unsigned char *) end && r->fails[s])
1650 {
1651 if (r->success[s] & sbit[*p])
1652 {
1653 if (backref)
1654 *backref = (r->states[s].backref != 0);
1655 return (char *) p;
1656 }
1657
1658 s1 = s;
1659 s = r->fails[s][*p++];
1660 continue;
1661 }
1662
1663 /* If the previous character was a newline, count it. */
1664 if (count && (char *) p <= end && p[-1] == '\n')
1665 ++*count;
1666
1667 /* Check if we've run off the end of the buffer. */
1668 if ((char *) p >= end)
1669 return NULL;
1670
1671 if (s >= 0)
1672 {
1673 build_state(s, r);
1674 trans = r->trans;
1675 continue;
1676 }
1677
1678 if (p[-1] == '\n' && newline)
1679 {
1680 s = r->newlines[s1];
1681 continue;
1682 }
1683
1684 s = 0;
1685 }
1686}
1687\f
1688/* Initialize the components of a regexp that the other routines don't
1689 initialize for themselves. */
1690void
1691reginit(r)
1692 struct regexp *r;
1693{
1694 r->calloc = 1;
1695 MALLOC(r->charsets, _charset, r->calloc);
1696 r->cindex = 0;
1697
1698 r->talloc = 1;
1699 MALLOC(r->tokens, _token, r->talloc);
1700 r->tindex = r->depth = r->nleaves = r->nregexps = 0;
1701
1702 r->searchflag = 0;
1703 r->tralloc = 0;
1704}
1705
1706/* Parse and analyze a single string of the given length. */
1707void
1708regcompile(s, len, r, searchflag)
1709 const char *s;
1710 size_t len;
1711 struct regexp *r;
1712 int searchflag;
1713{
1714 if (case_fold) /* dummy folding in service of regmust() */
1715 {
1716 char *regcopy;
1717 int i;
1718
1719 regcopy = malloc(len);
1720 if (!regcopy)
1721 reg_error("out of memory");
1722
1723 /* This is a complete kludge and could potentially break
1724 \<letter> escapes . . . */
1725 case_fold = 0;
1726 for (i = 0; i < len; ++i)
1727 if (ISUPPER(s[i]))
1728 regcopy[i] = tolower(s[i]);
1729 else
1730 regcopy[i] = s[i];
1731
1732 reginit(r);
1733 r->mustn = 0;
1734 r->must[0] = '\0';
1735 regparse(regcopy, len, r);
1736 free(regcopy);
1737 regmust(r);
1738 reganalyze(r, searchflag);
1739 case_fold = 1;
1740 reginit(r);
1741 regparse(s, len, r);
1742 reganalyze(r, searchflag);
1743 }
1744 else
1745 {
1746 reginit(r);
1747 regparse(s, len, r);
1748 regmust(r);
1749 reganalyze(r, searchflag);
1750 }
1751}
1752
1753/* Free the storage held by the components of a regexp. */
1754void
1755reg_free(r)
1756 struct regexp *r;
1757{
1758 int i;
1759
1760 free((ptr_t) r->charsets);
1761 free((ptr_t) r->tokens);
1762 for (i = 0; i < r->sindex; ++i)
1763 free((ptr_t) r->states[i].elems.elems);
1764 free((ptr_t) r->states);
1765 for (i = 0; i < r->tindex; ++i)
1766 if (r->follows[i].elems)
1767 free((ptr_t) r->follows[i].elems);
1768 free((ptr_t) r->follows);
1769 for (i = 0; i < r->tralloc; ++i)
1770 if (r->trans[i])
1771 free((ptr_t) r->trans[i]);
1772 else if (r->fails[i])
1773 free((ptr_t) r->fails[i]);
1774 if (r->realtrans)
1775 free((ptr_t) r->realtrans);
1776 if (r->fails)
1777 free((ptr_t) r->fails);
1778 if (r->newlines)
1779 free((ptr_t) r->newlines);
1780}
1781
1782/*
1783Having found the postfix representation of the regular expression,
1784try to find a long sequence of characters that must appear in any line
1785containing the r.e.
1786Finding a "longest" sequence is beyond the scope here;
1787we take an easy way out and hope for the best.
1788(Take "(ab|a)b"--please.)
1789
1790We do a bottom-up calculation of sequences of characters that must appear
1791in matches of r.e.'s represented by trees rooted at the nodes of the postfix
1792representation:
1793 sequences that must appear at the left of the match ("left")
1794 sequences that must appear at the right of the match ("right")
1795 lists of sequences that must appear somewhere in the match ("in")
1796 sequences that must constitute the match ("is")
1797When we get to the root of the tree, we use one of the longest of its
1798calculated "in" sequences as our answer. The sequence we find is returned in
1799r->must (where "r" is the single argument passed to "regmust");
1800the length of the sequence is returned in r->mustn.
1801
1802The sequences calculated for the various types of node (in pseudo ANSI c)
1803are shown below. "p" is the operand of unary operators (and the left-hand
1804operand of binary operators); "q" is the right-hand operand of binary operators
1805.
1806"ZERO" means "a zero-length sequence" below.
1807
1808Type left right is in
1809---- ---- ----- -- --
1810char c # c # c # c # c
1811
1812SET ZERO ZERO ZERO ZERO
1813
1814STAR ZERO ZERO ZERO ZERO
1815
1816QMARK ZERO ZERO ZERO ZERO
1817
1818PLUS p->left p->right ZERO p->in
1819
1820CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
1821 p->left : q->right : q->is!=ZERO) ? q->in plus
1822 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
1823 ZERO
1824
1825OR longest common longest common (do p->is and substrings common to
1826 leading trailing q->is have same p->in and q->in
1827 (sub)sequence (sub)sequence length and
1828 of p->left of p->right content) ?
1829 and q->left and q->right p->is : NULL
1830
1831If there's anything else we recognize in the tree, all four sequences get set
1832to zero-length sequences. If there's something we don't recognize in the tree,
1833we just return a zero-length sequence.
1834
1835Break ties in favor of infrequent letters (choosing 'zzz' in preference to
1836'aaa')?
1837
1838And. . .is it here or someplace that we might ponder "optimizations" such as
1839 egrep 'psi|epsilon' -> egrep 'psi'
1840 egrep 'pepsi|epsilon' -> egrep 'epsi'
1841 (Yes, we now find "epsi" as a "string
1842 that must occur", but we might also
1843 simplify the *entire* r.e. being sought
1844)
1845 grep '[c]' -> grep 'c'
1846 grep '(ab|a)b' -> grep 'ab'
1847 grep 'ab*' -> grep 'a'
1848 grep 'a*b' -> grep 'b'
1849There are several issues:
1850 Is optimization easy (enough)?
1851
1852 Does optimization actually accomplish anything,
1853 or is the automaton you get from "psi|epsilon" (for example)
1854 the same as the one you get from "psi" (for example)?
1855
1856 Are optimizable r.e.'s likely to be used in real-life situations
1857 (something like 'ab*' is probably unlikely; something like is
1858 'psi|epsilon' is likelier)?
1859*/
1860
1861static char *
1862icatalloc(old, new)
1863char * old;
1864const char * new;
1865{
1866 register char * result;
1867 register int oldsize, newsize;
1868
1869 newsize = (new == NULL) ? 0 : strlen(new);
1870 if (old == NULL)
1871 oldsize = 0;
1872 else if (newsize == 0)
1873 return old;
1874 else oldsize = strlen(old);
1875 if (old == NULL)
1876 result = (char *) malloc(newsize + 1);
1877 else result = (char *) realloc((void *) old, oldsize + newsize + 1);
1878 if (result != NULL && new != NULL)
1879 (void) strcpy(result + oldsize, new);
1880 return result;
1881}
1882
1883static char *
1884icpyalloc(string)
1885const char * string;
1886{
1887 return icatalloc((char *) NULL, string);
1888}
1889
1890static char *
1891istrstr(lookin, lookfor)
1892char * lookin;
1893register char * lookfor;
1894{
1895 register char * cp;
1896 register int len;
1897
1898 len = strlen(lookfor);
1899 for (cp = lookin; *cp != '\0'; ++cp)
1900 if (strncmp(cp, lookfor, len) == 0)
1901 return cp;
1902 return NULL;
1903}
1904
1905static void
1906ifree(cp)
1907char * cp;
1908{
1909 if (cp != NULL)
1910 free(cp);
1911}
1912
1913static void
1914freelist(cpp)
1915register char ** cpp;
1916{
1917 register int i;
1918
1919 if (cpp == NULL)
1920 return;
1921 for (i = 0; cpp[i] != NULL; ++i) {
1922 free(cpp[i]);
1923 cpp[i] = NULL;
1924 }
1925}
1926
1927static char **
1928enlist(cpp, new, len)
1929register char ** cpp;
1930register char * new;
1931#ifdef __STDC__
1932size_t len;
1933#else
1934int len;
1935#endif
1936{
1937 register int i, j;
1938
1939 if (cpp == NULL)
1940 return NULL;
1941 if ((new = icpyalloc(new)) == NULL) {
1942 freelist(cpp);
1943 return NULL;
1944 }
1945 new[len] = '\0';
1946 /*
1947 ** Is there already something in the list that's new (or longer)?
1948 */
1949 for (i = 0; cpp[i] != NULL; ++i)
1950 if (istrstr(cpp[i], new) != NULL) {
1951 free(new);
1952 return cpp;
1953 }
1954 /*
1955 ** Eliminate any obsoleted strings.
1956 */
1957 j = 0;
1958 while (cpp[j] != NULL)
1959 if (istrstr(new, cpp[j]) == NULL)
1960 ++j;
1961 else {
1962 free(cpp[j]);
1963 if (--i == j)
1964 break;
1965 cpp[j] = cpp[i];
1966 }
1967 /*
1968 ** Add the new string.
1969 */
1970 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
1971 if (cpp == NULL)
1972 return NULL;
1973 cpp[i] = new;
1974 cpp[i + 1] = NULL;
1975 return cpp;
1976}
1977
1978/*
1979** Given pointers to two strings,
1980** return a pointer to an allocated list of their distinct common substrings.
1981** Return NULL if something seems wild.
1982*/
1983
1984static char **
1985comsubs(left, right)
1986char * left;
1987char * right;
1988{
1989 register char ** cpp;
1990 register char * lcp;
1991 register char * rcp;
1992 register int i, len;
1993
1994 if (left == NULL || right == NULL)
1995 return NULL;
1996 cpp = (char **) malloc(sizeof *cpp);
1997 if (cpp == NULL)
1998 return NULL;
1999 cpp[0] = NULL;
2000 for (lcp = left; *lcp != '\0'; ++lcp) {
2001 len = 0;
2002 rcp = strchr(right, *lcp);
2003 while (rcp != NULL) {
2004 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2005 ;
2006 if (i > len)
2007 len = i;
2008 rcp = strchr(rcp + 1, *lcp);
2009 }
2010 if (len == 0)
2011 continue;
2012#ifdef __STDC__
2013 if ((cpp = enlist(cpp, lcp, (size_t)len)) == NULL)
2014#else
2015 if ((cpp = enlist(cpp, lcp, len)) == NULL)
2016#endif
2017 break;
2018 }
2019 return cpp;
2020}
2021
2022static char **
2023addlists(old, new)
2024char ** old;
2025char ** new;
2026{
2027 register int i;
2028
2029 if (old == NULL || new == NULL)
2030 return NULL;
2031 for (i = 0; new[i] != NULL; ++i) {
2032 old = enlist(old, new[i], strlen(new[i]));
2033 if (old == NULL)
2034 break;
2035 }
2036 return old;
2037}
2038
2039/*
2040** Given two lists of substrings,
2041** return a new list giving substrings common to both.
2042*/
2043
2044static char **
2045inboth(left, right)
2046char ** left;
2047char ** right;
2048{
2049 register char ** both;
2050 register char ** temp;
2051 register int lnum, rnum;
2052
2053 if (left == NULL || right == NULL)
2054 return NULL;
2055 both = (char **) malloc(sizeof *both);
2056 if (both == NULL)
2057 return NULL;
2058 both[0] = NULL;
2059 for (lnum = 0; left[lnum] != NULL; ++lnum) {
2060 for (rnum = 0; right[rnum] != NULL; ++rnum) {
2061 temp = comsubs(left[lnum], right[rnum]);
2062 if (temp == NULL) {
2063 freelist(both);
2064 return NULL;
2065 }
2066 both = addlists(both, temp);
2067 freelist(temp);
2068 if (both == NULL)
2069 return NULL;
2070 }
2071 }
2072 return both;
2073}
2074
2075/*
2076typedef struct {
2077 char ** in;
2078 char * left;
2079 char * right;
2080 char * is;
2081} must;
2082 */
2083static void
2084resetmust(mp)
2085register must * mp;
2086{
2087 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2088 freelist(mp->in);
2089}
2090
2091static void
2092regmust(r)
2093register struct regexp * r;
2094{
2095 register must * musts;
2096 register must * mp;
2097 register char * result = "";
2098 register int ri;
2099 register int i;
2100 register _token t;
2101 static must must0;
2102
2103 reg->mustn = 0;
2104 reg->must[0] = '\0';
2105 musts = (must *) malloc((reg->tindex + 1) * sizeof *musts);
2106 if (musts == NULL)
2107 return;
2108 mp = musts;
2109 for (i = 0; i <= reg->tindex; ++i)
2110 mp[i] = must0;
2111 for (i = 0; i <= reg->tindex; ++i) {
2112 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2113 mp[i].left = malloc(2);
2114 mp[i].right = malloc(2);
2115 mp[i].is = malloc(2);
2116 if (mp[i].in == NULL || mp[i].left == NULL ||
2117 mp[i].right == NULL || mp[i].is == NULL)
2118 goto done;
2119 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2120 mp[i].in[0] = NULL;
2121 }
2122 for (ri = 0; ri < reg->tindex; ++ri) {
2123 switch (t = reg->tokens[ri]) {
2124 case _ALLBEGLINE:
2125 case _ALLENDLINE:
2126 case _LPAREN:
2127 case _RPAREN:
2128 goto done; /* "cannot happen" */
2129 case _EMPTY:
2130 case _BEGLINE:
2131 case _ENDLINE:
2132 case _BEGWORD:
2133 case _ENDWORD:
2134 case _LIMWORD:
2135 case _NOTLIMWORD:
2136 case _BACKREF:
2137 resetmust(mp);
2138 break;
2139 case _STAR:
2140 case _QMARK:
2141 if (mp <= musts)
2142 goto done; /* "cannot happen" */
2143 --mp;
2144 resetmust(mp);
2145 break;
2146 case _OR:
2147 if (mp < &musts[2])
2148 goto done; /* "cannot happen" */
2149 {
2150 register char ** new;
2151 register must * lmp;
2152 register must * rmp;
2153 register int j, ln, rn, n;
2154
2155 rmp = --mp;
2156 lmp = --mp;
2157 /* Guaranteed to be. Unlikely, but. . . */
2158 if (strcmp(lmp->is, rmp->is) != 0)
2159 lmp->is[0] = '\0';
2160 /* Left side--easy */
2161 i = 0;
2162 while (lmp->left[i] != '\0' &&
2163 lmp->left[i] == rmp->left[i])
2164 ++i;
2165 lmp->left[i] = '\0';
2166 /* Right side */
2167 ln = strlen(lmp->right);
2168 rn = strlen(rmp->right);
2169 n = ln;
2170 if (n > rn)
2171 n = rn;
2172 for (i = 0; i < n; ++i)
2173 if (lmp->right[ln - i - 1] !=
2174 rmp->right[rn - i - 1])
2175 break;
2176 for (j = 0; j < i; ++j)
2177 lmp->right[j] =
2178 lmp->right[(ln - i) + j];
2179 lmp->right[j] = '\0';
2180 new = inboth(lmp->in, rmp->in);
2181 if (new == NULL)
2182 goto done;
2183 freelist(lmp->in);
2184 free((char *) lmp->in);
2185 lmp->in = new;
2186 }
2187 break;
2188 case _PLUS:
2189 if (mp <= musts)
2190 goto done; /* "cannot happen" */
2191 --mp;
2192 mp->is[0] = '\0';
2193 break;
2194 case _END:
2195 if (mp != &musts[1])
2196 goto done; /* "cannot happen" */
2197 for (i = 0; musts[0].in[i] != NULL; ++i)
2198 if (strlen(musts[0].in[i]) > strlen(result))
2199 result = musts[0].in[i];
2200 goto done;
2201 case _CAT:
2202 if (mp < &musts[2])
2203 goto done; /* "cannot happen" */
2204 {
2205 register must * lmp;
2206 register must * rmp;
2207
2208 rmp = --mp;
2209 lmp = --mp;
2210 /*
2211 ** In. Everything in left, plus everything in
2212 ** right, plus catenation of
2213 ** left's right and right's left.
2214 */
2215 lmp->in = addlists(lmp->in, rmp->in);
2216 if (lmp->in == NULL)
2217 goto done;
2218 if (lmp->right[0] != '\0' &&
2219 rmp->left[0] != '\0') {
2220 register char * tp;
2221
2222 tp = icpyalloc(lmp->right);
2223 if (tp == NULL)
2224 goto done;
2225 tp = icatalloc(tp, rmp->left);
2226 if (tp == NULL)
2227 goto done;
2228 lmp->in = enlist(lmp->in, tp,
2229 strlen(tp));
2230 free(tp);
2231 if (lmp->in == NULL)
2232 goto done;
2233 }
2234 /* Left-hand */
2235 if (lmp->is[0] != '\0') {
2236 lmp->left = icatalloc(lmp->left,
2237 rmp->left);
2238 if (lmp->left == NULL)
2239 goto done;
2240 }
2241 /* Right-hand */
2242 if (rmp->is[0] == '\0')
2243 lmp->right[0] = '\0';
2244 lmp->right = icatalloc(lmp->right, rmp->right);
2245 if (lmp->right == NULL)
2246 goto done;
2247 /* Guaranteed to be */
2248 if (lmp->is[0] != '\0' && rmp->is[0] != '\0') {
2249 lmp->is = icatalloc(lmp->is, rmp->is);
2250 if (lmp->is == NULL)
2251 goto done;
2252 }
2253 }
2254 break;
2255 default:
2256 if (t < _END) {
2257 /* "cannot happen" */
2258 goto done;
2259 } else if (t == '\0') {
2260 /* not on *my* shift */
2261 goto done;
2262 } else if (t >= _SET) {
2263 /* easy enough */
2264 resetmust(mp);
2265 } else {
2266 /* plain character */
2267 resetmust(mp);
2268 mp->is[0] = mp->left[0] = mp->right[0] = t;
2269 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2270 mp->in = enlist(mp->in, mp->is, 1);
2271 if (mp->in == NULL)
2272 goto done;
2273 }
2274 break;
2275 }
2276 ++mp;
2277 }
2278done:
2279 (void) strncpy(reg->must, result, MUST_MAX - 1);
2280 reg->must[MUST_MAX - 1] = '\0';
2281 reg->mustn = strlen(reg->must);
2282 mp = musts;
2283 for (i = 0; i <= reg->tindex; ++i) {
2284 freelist(mp[i].in);
2285 ifree((char *) mp[i].in);
2286 ifree(mp[i].left);
2287 ifree(mp[i].right);
2288 ifree(mp[i].is);
2289 }
2290 free((char *) mp);
2291}