BSD 4_4_Lite2 development
[unix-history] / usr / src / contrib / perl-4.036 / regcomp.c
CommitLineData
66961ac9
C
1/* NOTE: this is derived from Henry Spencer's regexp code, and should not
2 * confused with the original package (see point 3 below). Thanks, Henry!
3 */
4
5/* Additional note: this code is very heavily munged from Henry's version
6 * in places. In some spots I've traded clarity for efficiency, so don't
7 * blame Henry for some of the lack of readability.
8 */
9
10/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.5 $$Date: 92/06/08 15:23:36 $
11 *
12 * $Log: regcomp.c,v $
13 * Revision 4.0.1.5 92/06/08 15:23:36 lwall
14 * patch20: Perl now distinguishes overlapped copies from non-overlapped
15 * patch20: /^stuff/ wrongly assumed an implicit $* == 1
16 * patch20: /x{0}/ was wrongly interpreted as /x{0,}/
17 * patch20: added \W, \S and \D inside /[...]/
18 *
19 * Revision 4.0.1.4 91/11/05 22:55:14 lwall
20 * patch11: Erratum
21 *
22 * Revision 4.0.1.3 91/11/05 18:22:28 lwall
23 * patch11: minimum match length calculation in regexp is now cumulative
24 * patch11: initial .* in pattern had dependency on value of $*
25 * patch11: certain patterns made use of garbage pointers from uncleared memory
26 * patch11: prepared for ctype implementations that don't define isascii()
27 *
28 * Revision 4.0.1.2 91/06/07 11:48:24 lwall
29 * patch4: new copyright notice
30 * patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
31 * patch4: // wouldn't use previous pattern if it started with a null character
32 *
33 * Revision 4.0.1.1 91/04/12 09:04:45 lwall
34 * patch1: random cleanup in cpp namespace
35 *
36 * Revision 4.0 91/03/20 01:39:01 lwall
37 * 4.0 baseline.
38 *
39 */
40/*SUPPRESS 112*/
41/*
42 * regcomp and regexec -- regsub and regerror are not used in perl
43 *
44 * Copyright (c) 1986 by University of Toronto.
45 * Written by Henry Spencer. Not derived from licensed software.
46 *
47 * Permission is granted to anyone to use this software for any
48 * purpose on any computer system, and to redistribute it freely,
49 * subject to the following restrictions:
50 *
51 * 1. The author is not responsible for the consequences of use of
52 * this software, no matter how awful, even if they arise
53 * from defects in it.
54 *
55 * 2. The origin of this software must not be misrepresented, either
56 * by explicit claim or by omission.
57 *
58 * 3. Altered versions must be plainly marked as such, and must not
59 * be misrepresented as being the original software.
60 *
61 *
62 **** Alterations to Henry's code are...
63 ****
64 **** Copyright (c) 1991, Larry Wall
65 ****
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
68
69 *
70 * Beware that some of this code is subtly aware of the way operator
71 * precedence is structured in regular expressions. Serious changes in
72 * regular-expression syntax might require a total rethink.
73 */
74#include "EXTERN.h"
75#include "perl.h"
76#include "INTERN.h"
77#include "regcomp.h"
78
79#ifdef MSDOS
80# if defined(BUGGY_MSC6)
81 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
82 # pragma optimize("a",off)
83 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
84 # pragma optimize("w",on )
85# endif /* BUGGY_MSC6 */
86#endif /* MSDOS */
87
88#ifndef STATIC
89#define STATIC static
90#endif
91
92#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
93#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
94 ((*s) == '{' && regcurly(s)))
95#ifdef atarist
96#define PERL_META "^$.[()|?+*\\"
97#else
98#define META "^$.[()|?+*\\"
99#endif
100
101#ifdef SPSTART
102#undef SPSTART /* dratted cpp namespace... */
103#endif
104/*
105 * Flags to be passed up and down.
106 */
107#define HASWIDTH 01 /* Known never to match null string. */
108#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
109#define SPSTART 04 /* Starts with * or +. */
110#define WORST 0 /* Worst case. */
111
112/*
113 * Global work variables for regcomp().
114 */
115static char *regprecomp; /* uncompiled string. */
116static char *regparse; /* Input-scan pointer. */
117static char *regxend; /* End of input for compile */
118static int regnpar; /* () count. */
119static char *regcode; /* Code-emit pointer; &regdummy = don't. */
120static long regsize; /* Code size. */
121static int regfold;
122static int regsawbracket; /* Did we do {d,d} trick? */
123static int regsawback; /* Did we see \1, ...? */
124
125/*
126 * Forward declarations for regcomp()'s friends.
127 */
128STATIC int regcurly();
129STATIC char *reg();
130STATIC char *regbranch();
131STATIC char *regpiece();
132STATIC char *regatom();
133STATIC char *regclass();
134STATIC char *regnode();
135STATIC char *reganode();
136STATIC void regc();
137STATIC void reginsert();
138STATIC void regtail();
139STATIC void regoptail();
140
141/*
142 - regcomp - compile a regular expression into internal code
143 *
144 * We can't allocate space until we know how big the compiled form will be,
145 * but we can't compile it (and thus know how big it is) until we've got a
146 * place to put the code. So we cheat: we compile it twice, once with code
147 * generation turned off and size counting turned on, and once "for real".
148 * This also means that we don't allocate space until we are sure that the
149 * thing really will compile successfully, and we never have to move the
150 * code and thus invalidate pointers into it. (Note that it has to be in
151 * one piece because free() must be able to free it all.) [NB: not true in perl]
152 *
153 * Beware that the optimization-preparation code in here knows about some
154 * of the structure of the compiled regexp. [I'll say.]
155 */
156regexp *
157regcomp(exp,xend,fold)
158char *exp;
159char *xend;
160int fold;
161{
162 register regexp *r;
163 register char *scan;
164 register STR *longish;
165 STR *longest;
166 register int len;
167 register char *first;
168 int flags;
169 int backish;
170 int backest;
171 int curback;
172 int minlen;
173 int sawplus = 0;
174 int sawopen = 0;
175
176 if (exp == NULL)
177 fatal("NULL regexp argument");
178
179 /* First pass: determine size, legality. */
180 regfold = fold;
181 regparse = exp;
182 regxend = xend;
183 regprecomp = nsavestr(exp,xend-exp);
184 regsawbracket = 0;
185 regsawback = 0;
186 regnpar = 1;
187 regsize = 0L;
188 regcode = &regdummy;
189 regc((char)MAGIC);
190 if (reg(0, &flags) == NULL) {
191 Safefree(regprecomp);
192 regprecomp = Nullch;
193 return(NULL);
194 }
195
196 /* Small enough for pointer-storage convention? */
197 if (regsize >= 32767L) /* Probably could be 65535L. */
198 FAIL("regexp too big");
199
200 /* Allocate space. */
201 Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp);
202 if (r == NULL)
203 FAIL("regexp out of space");
204
205 /* Second pass: emit code. */
206 if (regsawbracket)
207 Copy(regprecomp,exp,xend-exp,char);
208 r->prelen = xend-exp;
209 r->precomp = regprecomp;
210 r->subbeg = r->subbase = NULL;
211 regparse = exp;
212 regnpar = 1;
213 regcode = r->program;
214 regc((char)MAGIC);
215 if (reg(0, &flags) == NULL)
216 return(NULL);
217
218 /* Dig out information for optimizations. */
219 r->regstart = Nullstr; /* Worst-case defaults. */
220 r->reganch = 0;
221 r->regmust = Nullstr;
222 r->regback = -1;
223 r->regstclass = Nullch;
224 scan = r->program+1; /* First BRANCH. */
225 if (OP(regnext(scan)) == END) {/* Only one top-level choice. */
226 scan = NEXTOPER(scan);
227
228 first = scan;
229 while ((OP(first) == OPEN && (sawopen = 1)) ||
230 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
231 (OP(first) == PLUS) ||
232 (OP(first) == CURLY && ARG1(first) > 0) ) {
233 if (OP(first) == PLUS)
234 sawplus = 1;
235 else
236 first += regarglen[OP(first)];
237 first = NEXTOPER(first);
238 }
239
240 /* Starting-point info. */
241 again:
242 if (OP(first) == EXACTLY) {
243 r->regstart =
244 str_make(OPERAND(first)+1,*OPERAND(first));
245 if (r->regstart->str_cur > !(sawstudy|fold))
246 fbmcompile(r->regstart,fold);
247 }
248 else if ((exp = index(simple,OP(first))) && exp > simple)
249 r->regstclass = first;
250 else if (OP(first) == BOUND || OP(first) == NBOUND)
251 r->regstclass = first;
252 else if (OP(first) == BOL) {
253 r->reganch = ROPT_ANCH;
254 first = NEXTOPER(first);
255 goto again;
256 }
257 else if ((OP(first) == STAR && OP(NEXTOPER(first)) == ANY) &&
258 !(r->reganch & ROPT_ANCH) ) {
259 /* turn .* into ^.* with an implied $*=1 */
260 r->reganch = ROPT_ANCH | ROPT_IMPLICIT;
261 first = NEXTOPER(first);
262 goto again;
263 }
264 if (sawplus && (!sawopen || !regsawback))
265 r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */
266
267#ifdef DEBUGGING
268 if (debug & 512)
269 fprintf(stderr,"first %d next %d offset %d\n",
270 OP(first), OP(NEXTOPER(first)), first - scan);
271#endif
272 /*
273 * If there's something expensive in the r.e., find the
274 * longest literal string that must appear and make it the
275 * regmust. Resolve ties in favor of later strings, since
276 * the regstart check works with the beginning of the r.e.
277 * and avoiding duplication strengthens checking. Not a
278 * strong reason, but sufficient in the absence of others.
279 * [Now we resolve ties in favor of the earlier string if
280 * it happens that curback has been invalidated, since the
281 * earlier string may buy us something the later one won't.]
282 */
283 longish = str_make("",0);
284 longest = str_make("",0);
285 len = 0;
286 minlen = 0;
287 curback = 0;
288 backish = 0;
289 backest = 0;
290 while (OP(scan) != END) {
291 if (OP(scan) == BRANCH) {
292 if (OP(regnext(scan)) == BRANCH) {
293 curback = -30000;
294 while (OP(scan) == BRANCH)
295 scan = regnext(scan);
296 }
297 else /* single branch is ok */
298 scan = NEXTOPER(scan);
299 }
300 if (OP(scan) == EXACTLY) {
301 char *t;
302
303 first = scan;
304 while (OP(t = regnext(scan)) == CLOSE)
305 scan = t;
306 minlen += *OPERAND(first);
307 if (curback - backish == len) {
308 str_ncat(longish, OPERAND(first)+1,
309 *OPERAND(first));
310 len += *OPERAND(first);
311 curback += *OPERAND(first);
312 first = regnext(scan);
313 }
314 else if (*OPERAND(first) >= len + (curback >= 0)) {
315 len = *OPERAND(first);
316 str_nset(longish, OPERAND(first)+1,len);
317 backish = curback;
318 curback += len;
319 first = regnext(scan);
320 }
321 else
322 curback += *OPERAND(first);
323 }
324 else if (index(varies,OP(scan))) {
325 curback = -30000;
326 len = 0;
327 if (longish->str_cur > longest->str_cur) {
328 str_sset(longest,longish);
329 backest = backish;
330 }
331 str_nset(longish,"",0);
332 if (OP(scan) == PLUS &&
333 index(simple,OP(NEXTOPER(scan))))
334 minlen++;
335 else if (OP(scan) == CURLY &&
336 index(simple,OP(NEXTOPER(scan)+4)))
337 minlen += ARG1(scan);
338 }
339 else if (index(simple,OP(scan))) {
340 curback++;
341 minlen++;
342 len = 0;
343 if (longish->str_cur > longest->str_cur) {
344 str_sset(longest,longish);
345 backest = backish;
346 }
347 str_nset(longish,"",0);
348 }
349 scan = regnext(scan);
350 }
351
352 /* Prefer earlier on tie, unless we can tail match latter */
353
354 if (longish->str_cur + (OP(first) == EOL) > longest->str_cur) {
355 str_sset(longest,longish);
356 backest = backish;
357 }
358 else
359 str_nset(longish,"",0);
360 if (longest->str_cur
361 &&
362 (!r->regstart
363 ||
364 !fbminstr((unsigned char*) r->regstart->str_ptr,
365 (unsigned char *) r->regstart->str_ptr
366 + r->regstart->str_cur,
367 longest)
368 )
369 )
370 {
371 r->regmust = longest;
372 if (backest < 0)
373 backest = -1;
374 r->regback = backest;
375 if (longest->str_cur
376 > !(sawstudy || fold || OP(first) == EOL) )
377 fbmcompile(r->regmust,fold);
378 r->regmust->str_u.str_useful = 100;
379 if (OP(first) == EOL && longish->str_cur)
380 r->regmust->str_pok |= SP_TAIL;
381 }
382 else {
383 str_free(longest);
384 longest = Nullstr;
385 }
386 str_free(longish);
387 }
388
389 r->do_folding = fold;
390 r->nparens = regnpar - 1;
391 r->minlen = minlen;
392 Newz(1002, r->startp, regnpar, char*);
393 Newz(1002, r->endp, regnpar, char*);
394#ifdef DEBUGGING
395 if (debug & 512)
396 regdump(r);
397#endif
398 return(r);
399}
400
401/*
402 - reg - regular expression, i.e. main body or parenthesized thing
403 *
404 * Caller must absorb opening parenthesis.
405 *
406 * Combining parenthesis handling with the base level of regular expression
407 * is a trifle forced, but the need to tie the tails of the branches to what
408 * follows makes it hard to avoid.
409 */
410static char *
411reg(paren, flagp)
412int paren; /* Parenthesized? */
413int *flagp;
414{
415 register char *ret;
416 register char *br;
417 register char *ender;
418 register int parno;
419 int flags;
420
421 *flagp = HASWIDTH; /* Tentatively. */
422
423 /* Make an OPEN node, if parenthesized. */
424 if (paren) {
425 parno = regnpar;
426 regnpar++;
427 ret = reganode(OPEN, parno);
428 } else
429 ret = NULL;
430
431 /* Pick up the branches, linking them together. */
432 br = regbranch(&flags);
433 if (br == NULL)
434 return(NULL);
435 if (ret != NULL)
436 regtail(ret, br); /* OPEN -> first. */
437 else
438 ret = br;
439 if (!(flags&HASWIDTH))
440 *flagp &= ~HASWIDTH;
441 *flagp |= flags&SPSTART;
442 while (*regparse == '|') {
443 regparse++;
444 br = regbranch(&flags);
445 if (br == NULL)
446 return(NULL);
447 regtail(ret, br); /* BRANCH -> BRANCH. */
448 if (!(flags&HASWIDTH))
449 *flagp &= ~HASWIDTH;
450 *flagp |= flags&SPSTART;
451 }
452
453 /* Make a closing node, and hook it on the end. */
454 if (paren)
455 ender = reganode(CLOSE, parno);
456 else
457 ender = regnode(END);
458 regtail(ret, ender);
459
460 /* Hook the tails of the branches to the closing node. */
461 for (br = ret; br != NULL; br = regnext(br))
462 regoptail(br, ender);
463
464 /* Check for proper termination. */
465 if (paren && *regparse++ != ')') {
466 FAIL("unmatched () in regexp");
467 } else if (!paren && regparse < regxend) {
468 if (*regparse == ')') {
469 FAIL("unmatched () in regexp");
470 } else
471 FAIL("junk on end of regexp"); /* "Can't happen". */
472 /* NOTREACHED */
473 }
474
475 return(ret);
476}
477
478/*
479 - regbranch - one alternative of an | operator
480 *
481 * Implements the concatenation operator.
482 */
483static char *
484regbranch(flagp)
485int *flagp;
486{
487 register char *ret;
488 register char *chain;
489 register char *latest;
490 int flags;
491
492 *flagp = WORST; /* Tentatively. */
493
494 ret = regnode(BRANCH);
495 chain = NULL;
496 while (regparse < regxend && *regparse != '|' && *regparse != ')') {
497 latest = regpiece(&flags);
498 if (latest == NULL)
499 return(NULL);
500 *flagp |= flags&HASWIDTH;
501 if (chain == NULL) /* First piece. */
502 *flagp |= flags&SPSTART;
503 else
504 regtail(chain, latest);
505 chain = latest;
506 }
507 if (chain == NULL) /* Loop ran zero times. */
508 (void) regnode(NOTHING);
509
510 return(ret);
511}
512
513/*
514 - regpiece - something followed by possible [*+?]
515 *
516 * Note that the branching code sequences used for ? and the general cases
517 * of * and + are somewhat optimized: they use the same NOTHING node as
518 * both the endmarker for their branch list and the body of the last branch.
519 * It might seem that this node could be dispensed with entirely, but the
520 * endmarker role is not redundant.
521 */
522static char *
523regpiece(flagp)
524int *flagp;
525{
526 register char *ret;
527 register char op;
528 register char *next;
529 int flags;
530 char *origparse = regparse;
531 int orignpar = regnpar;
532 char *max;
533 int iter;
534 char ch;
535
536 ret = regatom(&flags);
537 if (ret == NULL)
538 return(NULL);
539
540 op = *regparse;
541
542 /* Here's a total kludge: if after the atom there's a {\d+,?\d*}
543 * then we decrement the first number by one and reset our
544 * parsing back to the beginning of the same atom. If the first number
545 * is down to 0, decrement the second number instead and fake up
546 * a ? after it. Given the way this compiler doesn't keep track
547 * of offsets on the first pass, this is the only way to replicate
548 * a piece of code. Sigh.
549 */
550 if (op == '{' && regcurly(regparse)) {
551 next = regparse + 1;
552 max = Nullch;
553 while (isDIGIT(*next) || *next == ',') {
554 if (*next == ',') {
555 if (max)
556 break;
557 else
558 max = next;
559 }
560 next++;
561 }
562 if (*next == '}') { /* got one */
563 if (!max)
564 max = next;
565 regparse++;
566 iter = atoi(regparse);
567 if (flags&SIMPLE) { /* we can do it right after all */
568 int tmp;
569
570 reginsert(CURLY, ret);
571 if (iter > 0)
572 *flagp = (WORST|HASWIDTH);
573 if (*max == ',')
574 max++;
575 else
576 max = regparse;
577 tmp = atoi(max);
578 if (!tmp && *max != '0')
579 tmp = 32767; /* meaning "infinity" */
580 if (tmp && tmp < iter)
581 fatal("Can't do {n,m} with n > m");
582 if (regcode != &regdummy) {
583#ifdef REGALIGN
584 *(unsigned short *)(ret+3) = iter;
585 *(unsigned short *)(ret+5) = tmp;
586#else
587 ret[3] = iter >> 8; ret[4] = iter & 0377;
588 ret[5] = tmp >> 8; ret[6] = tmp & 0377;
589#endif
590 }
591 regparse = next;
592 goto nest_check;
593 }
594 regsawbracket++; /* remember we clobbered exp */
595 if (iter > 0) {
596 ch = *max;
597 sprintf(regparse,"%.*d", max-regparse, iter - 1);
598 *max = ch;
599 if (*max == ',' && max[1] != '}') {
600 if (atoi(max+1) <= 0)
601 fatal("Can't do {n,m} with n > m");
602 ch = *next;
603 sprintf(max+1,"%.*d", next-(max+1), atoi(max+1) - 1);
604 *next = ch;
605 }
606 if (iter != 1 || *max == ',') {
607 regparse = origparse; /* back up input pointer */
608 regnpar = orignpar; /* don't make more parens */
609 }
610 else {
611 regparse = next;
612 goto nest_check;
613 }
614 *flagp = flags;
615 return ret;
616 }
617 if (*max == ',') {
618 max++;
619 iter = atoi(max);
620 if (max == next) { /* any number more? */
621 regparse = next;
622 op = '*'; /* fake up one with a star */
623 }
624 else if (iter > 0) {
625 op = '?'; /* fake up optional atom */
626 ch = *next;
627 sprintf(max,"%.*d", next-max, iter - 1);
628 *next = ch;
629 if (iter == 1)
630 regparse = next;
631 else {
632 regparse = origparse - 1; /* offset ++ below */
633 regnpar = orignpar;
634 }
635 }
636 else
637 fatal("Can't do {n,0}");
638 }
639 else
640 fatal("Can't do {0}");
641 }
642 }
643
644 if (!ISMULT1(op)) {
645 *flagp = flags;
646 return(ret);
647 }
648
649 if (!(flags&HASWIDTH) && op != '?')
650 FAIL("regexp *+ operand could be empty");
651 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
652
653 if (op == '*' && (flags&SIMPLE))
654 reginsert(STAR, ret);
655 else if (op == '*') {
656 /* Emit x* as (x&|), where & means "self". */
657 reginsert(BRANCH, ret); /* Either x */
658 regoptail(ret, regnode(BACK)); /* and loop */
659 regoptail(ret, ret); /* back */
660 regtail(ret, regnode(BRANCH)); /* or */
661 regtail(ret, regnode(NOTHING)); /* null. */
662 } else if (op == '+' && (flags&SIMPLE))
663 reginsert(PLUS, ret);
664 else if (op == '+') {
665 /* Emit x+ as x(&|), where & means "self". */
666 next = regnode(BRANCH); /* Either */
667 regtail(ret, next);
668 regtail(regnode(BACK), ret); /* loop back */
669 regtail(next, regnode(BRANCH)); /* or */
670 regtail(ret, regnode(NOTHING)); /* null. */
671 } else if (op == '?') {
672 /* Emit x? as (x|) */
673 reginsert(BRANCH, ret); /* Either x */
674 regtail(ret, regnode(BRANCH)); /* or */
675 next = regnode(NOTHING); /* null. */
676 regtail(ret, next);
677 regoptail(ret, next);
678 }
679 nest_check:
680 regparse++;
681 if (ISMULT2(regparse))
682 FAIL("nested *?+ in regexp");
683
684 return(ret);
685}
686
687/*
688 - regatom - the lowest level
689 *
690 * Optimization: gobbles an entire sequence of ordinary characters so that
691 * it can turn them into a single node, which is smaller to store and
692 * faster to run. Backslashed characters are exceptions, each becoming a
693 * separate node; the code is simpler that way and it's not worth fixing.
694 *
695 * [Yes, it is worth fixing, some scripts can run twice the speed.]
696 */
697static char *
698regatom(flagp)
699int *flagp;
700{
701 register char *ret;
702 int flags;
703
704 *flagp = WORST; /* Tentatively. */
705
706 switch (*regparse++) {
707 case '^':
708 ret = regnode(BOL);
709 break;
710 case '$':
711 ret = regnode(EOL);
712 break;
713 case '.':
714 ret = regnode(ANY);
715 *flagp |= HASWIDTH|SIMPLE;
716 break;
717 case '[':
718 ret = regclass();
719 *flagp |= HASWIDTH|SIMPLE;
720 break;
721 case '(':
722 ret = reg(1, &flags);
723 if (ret == NULL)
724 return(NULL);
725 *flagp |= flags&(HASWIDTH|SPSTART);
726 break;
727 case '|':
728 case ')':
729 FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
730 break;
731 case '?':
732 case '+':
733 case '*':
734 FAIL("?+* follows nothing in regexp");
735 break;
736 case '\\':
737 switch (*regparse) {
738 case 'w':
739 ret = regnode(ALNUM);
740 *flagp |= HASWIDTH|SIMPLE;
741 regparse++;
742 break;
743 case 'W':
744 ret = regnode(NALNUM);
745 *flagp |= HASWIDTH|SIMPLE;
746 regparse++;
747 break;
748 case 'b':
749 ret = regnode(BOUND);
750 *flagp |= SIMPLE;
751 regparse++;
752 break;
753 case 'B':
754 ret = regnode(NBOUND);
755 *flagp |= SIMPLE;
756 regparse++;
757 break;
758 case 's':
759 ret = regnode(SPACE);
760 *flagp |= HASWIDTH|SIMPLE;
761 regparse++;
762 break;
763 case 'S':
764 ret = regnode(NSPACE);
765 *flagp |= HASWIDTH|SIMPLE;
766 regparse++;
767 break;
768 case 'd':
769 ret = regnode(DIGIT);
770 *flagp |= HASWIDTH|SIMPLE;
771 regparse++;
772 break;
773 case 'D':
774 ret = regnode(NDIGIT);
775 *flagp |= HASWIDTH|SIMPLE;
776 regparse++;
777 break;
778 case 'n':
779 case 'r':
780 case 't':
781 case 'f':
782 case 'e':
783 case 'a':
784 case 'x':
785 case 'c':
786 case '0':
787 goto defchar;
788 case '1': case '2': case '3': case '4':
789 case '5': case '6': case '7': case '8': case '9':
790 {
791 int num = atoi(regparse);
792
793 if (num > 9 && num >= regnpar)
794 goto defchar;
795 else {
796 regsawback = 1;
797 ret = reganode(REF, num);
798 while (isDIGIT(*regparse))
799 regparse++;
800 *flagp |= SIMPLE;
801 }
802 }
803 break;
804 case '\0':
805 if (regparse >= regxend)
806 FAIL("trailing \\ in regexp");
807 /* FALL THROUGH */
808 default:
809 goto defchar;
810 }
811 break;
812 default: {
813 register int len;
814 register char ender;
815 register char *p;
816 char *oldp;
817 int numlen;
818
819 defchar:
820 ret = regnode(EXACTLY);
821 regc(0); /* save spot for len */
822 for (len=0, p=regparse-1;
823 len < 127 && p < regxend;
824 len++)
825 {
826 oldp = p;
827 switch (*p) {
828 case '^':
829 case '$':
830 case '.':
831 case '[':
832 case '(':
833 case ')':
834 case '|':
835 goto loopdone;
836 case '\\':
837 switch (*++p) {
838 case 'w':
839 case 'W':
840 case 'b':
841 case 'B':
842 case 's':
843 case 'S':
844 case 'd':
845 case 'D':
846 --p;
847 goto loopdone;
848 case 'n':
849 ender = '\n';
850 p++;
851 break;
852 case 'r':
853 ender = '\r';
854 p++;
855 break;
856 case 't':
857 ender = '\t';
858 p++;
859 break;
860 case 'f':
861 ender = '\f';
862 p++;
863 break;
864 case 'e':
865 ender = '\033';
866 p++;
867 break;
868 case 'a':
869 ender = '\007';
870 p++;
871 break;
872 case 'x':
873 ender = scanhex(++p, 2, &numlen);
874 p += numlen;
875 break;
876 case 'c':
877 p++;
878 ender = *p++;
879 if (isLOWER(ender))
880 ender = toupper(ender);
881 ender ^= 64;
882 break;
883 case '0': case '1': case '2': case '3':case '4':
884 case '5': case '6': case '7': case '8':case '9':
885 if (*p == '0' ||
886 (isDIGIT(p[1]) && atoi(p) >= regnpar) ) {
887 ender = scanoct(p, 3, &numlen);
888 p += numlen;
889 }
890 else {
891 --p;
892 goto loopdone;
893 }
894 break;
895 case '\0':
896 if (p >= regxend)
897 FAIL("trailing \\ in regexp");
898 /* FALL THROUGH */
899 default:
900 ender = *p++;
901 break;
902 }
903 break;
904 default:
905 ender = *p++;
906 break;
907 }
908 if (regfold && isUPPER(ender))
909 ender = tolower(ender);
910 if (ISMULT2(p)) { /* Back off on ?+*. */
911 if (len)
912 p = oldp;
913 else {
914 len++;
915 regc(ender);
916 }
917 break;
918 }
919 regc(ender);
920 }
921 loopdone:
922 regparse = p;
923 if (len <= 0)
924 FAIL("internal disaster in regexp");
925 *flagp |= HASWIDTH;
926 if (len == 1)
927 *flagp |= SIMPLE;
928 if (regcode != &regdummy)
929 *OPERAND(ret) = len;
930 regc('\0');
931 }
932 break;
933 }
934
935 return(ret);
936}
937
938static void
939regset(bits,def,c)
940char *bits;
941int def;
942register int c;
943{
944 if (regcode == &regdummy)
945 return;
946 c &= 255;
947 if (def)
948 bits[c >> 3] &= ~(1 << (c & 7));
949 else
950 bits[c >> 3] |= (1 << (c & 7));
951}
952
953static char *
954regclass()
955{
956 register char *bits;
957 register int class;
958 register int lastclass;
959 register int range = 0;
960 register char *ret;
961 register int def;
962 int numlen;
963
964 ret = regnode(ANYOF);
965 if (*regparse == '^') { /* Complement of range. */
966 regparse++;
967 def = 0;
968 } else {
969 def = 255;
970 }
971 bits = regcode;
972 for (class = 0; class < 32; class++)
973 regc(def);
974 if (*regparse == ']' || *regparse == '-')
975 goto skipcond; /* allow 1st char to be ] or - */
976 while (regparse < regxend && *regparse != ']') {
977 skipcond:
978 class = UCHARAT(regparse++);
979 if (class == '\\') {
980 class = UCHARAT(regparse++);
981 switch (class) {
982 case 'w':
983 for (class = 0; class < 256; class++)
984 if (isALNUM(class))
985 regset(bits,def,class);
986 lastclass = 1234;
987 continue;
988 case 'W':
989 for (class = 0; class < 256; class++)
990 if (!isALNUM(class))
991 regset(bits,def,class);
992 lastclass = 1234;
993 continue;
994 case 's':
995 for (class = 0; class < 256; class++)
996 if (isSPACE(class))
997 regset(bits,def,class);
998 lastclass = 1234;
999 continue;
1000 case 'S':
1001 for (class = 0; class < 256; class++)
1002 if (!isSPACE(class))
1003 regset(bits,def,class);
1004 lastclass = 1234;
1005 continue;
1006 case 'd':
1007 for (class = '0'; class <= '9'; class++)
1008 regset(bits,def,class);
1009 lastclass = 1234;
1010 continue;
1011 case 'D':
1012 for (class = 0; class < '0'; class++)
1013 regset(bits,def,class);
1014 for (class = '9' + 1; class < 256; class++)
1015 regset(bits,def,class);
1016 lastclass = 1234;
1017 continue;
1018 case 'n':
1019 class = '\n';
1020 break;
1021 case 'r':
1022 class = '\r';
1023 break;
1024 case 't':
1025 class = '\t';
1026 break;
1027 case 'f':
1028 class = '\f';
1029 break;
1030 case 'b':
1031 class = '\b';
1032 break;
1033 case 'e':
1034 class = '\033';
1035 break;
1036 case 'a':
1037 class = '\007';
1038 break;
1039 case 'x':
1040 class = scanhex(regparse, 2, &numlen);
1041 regparse += numlen;
1042 break;
1043 case 'c':
1044 class = *regparse++;
1045 if (isLOWER(class))
1046 class = toupper(class);
1047 class ^= 64;
1048 break;
1049 case '0': case '1': case '2': case '3': case '4':
1050 case '5': case '6': case '7': case '8': case '9':
1051 class = scanoct(--regparse, 3, &numlen);
1052 regparse += numlen;
1053 break;
1054 }
1055 }
1056 if (range) {
1057 if (lastclass > class)
1058 FAIL("invalid [] range in regexp");
1059 range = 0;
1060 }
1061 else {
1062 lastclass = class;
1063 if (*regparse == '-' && regparse+1 < regxend &&
1064 regparse[1] != ']') {
1065 regparse++;
1066 range = 1;
1067 continue; /* do it next time */
1068 }
1069 }
1070 for ( ; lastclass <= class; lastclass++) {
1071 regset(bits,def,lastclass);
1072 if (regfold && isUPPER(lastclass))
1073 regset(bits,def,tolower(lastclass));
1074 }
1075 lastclass = class;
1076 }
1077 if (*regparse != ']')
1078 FAIL("unmatched [] in regexp");
1079 regparse++;
1080 return ret;
1081}
1082
1083/*
1084 - regnode - emit a node
1085 */
1086static char * /* Location. */
1087regnode(op)
1088char op;
1089{
1090 register char *ret;
1091 register char *ptr;
1092
1093 ret = regcode;
1094 if (ret == &regdummy) {
1095#ifdef REGALIGN
1096 if (!(regsize & 1))
1097 regsize++;
1098#endif
1099 regsize += 3;
1100 return(ret);
1101 }
1102
1103#ifdef REGALIGN
1104#ifndef lint
1105 if (!((long)ret & 1))
1106 *ret++ = 127;
1107#endif
1108#endif
1109 ptr = ret;
1110 *ptr++ = op;
1111 *ptr++ = '\0'; /* Null "next" pointer. */
1112 *ptr++ = '\0';
1113 regcode = ptr;
1114
1115 return(ret);
1116}
1117
1118/*
1119 - reganode - emit a node with an argument
1120 */
1121static char * /* Location. */
1122reganode(op, arg)
1123char op;
1124unsigned short arg;
1125{
1126 register char *ret;
1127 register char *ptr;
1128
1129 ret = regcode;
1130 if (ret == &regdummy) {
1131#ifdef REGALIGN
1132 if (!(regsize & 1))
1133 regsize++;
1134#endif
1135 regsize += 5;
1136 return(ret);
1137 }
1138
1139#ifdef REGALIGN
1140#ifndef lint
1141 if (!((long)ret & 1))
1142 *ret++ = 127;
1143#endif
1144#endif
1145 ptr = ret;
1146 *ptr++ = op;
1147 *ptr++ = '\0'; /* Null "next" pointer. */
1148 *ptr++ = '\0';
1149#ifdef REGALIGN
1150 *(unsigned short *)(ret+3) = arg;
1151#else
1152 ret[3] = arg >> 8; ret[4] = arg & 0377;
1153#endif
1154 ptr += 2;
1155 regcode = ptr;
1156
1157 return(ret);
1158}
1159
1160/*
1161 - regc - emit (if appropriate) a byte of code
1162 */
1163static void
1164regc(b)
1165char b;
1166{
1167 if (regcode != &regdummy)
1168 *regcode++ = b;
1169 else
1170 regsize++;
1171}
1172
1173/*
1174 - reginsert - insert an operator in front of already-emitted operand
1175 *
1176 * Means relocating the operand.
1177 */
1178static void
1179reginsert(op, opnd)
1180char op;
1181char *opnd;
1182{
1183 register char *src;
1184 register char *dst;
1185 register char *place;
1186 register offset = (op == CURLY ? 4 : 0);
1187
1188 if (regcode == &regdummy) {
1189#ifdef REGALIGN
1190 regsize += 4 + offset;
1191#else
1192 regsize += 3 + offset;
1193#endif
1194 return;
1195 }
1196
1197 src = regcode;
1198#ifdef REGALIGN
1199 regcode += 4 + offset;
1200#else
1201 regcode += 3 + offset;
1202#endif
1203 dst = regcode;
1204 while (src > opnd)
1205 *--dst = *--src;
1206
1207 place = opnd; /* Op node, where operand used to be. */
1208 *place++ = op;
1209 *place++ = '\0';
1210 *place++ = '\0';
1211 while (offset-- > 0)
1212 *place++ = '\0';
1213#ifdef REGALIGN
1214 *place++ = '\177';
1215#endif
1216}
1217
1218/*
1219 - regtail - set the next-pointer at the end of a node chain
1220 */
1221static void
1222regtail(p, val)
1223char *p;
1224char *val;
1225{
1226 register char *scan;
1227 register char *temp;
1228 register int offset;
1229
1230 if (p == &regdummy)
1231 return;
1232
1233 /* Find last node. */
1234 scan = p;
1235 for (;;) {
1236 temp = regnext(scan);
1237 if (temp == NULL)
1238 break;
1239 scan = temp;
1240 }
1241
1242#ifdef REGALIGN
1243 offset = val - scan;
1244#ifndef lint
1245 *(short*)(scan+1) = offset;
1246#else
1247 offset = offset;
1248#endif
1249#else
1250 if (OP(scan) == BACK)
1251 offset = scan - val;
1252 else
1253 offset = val - scan;
1254 *(scan+1) = (offset>>8)&0377;
1255 *(scan+2) = offset&0377;
1256#endif
1257}
1258
1259/*
1260 - regoptail - regtail on operand of first argument; nop if operandless
1261 */
1262static void
1263regoptail(p, val)
1264char *p;
1265char *val;
1266{
1267 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1268 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1269 return;
1270 regtail(NEXTOPER(p), val);
1271}
1272
1273/*
1274 - regcurly - a little FSA that accepts {\d+,?\d*}
1275 */
1276STATIC int
1277regcurly(s)
1278register char *s;
1279{
1280 if (*s++ != '{')
1281 return FALSE;
1282 if (!isDIGIT(*s))
1283 return FALSE;
1284 while (isDIGIT(*s))
1285 s++;
1286 if (*s == ',')
1287 s++;
1288 while (isDIGIT(*s))
1289 s++;
1290 if (*s != '}')
1291 return FALSE;
1292 return TRUE;
1293}
1294
1295#ifdef DEBUGGING
1296
1297/*
1298 - regdump - dump a regexp onto stderr in vaguely comprehensible form
1299 */
1300void
1301regdump(r)
1302regexp *r;
1303{
1304 register char *s;
1305 register char op = EXACTLY; /* Arbitrary non-END op. */
1306 register char *next;
1307
1308
1309 s = r->program + 1;
1310 while (op != END) { /* While that wasn't END last time... */
1311#ifdef REGALIGN
1312 if (!((long)s & 1))
1313 s++;
1314#endif
1315 op = OP(s);
1316 fprintf(stderr,"%2d%s", s-r->program, regprop(s)); /* Where, what. */
1317 next = regnext(s);
1318 s += regarglen[op];
1319 if (next == NULL) /* Next ptr. */
1320 fprintf(stderr,"(0)");
1321 else
1322 fprintf(stderr,"(%d)", (s-r->program)+(next-s));
1323 s += 3;
1324 if (op == ANYOF) {
1325 s += 32;
1326 }
1327 if (op == EXACTLY) {
1328 /* Literal string, where present. */
1329 s++;
1330 while (*s != '\0') {
1331 (void)putchar(*s);
1332 s++;
1333 }
1334 s++;
1335 }
1336 (void)putchar('\n');
1337 }
1338
1339 /* Header fields of interest. */
1340 if (r->regstart)
1341 fprintf(stderr,"start `%s' ", r->regstart->str_ptr);
1342 if (r->regstclass)
1343 fprintf(stderr,"stclass `%s' ", regprop(r->regstclass));
1344 if (r->reganch & ROPT_ANCH)
1345 fprintf(stderr,"anchored ");
1346 if (r->reganch & ROPT_SKIP)
1347 fprintf(stderr,"plus ");
1348 if (r->reganch & ROPT_IMPLICIT)
1349 fprintf(stderr,"implicit ");
1350 if (r->regmust != NULL)
1351 fprintf(stderr,"must have \"%s\" back %d ", r->regmust->str_ptr,
1352 r->regback);
1353 fprintf(stderr, "minlen %d ", r->minlen);
1354 fprintf(stderr,"\n");
1355}
1356
1357/*
1358 - regprop - printable representation of opcode
1359 */
1360char *
1361regprop(op)
1362char *op;
1363{
1364 register char *p;
1365
1366 (void) strcpy(buf, ":");
1367
1368 switch (OP(op)) {
1369 case BOL:
1370 p = "BOL";
1371 break;
1372 case EOL:
1373 p = "EOL";
1374 break;
1375 case ANY:
1376 p = "ANY";
1377 break;
1378 case ANYOF:
1379 p = "ANYOF";
1380 break;
1381 case BRANCH:
1382 p = "BRANCH";
1383 break;
1384 case EXACTLY:
1385 p = "EXACTLY";
1386 break;
1387 case NOTHING:
1388 p = "NOTHING";
1389 break;
1390 case BACK:
1391 p = "BACK";
1392 break;
1393 case END:
1394 p = "END";
1395 break;
1396 case ALNUM:
1397 p = "ALNUM";
1398 break;
1399 case NALNUM:
1400 p = "NALNUM";
1401 break;
1402 case BOUND:
1403 p = "BOUND";
1404 break;
1405 case NBOUND:
1406 p = "NBOUND";
1407 break;
1408 case SPACE:
1409 p = "SPACE";
1410 break;
1411 case NSPACE:
1412 p = "NSPACE";
1413 break;
1414 case DIGIT:
1415 p = "DIGIT";
1416 break;
1417 case NDIGIT:
1418 p = "NDIGIT";
1419 break;
1420 case CURLY:
1421 (void)sprintf(buf+strlen(buf), "CURLY {%d,%d}",
1422 ARG1(op),ARG2(op));
1423 p = NULL;
1424 break;
1425 case REF:
1426 (void)sprintf(buf+strlen(buf), "REF%d", ARG1(op));
1427 p = NULL;
1428 break;
1429 case OPEN:
1430 (void)sprintf(buf+strlen(buf), "OPEN%d", ARG1(op));
1431 p = NULL;
1432 break;
1433 case CLOSE:
1434 (void)sprintf(buf+strlen(buf), "CLOSE%d", ARG1(op));
1435 p = NULL;
1436 break;
1437 case STAR:
1438 p = "STAR";
1439 break;
1440 case PLUS:
1441 p = "PLUS";
1442 break;
1443 default:
1444 FAIL("corrupted regexp opcode");
1445 }
1446 if (p != NULL)
1447 (void) strcat(buf, p);
1448 return(buf);
1449}
1450#endif /* DEBUGGING */
1451
1452void
1453regfree(r)
1454struct regexp *r;
1455{
1456 if (r->precomp) {
1457 Safefree(r->precomp);
1458 r->precomp = Nullch;
1459 }
1460 if (r->subbase) {
1461 Safefree(r->subbase);
1462 r->subbase = Nullch;
1463 }
1464 if (r->regmust) {
1465 str_free(r->regmust);
1466 r->regmust = Nullstr;
1467 }
1468 if (r->regstart) {
1469 str_free(r->regstart);
1470 r->regstart = Nullstr;
1471 }
1472 Safefree(r->startp);
1473 Safefree(r->endp);
1474 Safefree(r);
1475}