/* NOTE: this is derived from Henry Spencer's regexp code, and should not
* confused with the original package (see point 3 below). Thanks, Henry!
/* Additional note: this code is very heavily munged from Henry's version
* in places. In some spots I've traded clarity for efficiency, so don't
* blame Henry for some of the lack of readability.
/* $RCSfile: regcomp.c,v $$Revision: 4.0.1.5 $$Date: 92/06/08 15:23:36 $
* Revision 4.0.1.5 92/06/08 15:23:36 lwall
* patch20: Perl now distinguishes overlapped copies from non-overlapped
* patch20: /^stuff/ wrongly assumed an implicit $* == 1
* patch20: /x{0}/ was wrongly interpreted as /x{0,}/
* patch20: added \W, \S and \D inside /[...]/
* Revision 4.0.1.4 91/11/05 22:55:14 lwall
* Revision 4.0.1.3 91/11/05 18:22:28 lwall
* patch11: minimum match length calculation in regexp is now cumulative
* patch11: initial .* in pattern had dependency on value of $*
* patch11: certain patterns made use of garbage pointers from uncleared memory
* patch11: prepared for ctype implementations that don't define isascii()
* Revision 4.0.1.2 91/06/07 11:48:24 lwall
* patch4: new copyright notice
* patch4: /(x+) \1/ incorrectly optimized to not match "xxx xx"
* patch4: // wouldn't use previous pattern if it started with a null character
* Revision 4.0.1.1 91/04/12 09:04:45 lwall
* patch1: random cleanup in cpp namespace
* Revision 4.0 91/03/20 01:39:01 lwall
* regcomp and regexec -- regsub and regerror are not used in perl
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
**** Alterations to Henry's code are...
**** Copyright (c) 1991, Larry Wall
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
/* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
# pragma optimize("a",off)
/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
# pragma optimize("w",on )
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
((*s) == '{' && regcurly(s)))
#define PERL_META "^$.[()|?+*\\"
#define META "^$.[()|?+*\\"
#undef SPSTART /* dratted cpp namespace... */
* Flags to be passed up and down.
#define HASWIDTH 01 /* Known never to match null string. */
#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
#define SPSTART 04 /* Starts with * or +. */
#define WORST 0 /* Worst case. */
* Global work variables for regcomp().
static char *regprecomp
; /* uncompiled string. */
static char *regparse
; /* Input-scan pointer. */
static char *regxend
; /* End of input for compile */
static int regnpar
; /* () count. */
static char *regcode
; /* Code-emit pointer; ®dummy = don't. */
static long regsize
; /* Code size. */
static int regsawbracket
; /* Did we do {d,d} trick? */
static int regsawback
; /* Did we see \1, ...? */
* Forward declarations for regcomp()'s friends.
STATIC
char *regbranch();
- regcomp - compile a regular expression into internal code
* We can't allocate space until we know how big the compiled form will be,
* but we can't compile it (and thus know how big it is) until we've got a
* place to put the code. So we cheat: we compile it twice, once with code
* generation turned off and size counting turned on, and once "for real".
* This also means that we don't allocate space until we are sure that the
* thing really will compile successfully, and we never have to move the
* code and thus invalidate pointers into it. (Note that it has to be in
* one piece because free() must be able to free it all.) [NB: not true in perl]
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regexp. [I'll say.]
fatal("NULL regexp argument");
/* First pass: determine size, legality. */
regprecomp
= nsavestr(exp
,xend
-exp
);
if (reg(0, &flags
) == NULL
) {
/* Small enough for pointer-storage convention? */
if (regsize
>= 32767L) /* Probably could be 65535L. */
Newc(1001, r
, sizeof(regexp
) + (unsigned)regsize
, char, regexp
);
FAIL("regexp out of space");
/* Second pass: emit code. */
Copy(regprecomp
,exp
,xend
-exp
,char);
r
->subbeg
= r
->subbase
= NULL
;
if (reg(0, &flags
) == NULL
)
/* Dig out information for optimizations. */
r
->regstart
= Nullstr
; /* Worst-case defaults. */
scan
= r
->program
+1; /* First BRANCH. */
if (OP(regnext(scan
)) == END
) {/* Only one top-level choice. */
while ((OP(first
) == OPEN
&& (sawopen
= 1)) ||
(OP(first
) == BRANCH
&& OP(regnext(first
)) != BRANCH
) ||
(OP(first
) == CURLY
&& ARG1(first
) > 0) ) {
first
+= regarglen
[OP(first
)];
/* Starting-point info. */
if (OP(first
) == EXACTLY
) {
str_make(OPERAND(first
)+1,*OPERAND(first
));
if (r
->regstart
->str_cur
> !(sawstudy
|fold
))
fbmcompile(r
->regstart
,fold
);
else if ((exp
= index(simple
,OP(first
))) && exp
> simple
)
else if (OP(first
) == BOUND
|| OP(first
) == NBOUND
)
else if (OP(first
) == BOL
) {
else if ((OP(first
) == STAR
&& OP(NEXTOPER(first
)) == ANY
) &&
!(r
->reganch
& ROPT_ANCH
) ) {
/* turn .* into ^.* with an implied $*=1 */
r
->reganch
= ROPT_ANCH
| ROPT_IMPLICIT
;
if (sawplus
&& (!sawopen
|| !regsawback
))
r
->reganch
|= ROPT_SKIP
; /* x+ must match 1st of run */
fprintf(stderr
,"first %d next %d offset %d\n",
OP(first
), OP(NEXTOPER(first
)), first
- scan
);
* If there's something expensive in the r.e., find the
* longest literal string that must appear and make it the
* regmust. Resolve ties in favor of later strings, since
* the regstart check works with the beginning of the r.e.
* and avoiding duplication strengthens checking. Not a
* strong reason, but sufficient in the absence of others.
* [Now we resolve ties in favor of the earlier string if
* it happens that curback has been invalidated, since the
* earlier string may buy us something the later one won't.]
longish
= str_make("",0);
longest
= str_make("",0);
while (OP(scan
) != END
) {
if (OP(scan
) == BRANCH
) {
if (OP(regnext(scan
)) == BRANCH
) {
while (OP(scan
) == BRANCH
)
else /* single branch is ok */
if (OP(scan
) == EXACTLY
) {
while (OP(t
= regnext(scan
)) == CLOSE
)
minlen
+= *OPERAND(first
);
if (curback
- backish
== len
) {
str_ncat(longish
, OPERAND(first
)+1,
curback
+= *OPERAND(first
);
else if (*OPERAND(first
) >= len
+ (curback
>= 0)) {
str_nset(longish
, OPERAND(first
)+1,len
);
curback
+= *OPERAND(first
);
else if (index(varies
,OP(scan
))) {
if (longish
->str_cur
> longest
->str_cur
) {
str_sset(longest
,longish
);
index(simple
,OP(NEXTOPER(scan
))))
else if (OP(scan
) == CURLY
&&
index(simple
,OP(NEXTOPER(scan
)+4)))
else if (index(simple
,OP(scan
))) {
if (longish
->str_cur
> longest
->str_cur
) {
str_sset(longest
,longish
);
/* Prefer earlier on tie, unless we can tail match latter */
if (longish
->str_cur
+ (OP(first
) == EOL
) > longest
->str_cur
) {
str_sset(longest
,longish
);
!fbminstr((unsigned char*) r
->regstart
->str_ptr
,
(unsigned char *) r
->regstart
->str_ptr
> !(sawstudy
|| fold
|| OP(first
) == EOL
) )
fbmcompile(r
->regmust
,fold
);
r
->regmust
->str_u
.str_useful
= 100;
if (OP(first
) == EOL
&& longish
->str_cur
)
r
->regmust
->str_pok
|= SP_TAIL
;
r
->nparens
= regnpar
- 1;
Newz(1002, r
->startp
, regnpar
, char*);
Newz(1002, r
->endp
, regnpar
, char*);
- reg - regular expression, i.e. main body or parenthesized thing
* Caller must absorb opening parenthesis.
* Combining parenthesis handling with the base level of regular expression
* is a trifle forced, but the need to tie the tails of the branches to what
* follows makes it hard to avoid.
int paren
; /* Parenthesized? */
*flagp
= HASWIDTH
; /* Tentatively. */
/* Make an OPEN node, if parenthesized. */
ret
= reganode(OPEN
, parno
);
/* Pick up the branches, linking them together. */
regtail(ret
, br
); /* OPEN -> first. */
while (*regparse
== '|') {
regtail(ret
, br
); /* BRANCH -> BRANCH. */
/* Make a closing node, and hook it on the end. */
ender
= reganode(CLOSE
, parno
);
/* Hook the tails of the branches to the closing node. */
for (br
= ret
; br
!= NULL
; br
= regnext(br
))
/* Check for proper termination. */
if (paren
&& *regparse
++ != ')') {
FAIL("unmatched () in regexp");
} else if (!paren
&& regparse
< regxend
) {
FAIL("unmatched () in regexp");
FAIL("junk on end of regexp"); /* "Can't happen". */
- regbranch - one alternative of an | operator
* Implements the concatenation operator.
*flagp
= WORST
; /* Tentatively. */
while (regparse
< regxend
&& *regparse
!= '|' && *regparse
!= ')') {
latest
= regpiece(&flags
);
*flagp
|= flags
&HASWIDTH
;
if (chain
== NULL
) /* First piece. */
if (chain
== NULL
) /* Loop ran zero times. */
- regpiece - something followed by possible [*+?]
* Note that the branching code sequences used for ? and the general cases
* of * and + are somewhat optimized: they use the same NOTHING node as
* both the endmarker for their branch list and the body of the last branch.
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
char *origparse
= regparse
;
/* Here's a total kludge: if after the atom there's a {\d+,?\d*}
* then we decrement the first number by one and reset our
* parsing back to the beginning of the same atom. If the first number
* is down to 0, decrement the second number instead and fake up
* a ? after it. Given the way this compiler doesn't keep track
* of offsets on the first pass, this is the only way to replicate
if (op
== '{' && regcurly(regparse
)) {
while (isDIGIT(*next
) || *next
== ',') {
if (*next
== '}') { /* got one */
if (flags
&SIMPLE
) { /* we can do it right after all */
*flagp
= (WORST
|HASWIDTH
);
tmp
= 32767; /* meaning "infinity" */
fatal("Can't do {n,m} with n > m");
if (regcode
!= ®dummy
) {
*(unsigned short *)(ret
+3) = iter
;
*(unsigned short *)(ret
+5) = tmp
;
ret
[3] = iter
>> 8; ret
[4] = iter
& 0377;
ret
[5] = tmp
>> 8; ret
[6] = tmp
& 0377;
regsawbracket
++; /* remember we clobbered exp */
sprintf(regparse
,"%.*d", max
-regparse
, iter
- 1);
if (*max
== ',' && max
[1] != '}') {
fatal("Can't do {n,m} with n > m");
sprintf(max
+1,"%.*d", next
-(max
+1), atoi(max
+1) - 1);
if (iter
!= 1 || *max
== ',') {
regparse
= origparse
; /* back up input pointer */
regnpar
= orignpar
; /* don't make more parens */
if (max
== next
) { /* any number more? */
op
= '*'; /* fake up one with a star */
op
= '?'; /* fake up optional atom */
sprintf(max
,"%.*d", next
-max
, iter
- 1);
regparse
= origparse
- 1; /* offset ++ below */
if (!(flags
&HASWIDTH
) && op
!= '?')
FAIL("regexp *+ operand could be empty");
*flagp
= (op
!= '+') ? (WORST
|SPSTART
) : (WORST
|HASWIDTH
);
if (op
== '*' && (flags
&SIMPLE
))
/* Emit x* as (x&|), where & means "self". */
reginsert(BRANCH
, ret
); /* Either x */
regoptail(ret
, regnode(BACK
)); /* and loop */
regoptail(ret
, ret
); /* back */
regtail(ret
, regnode(BRANCH
)); /* or */
regtail(ret
, regnode(NOTHING
)); /* null. */
} else if (op
== '+' && (flags
&SIMPLE
))
/* Emit x+ as x(&|), where & means "self". */
next
= regnode(BRANCH
); /* Either */
regtail(regnode(BACK
), ret
); /* loop back */
regtail(next
, regnode(BRANCH
)); /* or */
regtail(ret
, regnode(NOTHING
)); /* null. */
reginsert(BRANCH
, ret
); /* Either x */
regtail(ret
, regnode(BRANCH
)); /* or */
next
= regnode(NOTHING
); /* null. */
FAIL("nested *?+ in regexp");
- regatom - the lowest level
* Optimization: gobbles an entire sequence of ordinary characters so that
* it can turn them into a single node, which is smaller to store and
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
* [Yes, it is worth fixing, some scripts can run twice the speed.]
*flagp
= WORST
; /* Tentatively. */
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= flags
&(HASWIDTH
|SPSTART
);
FAIL("internal urp in regexp"); /* Supposed to be caught earlier. */
FAIL("?+* follows nothing in regexp");
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= HASWIDTH
|SIMPLE
;
*flagp
|= HASWIDTH
|SIMPLE
;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
int num
= atoi(regparse
);
if (num
> 9 && num
>= regnpar
)
ret
= reganode(REF
, num
);
while (isDIGIT(*regparse
))
FAIL("trailing \\ in regexp");
regc(0); /* save spot for len */
for (len
=0, p
=regparse
-1;
len
< 127 && p
< regxend
;
ender
= scanhex(++p
, 2, &numlen
);
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
(isDIGIT(p
[1]) && atoi(p
) >= regnpar
) ) {
ender
= scanoct(p
, 3, &numlen
);
FAIL("trailing \\ in regexp");
if (regfold
&& isUPPER(ender
))
if (ISMULT2(p
)) { /* Back off on ?+*. */
FAIL("internal disaster in regexp");
if (regcode
!= ®dummy
)
if (regcode
== ®dummy
)
bits
[c
>> 3] &= ~(1 << (c
& 7));
bits
[c
>> 3] |= (1 << (c
& 7));
if (*regparse
== '^') { /* Complement of range. */
for (class = 0; class < 32; class++)
if (*regparse
== ']' || *regparse
== '-')
goto skipcond
; /* allow 1st char to be ] or - */
while (regparse
< regxend
&& *regparse
!= ']') {
class = UCHARAT(regparse
++);
class = UCHARAT(regparse
++);
for (class = 0; class < 256; class++)
for (class = 0; class < 256; class++)
for (class = 0; class < 256; class++)
for (class = 0; class < 256; class++)
for (class = '0'; class <= '9'; class++)
for (class = 0; class < '0'; class++)
for (class = '9' + 1; class < 256; class++)
class = scanhex(regparse
, 2, &numlen
);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
class = scanoct(--regparse
, 3, &numlen
);
FAIL("invalid [] range in regexp");
if (*regparse
== '-' && regparse
+1 < regxend
&&
continue; /* do it next time */
for ( ; lastclass
<= class; lastclass
++) {
regset(bits
,def
,lastclass
);
if (regfold
&& isUPPER(lastclass
))
regset(bits
,def
,tolower(lastclass
));
FAIL("unmatched [] in regexp");
static char * /* Location. */
*ptr
++ = '\0'; /* Null "next" pointer. */
- reganode - emit a node with an argument
static char * /* Location. */
*ptr
++ = '\0'; /* Null "next" pointer. */
*(unsigned short *)(ret
+3) = arg
;
ret
[3] = arg
>> 8; ret
[4] = arg
& 0377;
- regc - emit (if appropriate) a byte of code
if (regcode
!= ®dummy
)
- reginsert - insert an operator in front of already-emitted operand
* Means relocating the operand.
register offset
= (op
== CURLY
? 4 : 0);
if (regcode
== ®dummy
) {
place
= opnd
; /* Op node, where operand used to be. */
- regtail - set the next-pointer at the end of a node chain
*(short*)(scan
+1) = offset
;
*(scan
+1) = (offset
>>8)&0377;
- regoptail - regtail on operand of first argument; nop if operandless
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if (p
== NULL
|| p
== ®dummy
|| OP(p
) != BRANCH
)
regtail(NEXTOPER(p
), val
);
- regcurly - a little FSA that accepts {\d+,?\d*}
- regdump - dump a regexp onto stderr in vaguely comprehensible form
register char op
= EXACTLY
; /* Arbitrary non-END op. */
while (op
!= END
) { /* While that wasn't END last time... */
fprintf(stderr
,"%2d%s", s
-r
->program
, regprop(s
)); /* Where, what. */
if (next
== NULL
) /* Next ptr. */
fprintf(stderr
,"(%d)", (s
-r
->program
)+(next
-s
));
/* Literal string, where present. */
/* Header fields of interest. */
fprintf(stderr
,"start `%s' ", r
->regstart
->str_ptr
);
fprintf(stderr
,"stclass `%s' ", regprop(r
->regstclass
));
if (r
->reganch
& ROPT_ANCH
)
fprintf(stderr
,"anchored ");
if (r
->reganch
& ROPT_SKIP
)
if (r
->reganch
& ROPT_IMPLICIT
)
fprintf(stderr
,"implicit ");
fprintf(stderr
,"must have \"%s\" back %d ", r
->regmust
->str_ptr
,
fprintf(stderr
, "minlen %d ", r
->minlen
);
- regprop - printable representation of opcode
(void)sprintf(buf
+strlen(buf
), "CURLY {%d,%d}",
(void)sprintf(buf
+strlen(buf
), "REF%d", ARG1(op
));
(void)sprintf(buf
+strlen(buf
), "OPEN%d", ARG1(op
));
(void)sprintf(buf
+strlen(buf
), "CLOSE%d", ARG1(op
));
FAIL("corrupted regexp opcode");