/* NOTE: this is derived from Henry Spencer's regexp code, and should not
* confused with the original package (see point 3 below). Thanks, Henry!
/* Additional note: this code is very heavily munged from Henry's version
* in places. In some spots I've traded clarity for efficiency, so don't
* blame Henry for some of the lack of readability.
/* $RCSfile: regexec.c,v $$Revision: 4.0.1.4 $$Date: 92/06/08 15:25:50 $
* Revision 4.0.1.4 92/06/08 15:25:50 lwall
* patch20: pattern modifiers i and g didn't interact right
* patch20: in some cases $` and $' didn't get set by match
* patch20: /x{0}/ was wrongly interpreted as /x{0,}/
* Revision 4.0.1.3 91/11/05 18:23:55 lwall
* patch11: prepared for ctype implementations that don't define isascii()
* patch11: initial .* in pattern had dependency on value of $*
* Revision 4.0.1.2 91/06/07 11:50:33 lwall
* patch4: new copyright notice
* patch4: // wouldn't use previous pattern if it started with a null character
* Revision 4.0.1.1 91/04/12 09:07:39 lwall
* patch1: regexec only allocated space for 9 subexpresssions
* Revision 4.0 91/03/20 01:39:16 lwall
* regcomp and regexec -- regsub and regerror are not used in perl
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
**** Alterations to Henry's code are...
**** Copyright (c) 1991, Larry Wall
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
* Global work variables for regexec().
static char *reginput
; /* String-input pointer. */
static char regprev
; /* char before regbol, \n if none */
static char *regbol
; /* Beginning of input, for ^ check. */
static char *regeol
; /* End of input, for $ check. */
static char **regstartp
; /* Pointer to startp array. */
static char **regendp
; /* Ditto for endp. */
static char *reglastparen
; /* Similarly for lastparen. */
static int regmyp_size
= 0;
static char **regmystartp
= Null(char**);
static char **regmyendp
= Null(char**);
- regexec - match a regexp against a string
regexec(prog
, stringarg
, strend
, strbeg
, minend
, screamer
, safebase
)
register char *strend
; /* pointer to null at end of string */
char *strbeg
; /* real beginning of string */
int minend
; /* end of match must be at least minend after stringarg */
int safebase
; /* no need to remember string in subbase */
register char *string
= stringarg
;
int minlen
= 0; /* must match at least this many chars */
int dontbother
= 0; /* how many characters not to try at end */
if (prog
== NULL
|| string
== NULL
) {
fatal("NULL regexp parameter");
if (string
== strbeg
) /* is ^ valid at stringarg? */
if (!multiline
&& regprev
== '\n')
regprev
= '\0'; /* force ^ to NOT match */
regprecomp
= prog
->precomp
;
/* Check validity of program. */
if (UCHARAT(prog
->program
) != MAGIC
) {
FAIL("corrupted regexp program");
Copy(string
, c
, i
+1, char);
for (s
= string
; s
< strend
; s
++)
/* If there is a "must appear" string, look for it. */
if (prog
->regmust
!= Nullstr
&&
(!(prog
->reganch
& ROPT_ANCH
)
|| (multiline
&& prog
->regback
>= 0)) ) {
if (stringarg
== strbeg
&& screamer
) {
if (screamfirst
[prog
->regmust
->str_rare
] >= 0)
s
= screaminstr(screamer
,prog
->regmust
);
s
= fbminstr((unsigned char*)s
, (unsigned char*)strend
,
++prog
->regmust
->str_u
.str_useful
; /* hooray */
goto phooey
; /* not present */
else if (prog
->regback
>= 0) {
minlen
= prog
->regback
+ prog
->regmust
->str_cur
;
else if (--prog
->regmust
->str_u
.str_useful
< 0) { /* boo */
prog
->regmust
= Nullstr
; /* disable regmust */
minlen
= prog
->regmust
->str_cur
;
/* Mark beginning of line for ^ . */
/* Mark end of line for $ (and such) */
/* see how far we have to get to not match where we matched before */
/* Allocate our backreference arrays */
if ( regmyp_size
< prog
->nparens
+ 1 ) {
/* Allocate or enlarge the arrays */
regmyp_size
= prog
->nparens
+ 1;
if ( regmyp_size
< 10 ) regmyp_size
= 10; /* minimum */
Renew(regmystartp
,regmyp_size
,char*);
Renew(regmyendp
, regmyp_size
,char*);
New(1102,regmystartp
,regmyp_size
,char*);
New(1102,regmyendp
, regmyp_size
,char*);
/* Simplest case: anchored match need be tried only once. */
/* [unless multiline is set] */
if (prog
->reganch
& ROPT_ANCH
) {
if (regtry(prog
, string
))
else if (multiline
|| (prog
->reganch
& ROPT_IMPLICIT
)) {
/* for multiline we only have to try after newlines */
if (s
< strend
&& regtry(prog
, s
))
/* Messy cases: unanchored match. */
if (prog
->reganch
& ROPT_SKIP
) { /* we have /x+whatever/ */
/* it must be a one character string */
i
= prog
->regstart
->str_ptr
[0];
while (s
< strend
&& *s
== i
)
else if (prog
->regstart
->str_pok
== 3) {
/* We know what string it must start with. */
while ((s
= fbminstr((unsigned char*)s
,
(unsigned char*)strend
, prog
->regstart
)) != NULL
)
c
= prog
->regstart
->str_ptr
;
while ((s
= ninstr(s
, strend
,
c
, c
+ prog
->regstart
->str_cur
)) != NULL
) {
if (c
= prog
->regstclass
) {
int doevery
= (prog
->reganch
& ROPT_SKIP
) == 0;
strend
-= dontbother
; /* don't bother with what can't match */
/* We know what class it must start with. */
if (!(c
[i
>> 3] & (1 << (i
&7)))) {
if (tmp
&& regtry(prog
, s
))
tmp
= isALNUM(regprev
); /* assume not alphanumeric */
if ((minlen
|| tmp
) && regtry(prog
,s
))
tmp
= isALNUM(regprev
); /* assume not alphanumeric */
else if (regtry(prog
, s
))
if ((minlen
|| !tmp
) && regtry(prog
,s
))
if (tmp
&& regtry(prog
, s
))
if (tmp
&& regtry(prog
, s
))
if (tmp
&& regtry(prog
, s
))
if (tmp
&& regtry(prog
, s
))
if (tmp
&& regtry(prog
, s
))
if (tmp
&& regtry(prog
, s
))
/* We don't know much -- general case. */
if ((!safebase
&& (prog
->nparens
|| sawampersand
)) || prog
->do_folding
){
strend
+= dontbother
; /* uncheat */
if (safebase
) /* no need for $digit later */
else if (strbeg
!= prog
->subbase
) {
i
= strend
- string
+ (stringarg
- strbeg
);
s
= nsavestr(strbeg
,i
); /* so $digit will work later */
prog
->subbeg
= prog
->subbase
= s
;
i
= strend
- string
+ (stringarg
- strbeg
);
prog
->subbeg
= s
= prog
->subbase
;
s
+= (stringarg
- strbeg
);
for (i
= 0; i
<= prog
->nparens
; i
++) {
prog
->startp
[i
] = s
+ (prog
->startp
[i
] - string
);
prog
->endp
[i
] = s
+ (prog
->endp
[i
] - string
);
- regtry - try match at specific point
static int /* 0 failure, 1 success */
regstartp
= prog
->startp
;
reglastparen
= &prog
->lastparen
;
for (i
= prog
->nparens
; i
>= 0; i
--) {
if (regmatch(prog
->program
+ 1) && reginput
>= regtill
) {
prog
->startp
[0] = string
;
prog
->endp
[0] = reginput
;
- regmatch - main matching routine
* Conceptually the strategy is simple: check to see whether the current
* node matches, call self recursively to see whether the rest matches,
* and then act accordingly. In practice we make some effort to avoid
* recursion, in particular by going through "ordinary" nodes (that don't
* need to know whether the rest of the match failed) by a loop instead of
/* [lwall] I've hoisted the register declarations to the outer block in order to
* maybe save a little bit of pushing and popping on the stack. It also takes
* advantage of machines that use a register save mask on subroutine entry.
static int /* 0 failure, 1 success */
register char *scan
; /* Current node. */
char *next
; /* Next node. */
register int n
; /* no or next */
register int ln
; /* len or last */
register char *s
; /* operand or save */
register char *locinput
= reginput
;
if (scan
!= NULL
&& regnarrate
)
fprintf(stderr
, "%s(\n", regprop(scan
));
fprintf(stderr
, "%s...\n", regprop(scan
));
next
= scan
+ NEXT(scan
);
if (locinput
== regbol
? regprev
== '\n' :
((nextchar
|| locinput
< regeol
) &&
if ((nextchar
|| locinput
< regeol
) && nextchar
!= '\n')
if (!multiline
&& regeol
- locinput
> 1)
if ((nextchar
== '\0' && locinput
>= regeol
) ||
/* Inline the first character, for speed. */
if (regeol
- locinput
< ln
)
if (ln
> 1 && bcmp(s
, locinput
, ln
) != 0)
nextchar
= UCHARAT(locinput
);
if (s
[nextchar
>> 3] & (1 << (nextchar
&7)))
if (!nextchar
&& locinput
>= regeol
)
if (!nextchar
&& locinput
>= regeol
)
if (locinput
== regbol
) /* was last char in word? */
ln
= isALNUM(locinput
[-1]);
n
= isALNUM(nextchar
); /* is next char in word? */
if ((ln
== n
) == (OP(scan
) == BOUND
))
if (!nextchar
&& locinput
>= regeol
)
if (!nextchar
&& locinput
>= regeol
)
n
= ARG1(scan
); /* which paren pair */
/* Inline the first character, for speed. */
if (locinput
+ ln
> regeol
)
if (ln
> 1 && bcmp(s
, locinput
, ln
) != 0)
n
= ARG1(scan
); /* which paren pair */
regmystartp
[n
] = locinput
; /* for REF */
* Don't set startp if some later
* invocation of the same parentheses
if (regstartp
[n
] == NULL
)
n
= ARG1(scan
); /* which paren pair */
regmyendp
[n
] = locinput
; /* for REF */
* Don't set endp if some later
* invocation of the same parentheses
if (regendp
[n
] == NULL
) {
if (OP(next
) != BRANCH
) /* No choice. */
next
= NEXTOPER(scan
); /* Avoid recursion. */
if (regmatch(NEXTOPER(scan
)))
} while (scan
!= NULL
&& OP(scan
) == BRANCH
);
ln
= ARG1(scan
); /* min to match */
n
= ARG2(scan
); /* max to match */
scan
= NEXTOPER(scan
) + 4;
* Lookahead to avoid useless match attempts
* when we know what character comes next.
nextchar
= *(OPERAND(next
)+1);
if (!multiline
&& OP(next
) == EOL
&& ln
< n
)
ln
= n
; /* why back off? */
/* If it could work, try it. */
if (nextchar
== -1000 || *reginput
== nextchar
)
/* Couldn't or didn't -- back up. */
reginput
= locinput
; /* put where regtry can find it */
return(1); /* Success! */
printf("%x %d\n",scan
,scan
[1]);
FAIL("regexp memory corruption");
* We get here only if there's trouble -- normally "case END" is
FAIL("corrupted regexp pointers");
- regrepeat - repeatedly match something simple, report how many
* [This routine now assumes that it will only match on things of length 1.
* That was true before, but now we assume scan - reginput is the count,
* rather than incrementing count on every character.]
register char *loceol
= regeol
;
if (max
!= 32767 && max
< loceol
- scan
)
while (scan
< loceol
&& *scan
!= '\n')
case EXACTLY
: /* length of string is 1 */
while (scan
< loceol
&& *opnd
== *scan
)
while (scan
< loceol
&& !(opnd
[c
>> 3] & (1 << (c
& 7)))) {
while (scan
< loceol
&& isALNUM(*scan
))
while (scan
< loceol
&& !isALNUM(*scan
))
while (scan
< loceol
&& isSPACE(*scan
))
while (scan
< loceol
&& !isSPACE(*scan
))
while (scan
< loceol
&& isDIGIT(*scan
))
while (scan
< loceol
&& !isDIGIT(*scan
))
default: /* Oh dear. Called inappropriately. */
FAIL("internal regexp foulup");
- regnext - dig the "next" pointer out of a node
* [Note, when REGALIGN is defined there are two places in regmatch()
* that bypass this code for speed.]