/* This file contains the code that compiles regular expressions and executes
* them. It supports the same syntax and features as vi's regular expression
* code. Specifically, the meta characters are:
* ^ matches the beginning of a line
* $ matches the end of a line
* \< matches the beginning of a word
* \> matches the end of a word
* . matches any single character
* [] matches any character in a character class
* \( delimits the start of a subexpression
* \) delimits the end of a subexpression
* * repeats the preceding 0 or more times
* NOTE: You cannot follow a \) with a *.
* The physical structure of a compiled RE is as follows:
* - First, there is a one-byte value that says how many character classes
* are used in this regular expression
* - Next, each character class is stored as a bitmap that is 256 bits
* - A mixture of literal characters and compiled meta characters follows.
* This begins with M_BEGIN(0) and ends with M_END(0). All meta chars
* are stored as a \n followed by a one-byte code, so they take up two
* bytes apiece. Literal characters take up one byte apiece. \n can't
* be used as a literal character.
* If NO_MAGIC is defined, then a different set of functions is used instead.
* That right, this file contains TWO versions of the code.
extern int patlock
; /* from cmd_substitute() module */
static regex_t
*previous
= NULL
; /* the previous regexp, used when null regexp is given */
if (!previous
) regerr("RE error: no previous pattern");
} else if (previous
&& !patlock
)
else if ((previous
= (regex_t
*) malloc(sizeof(regex_t
))) == NULL
) {
regerr("RE error: out of memory");
if (n
= regcomp(previous
, *o_magic
? s
: neuter(s
),
*o_ignorecase
? REG_ICASE
: 0)) {
regerr("RE error: %d", n
);
/* escape BRE meta-characters in a string */
if ((hd
= t
= (char *) malloc(n
+ n
+ 1)) == NULL
)
if (*s
== '.' || *s
== '\\' || *s
== '[' || *s
== '*')
static char *previous
; /* the previous regexp, used when null regexp is given */
/* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED */
/* These are used to classify or recognize meta-characters */
#define BASE_META(m) ((m) - 256)
#define INT_META(c) ((c) + 256)
#define IS_META(m) ((m) >= 256)
#define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
#define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
#define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
#define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE)
#define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))
#define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)
/* These are the internal codes used for each type of meta-character */
#define M_BEGLINE 256 /* internal code for ^ */
#define M_ENDLINE 257 /* internal code for $ */
#define M_BEGWORD 258 /* internal code for \< */
#define M_ENDWORD 259 /* internal code for \> */
#define M_ANY 260 /* internal code for . */
#define M_SPLAT 261 /* internal code for * */
#define M_PLUS 262 /* internal code for \+ */
#define M_QMARK 263 /* internal code for \? */
#define M_RANGE 264 /* internal code for \{ */
#define M_CLASS(n) (265+(n)) /* internal code for [] */
#define M_START(n) (275+(n)) /* internal code for \( */
#define M_END(n) (285+(n)) /* internal code for \) */
/* These are used during compilation */
static int class_cnt
; /* used to assign class IDs */
static int start_cnt
; /* used to assign start IDs */
static int end_stk
[NSUBEXP
];/* used to assign end IDs */
static char *retext
; /* points to the text being compiled */
/* error-handling stuff */
#define FAIL(why) regerr(why); longjmp(errorhandler, 1)
/* This function builds a bitmap for a particular class */
static char *makeclass(text
, bmap
)
REG
char *text
; /* start of the class */
REG
char *bmap
; /* the bitmap */
for (i
= 0; bmap
&& i
< 32; i
++)
/* see if we're going to complement this class */
/* add in the characters */
while (*text
&& *text
!= ']')
/* is this a span of characters? */
if (text
[1] == '-' && text
[2])
/* spans can't be backwards */
FAIL("Backwards span in []");
/* add each character in the span to the bitmap */
for (i
= UCHAR(text
[0]); bmap
&& (unsigned)i
<= UCHAR(text
[2]); i
++)
bmap
[i
>> 3] |= (1 << (i
& 7));
/* move past this span */
/* add this single character to the span */
bmap
[UCHAR(i
) >> 3] |= (1 << (UCHAR(i
) & 7));
/* make sure the closing ] is missing */
/* if we're supposed to complement this class, then do so */
/* This function gets the next character or meta character from a string.
* The pointer is incremented by 1, or by 2 for \-quoted characters. For [],
* a bitmap is generated via makeclass() (if re is given), and the
* character-class text is skipped.
static int gettoken(sptr
, re
)
if (start_cnt
>= NSUBEXP
)
end_stk
[end_sp
++] = start_cnt
;
return M_START(start_cnt
++);
return M_END(end_stk
[--end_sp
]);
return (*o_magic
? c
: M_SPLAT
);
return (*o_magic
? c
: M_ANY
);
/* make sure we don't have too many classes */
/* process the character list for this class */
/* generate the bitmap for this class */
*sptr
= makeclass(*sptr
, re
->program
+ 1 + 32 * class_cnt
);
/* skip to end of the class */
*sptr
= makeclass(*sptr
, (char *)0);
return M_CLASS(class_cnt
++);
else /* unquoted nomagic */
/* This function calculates the number of bytes that will be needed for a
* compiled RE. Its argument is the uncompiled version. It is not clever
* about catching syntax errors; that is done in a later pass.
static unsigned calcsize(text
)
while ((token
= gettoken(&text
, (regexp
*)0)) != 0)
else if (token
== M_RANGE
)
while ((token
= gettoken(&text
, (regexp
*)0)) != 0
/* This function compiles a regexp. */
/* prepare for error handling */
if (setjmp(errorhandler
))
/* if an empty regexp string was given, use the previous one */
else /* non-empty regexp given, so remember it */
previous
= (char *)malloc((unsigned)(strlen(exp
) + 1));
size
= calced
+ sizeof(regexp
);
size
= calcsize(exp
) + sizeof(regexp
) + 10; /* !!! 10 bytes for slop */
re
= (regexp
*)malloc((unsigned)size
);
FAIL("Not enough memory for this RE");
build
= &re
->program
[1 + 32 * class_cnt
];
re
->program
[0] = class_cnt
;
for (token
= 0; token
< NSUBEXP
; token
++)
re
->startp
[token
] = re
->endp
[token
] = (char *)0;
for (token
= M_START(0), peek
= gettoken(&exp
, re
);
token
= peek
, peek
= gettoken(&exp
, re
))
/* special processing for the closure operator */
/* detect misuse of closure operator */
FAIL("Closure operator follows nothing");
else if (IS_META(token
) && token
!= M_ANY
&& !IS_CLASS(token
))
FAIL("Closure operators can only follow a normal character or . or []");
/* if \{ \} then read the range */
for (digit
= gettoken(&exp
, re
);
!IS_META(digit
) && isdigit(digit
);
digit
= gettoken(&exp
, re
))
from
= from
* 10 + digit
- '0';
for (digit
= gettoken(&exp
, re
);
!IS_META(digit
) && isdigit(digit
);
digit
= gettoken(&exp
, re
))
to
= to
* 10 + digit
- '0';
FAIL("Bad characters after \\{");
else if (to
< from
|| to
== 0 || from
>= 255)
FAIL("Invalid range for \\{ \\}");
/* it is okay -- make it prefix instead of postfix */
*build
++ = (to
< 255 ? to
: 255);
/* take care of "needfirst" - is this the first char? */
if (needfirst
&& peek
== M_PLUS
&& !IS_META(token
))
/* we used "peek" -- need to refill it */
peek
= gettoken(&exp
, re
);
FAIL("* or \\+ or \\? doubled up");
else if (!IS_META(token
))
/* normal char is NOT argument of closure */
else if (token
== M_ANY
|| IS_CLASS(token
))
/* . or [] is NOT argument of closure */
/* the "token" character is not closure -- process it normally */
/* set the BOL flag instead of storing M_BEGLINE */
/* end it with a \) which MUST MATCH the opening \( */
ADD_META(build
, M_END(0));
if ((int)(build
- re
->program
) != calced
)
msg("regcomp error: calced=%d, actual=%d", calced
, (int)(build
- re
->program
));
/*---------------------------------------------------------------------------*/
/* This function checks for a match between a character and a token which is
* known to represent a single character. It returns 0 if they match, or
int match1(re
, ch
, token
)
/* the end of a line can't match any RE of width 1 */
else if (IS_CLASS(token
))
if (re
->program
[1 + 32 * (token
- M_CLASS(0)) + (UCHAR(ch
) >> 3)] & (1 << (UCHAR(ch
) & 7)))
else if (ch
== token
|| *o_ignorecase
&& tolower(ch
) == tolower(token
))
/* This function checks characters up to and including the next closure, at
* which point it does a recursive call to check the rest of it. This function
* returns 0 if everything matches, or 1 if something doesn't match.
int match(re
, str
, prog
, here
)
regexp
*re
; /* the regular expression */
char *str
; /* the string */
REG
char *prog
; /* a portion of re->program, an compiled RE */
REG
char *here
; /* a portion of str, the string to compare it to */
REG
int token
; /* the roken pointed to by prog */
REG
int nmatched
;/* counter, used during closure matching */
REG
int closure
;/* the token denoting the type of closure */
int from
; /* minimum number of matches in closure */
int to
; /* maximum number of matches in closure */
for (token
= GET_META(prog
); !IS_CLOSURE(token
); prog
++, token
= GET_META(prog
))
/*case M_BEGLINE: can't happen; re->bol is used instead */
(here
[-1] == '_' || isalnum(here
[-1])))
if (here
[0] == '_' || isalnum(here
[0]))
re
->startp
[token
- M_START(0)] = (char *)here
;
re
->endp
[token
- M_END(0)] = (char *)here
;
default: /* literal, M_CLASS(n), or M_ANY */
if (match1(re
, *here
, token
) != 0)
/* step 1: see what we have to match against, and move "prog" to point
* to the remainder of the compiled RE.
to
= strlen(str
); /* infinity */
to
= strlen(str
); /* infinity */
to
= strlen(str
); /* infinity */
/* step 2: see how many times we can match that token against the string */
nmatched
< to
&& *here
&& match1(re
, *here
, token
) == 0;
/* step 3: try to match the remainder, and back off if it doesn't */
while (nmatched
>= from
&& match(re
, str
, prog
, here
) != 0)
/* so how did it work out? */
/* This function searches through a string for text that matches an RE. */
int regexec(re
, str
, bol
)
regexp
*re
; /* the compiled regexp to search for */
char *str
; /* the string to search through */
int bol
; /* boolean: does str start at the beginning of a line? */
char *prog
; /* the entry point of re->program */
int len
; /* length of the string */
/* if must start at the beginning of a line, and this isn't, then fail */
prog
= re
->program
+ 1 + 32 * re
->program
[0];
/* search for the RE in the string */
&& match1(re
, *(char *)str
, re
->first
))/* wrong first letter? */
|| len
< re
->minlen
/* not long enough? */
|| match(re
, (char *)str
, prog
, str
)) /* doesn't match? */
return 0; /* THEN FAIL! */
/* can occur anywhere in the line, noignorecase */
(re
->first
&& re
->first
!= *here
)
|| match(re
, (char *)str
, prog
, here
);
/* can occur anywhere in the line, ignorecase */
(re
->first
&& match1(re
, *here
, (int)re
->first
))
|| match(re
, (char *)str
, prog
, here
);
/* if we didn't fail, then we must have succeeded */
/*============================================================================*/
/* allocate a big enough regexp structure */
re
= (regexp
*)malloc((unsigned)(strlen(exp
) + 1 + sizeof(struct regexp
)));
regerr("Could not malloc a regexp structure");
/* initialize all fields of the structure */
for (i
= 0; i
< NSUBEXP
; i
++)
re
->startp
[i
] = re
->endp
[i
] = (char *)0;
/* copy the string into it, translating ^ and $ as needed */
for (src
= exp
, dest
= re
->program
+ 1; *src
; src
++)
regerr("extra \\ at end of regular expression");
/* This "helper" function checks for a match at a given location. It returns
* 1 if it matches, 0 if it doesn't match here but might match later on in the
* string, or -1 if it could not possibly match
static int reghelp(prog
, string
, bolflag
)
/* if ^, then require bolflag */
if ((prog
->bol
& 1) && !bolflag
)
/* if it matches, then it will start here */
prog
->startp
[0] = string
;
/* compare, possibly ignoring case */
for (scan
= &prog
->program
[1]; *scan
; scan
++, string
++)
if (tolower(*scan
) != tolower(*string
))
for (scan
= &prog
->program
[1]; *scan
; scan
++, string
++)
/* if $, then require string to end here, too */
if ((prog
->bol
& 2) && *string
)
/* if we get to here, it matches */
int regexec(prog
, string
, bolflag
)
/* keep trying to match it */
for (rc
= reghelp(prog
, string
, bolflag
); rc
== 0; rc
= reghelp(prog
, string
, 0))