BSD 4_3 development
[unix-history] / usr / contrib / rn / search.c
/* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $
*
* $Log: search.c,v $
* Revision 4.3 85/05/01 11:50:16 lwall
* Baseline for release with 4.3bsd.
*
*/
/* string search routines */
/* Copyright (c) 1981,1980 James Gosling */
/* Modified Aug. 12, 1981 by Tom London to include regular expressions
as in ed. RE stuff hacked over by jag to correct a few major problems,
mainly dealing with searching within the buffer rather than copying
each line to a separate array. Newlines can now appear in RE's */
/* Ripped to shreds and glued back together to make a search package,
* July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
* Changes include:
* Buffer, window, and mlisp stuff gone.
* Translation tables reduced to 1 table.
* Expression buffer is now dynamically allocated.
* Character classes now implemented with a bitmap.
*/
#include "EXTERN.h"
#include "common.h"
#include "util.h"
#include "INTERN.h"
#include "search.h"
#ifndef BITSPERBYTE
#define BITSPERBYTE 8
#endif
#define BMAPSIZ (127 / BITSPERBYTE + 1)
/* meta characters in the "compiled" form of a regular expression */
#define CBRA 2 /* \( -- begin bracket */
#define CCHR 4 /* a vanilla character */
#define CDOT 6 /* . -- match anything except a newline */
#define CCL 8 /* [...] -- character class */
#define NCCL 10 /* [^...] -- negated character class */
#define CDOL 12 /* $ -- matches the end of a line */
#define CEND 14 /* The end of the pattern */
#define CKET 16 /* \) -- close bracket */
#define CBACK 18 /* \N -- backreference to the Nth bracketed
string */
#define CIRC 20 /* ^ matches the beginning of a line */
#define WORD 32 /* matches word character \w */
#define NWORD 34 /* matches non-word characer \W */
#define WBOUND 36 /* matches word boundary \b */
#define NWBOUND 38 /* matches non-(word boundary) \B */
#define STAR 01 /* * -- Kleene star, repeats the previous
REas many times as possible; the value
ORs with the other operator types */
#define ASCSIZ 0200
typedef char TRANSTABLE[ASCSIZ];
static TRANSTABLE trans = {
0000,0001,0002,0003,0004,0005,0006,0007,
0010,0011,0012,0013,0014,0015,0016,0017,
0020,0021,0022,0023,0024,0025,0026,0027,
0030,0031,0032,0033,0034,0035,0036,0037,
0040,0041,0042,0043,0044,0045,0046,0047,
0050,0051,0052,0053,0054,0055,0056,0057,
0060,0061,0062,0063,0064,0065,0066,0067,
0070,0071,0072,0073,0074,0075,0076,0077,
0100,0101,0102,0103,0104,0105,0106,0107,
0110,0111,0112,0113,0114,0115,0116,0117,
0120,0121,0122,0123,0124,0125,0126,0127,
0130,0131,0132,0133,0134,0135,0136,0137,
0140,0141,0142,0143,0144,0145,0146,0147,
0150,0151,0152,0153,0154,0155,0156,0157,
0160,0161,0162,0163,0164,0165,0166,0167,
0170,0171,0172,0173,0174,0175,0176,0177,
};
static bool folding = FALSE;
static int err;
static char *FirstCharacter;
void
search_init()
{
#ifdef UNDEF
register int i;
for (i = 0; i < ASCSIZ; i++)
trans[i] = i;
#else
;
#endif
}
void
init_compex(compex)
register COMPEX *compex;
{
/* the following must start off zeroed */
compex->eblen = 0;
compex->brastr = Nullch;
}
void
free_compex(compex)
register COMPEX *compex;
{
if (compex->eblen) {
free(compex->expbuf);
compex->eblen = 0;
}
if (compex->brastr) {
free(compex->brastr);
compex->brastr = Nullch;
}
}
static char *gbr_str = Nullch;
static int gbr_siz = 0;
char *
getbracket(compex,n)
register COMPEX *compex;
int n;
{
int length = compex->braelist[n] - compex->braslist[n];
if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0)
return nullstr;
growstr(&gbr_str, &gbr_siz, length+1);
safecpy(gbr_str, compex->braslist[n], length+1);
return gbr_str;
}
void
case_fold(which)
int which;
{
register int i;
if (which != folding) {
if (which) {
for (i = 'A'; i <= 'Z'; i++)
trans[i] = tolower(i);
}
else {
for (i = 'A'; i <= 'Z'; i++)
trans[i] = i;
}
folding = which;
}
}
/* Compile the given regular expression into a [secret] internal format */
char *
compile (compex, strp, RE, fold)
register COMPEX *compex;
register char *strp;
int RE;
int fold;
{
register int c;
register char *ep;
char *lastep;
char bracket[NBRA],
*bracketp;
char **alt = compex->alternatives;
char *retmes = "Badly formed search string";
case_fold(compex->do_folding = fold);
if (!compex->eblen) {
compex->expbuf = safemalloc(84);
compex->eblen = 80;
}
ep = compex->expbuf; /* point at expression buffer */
*alt++ = ep; /* first alternative starts here */
bracketp = bracket; /* first bracket goes here */
if (*strp == 0) { /* nothing to compile? */
if (*ep == 0) /* nothing there yet? */
return "Null search string";
return Nullch; /* just keep old expression */
}
compex->nbra = 0; /* no brackets yet */
lastep = 0;
for (;;) {
if (ep - compex->expbuf >= compex->eblen)
grow_eb(compex);
c = *strp++; /* fetch next char of pattern */
if (c == 0) { /* end of pattern? */
if (bracketp != bracket) { /* balanced brackets? */
#ifdef VERBOSE
retmes = "Unbalanced parens";
#endif
goto cerror;
}
*ep++ = CEND; /* terminate expression */
*alt++ = 0; /* terminal alternative list */
/*
compex->eblen = ep - compex->expbuf + 1;
compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */
return Nullch; /* return success */
}
if (c != '*')
lastep = ep;
if (!RE) { /* just a normal search string? */
*ep++ = CCHR; /* everything is a normal char */
*ep++ = c;
}
else /* it is a regular expression */
switch (c) {
case '\\': /* meta something */
switch (c = *strp++) {
case '(':
if (compex->nbra >= NBRA) {
#ifdef VERBOSE
retmes = "Too many parens";
#endif
goto cerror;
}
*bracketp++ = ++compex->nbra;
*ep++ = CBRA;
*ep++ = compex->nbra;
break;
case '|':
if (bracketp>bracket) {
#ifdef VERBOSE
retmes = "No \\| in parens"; /* Alas! */
#endif
goto cerror;
}
*ep++ = CEND;
*alt++ = ep;
break;
case ')':
if (bracketp <= bracket) {
#ifdef VERBOSE
retmes = "Unmatched right paren";
#endif
goto cerror;
}
*ep++ = CKET;
*ep++ = *--bracketp;
break;
case 'w':
*ep++ = WORD;
break;
case 'W':
*ep++ = NWORD;
break;
case 'b':
*ep++ = WBOUND;
break;
case 'B':
*ep++ = NWBOUND;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
*ep++ = CBACK;
*ep++ = c - '0';
break;
default:
*ep++ = CCHR;
if (c == '\0')
goto cerror;
*ep++ = c;
break;
}
break;
case '.':
*ep++ = CDOT;
continue;
case '*':
if (lastep == 0 || *lastep == CBRA || *lastep == CKET
|| *lastep == CIRC
|| (*lastep&STAR)|| *lastep>NWORD)
goto defchar;
*lastep |= STAR;
continue;
case '^':
if (ep != compex->expbuf && ep[-1] != CEND)
goto defchar;
*ep++ = CIRC;
continue;
case '$':
if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
goto defchar;
*ep++ = CDOL;
continue;
case '[': { /* character class */
register int i;
if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
grow_eb(compex); /* reserve bitmap */
for (i = BMAPSIZ; i; --i)
ep[i] = 0;
if ((c = *strp++) == '^') {
c = *strp++;
*ep++ = NCCL; /* negated */
}
else
*ep++ = CCL; /* normal */
i = 0; /* remember oldchar */
do {
if (c == '\0') {
#ifdef VERBOSE
retmes = "Missing ]";
#endif
goto cerror;
}
if (*strp == '-' && *(++strp))
i = *strp++;
else
i = c;
while (c <= i) {
ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
if (fold && isalpha(c))
ep[(c ^ 32) / BITSPERBYTE] |=
1 << ((c ^ 32) % BITSPERBYTE);
/* set the other bit too */
c++;
}
} while ((c = *strp++) != ']');
ep += BMAPSIZ;
continue;
}
defchar:
default:
*ep++ = CCHR;
*ep++ = c;
}
}
cerror:
compex->expbuf[0] = 0;
compex->nbra = 0;
return retmes;
}
void
grow_eb(compex)
register COMPEX *compex;
{
compex->eblen += 80;
compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
}
char *
execute (compex, addr)
register COMPEX *compex;
char *addr;
{
register char *p1 = addr;
register char *trt = trans;
register int c;
if (addr == Nullch)
return Nullch;
if (compex->nbra) { /* any brackets? */
for (c = 0; c <= compex->nbra; c++)
compex->braslist[c] = compex->braelist[c] = Nullch;
if (compex->brastr)
free(compex->brastr);
compex->brastr = savestr(p1); /* in case p1 is not static */
p1 = compex->brastr; /* ! */
}
case_fold(compex->do_folding); /* make sure table is correct */
FirstCharacter = p1; /* for ^ tests */
if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
c = trt[compex->expbuf[1]]; /* fast check for first character */
do {
if (trt[*p1] == c && advance (compex, p1, compex->expbuf))
return p1;
p1++;
} while (*p1 && !err);
return Nullch;
}
else { /* regular algorithm */
do {
register char **alt = compex->alternatives;
while (*alt) {
if (advance (compex, p1, *alt++))
return p1;
}
p1++;
} while (*p1 && !err);
return Nullch;
}
}
/* advance the match of the regular expression starting at ep along the
string lp, simulates an NDFSA */
bool
advance (compex, lp, ep)
register COMPEX *compex;
register char *ep;
register char *lp;
{
register char *curlp;
register char *trt = trans;
register int i;
while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
switch (*ep++) {
case CCHR:
if (trt[*ep++] != trt[*lp]) return FALSE;
lp++;
continue;
case CDOT:
if (*lp == '\n') return FALSE;
lp++;
continue;
case CDOL:
if (!*lp || *lp == '\n')
continue;
return FALSE;
case CIRC:
if (lp == FirstCharacter || lp[-1]=='\n')
continue;
return FALSE;
case WORD:
if (isalnum(*lp)) {
lp++;
continue;
}
return FALSE;
case NWORD:
if (!isalnum(*lp)) {
lp++;
continue;
}
return FALSE;
case WBOUND:
if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
(!*lp || !isalnum(*lp)) )
continue;
return FALSE;
case NWBOUND:
if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
(!*lp || !isalnum(*lp)))
continue;
return FALSE;
case CEND:
return TRUE;
case CCL:
if (cclass (ep, *lp, 1)) {
ep += BMAPSIZ;
lp++;
continue;
}
return FALSE;
case NCCL:
if (cclass (ep, *lp, 0)) {
ep += BMAPSIZ;
lp++;
continue;
}
return FALSE;
case CBRA:
compex->braslist[*ep++] = lp;
continue;
case CKET:
i = *ep++;
compex->braelist[i] = lp;
compex->braelist[0] = lp;
compex->braslist[0] = compex->braslist[i];
continue;
case CBACK:
if (compex->braelist[i = *ep++] == 0) {
fputs("bad braces\n",stdout) FLUSH;
err = TRUE;
return FALSE;
}
if (backref (compex, i, lp)) {
lp += compex->braelist[i] - compex->braslist[i];
continue;
}
return FALSE;
case CBACK | STAR:
if (compex->braelist[i = *ep++] == 0) {
fputs("bad braces\n",stdout) FLUSH;
err = TRUE;
return FALSE;
}
curlp = lp;
while (backref (compex, i, lp)) {
lp += compex->braelist[i] - compex->braslist[i];
}
while (lp >= curlp) {
if (advance (compex, lp, ep))
return TRUE;
lp -= compex->braelist[i] - compex->braslist[i];
}
continue;
case CDOT | STAR:
curlp = lp;
while (*lp++ && lp[-1] != '\n');
goto star;
case WORD | STAR:
curlp = lp;
while (*lp++ && isalnum(lp[-1]));
goto star;
case NWORD | STAR:
curlp = lp;
while (*lp++ && !isalnum(lp[-1]));
goto star;
case CCHR | STAR:
curlp = lp;
while (*lp++ && trt[lp[-1]] == trt[*ep]);
ep++;
goto star;
case CCL | STAR:
case NCCL | STAR:
curlp = lp;
while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR)));
ep += BMAPSIZ;
goto star;
star:
do {
lp--;
if (advance (compex, lp, ep))
return TRUE;
} while (lp > curlp);
return FALSE;
default:
fputs("Badly compiled pattern\n",stdout) FLUSH;
err = TRUE;
return -1;
}
if (*ep == CEND || *ep == CDOL) {
return TRUE;
}
return FALSE;
}
bool
backref (compex, i, lp)
register COMPEX *compex;
register int i;
register char *lp;
{
register char *bp;
bp = compex->braslist[i];
while (*lp && *bp == *lp) {
bp++;
lp++;
if (bp >= compex->braelist[i])
return TRUE;
}
return FALSE;
}
bool
cclass (set, c, af)
register char *set;
register int c;
{
c &= 0177;
#if BITSPERBYTE == 8
if (set[c >> 3] & 1 << (c & 7))
#else
if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
#endif
return af;
return !af;
}