usr/src/lib/libcompat/4.3/regex.c

/*-
 * Copyright (c) 1980, 1993
 *      The Regents of the University of California.  All rights reserved.
 *
 * This module is believed to contain source code proprietary to AT&T.
 * Use and redistribution is subject to the Berkeley Software License
 * Agreement and your Software Agreement with AT&T (Western Electric).
 */

#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)regex.c     8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */

#include <unistd.h>

static int advance();
static int backref();
static int cclass();

/*
 * routines to do regular expression matching
 *
 * Entry points:
 *
 *      re_comp(s)
 *              char *s;
 *       ... returns 0 if the string s was compiled successfully,
 *                   a pointer to an error message otherwise.
 *           If passed 0 or a null string returns without changing
 *           the currently compiled re (see note 11 below).
 *
 *      re_exec(s)
 *              char *s;
 *       ... returns 1 if the string s matches the last compiled regular
 *                     expression,
 *                   0 if the string s failed to match the last compiled
 *                     regular expression, and
 *                  -1 if the compiled regular expression was invalid
 *                     (indicating an internal error).
 *
 * The strings passed to both re_comp and re_exec may have trailing or
 * embedded newline characters; they are terminated by nulls.
 *
 * The identity of the author of these routines is lost in antiquity;
 * this is essentially the same as the re code in the original V6 ed.
 *
 * The regular expressions recognized are described below. This description
 * is essentially the same as that for ed.
 *
 *      A regular expression specifies a set of strings of characters.
 *      A member of this set of strings is said to be matched by
 *      the regular expression.  In the following specification for
 *      regular expressions the word `character' means any character but NUL.
 *
 *      1.  Any character except a special character matches itself.
 *          Special characters are the regular expression delimiter plus
 *          \ [ . and sometimes ^ * $.
 *      2.  A . matches any character.
 *      3.  A \ followed by any character except a digit or ( )
 *          matches that character.
 *      4.  A nonempty string s bracketed [s] (or [^s]) matches any
 *          character in (or not in) s. In s, \ has no special meaning,
 *          and ] may only appear as the first letter. A substring
 *          a-b, with a and b in ascending ASCII order, stands for
 *          the inclusive range of ASCII characters.
 *      5.  A regular expression of form 1-4 followed by * matches a
 *          sequence of 0 or more matches of the regular expression.
 *      6.  A regular expression, x, of form 1-8, bracketed \(x\)
 *          matches what x matches.
 *      7.  A \ followed by a digit n matches a copy of the string that the
 *          bracketed regular expression beginning with the nth \( matched.
 *      8.  A regular expression of form 1-8, x, followed by a regular
 *          expression of form 1-7, y matches a match for x followed by
 *          a match for y, with the x match being as long as possible
 *          while still permitting a y match.
 *      9.  A regular expression of form 1-8 preceded by ^ (or followed
 *          by $), is constrained to matches that begin at the left
 *          (or end at the right) end of a line.
 *      10. A regular expression of form 1-9 picks out the longest among
 *          the leftmost matches in a line.
 *      11. An empty regular expression stands for a copy of the last
 *          regular expression encountered.
 */

/*
 * constants for re's
 */
#define CBRA    1
#define CCHR    2
#define CDOT    4
#define CCL     6
#define NCCL    8
#define CDOL    10
#define CEOF    11
#define CKET    12
#define CBACK   18

#define CSTAR   01

#define ESIZE   512
#define NBRA    9

static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
static  char    circf;

/*
 * compile the regular expression argument into a dfa
 */
char *
re_comp(sp)
        register const char     *sp;
{
        register int    c;
        register char   *ep = expbuf;
        int     cclcnt, numbra = 0;
        char    *lastep = 0;
        char    bracket[NBRA];
        char    *bracketp = &bracket[0];
        static  char    *retoolong = "Regular expression too long";

#define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }

        if (sp == 0 || *sp == '\0') {
                if (*ep == 0)
                        return("No previous regular expression");
                return(0);
        }
        if (*sp == '^') {
                circf = 1;
                sp++;
        }
        else
                circf = 0;
        for (;;) {
                if (ep >= &expbuf[ESIZE])
                        comerr(retoolong);
                if ((c = *sp++) == '\0') {
                        if (bracketp != bracket)
                                comerr("unmatched \\(");
                        *ep++ = CEOF;
                        *ep++ = 0;
                        return(0);
                }
                if (c != '*')
                        lastep = ep;
                switch (c) {

                case '.':
                        *ep++ = CDOT;
                        continue;

                case '*':
                        if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
                                goto defchar;
                        *lastep |= CSTAR;
                        continue;

                case '$':
                        if (*sp != '\0')
                                goto defchar;
                        *ep++ = CDOL;
                        continue;

                case '[':
                        *ep++ = CCL;
                        *ep++ = 0;
                        cclcnt = 1;
                        if ((c = *sp++) == '^') {
                                c = *sp++;
                                ep[-2] = NCCL;
                        }
                        do {
                                if (c == '\0')
                                        comerr("missing ]");
                                if (c == '-' && ep [-1] != 0) {
                                        if ((c = *sp++) == ']') {
                                                *ep++ = '-';
                                                cclcnt++;
                                                break;
                                        }
                                        while (ep[-1] < c) {
                                                *ep = ep[-1] + 1;
                                                ep++;
                                                cclcnt++;
                                                if (ep >= &expbuf[ESIZE])
                                                        comerr(retoolong);
                                        }
                                }
                                *ep++ = c;
                                cclcnt++;
                                if (ep >= &expbuf[ESIZE])
                                        comerr(retoolong);
                        } while ((c = *sp++) != ']');
                        lastep[1] = cclcnt;
                        continue;

                case '\\':
                        if ((c = *sp++) == '(') {
                                if (numbra >= NBRA)
                                        comerr("too many \\(\\) pairs");
                                *bracketp++ = numbra;
                                *ep++ = CBRA;
                                *ep++ = numbra++;
                                continue;
                        }
                        if (c == ')') {
                                if (bracketp <= bracket)
                                        comerr("unmatched \\)");
                                *ep++ = CKET;
                                *ep++ = *--bracketp;
                                continue;
                        }
                        if (c >= '1' && c < ('1' + NBRA)) {
                                *ep++ = CBACK;
                                *ep++ = c - '1';
                                continue;
                        }
                        *ep++ = CCHR;
                        *ep++ = c;
                        continue;

                defchar:
                default:
                        *ep++ = CCHR;
                        *ep++ = c;
                }
        }
}

/*
 * match the argument string against the compiled re
 */
int
re_exec(p1)
        register const char     *p1;
{
        register char   *p2 = expbuf;
        register int    c;
        int     rv;

        for (c = 0; c < NBRA; c++) {
                braslist[c] = 0;
                braelist[c] = 0;
        }
        if (circf)
                return((advance(p1, p2)));
        /*
         * fast check for first character
         */
        if (*p2 == CCHR) {
                c = p2[1];
                do {
                        if (*p1 != c)
                                continue;
                        if (rv = advance(p1, p2))
                                return(rv);
                } while (*p1++);
                return(0);
        }
        /*
         * regular algorithm
         */
        do
                if (rv = advance(p1, p2))
                        return(rv);
        while (*p1++);
        return(0);
}

/*
 * try to match the next thing in the dfa
 */
static  int
advance(lp, ep)
        register char   *lp, *ep;
{
        register char   *curlp;
        int     ct, i;
        int     rv;

        for (;;)
                switch (*ep++) {

                case CCHR:
                        if (*ep++ == *lp++)
                                continue;
                        return(0);

                case CDOT:
                        if (*lp++)
                                continue;
                        return(0);

                case CDOL:
                        if (*lp == '\0')
                                continue;
                        return(0);

                case CEOF:
                        return(1);

                case CCL:
                        if (cclass(ep, *lp++, 1)) {
                                ep += *ep;
                                continue;
                        }
                        return(0);

                case NCCL:
                        if (cclass(ep, *lp++, 0)) {
                                ep += *ep;
                                continue;
                        }
                        return(0);

                case CBRA:
                        braslist[*ep++] = lp;
                        continue;

                case CKET:
                        braelist[*ep++] = lp;
                        continue;

                case CBACK:
                        if (braelist[i = *ep++] == 0)
                                return(-1);
                        if (backref(i, lp)) {
                                lp += braelist[i] - braslist[i];
                                continue;
                        }
                        return(0);

                case CBACK|CSTAR:
                        if (braelist[i = *ep++] == 0)
                                return(-1);
                        curlp = lp;
                        ct = braelist[i] - braslist[i];
                        while (backref(i, lp))
                                lp += ct;
                        while (lp >= curlp) {
                                if (rv = advance(lp, ep))
                                        return(rv);
                                lp -= ct;
                        }
                        continue;

                case CDOT|CSTAR:
                        curlp = lp;
                        while (*lp++)
                                ;
                        goto star;

                case CCHR|CSTAR:
                        curlp = lp;
                        while (*lp++ == *ep)
                                ;
                        ep++;
                        goto star;

                case CCL|CSTAR:
                case NCCL|CSTAR:
                        curlp = lp;
                        while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
                                ;
                        ep += *ep;
                        goto star;

                star:
                        do {
                                lp--;
                                if (rv = advance(lp, ep))
                                        return(rv);
                        } while (lp > curlp);
                        return(0);

                default:
                        return(-1);
                }
}

static
backref(i, lp)
        register int    i;
        register char   *lp;
{
        register char   *bp;

        bp = braslist[i];
        while (*bp++ == *lp++)
                if (bp >= braelist[i])
                        return(1);
        return(0);
}

static int
cclass(set, c, af)
        register char   *set, c;
        int     af;
{
        register int    n;

        if (c == 0)
                return(0);
        n = *set++;
        while (--n)
                if (*set++ == c)
                        return(af);
        return(! af);
}