* Copyright (c) 1992 Henry Spencer.
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
* This code is derived from software contributed to Berkeley by
* Henry Spencer of the University of Toronto.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* @(#)regcomp.c 8.1 (Berkeley) 6/4/93
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid
[] = "@(#)regcomp.c 8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */
* parse structure, passed up and down to avoid global variables and
char *next
; /* next character in RE */
char *end
; /* end of string (-> NUL normally) */
int error
; /* has an error been seen? */
sop
*strip
; /* malloced strip */
sopno ssize
; /* malloced strip size (allocated) */
sopno slen
; /* malloced strip length (used) */
int ncsalloc
; /* number of csets allocated */
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
sopno pbegin
[NPAREN
]; /* -> ( ([0] unused) */
sopno pend
[NPAREN
]; /* -> ) ([0] unused) */
static void p_ere(/*register struct parse *p, int stop*/);
static void p_ere_exp(/*register struct parse *p*/);
static void p_str(/*register struct parse *p*/);
static void p_bre(/*register struct parse *p, register int end1, register int end2*/);
static int p_simp_re(/*register struct parse *p, int starordinary*/);
static int p_count(/*register struct parse *p*/);
static void p_bracket(/*register struct parse *p*/);
static void p_b_term(/*register struct parse *p, register cset *cs*/);
static void p_b_cclass(/*register struct parse *p, register cset *cs*/);
static void p_b_eclass(/*register struct parse *p, register cset *cs*/);
static char p_b_symbol(/*register struct parse *p*/);
static char p_b_coll_elem(/*register struct parse *p, int endc*/);
static char othercase(/*int ch*/);
static void bothcases(/*register struct parse *p, int ch*/);
static void ordinary(/*register struct parse *p, register int ch*/);
static void nonnewline(/*register struct parse *p*/);
static void repeat(/*register struct parse *p, sopno start, int from, int to*/);
static int seterr(/*register struct parse *p, int e*/);
static cset
*allocset(/*register struct parse *p*/);
static void freeset(/*register struct parse *p, register cset *cs*/);
static int freezeset(/*register struct parse *p, register cset *cs*/);
static int firstch(/*register struct parse *p, register cset *cs*/);
static int nch(/*register struct parse *p, register cset *cs*/);
static void mcadd(/*register struct parse *p, register cset *cs, register char *cp*/);
static void mcsub(/*register cset *cs, register char *cp*/);
static int mcin(/*register cset *cs, register char *cp*/);
static char *mcfind(/*register cset *cs, register char *cp*/);
static void mcinvert(/*register cset *cs*/);
static void mccase(/*register cset *cs*/);
static int isinsets(/*register struct re_guts *g, int c*/);
static int samesets(/*register struct re_guts *g, int c1, int c2*/);
static void categorize(/*struct parse *p, register struct re_guts *g*/);
static sopno
dupl(/*register struct parse *p, sopno start, sopno finish*/);
static void doemit(/*register struct parse *p, sop op, size_t opnd*/);
static void doinsert(/*register struct parse *p, sop op, size_t opnd, sopno pos*/);
static void dofwd(/*register struct parse *p, sopno pos, sop value*/);
static void enlarge(/*register struct parse *p, sopno size*/);
static void stripsnug(/*register struct parse *p, register struct re_guts *g*/);
static void findmust(/*register struct parse *p, register struct re_guts *g*/);
static sopno
pluscount(/*register struct parse *p, register struct re_guts *g*/);
static char nuls
[10]; /* place to point scanner in event of error */
* macros for use with parse structure
* BEWARE: these know that the parse structure is named `p' !!!
#define PEEK() (*p->next)
#define PEEK2() (*(p->next+1))
#define MORE() (p->next < p->end)
#define MORE2() (p->next+1 < p->end)
#define SEE(c) (MORE() && PEEK() == (c))
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
#define NEXT() (p->next++)
#define NEXT2() (p->next += 2)
#define NEXTn(n) (p->next += (n))
#define GETNEXT() (*p->next++)
#define SETERROR(e) seterr(p, (e))
#define REQUIRE(co, e) ((co) || SETERROR(e))
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
#define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd))
#define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos)
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
#define THERE() (p->slen - 1)
#define DROP(n) (p->slen -= (n))
static int never
= 0; /* for use in asserts; shuts lint up */
- regcomp - interface for parser and compilation
= extern int regcomp(regex_t *preg, const char *pattern, int cflags);
= #define REG_EXTENDED 0001
= #define REG_NEWLINE 0010
= #define REG_NOSPEC 0020
int /* 0 success, otherwise REG_something */
regcomp(preg
, pattern
, cflags
)
register struct re_guts
*g
;
register struct parse
*p
= &pa
;
if ((cflags
®_EXTENDED
) && (cflags
®_NOSPEC
))
if (preg
->re_endp
< pattern
)
len
= preg
->re_endp
- pattern
;
len
= strlen((char *)pattern
);
/* do the mallocs early so failure handling is easy */
g
= (struct re_guts
*)malloc(sizeof(struct re_guts
) +
p
->ssize
= len
/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
p
->strip
= (sop
*)malloc(p
->ssize
* sizeof(sop
));
p
->next
= (char *)pattern
; /* convenience; we do not modify it */
for (i
= 0; i
< NPAREN
; i
++) {
g
->ncategories
= 1; /* category 0 is "everything else" */
g
->categories
= &g
->catspace
[-(CHAR_MIN
)];
(void) memset((char *)g
->catspace
, 0, NC
*sizeof(cat_t
));
else if (cflags
®_NOSPEC
)
/* tidy up loose ends and fill things in */
g
->nplus
= pluscount(p
, g
);
/* not debugging, so can't rely on the assert() in regexec() */
/* win or lose, we're done */
if (p
->error
!= 0) /* lose */
- p_ere - ERE parser top level, concatenation and alternation
== static void p_ere(register struct parse *p, int stop);
register struct parse
*p
;
int stop
; /* character this ERE should end at */
register int first
= 1; /* is this the first alternative? */
/* do a bunch of concatenated expressions */
while (MORE() && (c
= PEEK()) != '|' && c
!= stop
)
REQUIRE(HERE() != conc
, REG_EMPTY
); /* require nonempty */
break; /* NOTE BREAK OUT */
INSERT(OCH_
, conc
); /* offset is wrong */
AHEAD(prevfwd
); /* fix previous offset */
EMIT(OOR2
, 0); /* offset is very wrong */
if (!first
) { /* tail-end fixups */
assert(!MORE() || SEE(stop
));
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
== static void p_ere_exp(register struct parse *p);
register struct parse
*p
;
assert(MORE()); /* caller should have ensured this */
REQUIRE(MORE(), REG_EPAREN
);
p
->pbegin
[subno
] = HERE();
assert(p
->pend
[subno
] != 0);
MUSTEAT(')', REG_EPAREN
);
case ')': /* happens only if no current unmatched ( */
* You may ask, why the ifndef? Because I didn't notice
* this until slightly too late for 1003.2, and none of the
* other 1003.2 regular-expression reviewers noticed it at
* all. So an unmatched ) is legal POSIX, at least until
if (p
->g
->cflags
®_NEWLINE
)
REQUIRE(MORE(), REG_EESCAPE
);
case '{': /* okay as ordinary except if digit follows */
REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT
);
/* we call { a repetition if followed by a digit */
if (!( c
== '*' || c
== '+' || c
== '?' ||
(c
== '{' && MORE2() && isdigit(PEEK2())) ))
return; /* no repetition, we're done */
REQUIRE(!wascaret
, REG_BADRPT
);
case '*': /* implemented as +? */
REQUIRE(count
<= count2
, REG_BADBR
);
} else /* single number with comma */
} else /* just a single number */
repeat(p
, pos
, count
, count2
);
if (!EAT('}')) { /* error heuristics */
while (MORE() && PEEK() != '}')
REQUIRE(MORE(), REG_EBRACE
);
if (!( c
== '*' || c
== '+' || c
== '?' ||
(c
== '{' && MORE2() && isdigit(PEEK2())) ) )
- p_str - string (no metacharacters) "parser"
== static void p_str(register struct parse *p);
register struct parse
*p
;
REQUIRE(MORE(), REG_EMPTY
);
- p_bre - BRE parser top level, anchoring and concatenation
== static void p_bre(register struct parse *p, register int end1, \
* Giving end1 as OUT essentially eliminates the end1/end2 check.
* This implementation is a bit of a kludge, in that a trailing $ is first
* taken as an ordinary character and then revised to be an anchor. The
* only undesirable side effect is that '$' gets included as a character
* category in such cases. This is fairly harmless; not worth fixing.
* The amount of lookahead needed to avoid this kludge is excessive.
register struct parse
*p
;
register int end1
; /* first terminating character */
register int end2
; /* second terminating character */
register sopno start
= HERE();
register int first
= 1; /* first subexpression? */
register int wasdollar
= 0;
while (MORE() && !SEETWO(end1
, end2
)) {
wasdollar
= p_simp_re(p
, first
);
if (wasdollar
) { /* oops, that was a trailing anchor */
REQUIRE(HERE() != start
, REG_EMPTY
); /* require nonempty */
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
== static int p_simp_re(register struct parse *p, int starordinary);
static int /* was the simple RE an unbackslashed $? */
p_simp_re(p
, starordinary
)
register struct parse
*p
;
int starordinary
; /* is a leading * an ordinary character? */
# define BACKSL (1<<CHAR_BIT)
pos
= HERE(); /* repetion op, if any, covers from here */
assert(MORE()); /* caller should have ensured this */
REQUIRE(MORE(), REG_EESCAPE
);
c
= BACKSL
| (unsigned char)GETNEXT();
if (p
->g
->cflags
®_NEWLINE
)
p
->pbegin
[subno
] = HERE();
/* the MORE here is an error heuristic */
if (MORE() && !SEETWO('\\', ')'))
assert(p
->pend
[subno
] != 0);
REQUIRE(EATTWO('\\', ')'), REG_EPAREN
);
case BACKSL
|')': /* should not get here -- must be user */
assert(p
->pbegin
[i
] != 0);
assert(OP(p
->strip
[p
->pbegin
[i
]]) == OLPAREN
);
assert(OP(p
->strip
[p
->pend
[i
]]) == ORPAREN
);
(void) dupl(p
, p
->pbegin
[i
]+1, p
->pend
[i
]);
REQUIRE(starordinary
, REG_BADRPT
);
ordinary(p
, c
&~ BACKSL
);
if (EAT('*')) { /* implemented as +? */
} else if (EATTWO('\\', '{')) {
if (MORE() && isdigit(PEEK())) {
REQUIRE(count
<= count2
, REG_BADBR
);
} else /* single number with comma */
} else /* just a single number */
repeat(p
, pos
, count
, count2
);
if (!EATTWO('\\', '}')) { /* error heuristics */
while (MORE() && !SEETWO('\\', '}'))
REQUIRE(MORE(), REG_EBRACE
);
} else if (c
== (unsigned char)'$') /* $ (but not \$) ends it */
- p_count - parse a repetition count
== static int p_count(register struct parse *p);
static int /* the value */
register struct parse
*p
;
register int ndigits
= 0;
while (MORE() && isdigit(PEEK()) && count
<= DUPMAX
) {
count
= count
*10 + (GETNEXT() - '0');
REQUIRE(ndigits
> 0 && count
<= DUPMAX
, REG_BADBR
);
- p_bracket - parse a bracketed character list
== static void p_bracket(register struct parse *p);
* Note a significant property of this code: if the allocset() did SETERROR,
* no set operations are done.
register struct parse
*p
;
register cset
*cs
= allocset(p
);
/* Dept of Truly Sickening Special-Case Kludges */
if (p
->next
+ 5 < p
->end
&& strncmp(p
->next
, "[:<:]]", 6) == 0) {
if (p
->next
+ 5 < p
->end
&& strncmp(p
->next
, "[:>:]]", 6) == 0) {
invert
++; /* make note to invert set at end */
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
MUSTEAT(']', REG_EBRACK
);
if (p
->error
!= 0) /* don't mess things up further */
if (p
->g
->cflags
®_ICASE
) {
for (i
= p
->g
->csetsize
- 1; i
>= 0; i
--)
if (CHIN(cs
, i
) && isalpha(i
)) {
for (i
= p
->g
->csetsize
- 1; i
>= 0; i
--)
if (p
->g
->cflags
®_NEWLINE
)
assert(cs
->multis
== NULL
); /* xxx */
if (nch(p
, cs
) == 1) { /* optimize singleton sets */
ordinary(p
, firstch(p
, cs
));
EMIT(OANYOF
, freezeset(p
, cs
));
- p_b_term - parse one term of a bracketed character list
== static void p_b_term(register struct parse *p, register cset *cs);
register struct parse
*p
;
register char start
, finish
;
/* classify what we've got */
switch ((MORE()) ? PEEK() : '\0') {
c
= (MORE2()) ? PEEK2() : '\0';
return; /* NOTE RETURN */
case ':': /* character class */
REQUIRE(MORE(), REG_EBRACK
);
REQUIRE(c
!= '-' && c
!= ']', REG_ECTYPE
);
REQUIRE(MORE(), REG_EBRACK
);
REQUIRE(EATTWO(':', ']'), REG_ECTYPE
);
case '=': /* equivalence class */
REQUIRE(MORE(), REG_EBRACK
);
REQUIRE(c
!= '-' && c
!= ']', REG_ECOLLATE
);
REQUIRE(MORE(), REG_EBRACK
);
REQUIRE(EATTWO('=', ']'), REG_ECOLLATE
);
default: /* symbol, ordinary character, or range */
/* xxx revision needed for multichar stuff */
if (SEE('-') && MORE2() && PEEK2() != ']') {
/* xxx what about signed chars here... */
REQUIRE(start
<= finish
, REG_ERANGE
);
for (i
= start
; i
<= finish
; i
++)
- p_b_cclass - parse a character-class name and deal with it
== static void p_b_cclass(register struct parse *p, register cset *cs);
register struct parse
*p
;
register char *sp
= p
->next
;
register struct cclass
*cp
;
while (MORE() && isalpha(PEEK()))
for (cp
= cclasses
; cp
->name
!= NULL
; cp
++)
if (strncmp(cp
->name
, sp
, len
) == 0 && cp
->name
[len
] == '\0')
/* oops, didn't find it */
while ((c
= *u
++) != '\0')
for (u
= cp
->multis
; *u
!= '\0'; u
+= strlen(u
) + 1)
- p_b_eclass - parse an equivalence-class name and deal with it
== static void p_b_eclass(register struct parse *p, register cset *cs);
* This implementation is incomplete. xxx
register struct parse
*p
;
c
= p_b_coll_elem(p
, '=');
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
== static char p_b_symbol(register struct parse *p);
static char /* value of symbol */
register struct parse
*p
;
REQUIRE(MORE(), REG_EBRACK
);
value
= p_b_coll_elem(p
, '.');
REQUIRE(EATTWO('.', ']'), REG_ECOLLATE
);
- p_b_coll_elem - parse a collating-element name and look it up
== static char p_b_coll_elem(register struct parse *p, int endc);
static char /* value of collating element */
register struct parse
*p
;
int endc
; /* name ended by endc,']' */
register char *sp
= p
->next
;
register struct cname
*cp
;
while (MORE() && !SEETWO(endc
, ']'))
for (cp
= cnames
; cp
->name
!= NULL
; cp
++)
if (strncmp(cp
->name
, sp
, len
) == 0 && cp
->name
[len
] == '\0')
return(cp
->code
); /* known name */
return(*sp
); /* single character */
SETERROR(REG_ECOLLATE
); /* neither */
- othercase - return the case counterpart of an alphabetic
== static char othercase(int ch);
static char /* if no counterpart, return ch */
else /* peculiar, but could happen */
- bothcases - emit a dualcase version of a two-case character
== static void bothcases(register struct parse *p, int ch);
* Boy, is this implementation ever a kludge...
register struct parse
*p
;
register char *oldnext
= p
->next
;
register char *oldend
= p
->end
;
assert(othercase(ch
) != ch
); /* p_bracket() would recurse */
assert(p
->next
== bracket
+2);
- ordinary - emit an ordinary character
== static void ordinary(register struct parse *p, register int ch);
register struct parse
*p
;
register cat_t
*cap
= p
->g
->categories
;
if ((p
->g
->cflags
®_ICASE
) && isalpha(ch
) && othercase(ch
) != ch
)
EMIT(OCHAR
, (unsigned char)ch
);
cap
[ch
] = p
->g
->ncategories
++;
- nonnewline - emit REG_NEWLINE version of OANY
== static void nonnewline(register struct parse *p);
* Boy, is this implementation ever a kludge...
register struct parse
*p
;
register char *oldnext
= p
->next
;
register char *oldend
= p
->end
;
assert(p
->next
== bracket
+3);
- repeat - generate code for a bounded repetition, recursively if needed
== static void repeat(register struct parse *p, sopno start, int from, int to);
repeat(p
, start
, from
, to
)
register struct parse
*p
;
sopno start
; /* operand from here to end of strip */
int from
; /* repeated from this number */
int to
; /* to this number of times (maybe INFINITY) */
register sopno finish
= HERE();
# define REP(f, t) ((f)*8 + (t))
# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
if (p
->error
!= 0) /* head off possible runaway recursion */
switch (REP(MAP(from
), MAP(to
))) {
case REP(0, 0): /* must be user doing this */
DROP(finish
-start
); /* drop the operand */
case REP(0, 1): /* as x{1,1}? */
case REP(0, N
): /* as x{1,n}? */
case REP(0, INF
): /* as x{1,}? */
INSERT(OQUEST_
, start
); /* offset is wrong... */
repeat(p
, start
+1, 1, to
);
AHEAD(start
); /* ... fix it */
case REP(1, 1): /* trivial case */
case REP(1, N
): /* as x?x{1,n-1} */
copy
= dupl(p
, start
+1, finish
+1);
assert(copy
== finish
+2);
repeat(p
, copy
, 1, to
-1);
case REP(1, INF
): /* as x+ */
case REP(N
, N
): /* as xx{m-1,n-1} */
copy
= dupl(p
, start
, finish
);
repeat(p
, copy
, from
-1, to
-1);
case REP(N
, INF
): /* as xx{n-1,INF} */
copy
= dupl(p
, start
, finish
);
repeat(p
, copy
, from
-1, to
);
default: /* "can't happen" */
SETERROR(REG_ASSERT
); /* just in case */
- seterr - set an error condition
== static int seterr(register struct parse *p, int e);
static int /* useless but makes type checking happy */
register struct parse
*p
;
if (p
->error
== 0) /* keep earliest error condition */
p
->next
= nuls
; /* try to bring things to a halt */
return(0); /* make the return value well-defined */
- allocset - allocate a set of characters for []
== static cset *allocset(register struct parse *p);
register struct parse
*p
;
register int no
= p
->g
->ncsets
++;
register size_t css
= (size_t)p
->g
->csetsize
;
if (no
>= p
->ncsalloc
) { /* need another column of space */
assert(nc
% CHAR_BIT
== 0);
nbytes
= nc
/ CHAR_BIT
* css
;
p
->g
->sets
= (cset
*)malloc(nc
* sizeof(cset
));
p
->g
->sets
= (cset
*)realloc((char *)p
->g
->sets
,
if (p
->g
->setbits
== NULL
)
p
->g
->setbits
= (uchar
*)malloc(nbytes
);
p
->g
->setbits
= (uchar
*)realloc((char *)p
->g
->setbits
,
if (p
->g
->sets
!= NULL
&& p
->g
->setbits
!= NULL
)
(void) memset((char *)p
->g
->setbits
+ (nbytes
- css
),
/* caller's responsibility not to do set ops */
assert(p
->g
->sets
!= NULL
); /* xxx */
cs
->ptr
= p
->g
->setbits
+ css
*((no
)/CHAR_BIT
);
cs
->mask
= 1 << ((no
) % CHAR_BIT
);
- freeset - free a now-unused set
== static void freeset(register struct parse *p, register cset *cs);
register struct parse
*p
;
register cset
*top
= &p
->g
->sets
[p
->g
->ncsets
];
register size_t css
= (size_t)p
->g
->csetsize
;
for (i
= 0; i
< css
; i
++)
if (cs
== top
-1) /* recover only the easy case */
- freezeset - final processing on a set of characters
== static int freezeset(register struct parse *p, register cset *cs);
* The main task here is merging identical sets. This is usually a waste
* of time (although the hash code minimizes the overhead), but can win
* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
* is done using addition rather than xor -- all ASCII [aA] sets xor to
static int /* set number */
register struct parse
*p
;
register uchar h
= cs
->hash
;
register cset
*top
= &p
->g
->sets
[p
->g
->ncsets
];
register size_t css
= (size_t)p
->g
->csetsize
;
/* look for an earlier one which is the same */
for (cs2
= &p
->g
->sets
[0]; cs2
< top
; cs2
++)
if (cs2
->hash
== h
&& cs2
!= cs
) {
for (i
= 0; i
< css
; i
++)
if (!!CHIN(cs2
, i
) != !!CHIN(cs
, i
))
if (cs2
< top
) { /* found one */
return((int)(cs
- p
->g
->sets
));
- firstch - return first character in a set (which must have at least one)
== static int firstch(register struct parse *p, register cset *cs);
static int /* character; there is no "none" value */
register struct parse
*p
;
register size_t css
= (size_t)p
->g
->csetsize
;
for (i
= 0; i
< css
; i
++)
return(0); /* arbitrary */
- nch - number of characters in a set
== static int nch(register struct parse *p, register cset *cs);
register struct parse
*p
;
register size_t css
= (size_t)p
->g
->csetsize
;
for (i
= 0; i
< css
; i
++)
- mcadd - add a collating element to a cset
== static void mcadd(register struct parse *p, register cset *cs, \
register struct parse
*p
;
register size_t oldend
= cs
->smultis
;
cs
->smultis
+= strlen(cp
) + 1;
cs
->multis
= malloc(cs
->smultis
);
cs
->multis
= realloc(cs
->multis
, cs
->smultis
);
if (cs
->multis
== NULL
) {
(void) strcpy(cs
->multis
+ oldend
- 1, cp
);
cs
->multis
[cs
->smultis
- 1] = '\0';
- mcsub - subtract a collating element from a cset
== static void mcsub(register cset *cs, register char *cp);
register char *fp
= mcfind(cs
, cp
);
register size_t len
= strlen(fp
);
(void) memmove(fp
, fp
+ len
+ 1,
cs
->smultis
- (fp
+ len
+ 1 - cs
->multis
));
cs
->multis
= realloc(cs
->multis
, cs
->smultis
);
assert(cs
->multis
!= NULL
);
- mcin - is a collating element in a cset?
== static int mcin(register cset *cs, register char *cp);
return(mcfind(cs
, cp
) != NULL
);
- mcfind - find a collating element in a cset
== static char *mcfind(register cset *cs, register char *cp);
for (p
= cs
->multis
; *p
!= '\0'; p
+= strlen(p
) + 1)
- mcinvert - invert the list of collating elements in a cset
== static void mcinvert(register cset *cs);
* This would have to know the set of possibilities. Implementation
assert(cs
->multis
== NULL
); /* xxx */
- mccase - add case counterparts of the list of collating elements in a cset
== static void mccase(register cset *cs);
* This would have to know the set of possibilities. Implementation
assert(cs
->multis
== NULL
); /* xxx */
- isinsets - is this character in any sets?
== static int isinsets(register struct re_guts *g, int c);
static int /* predicate */
register struct re_guts
*g
;
register int ncols
= (g
->ncsets
+(CHAR_BIT
-1)) / CHAR_BIT
;
register unsigned uc
= (unsigned char)c
;
for (i
= 0, col
= g
->setbits
; i
< ncols
; i
++, col
+= g
->csetsize
)
- samesets - are these two characters in exactly the same sets?
== static int samesets(register struct re_guts *g, int c1, int c2);
static int /* predicate */
register struct re_guts
*g
;
register int ncols
= (g
->ncsets
+(CHAR_BIT
-1)) / CHAR_BIT
;
register unsigned uc1
= (unsigned char)c1
;
register unsigned uc2
= (unsigned char)c2
;
for (i
= 0, col
= g
->setbits
; i
< ncols
; i
++, col
+= g
->csetsize
)
if (col
[uc1
] != col
[uc2
])
- categorize - sort out character categories
== static void categorize(struct parse *p, register struct re_guts *g);
register struct re_guts
*g
;
register cat_t
*cats
= g
->categories
;
/* avoid making error situations worse */
for (c
= CHAR_MIN
; c
<= CHAR_MAX
; c
++)
if (cats
[c
] == 0 && isinsets(g
, c
)) {
for (c2
= c
+1; c2
<= CHAR_MAX
; c2
++)
if (cats
[c2
] == 0 && samesets(g
, c
, c2
))
- dupl - emit a duplicate of a bunch of sops
== static sopno dupl(register struct parse *p, sopno start, sopno finish);
static sopno
/* start of duplicate */
register struct parse
*p
;
sopno start
; /* from here */
sopno finish
; /* to this less one */
register sopno ret
= HERE();
register sopno len
= finish
- start
;
enlarge(p
, p
->ssize
+ len
); /* this many unexpected additions */
assert(p
->ssize
>= p
->slen
+ len
);
(void) memcpy((char *)(p
->strip
+ p
->slen
),
(char *)(p
->strip
+ start
), (size_t)len
*sizeof(sop
));
- doemit - emit a strip operator
== static void doemit(register struct parse *p, sop op, size_t opnd);
* It might seem better to implement this as a macro with a function as
* hard-case backup, but it's just too big and messy unless there are
* some changes to the data structures. Maybe later.
register struct parse
*p
;
/* avoid making error situations worse */
/* deal with oversize operands ("can't happen", more or less) */
assert(opnd
< 1<<OPSHIFT
);
/* deal with undersized strip */
enlarge(p
, (p
->ssize
+1) / 2 * 3); /* +50% */
assert(p
->slen
< p
->ssize
);
/* finally, it's all reduced to the easy case */
p
->strip
[p
->slen
++] = SOP(op
, opnd
);
- doinsert - insert a sop into the strip
== static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
doinsert(p
, op
, opnd
, pos
)
register struct parse
*p
;
/* avoid making error situations worse */
EMIT(op
, opnd
); /* do checks, ensure space */
/* adjust paren pointers */
for (i
= 1; i
< NPAREN
; i
++) {
if (p
->pbegin
[i
] >= pos
) {
memmove((char *)&p
->strip
[pos
+1], (char *)&p
->strip
[pos
],
(HERE()-pos
-1)*sizeof(sop
));
- dofwd - complete a forward reference
== static void dofwd(register struct parse *p, sopno pos, sop value);
register struct parse
*p
;
/* avoid making error situations worse */
assert(value
< 1<<OPSHIFT
);
p
->strip
[pos
] = OP(p
->strip
[pos
]) | value
;
- enlarge - enlarge the strip
== static void enlarge(register struct parse *p, sopno size);
register struct parse
*p
;
sp
= (sop
*)realloc(p
->strip
, size
*sizeof(sop
));
- stripsnug - compact the strip
== static void stripsnug(register struct parse *p, register struct re_guts *g);
register struct parse
*p
;
register struct re_guts
*g
;
g
->strip
= (sop
*)realloc((sop
*)p
->strip
, p
->slen
* sizeof(sop
));
- findmust - fill in must and mlen with longest mandatory literal string
== static void findmust(register struct parse *p, register struct re_guts *g);
* This algorithm could do fancy things like analyzing the operands of |
* for common subsequences. Someday. This code is simple and finds most
* of the interesting cases.
* Note that must and mlen got initialized during setup.
register struct re_guts
*g
;
/* avoid making error situations worse */
/* find the longest OCHAR sequence in strip */
case OCHAR
: /* sequence member */
if (newlen
== 0) /* new sequence */
case OPLUS_
: /* things that don't break one */
case OQUEST_
: /* things that must be skipped */
/* assert() interferes w debug printouts */
if (OP(s
) != O_QUEST
&& OP(s
) != O_CH
&&
} while (OP(s
) != O_QUEST
&& OP(s
) != O_CH
);
default: /* things that break a sequence */
if (newlen
> g
->mlen
) { /* ends one */
if (g
->mlen
== 0) /* there isn't one */
/* turn it into a character string */
g
->must
= malloc((size_t)g
->mlen
+ 1);
if (g
->must
== NULL
) { /* argh; just forget it */
for (i
= g
->mlen
; i
> 0; i
--) {
while (OP(s
= *scan
++) != OCHAR
)
*cp
++ = '\0'; /* just on general principles */
- pluscount - count + nesting
== static sopno pluscount(register struct parse *p, register struct re_guts *g);
static sopno
/* nesting depth */
register struct re_guts
*g
;
register sopno plusnest
= 0;
register sopno maxnest
= 0;
return(0); /* there may not be an OEND */