BSD 4_4 release
[unix-history] / usr / src / lib / libc / regex / regcomp.c
CommitLineData
ff09270c
KB
1/*-
2 * Copyright (c) 1992 Henry Spencer.
a586e915
KB
3 * Copyright (c) 1992, 1993
4 * The Regents of the University of California. All rights reserved.
ff09270c
KB
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer of the University of Toronto.
8 *
ad787160
C
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
ff09270c 24 *
ad787160
C
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)regcomp.c 8.1 (Berkeley) 6/4/93
ff09270c
KB
38 */
39
40#if defined(LIBC_SCCS) && !defined(lint)
ad787160 41static char sccsid[] = "@(#)regcomp.c 8.1 (Berkeley) 6/4/93";
ff09270c
KB
42#endif /* LIBC_SCCS and not lint */
43
44#include <sys/types.h>
ff09270c
KB
45#include <stdio.h>
46#include <string.h>
47#include <ctype.h>
48#include <limits.h>
49#include <stdlib.h>
ff09270c
KB
50#include <regex.h>
51
52#include "utils.h"
53#include "regex2.h"
54
55#include "cclass.h"
56#include "cname.h"
57
ff09270c
KB
58/*
59 * parse structure, passed up and down to avoid global variables and
60 * other clumsinesses
61 */
62struct parse {
6175ca7c
KB
63 char *next; /* next character in RE */
64 char *end; /* end of string (-> NUL normally) */
ff09270c
KB
65 int error; /* has an error been seen? */
66 sop *strip; /* malloced strip */
67 sopno ssize; /* malloced strip size (allocated) */
68 sopno slen; /* malloced strip length (used) */
69 int ncsalloc; /* number of csets allocated */
70 struct re_guts *g;
71# define NPAREN 10 /* we need to remember () 1-9 for back refs */
72 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
73 sopno pend[NPAREN]; /* -> ) ([0] unused) */
74};
75
6175ca7c
KB
76
77static void p_ere(/*register struct parse *p, int stop*/);
78static void p_ere_exp(/*register struct parse *p*/);
79static void p_str(/*register struct parse *p*/);
80static void p_bre(/*register struct parse *p, register int end1, register int end2*/);
81static int p_simp_re(/*register struct parse *p, int starordinary*/);
82static int p_count(/*register struct parse *p*/);
83static void p_bracket(/*register struct parse *p*/);
84static void p_b_term(/*register struct parse *p, register cset *cs*/);
85static void p_b_cclass(/*register struct parse *p, register cset *cs*/);
86static void p_b_eclass(/*register struct parse *p, register cset *cs*/);
87static char p_b_symbol(/*register struct parse *p*/);
88static char p_b_coll_elem(/*register struct parse *p, int endc*/);
89static char othercase(/*int ch*/);
90static void bothcases(/*register struct parse *p, int ch*/);
91static void ordinary(/*register struct parse *p, register int ch*/);
92static void nonnewline(/*register struct parse *p*/);
93static void repeat(/*register struct parse *p, sopno start, int from, int to*/);
94static int seterr(/*register struct parse *p, int e*/);
95static cset *allocset(/*register struct parse *p*/);
96static void freeset(/*register struct parse *p, register cset *cs*/);
97static int freezeset(/*register struct parse *p, register cset *cs*/);
98static int firstch(/*register struct parse *p, register cset *cs*/);
99static int nch(/*register struct parse *p, register cset *cs*/);
100static void mcadd(/*register struct parse *p, register cset *cs, register char *cp*/);
101static void mcsub(/*register cset *cs, register char *cp*/);
102static int mcin(/*register cset *cs, register char *cp*/);
103static char *mcfind(/*register cset *cs, register char *cp*/);
104static void mcinvert(/*register cset *cs*/);
105static void mccase(/*register cset *cs*/);
106static int isinsets(/*register struct re_guts *g, int c*/);
107static int samesets(/*register struct re_guts *g, int c1, int c2*/);
108static void categorize(/*struct parse *p, register struct re_guts *g*/);
109static sopno dupl(/*register struct parse *p, sopno start, sopno finish*/);
110static void doemit(/*register struct parse *p, sop op, size_t opnd*/);
111static void doinsert(/*register struct parse *p, sop op, size_t opnd, sopno pos*/);
112static void dofwd(/*register struct parse *p, sopno pos, sop value*/);
113static void enlarge(/*register struct parse *p, sopno size*/);
114static void stripsnug(/*register struct parse *p, register struct re_guts *g*/);
115static void findmust(/*register struct parse *p, register struct re_guts *g*/);
116static sopno pluscount(/*register struct parse *p, register struct re_guts *g*/);
117
118static char nuls[10]; /* place to point scanner in event of error */
ff09270c
KB
119
120/*
121 * macros for use with parse structure
122 * BEWARE: these know that the parse structure is named `p' !!!
123 */
6175ca7c
KB
124#define PEEK() (*p->next)
125#define PEEK2() (*(p->next+1))
126#define MORE() (p->next < p->end)
127#define MORE2() (p->next+1 < p->end)
128#define SEE(c) (MORE() && PEEK() == (c))
129#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
ff09270c
KB
130#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
131#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
132#define NEXT() (p->next++)
133#define NEXT2() (p->next += 2)
134#define NEXTn(n) (p->next += (n))
6175ca7c 135#define GETNEXT() (*p->next++)
ff09270c
KB
136#define SETERROR(e) seterr(p, (e))
137#define REQUIRE(co, e) ((co) || SETERROR(e))
6175ca7c
KB
138#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
139#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
140#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
ff09270c
KB
141#define EMIT(sop, sopnd) doemit(p, sop, (size_t)(sopnd))
142#define INSERT(sop, pos) doinsert(p, sop, HERE()-(pos)+1, pos)
6175ca7c
KB
143#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
144#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
ff09270c
KB
145#define HERE() (p->slen)
146#define THERE() (p->slen - 1)
147#define DROP(n) (p->slen -= (n))
148
6175ca7c
KB
149#ifndef NDEBUG
150static int never = 0; /* for use in asserts; shuts lint up */
151#endif
c62af244 152
ff09270c
KB
153/*
154 - regcomp - interface for parser and compilation
6175ca7c
KB
155 = extern int regcomp(regex_t *preg, const char *pattern, int cflags);
156 = #define REG_BASIC 0000
157 = #define REG_EXTENDED 0001
158 = #define REG_ICASE 0002
159 = #define REG_NOSUB 0004
160 = #define REG_NEWLINE 0010
161 = #define REG_NOSPEC 0020
162 = #define REG_PEND 0040
163 = #define REG_DUMP 0200
ff09270c
KB
164 */
165int /* 0 success, otherwise REG_something */
166regcomp(preg, pattern, cflags)
167regex_t *preg;
168const char *pattern;
169int cflags;
170{
171 struct parse pa;
172 register struct re_guts *g;
173 register struct parse *p = &pa;
ff09270c 174 register int i;
6175ca7c
KB
175 register size_t len;
176
177 if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
178 return(REG_INVARG);
179
180 if (cflags&REG_PEND) {
181 if (preg->re_endp < pattern)
182 return(REG_INVARG);
183 len = preg->re_endp - pattern;
184 } else
185 len = strlen((char *)pattern);
ff09270c
KB
186
187 /* do the mallocs early so failure handling is easy */
6175ca7c
KB
188 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
189 (NC-1)*sizeof(cat_t));
ff09270c
KB
190 if (g == NULL)
191 return(REG_ESPACE);
6175ca7c 192 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
ff09270c
KB
193 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
194 p->slen = 0;
195 if (p->strip == NULL) {
196 free((char *)g);
197 return(REG_ESPACE);
198 }
199
200 /* set things up */
201 p->g = g;
6175ca7c
KB
202 p->next = (char *)pattern; /* convenience; we do not modify it */
203 p->end = p->next + len;
ff09270c
KB
204 p->error = 0;
205 p->ncsalloc = 0;
206 for (i = 0; i < NPAREN; i++) {
207 p->pbegin[i] = 0;
208 p->pend[i] = 0;
209 }
6175ca7c 210 g->csetsize = NC;
ff09270c
KB
211 g->sets = NULL;
212 g->setbits = NULL;
213 g->ncsets = 0;
214 g->cflags = cflags;
215 g->iflags = 0;
6175ca7c
KB
216 g->nbol = 0;
217 g->neol = 0;
ff09270c
KB
218 g->must = NULL;
219 g->mlen = 0;
220 g->nsub = 0;
221 g->ncategories = 1; /* category 0 is "everything else" */
85839952 222 g->categories = &g->catspace[-(CHAR_MIN)];
6175ca7c 223 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
ff09270c 224 g->backrefs = 0;
ff09270c
KB
225
226 /* do it */
227 EMIT(OEND, 0);
228 g->firststate = THERE();
229 if (cflags&REG_EXTENDED)
6175ca7c
KB
230 p_ere(p, OUT);
231 else if (cflags&REG_NOSPEC)
232 p_str(p);
ff09270c 233 else
6175ca7c 234 p_bre(p, OUT, OUT);
ff09270c
KB
235 EMIT(OEND, 0);
236 g->laststate = THERE();
237
238 /* tidy up loose ends and fill things in */
239 categorize(p, g);
240 stripsnug(p, g);
241 findmust(p, g);
242 g->nplus = pluscount(p, g);
243 g->magic = MAGIC2;
244 preg->re_nsub = g->nsub;
245 preg->re_g = g;
246 preg->re_magic = MAGIC1;
4d4cb2b0 247#ifndef REDEBUG
ff09270c
KB
248 /* not debugging, so can't rely on the assert() in regexec() */
249 if (g->iflags&BAD)
250 SETERROR(REG_ASSERT);
251#endif
252
253 /* win or lose, we're done */
254 if (p->error != 0) /* lose */
255 regfree(preg);
256 return(p->error);
257}
258
259/*
260 - p_ere - ERE parser top level, concatenation and alternation
6175ca7c 261 == static void p_ere(register struct parse *p, int stop);
ff09270c
KB
262 */
263static void
264p_ere(p, stop)
265register struct parse *p;
6175ca7c 266int stop; /* character this ERE should end at */
ff09270c 267{
6175ca7c 268 register char c;
ff09270c
KB
269 register sopno prevback;
270 register sopno prevfwd;
271 register sopno conc;
272 register int first = 1; /* is this the first alternative? */
273
274 for (;;) {
275 /* do a bunch of concatenated expressions */
276 conc = HERE();
6175ca7c 277 while (MORE() && (c = PEEK()) != '|' && c != stop)
ff09270c
KB
278 p_ere_exp(p);
279 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
280
281 if (!EAT('|'))
282 break; /* NOTE BREAK OUT */
283
284 if (first) {
285 INSERT(OCH_, conc); /* offset is wrong */
286 prevfwd = conc;
287 prevback = conc;
288 first = 0;
289 }
6175ca7c 290 ASTERN(OOR1, prevback);
ff09270c 291 prevback = THERE();
6175ca7c 292 AHEAD(prevfwd); /* fix previous offset */
ff09270c
KB
293 prevfwd = HERE();
294 EMIT(OOR2, 0); /* offset is very wrong */
295 }
296
297 if (!first) { /* tail-end fixups */
6175ca7c
KB
298 AHEAD(prevfwd);
299 ASTERN(O_CH, prevback);
ff09270c
KB
300 }
301
6175ca7c 302 assert(!MORE() || SEE(stop));
ff09270c
KB
303}
304
305/*
306 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
6175ca7c 307 == static void p_ere_exp(register struct parse *p);
ff09270c
KB
308 */
309static void
310p_ere_exp(p)
311register struct parse *p;
312{
6175ca7c 313 register char c;
ff09270c 314 register sopno pos;
ff09270c
KB
315 register int count;
316 register int count2;
317 register sopno subno;
318 int wascaret = 0;
ff09270c 319
6175ca7c 320 assert(MORE()); /* caller should have ensured this */
ff09270c 321 c = GETNEXT();
ff09270c
KB
322
323 pos = HERE();
324 switch (c) {
325 case '(':
6175ca7c 326 REQUIRE(MORE(), REG_EPAREN);
ff09270c
KB
327 p->g->nsub++;
328 subno = p->g->nsub;
329 if (subno < NPAREN)
330 p->pbegin[subno] = HERE();
331 EMIT(OLPAREN, subno);
332 if (!SEE(')'))
333 p_ere(p, ')');
334 if (subno < NPAREN) {
335 p->pend[subno] = HERE();
336 assert(p->pend[subno] != 0);
337 }
338 EMIT(ORPAREN, subno);
339 MUSTEAT(')', REG_EPAREN);
340 break;
341#ifndef POSIX_MISTAKE
342 case ')': /* happens only if no current unmatched ( */
343 /*
344 * You may ask, why the ifndef? Because I didn't notice
345 * this until slightly too late for 1003.2, and none of the
346 * other 1003.2 regular-expression reviewers noticed it at
347 * all. So an unmatched ) is legal POSIX, at least until
348 * we can get it fixed.
349 */
350 SETERROR(REG_EPAREN);
351 break;
352#endif
353 case '^':
354 EMIT(OBOL, 0);
355 p->g->iflags |= USEBOL;
6175ca7c 356 p->g->nbol++;
ff09270c
KB
357 wascaret = 1;
358 break;
359 case '$':
360 EMIT(OEOL, 0);
361 p->g->iflags |= USEEOL;
6175ca7c 362 p->g->neol++;
ff09270c
KB
363 break;
364 case '|':
365 SETERROR(REG_EMPTY);
366 break;
367 case '*':
368 case '+':
369 case '?':
370 SETERROR(REG_BADRPT);
371 break;
372 case '.':
373 if (p->g->cflags&REG_NEWLINE)
374 nonnewline(p);
375 else
376 EMIT(OANY, 0);
377 break;
378 case '[':
379 p_bracket(p);
380 break;
381 case '\\':
6175ca7c 382 REQUIRE(MORE(), REG_EESCAPE);
ff09270c 383 c = GETNEXT();
6175ca7c 384 ordinary(p, c);
ff09270c
KB
385 break;
386 case '{': /* okay as ordinary except if digit follows */
6175ca7c 387 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
ff09270c
KB
388 /* FALLTHROUGH */
389 default:
390 ordinary(p, c);
391 break;
392 }
393
6175ca7c
KB
394 if (!MORE())
395 return;
ff09270c 396 c = PEEK();
6175ca7c
KB
397 /* we call { a repetition if followed by a digit */
398 if (!( c == '*' || c == '+' || c == '?' ||
399 (c == '{' && MORE2() && isdigit(PEEK2())) ))
ff09270c
KB
400 return; /* no repetition, we're done */
401 NEXT();
402
403 REQUIRE(!wascaret, REG_BADRPT);
404 switch (c) {
405 case '*': /* implemented as +? */
406 INSERT(OPLUS_, pos);
6175ca7c 407 ASTERN(O_PLUS, pos);
ff09270c 408 INSERT(OQUEST_, pos);
6175ca7c 409 ASTERN(O_QUEST, pos);
ff09270c
KB
410 break;
411 case '+':
412 INSERT(OPLUS_, pos);
6175ca7c 413 ASTERN(O_PLUS, pos);
ff09270c
KB
414 break;
415 case '?':
416 INSERT(OQUEST_, pos);
6175ca7c 417 ASTERN(O_QUEST, pos);
ff09270c
KB
418 break;
419 case '{':
420 count = p_count(p);
421 if (EAT(',')) {
422 if (isdigit(PEEK())) {
423 count2 = p_count(p);
424 REQUIRE(count <= count2, REG_BADBR);
425 } else /* single number with comma */
426 count2 = INFINITY;
427 } else /* just a single number */
428 count2 = count;
429 repeat(p, pos, count, count2);
430 if (!EAT('}')) { /* error heuristics */
6175ca7c 431 while (MORE() && PEEK() != '}')
ff09270c 432 NEXT();
6175ca7c
KB
433 REQUIRE(MORE(), REG_EBRACE);
434 SETERROR(REG_BADBR);
ff09270c
KB
435 }
436 break;
437 }
438
6175ca7c
KB
439 if (!MORE())
440 return;
ff09270c 441 c = PEEK();
6175ca7c
KB
442 if (!( c == '*' || c == '+' || c == '?' ||
443 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
444 return;
445 SETERROR(REG_BADRPT);
446}
447
448/*
449 - p_str - string (no metacharacters) "parser"
450 == static void p_str(register struct parse *p);
451 */
452static void
453p_str(p)
454register struct parse *p;
455{
456 REQUIRE(MORE(), REG_EMPTY);
457 while (MORE())
458 ordinary(p, GETNEXT());
ff09270c
KB
459}
460
461/*
462 - p_bre - BRE parser top level, anchoring and concatenation
6175ca7c
KB
463 == static void p_bre(register struct parse *p, register int end1, \
464 == register int end2);
465 * Giving end1 as OUT essentially eliminates the end1/end2 check.
4d4cb2b0
KB
466 *
467 * This implementation is a bit of a kludge, in that a trailing $ is first
468 * taken as an ordinary character and then revised to be an anchor. The
469 * only undesirable side effect is that '$' gets included as a character
470 * category in such cases. This is fairly harmless; not worth fixing.
6175ca7c 471 * The amount of lookahead needed to avoid this kludge is excessive.
ff09270c
KB
472 */
473static void
474p_bre(p, end1, end2)
475register struct parse *p;
6175ca7c
KB
476register int end1; /* first terminating character */
477register int end2; /* second terminating character */
ff09270c 478{
ff09270c
KB
479 register sopno start = HERE();
480 register int first = 1; /* first subexpression? */
4d4cb2b0 481 register int wasdollar = 0;
ff09270c
KB
482
483 if (EAT('^')) {
484 EMIT(OBOL, 0);
485 p->g->iflags |= USEBOL;
6175ca7c 486 p->g->nbol++;
ff09270c 487 }
6175ca7c 488 while (MORE() && !SEETWO(end1, end2)) {
ff09270c
KB
489 wasdollar = p_simp_re(p, first);
490 first = 0;
491 }
492 if (wasdollar) { /* oops, that was a trailing anchor */
493 DROP(1);
494 EMIT(OEOL, 0);
495 p->g->iflags |= USEEOL;
6175ca7c 496 p->g->neol++;
ff09270c
KB
497 }
498
499 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
500}
501
502/*
503 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
6175ca7c 504 == static int p_simp_re(register struct parse *p, int starordinary);
ff09270c
KB
505 */
506static int /* was the simple RE an unbackslashed $? */
507p_simp_re(p, starordinary)
508register struct parse *p;
509int starordinary; /* is a leading * an ordinary character? */
510{
ff09270c
KB
511 register int c;
512 register int count;
513 register int count2;
514 register sopno pos;
ff09270c
KB
515 register int i;
516 register sopno subno;
517# define BACKSL (1<<CHAR_BIT)
518
519 pos = HERE(); /* repetion op, if any, covers from here */
520
6175ca7c 521 assert(MORE()); /* caller should have ensured this */
ff09270c 522 c = GETNEXT();
6175ca7c
KB
523 if (c == '\\') {
524 REQUIRE(MORE(), REG_EESCAPE);
525 c = BACKSL | (unsigned char)GETNEXT();
526 }
ff09270c
KB
527 switch (c) {
528 case '.':
529 if (p->g->cflags&REG_NEWLINE)
530 nonnewline(p);
531 else
532 EMIT(OANY, 0);
533 break;
534 case '[':
535 p_bracket(p);
536 break;
ff09270c
KB
537 case BACKSL|'{':
538 SETERROR(REG_BADRPT);
539 break;
540 case BACKSL|'(':
ff09270c
KB
541 p->g->nsub++;
542 subno = p->g->nsub;
543 if (subno < NPAREN)
544 p->pbegin[subno] = HERE();
545 EMIT(OLPAREN, subno);
6175ca7c
KB
546 /* the MORE here is an error heuristic */
547 if (MORE() && !SEETWO('\\', ')'))
ff09270c
KB
548 p_bre(p, '\\', ')');
549 if (subno < NPAREN) {
550 p->pend[subno] = HERE();
551 assert(p->pend[subno] != 0);
552 }
553 EMIT(ORPAREN, subno);
554 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
555 break;
556 case BACKSL|')': /* should not get here -- must be user */
557 case BACKSL|'}':
558 SETERROR(REG_EPAREN);
559 break;
560 case BACKSL|'1':
561 case BACKSL|'2':
562 case BACKSL|'3':
563 case BACKSL|'4':
564 case BACKSL|'5':
565 case BACKSL|'6':
566 case BACKSL|'7':
567 case BACKSL|'8':
568 case BACKSL|'9':
569 i = (c&~BACKSL) - '0';
570 assert(i < NPAREN);
571 if (p->pend[i] != 0) {
572 assert(i <= p->g->nsub);
573 EMIT(OBACK_, i);
574 assert(p->pbegin[i] != 0);
575 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
576 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
577 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
578 EMIT(O_BACK, i);
579 } else
580 SETERROR(REG_ESUBREG);
581 p->g->backrefs = 1;
4d4cb2b0 582 break;
ff09270c
KB
583 case '*':
584 REQUIRE(starordinary, REG_BADRPT);
585 /* FALLTHROUGH */
586 default:
6175ca7c 587 ordinary(p, c &~ BACKSL);
ff09270c
KB
588 break;
589 }
590
591 if (EAT('*')) { /* implemented as +? */
592 INSERT(OPLUS_, pos);
6175ca7c 593 ASTERN(O_PLUS, pos);
ff09270c 594 INSERT(OQUEST_, pos);
6175ca7c 595 ASTERN(O_QUEST, pos);
ff09270c
KB
596 } else if (EATTWO('\\', '{')) {
597 count = p_count(p);
598 if (EAT(',')) {
6175ca7c 599 if (MORE() && isdigit(PEEK())) {
ff09270c
KB
600 count2 = p_count(p);
601 REQUIRE(count <= count2, REG_BADBR);
602 } else /* single number with comma */
603 count2 = INFINITY;
604 } else /* just a single number */
605 count2 = count;
606 repeat(p, pos, count, count2);
607 if (!EATTWO('\\', '}')) { /* error heuristics */
6175ca7c 608 while (MORE() && !SEETWO('\\', '}'))
ff09270c 609 NEXT();
6175ca7c
KB
610 REQUIRE(MORE(), REG_EBRACE);
611 SETERROR(REG_BADBR);
ff09270c 612 }
6175ca7c 613 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
ff09270c
KB
614 return(1);
615
616 return(0);
617}
618
619/*
620 - p_count - parse a repetition count
6175ca7c 621 == static int p_count(register struct parse *p);
ff09270c
KB
622 */
623static int /* the value */
624p_count(p)
625register struct parse *p;
626{
627 register int count = 0;
628 register int ndigits = 0;
629
6175ca7c 630 while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
ff09270c
KB
631 count = count*10 + (GETNEXT() - '0');
632 ndigits++;
633 }
634
635 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
636 return(count);
637}
638
639/*
640 - p_bracket - parse a bracketed character list
6175ca7c 641 == static void p_bracket(register struct parse *p);
ff09270c
KB
642 *
643 * Note a significant property of this code: if the allocset() did SETERROR,
644 * no set operations are done.
645 */
646static void
647p_bracket(p)
648register struct parse *p;
649{
6175ca7c 650 register char c;
ff09270c
KB
651 register cset *cs = allocset(p);
652 register int invert = 0;
653
6175ca7c
KB
654 /* Dept of Truly Sickening Special-Case Kludges */
655 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
656 EMIT(OBOW, 0);
657 NEXTn(6);
658 return;
659 }
660 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
661 EMIT(OEOW, 0);
662 NEXTn(6);
663 return;
664 }
665
ff09270c
KB
666 if (EAT('^'))
667 invert++; /* make note to invert set at end */
668 if (EAT(']'))
669 CHadd(cs, ']');
10e6394a
KB
670 else if (EAT('-'))
671 CHadd(cs, '-');
6175ca7c 672 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
ff09270c
KB
673 p_b_term(p, cs);
674 if (EAT('-'))
675 CHadd(cs, '-');
676 MUSTEAT(']', REG_EBRACK);
677
6175ca7c
KB
678 if (p->error != 0) /* don't mess things up further */
679 return;
680
681 if (p->g->cflags&REG_ICASE) {
682 register int i;
683 register int ci;
684
685 for (i = p->g->csetsize - 1; i >= 0; i--)
686 if (CHIN(cs, i) && isalpha(i)) {
687 ci = othercase(i);
688 if (ci != i)
689 CHadd(cs, ci);
690 }
691 if (cs->multis != NULL)
692 mccase(p, cs);
693 }
694 if (invert) {
ff09270c
KB
695 register int i;
696
697 for (i = p->g->csetsize - 1; i >= 0; i--)
698 if (CHIN(cs, i))
699 CHsub(cs, i);
700 else
701 CHadd(cs, i);
702 if (p->g->cflags&REG_NEWLINE)
703 CHsub(cs, '\n');
704 if (cs->multis != NULL)
705 mcinvert(p, cs);
706 }
6175ca7c 707
ff09270c 708 assert(cs->multis == NULL); /* xxx */
6175ca7c
KB
709
710 if (nch(p, cs) == 1) { /* optimize singleton sets */
711 ordinary(p, firstch(p, cs));
712 freeset(p, cs);
713 } else
714 EMIT(OANYOF, freezeset(p, cs));
ff09270c
KB
715}
716
717/*
718 - p_b_term - parse one term of a bracketed character list
6175ca7c 719 == static void p_b_term(register struct parse *p, register cset *cs);
ff09270c
KB
720 */
721static void
722p_b_term(p, cs)
723register struct parse *p;
724register cset *cs;
725{
6175ca7c
KB
726 register char c;
727 register char start, finish;
ff09270c
KB
728 register int i;
729
730 /* classify what we've got */
6175ca7c 731 switch ((MORE()) ? PEEK() : '\0') {
ff09270c 732 case '[':
6175ca7c 733 c = (MORE2()) ? PEEK2() : '\0';
ff09270c
KB
734 break;
735 case '-':
736 SETERROR(REG_ERANGE);
737 return; /* NOTE RETURN */
738 break;
739 default:
740 c = '\0';
741 break;
742 }
743
744 switch (c) {
745 case ':': /* character class */
746 NEXT2();
6175ca7c 747 REQUIRE(MORE(), REG_EBRACK);
ff09270c 748 c = PEEK();
ff09270c
KB
749 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
750 p_b_cclass(p, cs);
6175ca7c 751 REQUIRE(MORE(), REG_EBRACK);
ff09270c
KB
752 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
753 break;
754 case '=': /* equivalence class */
755 NEXT2();
6175ca7c 756 REQUIRE(MORE(), REG_EBRACK);
ff09270c 757 c = PEEK();
ff09270c
KB
758 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
759 p_b_eclass(p, cs);
6175ca7c 760 REQUIRE(MORE(), REG_EBRACK);
ff09270c
KB
761 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
762 break;
763 default: /* symbol, ordinary character, or range */
764/* xxx revision needed for multichar stuff */
765 start = p_b_symbol(p);
6175ca7c 766 if (SEE('-') && MORE2() && PEEK2() != ']') {
ff09270c
KB
767 /* range */
768 NEXT();
769 if (EAT('-'))
770 finish = '-';
771 else
772 finish = p_b_symbol(p);
773 } else
774 finish = start;
6175ca7c 775/* xxx what about signed chars here... */
ff09270c 776 REQUIRE(start <= finish, REG_ERANGE);
6175ca7c 777 for (i = start; i <= finish; i++)
ff09270c 778 CHadd(cs, i);
ff09270c
KB
779 break;
780 }
781}
782
783/*
784 - p_b_cclass - parse a character-class name and deal with it
6175ca7c 785 == static void p_b_cclass(register struct parse *p, register cset *cs);
ff09270c
KB
786 */
787static void
788p_b_cclass(p, cs)
789register struct parse *p;
790register cset *cs;
791{
6175ca7c 792 register char *sp = p->next;
ff09270c 793 register struct cclass *cp;
6175ca7c
KB
794 register size_t len;
795 register char *u;
796 register char c;
ff09270c 797
6175ca7c
KB
798 while (MORE() && isalpha(PEEK()))
799 NEXT();
800 len = p->next - sp;
ff09270c 801 for (cp = cclasses; cp->name != NULL; cp++)
6175ca7c 802 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
ff09270c
KB
803 break;
804 if (cp->name == NULL) {
805 /* oops, didn't find it */
806 SETERROR(REG_ECTYPE);
807 return;
808 }
809
6175ca7c 810 u = cp->chars;
ff09270c
KB
811 while ((c = *u++) != '\0')
812 CHadd(cs, c);
6175ca7c 813 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
ff09270c
KB
814 MCadd(cs, u);
815}
816
817/*
818 - p_b_eclass - parse an equivalence-class name and deal with it
6175ca7c 819 == static void p_b_eclass(register struct parse *p, register cset *cs);
ff09270c
KB
820 *
821 * This implementation is incomplete. xxx
822 */
823static void
824p_b_eclass(p, cs)
825register struct parse *p;
826register cset *cs;
827{
6175ca7c 828 register char c;
ff09270c
KB
829
830 c = p_b_coll_elem(p, '=');
831 CHadd(cs, c);
832}
833
834/*
835 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
6175ca7c 836 == static char p_b_symbol(register struct parse *p);
ff09270c 837 */
6175ca7c 838static char /* value of symbol */
ff09270c
KB
839p_b_symbol(p)
840register struct parse *p;
841{
6175ca7c 842 register char value;
ff09270c 843
6175ca7c
KB
844 REQUIRE(MORE(), REG_EBRACK);
845 if (!EATTWO('[', '.'))
ff09270c 846 return(GETNEXT());
ff09270c
KB
847
848 /* collating symbol */
ff09270c
KB
849 value = p_b_coll_elem(p, '.');
850 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
851 return(value);
852}
853
854/*
855 - p_b_coll_elem - parse a collating-element name and look it up
6175ca7c 856 == static char p_b_coll_elem(register struct parse *p, int endc);
ff09270c 857 */
6175ca7c 858static char /* value of collating element */
ff09270c
KB
859p_b_coll_elem(p, endc)
860register struct parse *p;
6175ca7c 861int endc; /* name ended by endc,']' */
ff09270c 862{
6175ca7c 863 register char *sp = p->next;
ff09270c
KB
864 register struct cname *cp;
865 register int len;
6175ca7c 866 register char c;
ff09270c 867
6175ca7c 868 while (MORE() && !SEETWO(endc, ']'))
ff09270c 869 NEXT();
6175ca7c 870 if (!MORE()) {
ff09270c
KB
871 SETERROR(REG_EBRACK);
872 return(0);
873 }
874 len = p->next - sp;
875 for (cp = cnames; cp->name != NULL; cp++)
6175ca7c 876 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
ff09270c
KB
877 return(cp->code); /* known name */
878 if (len == 1)
879 return(*sp); /* single character */
880 SETERROR(REG_ECOLLATE); /* neither */
881 return(0);
882}
883
884/*
885 - othercase - return the case counterpart of an alphabetic
6175ca7c 886 == static char othercase(int ch);
ff09270c 887 */
6175ca7c 888static char /* if no counterpart, return ch */
ff09270c 889othercase(ch)
6175ca7c 890int ch;
ff09270c
KB
891{
892 assert(isalpha(ch));
893 if (isupper(ch))
894 return(tolower(ch));
895 else if (islower(ch))
896 return(toupper(ch));
897 else /* peculiar, but could happen */
898 return(ch);
899}
900
901/*
6175ca7c
KB
902 - bothcases - emit a dualcase version of a two-case character
903 == static void bothcases(register struct parse *p, int ch);
4d4cb2b0 904 *
ff09270c
KB
905 * Boy, is this implementation ever a kludge...
906 */
907static void
908bothcases(p, ch)
909register struct parse *p;
6175ca7c 910int ch;
ff09270c 911{
6175ca7c
KB
912 register char *oldnext = p->next;
913 register char *oldend = p->end;
914 char bracket[3];
ff09270c 915
6175ca7c 916 assert(othercase(ch) != ch); /* p_bracket() would recurse */
ff09270c 917 p->next = bracket;
6175ca7c 918 p->end = bracket+2;
ff09270c
KB
919 bracket[0] = ch;
920 bracket[1] = ']';
921 bracket[2] = '\0';
922 p_bracket(p);
923 assert(p->next == bracket+2);
924 p->next = oldnext;
6175ca7c 925 p->end = oldend;
ff09270c
KB
926}
927
928/*
929 - ordinary - emit an ordinary character
6175ca7c 930 == static void ordinary(register struct parse *p, register int ch);
ff09270c
KB
931 */
932static void
933ordinary(p, ch)
934register struct parse *p;
6175ca7c 935register int ch;
ff09270c 936{
6175ca7c 937 register cat_t *cap = p->g->categories;
ff09270c 938
6175ca7c 939 if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
ff09270c 940 bothcases(p, ch);
6175ca7c
KB
941 else {
942 EMIT(OCHAR, (unsigned char)ch);
943 if (cap[ch] == 0)
944 cap[ch] = p->g->ncategories++;
ff09270c 945 }
ff09270c
KB
946}
947
948/*
949 - nonnewline - emit REG_NEWLINE version of OANY
6175ca7c 950 == static void nonnewline(register struct parse *p);
4d4cb2b0 951 *
ff09270c
KB
952 * Boy, is this implementation ever a kludge...
953 */
954static void
955nonnewline(p)
956register struct parse *p;
957{
6175ca7c
KB
958 register char *oldnext = p->next;
959 register char *oldend = p->end;
960 char bracket[4];
ff09270c 961
ff09270c 962 p->next = bracket;
6175ca7c 963 p->end = bracket+3;
ff09270c
KB
964 bracket[0] = '^';
965 bracket[1] = '\n';
966 bracket[2] = ']';
967 bracket[3] = '\0';
968 p_bracket(p);
969 assert(p->next == bracket+3);
970 p->next = oldnext;
6175ca7c 971 p->end = oldend;
ff09270c
KB
972}
973
974/*
975 - repeat - generate code for a bounded repetition, recursively if needed
6175ca7c 976 == static void repeat(register struct parse *p, sopno start, int from, int to);
ff09270c
KB
977 */
978static void
979repeat(p, start, from, to)
980register struct parse *p;
981sopno start; /* operand from here to end of strip */
982int from; /* repeated from this number */
983int to; /* to this number of times (maybe INFINITY) */
984{
985 register sopno finish = HERE();
986# define N 2
987# define INF 3
988# define REP(f, t) ((f)*8 + (t))
989# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
990 register sopno copy;
991
992 if (p->error != 0) /* head off possible runaway recursion */
993 return;
994
995 assert(from <= to);
996
997 switch (REP(MAP(from), MAP(to))) {
998 case REP(0, 0): /* must be user doing this */
999 DROP(finish-start); /* drop the operand */
1000 break;
1001 case REP(0, 1): /* as x{1,1}? */
1002 case REP(0, N): /* as x{1,n}? */
1003 case REP(0, INF): /* as x{1,}? */
1004 INSERT(OQUEST_, start); /* offset is wrong... */
1005 repeat(p, start+1, 1, to);
6175ca7c
KB
1006 AHEAD(start); /* ... fix it */
1007 ASTERN(O_QUEST, start);
ff09270c
KB
1008 break;
1009 case REP(1, 1): /* trivial case */
1010 /* done */
1011 break;
1012 case REP(1, N): /* as x?x{1,n-1} */
1013 INSERT(OQUEST_, start);
6175ca7c 1014 ASTERN(O_QUEST, start);
ff09270c
KB
1015 copy = dupl(p, start+1, finish+1);
1016 assert(copy == finish+2);
1017 repeat(p, copy, 1, to-1);
1018 break;
1019 case REP(1, INF): /* as x+ */
1020 INSERT(OPLUS_, start);
6175ca7c 1021 ASTERN(O_PLUS, start);
ff09270c
KB
1022 break;
1023 case REP(N, N): /* as xx{m-1,n-1} */
1024 copy = dupl(p, start, finish);
1025 repeat(p, copy, from-1, to-1);
1026 break;
1027 case REP(N, INF): /* as xx{n-1,INF} */
1028 copy = dupl(p, start, finish);
1029 repeat(p, copy, from-1, to);
1030 break;
1031 default: /* "can't happen" */
1032 SETERROR(REG_ASSERT); /* just in case */
1033 break;
1034 }
1035}
1036
1037/*
1038 - seterr - set an error condition
6175ca7c 1039 == static int seterr(register struct parse *p, int e);
ff09270c
KB
1040 */
1041static int /* useless but makes type checking happy */
1042seterr(p, e)
1043register struct parse *p;
1044int e;
1045{
1046 if (p->error == 0) /* keep earliest error condition */
1047 p->error = e;
1048 p->next = nuls; /* try to bring things to a halt */
6175ca7c 1049 p->end = nuls;
ff09270c
KB
1050 return(0); /* make the return value well-defined */
1051}
1052
1053/*
1054 - allocset - allocate a set of characters for []
6175ca7c 1055 == static cset *allocset(register struct parse *p);
ff09270c
KB
1056 */
1057static cset *
1058allocset(p)
1059register struct parse *p;
1060{
1061 register int no = p->g->ncsets++;
1062 register size_t nc;
1063 register size_t nbytes;
1064 register cset *cs;
ff09270c
KB
1065 register size_t css = (size_t)p->g->csetsize;
1066
1067 if (no >= p->ncsalloc) { /* need another column of space */
1068 p->ncsalloc += CHAR_BIT;
1069 nc = p->ncsalloc;
1070 assert(nc % CHAR_BIT == 0);
1071 nbytes = nc / CHAR_BIT * css;
1072 if (p->g->sets == NULL)
1073 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1074 else
1075 p->g->sets = (cset *)realloc((char *)p->g->sets,
1076 nc * sizeof(cset));
1077 if (p->g->setbits == NULL)
1078 p->g->setbits = (uchar *)malloc(nbytes);
1079 else
1080 p->g->setbits = (uchar *)realloc((char *)p->g->setbits,
1081 nbytes);
1082 if (p->g->sets != NULL && p->g->setbits != NULL)
1083 (void) memset((char *)p->g->setbits + (nbytes - css),
1084 0, css);
1085 else {
1086 no = 0;
1087 SETERROR(REG_ESPACE);
1088 /* caller's responsibility not to do set ops */
1089 }
1090 }
1091
1092 assert(p->g->sets != NULL); /* xxx */
1093 cs = &p->g->sets[no];
1094 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1095 cs->mask = 1 << ((no) % CHAR_BIT);
1096 cs->hash = 0;
1097 cs->smultis = 0;
1098 cs->multis = NULL;
1099
1100 return(cs);
1101}
1102
6175ca7c
KB
1103/*
1104 - freeset - free a now-unused set
1105 == static void freeset(register struct parse *p, register cset *cs);
1106 */
1107static void
1108freeset(p, cs)
1109register struct parse *p;
1110register cset *cs;
1111{
1112 register int i;
1113 register cset *top = &p->g->sets[p->g->ncsets];
1114 register size_t css = (size_t)p->g->csetsize;
1115
1116 for (i = 0; i < css; i++)
1117 CHsub(cs, i);
1118 if (cs == top-1) /* recover only the easy case */
1119 p->g->ncsets--;
1120}
1121
ff09270c
KB
1122/*
1123 - freezeset - final processing on a set of characters
6175ca7c 1124 == static int freezeset(register struct parse *p, register cset *cs);
ff09270c
KB
1125 *
1126 * The main task here is merging identical sets. This is usually a waste
1127 * of time (although the hash code minimizes the overhead), but can win
1128 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1129 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1130 * the same value!
1131 */
1132static int /* set number */
1133freezeset(p, cs)
1134register struct parse *p;
1135register cset *cs;
1136{
1137 register uchar h = cs->hash;
1138 register int i;
1139 register cset *top = &p->g->sets[p->g->ncsets];
ff09270c
KB
1140 register cset *cs2;
1141 register size_t css = (size_t)p->g->csetsize;
1142
1143 /* look for an earlier one which is the same */
1144 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1145 if (cs2->hash == h && cs2 != cs) {
1146 /* maybe */
1147 for (i = 0; i < css; i++)
1148 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1149 break; /* no */
1150 if (i == css)
1151 break; /* yes */
1152 }
1153
1154 if (cs2 < top) { /* found one */
6175ca7c 1155 freeset(p, cs);
ff09270c
KB
1156 cs = cs2;
1157 }
1158
1159 return((int)(cs - p->g->sets));
1160}
1161
6175ca7c
KB
1162/*
1163 - firstch - return first character in a set (which must have at least one)
1164 == static int firstch(register struct parse *p, register cset *cs);
1165 */
1166static int /* character; there is no "none" value */
1167firstch(p, cs)
1168register struct parse *p;
1169register cset *cs;
1170{
1171 register int i;
1172 register size_t css = (size_t)p->g->csetsize;
1173
1174 for (i = 0; i < css; i++)
1175 if (CHIN(cs, i))
1176 return((char)i);
1177 assert(never);
1178 return(0); /* arbitrary */
1179}
1180
1181/*
1182 - nch - number of characters in a set
1183 == static int nch(register struct parse *p, register cset *cs);
1184 */
1185static int
1186nch(p, cs)
1187register struct parse *p;
1188register cset *cs;
1189{
1190 register int i;
1191 register size_t css = (size_t)p->g->csetsize;
1192 register int n = 0;
1193
1194 for (i = 0; i < css; i++)
1195 if (CHIN(cs, i))
1196 n++;
1197 return(n);
1198}
1199
ff09270c
KB
1200/*
1201 - mcadd - add a collating element to a cset
6175ca7c
KB
1202 == static void mcadd(register struct parse *p, register cset *cs, \
1203 == register char *cp);
ff09270c
KB
1204 */
1205static void
1206mcadd(p, cs, cp)
1207register struct parse *p;
1208register cset *cs;
6175ca7c 1209register char *cp;
ff09270c
KB
1210{
1211 register size_t oldend = cs->smultis;
1212
6175ca7c 1213 cs->smultis += strlen(cp) + 1;
ff09270c 1214 if (cs->multis == NULL)
6175ca7c 1215 cs->multis = malloc(cs->smultis);
ff09270c 1216 else
6175ca7c 1217 cs->multis = realloc(cs->multis, cs->smultis);
ff09270c
KB
1218 if (cs->multis == NULL) {
1219 SETERROR(REG_ESPACE);
1220 return;
1221 }
1222
6175ca7c 1223 (void) strcpy(cs->multis + oldend - 1, cp);
ff09270c
KB
1224 cs->multis[cs->smultis - 1] = '\0';
1225}
1226
1227/*
1228 - mcsub - subtract a collating element from a cset
6175ca7c 1229 == static void mcsub(register cset *cs, register char *cp);
ff09270c
KB
1230 */
1231static void
6175ca7c 1232mcsub(cs, cp)
ff09270c 1233register cset *cs;
6175ca7c 1234register char *cp;
ff09270c 1235{
6175ca7c
KB
1236 register char *fp = mcfind(cs, cp);
1237 register size_t len = strlen(fp);
ff09270c 1238
6175ca7c
KB
1239 assert(fp != NULL);
1240 (void) memmove(fp, fp + len + 1,
ff09270c
KB
1241 cs->smultis - (fp + len + 1 - cs->multis));
1242 cs->smultis -= len;
1243
1244 if (cs->smultis == 0) {
6175ca7c 1245 free(cs->multis);
ff09270c
KB
1246 cs->multis = NULL;
1247 return;
1248 }
1249
6175ca7c 1250 cs->multis = realloc(cs->multis, cs->smultis);
ff09270c
KB
1251 assert(cs->multis != NULL);
1252}
1253
1254/*
1255 - mcin - is a collating element in a cset?
6175ca7c 1256 == static int mcin(register cset *cs, register char *cp);
ff09270c
KB
1257 */
1258static int
6175ca7c 1259mcin(cs, cp)
ff09270c 1260register cset *cs;
6175ca7c 1261register char *cp;
ff09270c
KB
1262{
1263 return(mcfind(cs, cp) != NULL);
1264}
1265
1266/*
1267 - mcfind - find a collating element in a cset
6175ca7c 1268 == static char *mcfind(register cset *cs, register char *cp);
ff09270c 1269 */
6175ca7c 1270static char *
ff09270c
KB
1271mcfind(cs, cp)
1272register cset *cs;
6175ca7c 1273register char *cp;
ff09270c 1274{
6175ca7c 1275 register char *p;
ff09270c
KB
1276
1277 if (cs->multis == NULL)
1278 return(NULL);
6175ca7c
KB
1279 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1280 if (strcmp(cp, p) == 0)
ff09270c
KB
1281 return(p);
1282 return(NULL);
1283}
1284
1285/*
1286 - mcinvert - invert the list of collating elements in a cset
6175ca7c 1287 == static void mcinvert(register cset *cs);
ff09270c
KB
1288 *
1289 * This would have to know the set of possibilities. Implementation
1290 * is deferred.
1291 */
1292static void
6175ca7c
KB
1293mcinvert(cs)
1294register cset *cs;
1295{
1296 assert(cs->multis == NULL); /* xxx */
1297}
1298
1299/*
1300 - mccase - add case counterparts of the list of collating elements in a cset
1301 == static void mccase(register cset *cs);
1302 *
1303 * This would have to know the set of possibilities. Implementation
1304 * is deferred.
1305 */
1306static void
1307mccase(cs)
ff09270c
KB
1308register cset *cs;
1309{
1310 assert(cs->multis == NULL); /* xxx */
1311}
1312
1313/*
1314 - isinsets - is this character in any sets?
6175ca7c 1315 == static int isinsets(register struct re_guts *g, int c);
ff09270c
KB
1316 */
1317static int /* predicate */
1318isinsets(g, c)
1319register struct re_guts *g;
6175ca7c 1320int c;
ff09270c
KB
1321{
1322 register uchar *col;
1323 register int i;
1324 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
6175ca7c 1325 register unsigned uc = (unsigned char)c;
ff09270c
KB
1326
1327 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
6175ca7c 1328 if (col[uc] != 0)
ff09270c
KB
1329 return(1);
1330 return(0);
1331}
1332
1333/*
1334 - samesets - are these two characters in exactly the same sets?
6175ca7c 1335 == static int samesets(register struct re_guts *g, int c1, int c2);
ff09270c
KB
1336 */
1337static int /* predicate */
1338samesets(g, c1, c2)
1339register struct re_guts *g;
6175ca7c
KB
1340int c1;
1341int c2;
ff09270c
KB
1342{
1343 register uchar *col;
1344 register int i;
1345 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
6175ca7c
KB
1346 register unsigned uc1 = (unsigned char)c1;
1347 register unsigned uc2 = (unsigned char)c2;
ff09270c
KB
1348
1349 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
6175ca7c 1350 if (col[uc1] != col[uc2])
ff09270c
KB
1351 return(0);
1352 return(1);
1353}
1354
1355/*
1356 - categorize - sort out character categories
6175ca7c 1357 == static void categorize(struct parse *p, register struct re_guts *g);
ff09270c
KB
1358 */
1359static void
1360categorize(p, g)
1361struct parse *p;
1362register struct re_guts *g;
1363{
6175ca7c
KB
1364 register cat_t *cats = g->categories;
1365 register int c;
1366 register int c2;
1367 register cat_t cat;
ff09270c
KB
1368
1369 /* avoid making error situations worse */
1370 if (p->error != 0)
1371 return;
1372
6175ca7c 1373 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
ff09270c
KB
1374 if (cats[c] == 0 && isinsets(g, c)) {
1375 cat = g->ncategories++;
1376 cats[c] = cat;
6175ca7c 1377 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
ff09270c
KB
1378 if (cats[c2] == 0 && samesets(g, c, c2))
1379 cats[c2] = cat;
1380 }
1381}
1382
1383/*
1384 - dupl - emit a duplicate of a bunch of sops
6175ca7c 1385 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
ff09270c
KB
1386 */
1387static sopno /* start of duplicate */
1388dupl(p, start, finish)
1389register struct parse *p;
1390sopno start; /* from here */
1391sopno finish; /* to this less one */
1392{
ff09270c
KB
1393 register sopno ret = HERE();
1394 register sopno len = finish - start;
1395
1396 assert(finish >= start);
1397 if (len == 0)
1398 return(ret);
1399 enlarge(p, p->ssize + len); /* this many unexpected additions */
1400 assert(p->ssize >= p->slen + len);
1401 (void) memcpy((char *)(p->strip + p->slen),
1402 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1403 p->slen += len;
1404 return(ret);
1405}
1406
1407/*
1408 - doemit - emit a strip operator
6175ca7c 1409 == static void doemit(register struct parse *p, sop op, size_t opnd);
ff09270c
KB
1410 *
1411 * It might seem better to implement this as a macro with a function as
1412 * hard-case backup, but it's just too big and messy unless there are
1413 * some changes to the data structures. Maybe later.
1414 */
1415static void
1416doemit(p, op, opnd)
1417register struct parse *p;
1418sop op;
1419size_t opnd;
1420{
1421 /* avoid making error situations worse */
1422 if (p->error != 0)
1423 return;
1424
1425 /* deal with oversize operands ("can't happen", more or less) */
1426 assert(opnd < 1<<OPSHIFT);
1427
1428 /* deal with undersized strip */
1429 if (p->slen >= p->ssize)
1430 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1431 assert(p->slen < p->ssize);
1432
1433 /* finally, it's all reduced to the easy case */
1434 p->strip[p->slen++] = SOP(op, opnd);
1435}
1436
1437/*
1438 - doinsert - insert a sop into the strip
6175ca7c 1439 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
ff09270c
KB
1440 */
1441static void
1442doinsert(p, op, opnd, pos)
1443register struct parse *p;
1444sop op;
1445size_t opnd;
1446sopno pos;
1447{
1448 register sopno sn;
1449 register sop s;
1450 register int i;
1451
1452 /* avoid making error situations worse */
1453 if (p->error != 0)
1454 return;
1455
1456 sn = HERE();
1457 EMIT(op, opnd); /* do checks, ensure space */
1458 assert(HERE() == sn+1);
1459 s = p->strip[sn];
1460
1461 /* adjust paren pointers */
1462 assert(pos > 0);
1463 for (i = 1; i < NPAREN; i++) {
1464 if (p->pbegin[i] >= pos) {
1465 p->pbegin[i]++;
1466 }
1467 if (p->pend[i] >= pos) {
1468 p->pend[i]++;
1469 }
1470 }
1471
1472 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1473 (HERE()-pos-1)*sizeof(sop));
1474 p->strip[pos] = s;
1475}
1476
1477/*
1478 - dofwd - complete a forward reference
6175ca7c 1479 == static void dofwd(register struct parse *p, sopno pos, sop value);
ff09270c
KB
1480 */
1481static void
1482dofwd(p, pos, value)
1483register struct parse *p;
1484register sopno pos;
1485sop value;
1486{
1487 /* avoid making error situations worse */
1488 if (p->error != 0)
1489 return;
1490
1491 assert(value < 1<<OPSHIFT);
1492 p->strip[pos] = OP(p->strip[pos]) | value;
1493}
1494
1495/*
1496 - enlarge - enlarge the strip
6175ca7c 1497 == static void enlarge(register struct parse *p, sopno size);
ff09270c
KB
1498 */
1499static void
1500enlarge(p, size)
1501register struct parse *p;
1502register sopno size;
1503{
1504 register sop *sp;
1505
1506 if (p->ssize >= size)
1507 return;
1508
1509 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1510 if (sp == NULL) {
1511 SETERROR(REG_ESPACE);
1512 return;
1513 }
1514 p->strip = sp;
1515 p->ssize = size;
1516}
1517
1518/*
1519 - stripsnug - compact the strip
6175ca7c 1520 == static void stripsnug(register struct parse *p, register struct re_guts *g);
ff09270c
KB
1521 */
1522static void
1523stripsnug(p, g)
1524register struct parse *p;
1525register struct re_guts *g;
1526{
1527 g->nstates = p->slen;
1528 g->strip = (sop *)realloc((sop *)p->strip, p->slen * sizeof(sop));
1529 if (g->strip == NULL) {
1530 SETERROR(REG_ESPACE);
1531 g->strip = p->strip;
1532 }
1533}
1534
1535/*
1536 - findmust - fill in must and mlen with longest mandatory literal string
6175ca7c 1537 == static void findmust(register struct parse *p, register struct re_guts *g);
ff09270c
KB
1538 *
1539 * This algorithm could do fancy things like analyzing the operands of |
1540 * for common subsequences. Someday. This code is simple and finds most
1541 * of the interesting cases.
1542 *
1543 * Note that must and mlen got initialized during setup.
1544 */
4d4cb2b0 1545static void
ff09270c
KB
1546findmust(p, g)
1547struct parse *p;
1548register struct re_guts *g;
1549{
1550 register sop *scan;
1551 sop *start;
1552 register sop *newstart;
1553 register sopno newlen;
1554 register sop s;
1555 register char *cp;
1556 register sopno i;
1557
1558 /* avoid making error situations worse */
1559 if (p->error != 0)
1560 return;
1561
1562 /* find the longest OCHAR sequence in strip */
1563 newlen = 0;
1564 scan = g->strip + 1;
1565 do {
1566 s = *scan++;
1567 switch (OP(s)) {
1568 case OCHAR: /* sequence member */
1569 if (newlen == 0) /* new sequence */
1570 newstart = scan - 1;
1571 newlen++;
1572 break;
1573 case OPLUS_: /* things that don't break one */
1574 case OLPAREN:
1575 case ORPAREN:
1576 break;
1577 case OQUEST_: /* things that must be skipped */
1578 case OCH_:
1579 scan--;
1580 do {
1581 scan += OPND(s);
1582 s = *scan;
1583 /* assert() interferes w debug printouts */
1584 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1585 OP(s) != OOR2) {
1586 g->iflags |= BAD;
1587 return;
1588 }
1589 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1590 /* fallthrough */
1591 default: /* things that break a sequence */
1592 if (newlen > g->mlen) { /* ends one */
1593 start = newstart;
1594 g->mlen = newlen;
1595 }
1596 newlen = 0;
1597 break;
1598 }
1599 } while (OP(s) != OEND);
1600
1601 if (g->mlen == 0) /* there isn't one */
1602 return;
1603
1604 /* turn it into a character string */
1605 g->must = malloc((size_t)g->mlen + 1);
1606 if (g->must == NULL) { /* argh; just forget it */
1607 g->mlen = 0;
1608 return;
1609 }
1610 cp = g->must;
1611 scan = start;
1612 for (i = g->mlen; i > 0; i--) {
1613 while (OP(s = *scan++) != OCHAR)
1614 continue;
6175ca7c 1615 *cp++ = (char)OPND(s);
ff09270c
KB
1616 }
1617 *cp++ = '\0'; /* just on general principles */
1618}
1619
1620/*
1621 - pluscount - count + nesting
6175ca7c 1622 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
ff09270c 1623 */
4d4cb2b0 1624static sopno /* nesting depth */
ff09270c
KB
1625pluscount(p, g)
1626struct parse *p;
1627register struct re_guts *g;
1628{
1629 register sop *scan;
1630 register sop s;
1631 register sopno plusnest = 0;
1632 register sopno maxnest = 0;
1633
1634 if (p->error != 0)
1635 return(0); /* there may not be an OEND */
1636
1637 scan = g->strip + 1;
1638 do {
1639 s = *scan++;
1640 switch (OP(s)) {
1641 case OPLUS_:
1642 plusnest++;
1643 break;
1644 case O_PLUS:
1645 if (plusnest > maxnest)
1646 maxnest = plusnest;
1647 plusnest--;
1648 break;
1649 }
1650 } while (OP(s) != OEND);
1651 if (plusnest != 0)
1652 g->iflags |= BAD;
1653 return(maxnest);
1654}