everybody has the -h flag, now
[unix-history] / usr / src / usr.bin / grep / egrep / egrep.c
CommitLineData
896308ef 1#ifndef lint
ee56711b 2static char sccsid[] = "@(#)egrep.c 5.6 (Berkeley) %G%";
896308ef
KB
3#endif not lint
4
5/*
6 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7 table as in original paper (CACM, October, 1977). No delta1 or delta2.
8 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9 minimal practical value. However, to improve for worst case input,
10 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11 Comput., Feb. 1986) deserves consideration.
12
13 Method: extract longest metacharacter-free string from expression.
14 this is done using a side-effect from henry spencer's regcomp().
15 use boyer-moore to match such, then pass submatching lines
16 to either regexp() or standard 'egrep', depending on certain
17 criteria within execstrategy() below. [this tradeoff is due
18 to the general slowness of the regexp() nondeterministic
19 machine on complex expressions, as well as the startup time
20 of standard 'egrep' on short files.] alternatively, one may
21 change the vendor-supplied 'egrep' automaton to include
22 boyer-moore directly. see accompanying writeup for discussion
23 of kanji expression treatment.
24
25 late addition: apply trickbag for fast match of simple
26 alternations (sublinear, in common low-cardinality cases).
27 trap fgrep into this lair.
28
29 gnu additions: -f, newline as |, \< and \> [in regexec()], more
30 comments. inspire better dfa exec() strategy.
31 serious testing and help with special cases.
32
33 Algorithm amalgam summary:
34
35 dfa e?grep (aho/thompson)
36 ndfa regexp() (spencer/aho)
37 bmg (boyer/moore/gosper)
38 "superimposed" bmg (jaw)
39 fgrep (aho/corrasick)
40
41 sorry, but the knuth/morris/pratt machine, horspool's
42 "frequentist" code, and the rabin/karp matcher, however cute,
43 just don't cut it for this production.
44
45 James A. Woods Copyright (c) 1986
46 NASA Ames Research Center
47*/
48#include <stdio.h>
49#include <ctype.h>
50#include <sys/types.h>
51#include <sys/stat.h>
81922b46 52#include <sys/file.h>
896308ef
KB
53#include <regexp.h> /* must be henry spencer's version */
54
55#define MIN(A, B) ((A) > (B) ? (B) : (A))
56
57#ifdef SLOWSYS
58#define read xread
59#endif
60/*
61 * define existing [ef]?grep program locations below for use by execvp().
62 * [execlp() would be used were it not for the possibility of
63 * installation-dependent recursion.]
64 */
65#ifndef EGREPSTD
66#define EGREPSTD "/usr/bin/egrep"
67#endif
68#ifndef GREPSTD
69#define GREPSTD "/bin/grep"
70#endif
71#ifndef FGREPSTD
72#define FGREPSTD "/usr/bin/fgrep"
73#endif
74
75#define BUFSIZE 8192 /* make higher for cray */
76#define PATSIZE 6000
77#define LARGE BUFSIZE + PATSIZE
78
896308ef
KB
79#define NALT 7 /* tied to scanf() size in alternate() */
80#define NMUSH 6 /* loosely relates to expected alt length */
81
82#define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
83#define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
84 * was processed by regexp(), exec std egrep. */
85#define NL '\n'
86#define EOS '\0'
87#define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
88#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
89#define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
90#define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
91
92extern char *optarg;
93extern int optind;
94char *progname;
95
96int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
97int sflag, hflag; /* v7, v8, bsd */
98
99int firstflag; /* Stop at first match */
100int grepflag; /* Called as "grep" */
101int fgrepflag; /* Called as "fgrep" */
102int altflag; /* Simple alternation in pattern */
103int boyonly; /* No regexp needed -- all simple */
104int flushflag;
105int grepold, egrepold, fgrepold;
106
107int nalt; /* Number of alternatives */
108int nsuccess; /* 1 for match, 2 for error */
109int altmin; /* Minimum length of all the alternate
110 * strings */
111int firstfile; /* argv index of first file argument */
112int patind; /* argv index of pattern */
113long nmatch; /* Number of matches in this file */
114long incount, counted; /* Amount of input consumed */
115long rxcount; /* Bytes of input processed by regexec() */
116int boyfound; /* accumulated partial matches (tripped by
117 * FIRSTFEW) */
118int prevmatch; /* next three lines aid fast -n */
119long nline, prevnline;
120char *prevloc;
121
122regexp *rspencer;
123char *pattern;
124char *patboy; /* Pattern for simple Boyer-Moore */
125char *patfile; /* Filename containing pattern(s) */
126
127int delta0[256]; /* Boyer-Moore algorithm core */
128char cmap[256]; /* Usually 0-255, but if -i, maps upper to
129 * lower case */
130char str[BUFSIZE + 2];
131int nleftover;
132char linetemp[BUFSIZE];
89422f46 133char *altpat[NALT]; /* alternation component storage */
896308ef
KB
134int altlen[NALT];
135short altset[NMUSH + 1][256];
136char preamble[200]; /* match prefix (filename, line no.) */
137
138int fd;
139char *
140strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *sprintf(), *malloc();
141char *
142grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
143char **args;
144
145main(argc, argv)
146 int argc;
147 char *argv[];
148{
149 int c;
150 int errflag = 0;
151
152 args = argv;
153
154 if ((progname = strrchr(argv[0], '/')) != 0)
155 progname++;
156 else
157 progname = argv[0];
158 if (strcmp(progname, "grep") == 0)
159 grepflag++;
160 if (strcmp(progname, "fgrep") == 0)
161 fgrepflag++;
162
163 while ((c = getopt(argc, argv, "bchie:f:lnsvwxy1")) != EOF) {
164 switch (c) {
165
166 case 'f':
167 fflag++;
168 patfile = optarg;
169 continue;
170 case 'b':
171 case 'v':
172 egrepold++; /* boyer-moore of little help here */
173 continue;
174 case 'c':
175 cflag++;
176 continue;
177 case 'e':
178 eflag++;
179 pattern = optarg;
180 continue;
181 case 'h':
182 hflag++;
183 continue;
184 case '1': /* Stop at very first match */
185 firstflag++; /* spead freaks only */
186 continue;
187 case 'i':
188 iflag++;
189 continue;
190 case 'l':
191 lflag++;
192 continue;
193 case 'n':
194 nflag++;
195 continue;
196 case 's':
197 sflag++;
198 continue;
199 case 'w':
200 case 'y':
201 if (!grepflag)
202 errflag++;
203 grepold++;
204 continue;
205 case 'x': /* needs more work, like -b above */
206 if (!fgrepflag)
207 errflag++;
208 fgrepold++;
209 continue;
210 case '?':
211 errflag++;
212 }
213 }
214 if (errflag || ((argc <= optind) && !fflag && !eflag)) {
215 if (grepflag)
ee56711b 216oops("usage: grep [-bchilnsvwy] [-e] pattern [file ...]");
896308ef 217 else if (fgrepflag)
ee56711b 218oops("usage: fgrep [-bchilnv] {-f patfile | [-e] strings} [file ...]");
896308ef
KB
219 else /* encourage SVID options, though we provide
220 * others */
ee56711b 221oops("usage: egrep [-bchilnv] {-f patfile | [-e] pattern} [file ...]");
896308ef
KB
222 }
223 patind = optind;
224 if (fflag)
225 pattern = pfile(patfile), patind = 0;
226 else if (!eflag)
227 pattern = argv[optind++];
228
229 if ((argc - optind) <= 1) /* Filename invisible given < 2 files */
230 hflag++;
231 if (pattern[0] == EOS)
232 kernighan(argv); /* same as it ever was */
233 /*
234 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
235 * exprs \( \), word matches \< and \>, -w and -y options, char
236 * classes with '-' at end (egrep bug?), and patterns beginning with
237 * an asterisk (don't ask why). otherwise, characters meaningful to
238 * 'egrep' but not to 'grep' are escaped; the entire expr is then
239 * passed to 'egrep'.
240 */
241 if (grepflag && !grepold) {
242 if (strindex(pattern, "\\(") >= 0 ||
243 strindex(pattern, "\\<") >= 0 ||
244 strindex(pattern, "\\>") >= 0 ||
245 strindex(pattern, "-]") >= 0 ||
246 pattern[0] == '*') /* grep bug */
247 grepold++;
248 else
249 pattern = grepxlat(pattern);
250 }
251 if (grepold || egrepold || fgrepold)
252 kernighan(argv);
253
254 if (iflag)
255 strcpy(pattern, fold(pattern));
256 /*
257 * If the pattern is a plain string, just run boyer-moore. If it
258 * consists of meta-free alternatives, run "superimposed" bmg.
259 * Otherwise, find best string, and compile pattern for regexec().
260 */
261 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
262 boyonly++;
263 patboy = pattern;
264 } else {
265 if ((patboy = alternate(pattern)) != NULL)
266 boyonly++;
267 else {
268 if ((patboy = isolate(pattern)) == NULL)
269 kernighan(argv); /* expr too involved */
270#ifndef NOKANJI
271 for (c = 0; pattern[c] != EOS; c++)
272 if (pattern[c] & NONASCII) /* kanji + meta */
273 kernighan(argv);
274#endif
275 if ((rspencer = regcomp(pattern)) == NULL)
276 oops("regcomp failure");
277 }
278 }
279 gosper(patboy); /* "pre-conditioning is wonderful"
280 * -- v. strassen */
281
282 if ((firstfile = optind) >= argc) {
283 /* Grep standard input */
284 if (lflag) /* We don't know its name! */
285 exit(1);
286 egsecute((char *) NULL);
287 } else {
288 while (optind < argc) {
289 egsecute(argv[optind]);
290 optind++;
291 if (firstflag && (nsuccess == 1))
292 break;
293 }
294 }
295 exit((nsuccess == 2) ? 2 : (nsuccess == 0));
296}
297
298char *
299pfile(pfname) /* absorb expression from file */
300 char *pfname;
301{
81922b46 302 int fd;
896308ef
KB
303 struct stat patstat;
304 static char *pat;
305
81922b46 306 if ((fd = open(pfname, O_RDONLY, 0)) < 0)
896308ef 307 oops("can't read pattern file");
81922b46 308 if (fstat(fd, &patstat) != 0)
896308ef
KB
309 oops("can't stat pattern file");
310 if (patstat.st_size > PATSIZE) {
311 if (fgrepflag) { /* defer to unix version */
312 fgrepold++;
313 return "dummy";
314 } else
315 oops("pattern file too big");
316 }
317 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
318 oops("out of memory to read pattern file");
81922b46 319 if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
896308ef 320 oops("error reading pattern file");
81922b46 321 (void) close(fd);
896308ef
KB
322
323 pat[patstat.st_size] = EOS;
324 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
325 pat[patstat.st_size - 1] = EOS;
326
327 if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
328 if (fgrepflag)
329 fgrepold++; /* "what's it all about, alfie?" */
330 else
331 egrepold++;
332 }
333 return (pat);
334}
335
336egsecute(file)
337 char *file;
338{
339 if (file == NULL)
340 fd = 0;
89422f46 341 else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
896308ef
KB
342 fprintf(stderr, "%s: can't open %s\n", progname, file);
343 nsuccess = 2;
344 return;
345 }
346 chimaera(file, patboy);
347
348 if (!boyonly && !flushflag && file != NULL)
349 flushmatches();
350 if (file != NULL)
351 close(fd);
352}
353
354chimaera(file, pat) /* "reach out and boyer-moore search someone" */
355 char *file, *pat; /* -- soon-to-be-popular bumper sticker */
356{
357 register char *k, *strend, *s;
358 register int j, count;
359 register int *deltazero = delta0;
360 int patlen = altmin;
361 char *t;
362 char *gotamatch(), *kanji(), *linesave(), *submatch();
363
364 nleftover = boyfound = flushflag = 0;
365 nline = 1L;
366 prevmatch = 0;
367 nmatch = counted = rxcount = 0L;
368
369 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
370
371 counted += count;
372 strend = linesave(str, count);
373
374 for (k = str + patlen - 1; k < strend;) {
375 /*
376 * for a large class of patterns, upwards of 80% of
377 * match time is spent on the next line. we beat
378 * existing microcode (vax 'matchc') this way.
379 */
380 while ((k += deltazero[*(unsigned char *) k]) < strend);
381 if (k < (str + LARGE))
382 break;
383 k -= LARGE;
384
385 if (altflag) {
386 /*
387 * Parallel Boyer-Moore. Check whether each
388 * of the previous <altmin> chars COULD be
389 * from one of the alternative strings.
390 */
391 s = k - 1;
392 j = altmin;
393 while (altset[--j][(unsigned char)
394 cmap[*(unsigned char *) s--]]);
395 /*
396 * quick test fails. in this life, compare
397 * 'em all. but, a "reverse trie" would
398 * attenuate worst case (linear w/delta2?).
399 */
400 if (--j < 0) {
401 count = nalt - 1;
402 do {
403 s = k;
404 j = altlen[count];
405 t = altpat[count];
406
407 while
408 (cmap[*(unsigned char *) s--]
409 == t[--j]);
410 if (j < 0)
411 break;
412 }
413 while (count--);
414 }
415 } else {
416 /* One string -- check it */
417 j = patlen - 1;
418 s = k - 1;
419 while (cmap[*(unsigned char *) s--] == pat[--j]);
420 }
421 /*
422 * delta-less shortcut for literati. short shrift for
423 * genetic engineers?
424 */
425 if (j >= 0) {
426 k++; /* no match; restart next char */
427 continue;
428 }
429 k = submatch(file, pat, str, strend, k, count);
430 if (k == NULL)
431 return;
432 }
433 if (nflag) {
434 if (prevmatch)
435 nline = prevnline + nlcount(prevloc, k);
436 else
437 nline = nline + nlcount(str, k);
438 prevmatch = 0;
439 }
440 strncpy(str, linetemp, nleftover);
441 }
442 if (cflag) {
443 /* Bug from old grep: -c overrides -h. We fix the bug. */
444 if (!hflag)
445 printf("%s:", file);
446 printf("%ld\n", nmatch);
447 }
448}
449
450char *
451linesave(str, count) /* accumulate partial line at end of buffer */
452 char str[];
453 register int count;
454{
455 register int j;
456
457 count += nleftover;
458 if (count != BUFSIZE && fd != 0)
459 str[count++] = NL; /* insurance for broken last line */
460 str[count] = EOS;
461 for (j = count - 1; str[j] != NL && j >= 0;)
462 j--;
463 /*
464 * break up these lines: long line (> BUFSIZE), last line of file, or
465 * short return from read(), as from tee(1) input
466 */
467 if (j < 0 && (count == (BUFSIZE - nleftover))) {
468 str[count++] = NL;
469 str[count] = EOS;
470 linetemp[0] = EOS;
471 nleftover = 0;
472 return (str + count);
473 } else {
474 nleftover = count - j - 1;
475 strncpy(linetemp, str + j + 1, nleftover);
476 return (str + j);
477 }
478}
479
480/*
481 * Process partial match. First check for mis-aligned Kanji, then match line
482 * against full compiled r.e. if statistics do not warrant handing off to
483 * standard egrep.
484 */
485char *
486submatch(file, pat, str, strend, k, altindex)
487 char file[], pat[], str[];
488 register char *strend, *k;
489 int altindex;
490{
491 register char *s;
492 char *t, c;
493
494 t = k;
495 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
496#ifndef NOKANJI
497 c = ((altflag) ? altpat[altindex][0] : pat[0]);
498 if (c & NONASCII)
499 if ((s = kanji(str, s, k)) == NULL)
500 return (++k); /* reject false kanji */
501#endif
502 do;
503 while (*s != NL && --s >= str);
504 k = s + 1; /* now at line start */
505
506 if (boyonly)
507 return (gotamatch(file, k));
508
509 incount = counted - (strend - k);
510 if (boyfound++ == FIRSTFEW)
511 execstrategy(file);
512
513 s = t;
514 do
515 rxcount++;
516 while (*s++ != NL);
517 *--s = EOS;
518 /*
519 * "quick henry -- the flit" (after theodor geisel)
520 */
521 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
522 *s = NL;
523 if (gotamatch(file, k) == NULL)
524 return (NULL);
525 }
526 *s = NL;
527 return (s + 1);
528}
529
008a5c7a 530#ifndef NOKANJI
896308ef
KB
531/*
532 * EUC code disambiguation -- scan backwards to first 7-bit code, while
533 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
534 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
535 */
536char *
537kanji(str, s, k)
538 register char *str, *s, *k;
539{
540 register int j = 0;
541
542 for (s--; s >= str; s--) {
543 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
544 break;
545 j++;
546 }
547#ifndef CHINESE
548 if (*s == SS2)
549 j -= 1;
550#endif CHINESE
551 return ((j & 01) ? NULL : k);
552}
008a5c7a 553#endif
896308ef
KB
554
555/*
556 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
557 */
558gosper(pattern)
559 char *pattern; /* ... HAKMEM lives ... */
560{
561 register int i, j;
562 unsigned char c;
563
564 /* Make one-string case look like simple alternatives case */
565 if (!altflag) {
566 nalt = 1;
567 altmin = altlen[0] = strlen(pattern);
89422f46 568 altpat[0] = pattern;
896308ef
KB
569 }
570 /* For chars that aren't in any string, skip by string length. */
571 for (j = 0; j < 256; j++) {
572 delta0[j] = altmin;
573 cmap[j] = j; /* Sneak in initialization of cmap */
574 }
575
576 /* For chars in a string, skip distance from char to end of string. */
577 /* (If char appears more than once, skip minimum distance.) */
578 for (i = 0; i < nalt; i++)
579 for (j = 0; j < altlen[i] - 1; j++) {
580 c = altpat[i][j];
581 delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
582 if (iflag && islower((int) c))
583 delta0[toupper((int) c)] = delta0[c];
584 }
585
586 /* For last char of each string, fall out of search loop. */
587 for (i = 0; i < nalt; i++) {
588 c = altpat[i][altlen[i] - 1];
589 delta0[c] = LARGE;
590 if (iflag && islower((int) c))
591 delta0[toupper((int) c)] = LARGE;
592 }
593 if (iflag)
594 for (j = 'A'; j <= 'Z'; j++)
595 cmap[j] = tolower((int) j);
596}
597
598/*
599 * Print, count, or stop on full match. Result is either the location for
600 * continued search, or NULL to stop.
601 */
602char *
603gotamatch(file, s)
604 register char *file, *s;
605{
606 char *savematch();
607 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
608
609 nmatch++;
610 nsuccess = 1;
611 if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
612 squirrel = 1;
613
614 if (sflag)
615 return (NULL); /* -s usurps all flags (unlike some versions) */
616 if (cflag) { /* -c overrides -l, we guess */
617 do;
618 while (*s++ != NL);
619 } else if (lflag) {
620 puts(file);
621 return (NULL);
622 } else {
623 if (!hflag)
624 if (!squirrel)
625 printf("%s:", file);
626 else
627 sprintf(preamble, "%s:", file);
628 if (nflag) {
629 if (prevmatch)
630 prevnline = prevnline + nlcount(prevloc, s);
631 else
632 prevnline = nline + nlcount(str, s);
633 prevmatch = 1;
634
635 if (!squirrel)
636 printf("%ld:", prevnline);
637 else
638 sprintf(preamble + strlen(preamble),
639 "%ld:", prevnline);
640 }
641 if (!squirrel) {
642 do
643 putchar(*s);
644 while (*s++ != NL);
645 } else
646 s = savematch(s);
647
648 if (nflag)
649 prevloc = s - 1;
650 }
651 return ((firstflag && !cflag) ? NULL : s);
652}
653
654char *
655fold(line)
656 char *line;
657{
658 static char fline[BUFSIZE];
659 register char *s, *t = fline;
660
661 for (s = line; *s != EOS; s++)
662 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
663 *t = EOS;
664 return (fline);
665}
666
667strindex(s, t) /* the easy way, as in K&P, p. 192 */
668 char *s, *t;
669{
670 int i, n;
671
672 n = strlen(t);
673 for (i = 0; s[i] != '\0'; i++)
674 if (strncmp(s + i, t, n) == 0)
675 return (i);
676 return (-1);
677}
678
679char *
680grepxlat(pattern) /* grep pattern meta conversion */
681 char *pattern;
682{
683 register char *p, *s;
684 static char newpat[BUFSIZE];
685
686 for (s = newpat, p = pattern; *p != EOS;) {
687 if (*p == '\\') { /* skip escapes ... */
688 *s++ = *p++;
689 if (*p)
690 *s++ = *p++;
691 } else if (*p == '[') { /* ... and char classes */
692 while (*p != EOS && *p != ']')
693 *s++ = *p++;
694 } else if (strchr("+?|()", *p) != NULL) {
695 *s++ = '\\'; /* insert protection */
696 *s++ = *p++;
697 } else
698 *s++ = *p++;
699 }
700 *s = EOS;
701 grepflag = ((patind) ? 0 : 1);
702 return (newpat);
703}
704
705/*
706 * Test for simple alternation. Result is NULL if it's not so simple, or is
707 * a pointer to the first string if it is. Warning: sscanf size is a
708 * fixpoint, beyond which the speedup linearity starts to break down. In the
709 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
710 * altpat[] to arbitrary size is not useful.
711 */
712char *
713alternate(regexpr)
714 char *regexpr;
715{
89422f46
KB
716 register int i, j;
717 register char *start, *stop;
896308ef 718 unsigned char c;
896308ef
KB
719
720 if (fgrepflag && strchr(regexpr, '|'))
89422f46 721 return (NULL);
896308ef 722
89422f46
KB
723 /*
724 * break pattern up into altpat array; delimit on newline, bar,
725 * or EOS. We know we won't overflow, we've already checked the
726 * number of patterns we're going to find against NALT.
727 * Also, set length of pattern and find minimum pattern length.
728 */
729 nalt = 0;
730 altmin = NMUSH;
731 for (start = stop = regexpr;; ++stop)
732 if (!*stop || *stop == '|' || *stop == NL) {
733 altlen[nalt] = j = stop - start;
734 if (j < altmin)
735 altmin = j;
736 if (!(altpat[nalt] = malloc((u_int)(j + 1))))
737 oops("out of memory");
738 bcopy(start, altpat[nalt], j);
739 altpat[nalt][j] = EOS;
dd9922c6 740 ++nalt;
89422f46
KB
741 if (!*stop)
742 break;
dd9922c6 743 if (nalt == NALT)
89422f46
KB
744 return(NULL);
745 if (*stop == NL)
746 *stop = '|';
747 start = stop + 1;
748 }
896308ef
KB
749 if (!fgrepflag) {
750 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
751 return (NULL);
752 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
753 || strindex(regexpr, "||") >= 0)
754 return (NULL);
755 }
896308ef 756
896308ef
KB
757 if (nalt > 1) { /* build superimposed "pre-match" sets per
758 * char */
759 altflag++;
760 for (j = 0; j < nalt; j++)
761 for (i = 0; i < altmin; i++) {
762 c = altpat[j][altlen[j] - altmin + i];
763 altset[i + 1][c] = 1; /* offset for sentinel */
764 }
765 }
766 return (altpat[0]);
767}
768
769/*
770 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
771 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
772 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
773 * of the input, the r.e. is likely to be underspecified, so do old *grep,
774 * which is faster on complex patterns than regexp(). At FIRSTFEW,
775 * dump the saved matches collected by savematch(). They are saved
776 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
777 * since it's hard to rewind.
778 */
779
780execstrategy(file)
781 char *file;
782{
783 int pctmatch;
784
785 pctmatch = (100 * rxcount) / incount;
786 if (pctmatch > PUNTPERCENT && file != NULL)
787 kernighan(args);
788 if (file != NULL)
789 flushmatches();
790}
791
792nlcount(bstart, bstop) /* flail interval to totalize newlines. */
793 char *bstart, *bstop;
794{
795 register char *s = bstart;
796 register char *t = bstop;
797 register int count = 0;
798
799 do { /* loop unroll for older architectures */
800 if (*t == NL) /* ... ask ames!jaw for sample code */
801 count++;
802 } while (t-- > s);
803
804 return (count);
805}
806
807char *
808isolate(regexpr) /* isolate longest metacharacter-free string */
809 char *regexpr;
810{
811 char *dummyexpr;
812
813 /*
814 * We add (.)* because Henry's regcomp only figures regmust if it
815 * sees a leading * pattern. Foo!
816 */
817 dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
818 sprintf(dummyexpr, "(.)*%s", regexpr);
819 if ((rspencer = regcomp(dummyexpr)) == NULL)
820 kernighan(args);
821 return (rspencer->regmust);
822}
823
824char *matches[FIRSTFEW];
825static int mcount = 0;
826
827char *
828savematch(s) /* horde matches during statistics gathering */
829 register char *s;
830{
831 char *p;
832 char *start = s;
833 int msize = 0;
834 int psize = strlen(preamble);
835
836 while (*s++ != NL)
837 msize++;
838 *--s = EOS;
839
840 p = malloc((unsigned) msize + 1 + psize);
841 strcpy(p, preamble);
842 strcpy(p + psize, start);
843 matches[mcount++] = p;
844
845 preamble[0] = 0;
846 *s = NL;
847 return (s);
848}
849
850flushmatches()
851{
852 int n;
853
854 flushflag = 1;
855 for (n = 0; n < mcount; n++)
856 printf("%s\n", matches[n]);
857 mcount = 0;
858}
859
860oops(message)
861 char *message;
862{
863 fprintf(stderr, "%s: %s\n", progname, message);
864 exit(2);
865}
866
867kernighan(args) /* "let others do the hard part ..." */
868 char *args[];
869{
870 /*
871 * We may have already run grep on some of the files; remove them
872 * from the arg list we pass on. Note that we can't delete them
873 * totally because the number of file names affects the output
874 * (automatic -h).
875 */
876 /* better would be fork/exec per punted file -- jaw */
877
878 while (firstfile && optind > firstfile)
879 args[firstfile++] = "/dev/null";
880 if (patind)
881 args[patind] = pattern;
89422f46 882 (void) fflush(stdout);
896308ef
KB
883
884 if (grepflag)
885 execvp(GREPSTD, args), oops("can't exec old 'grep'");
886 else if (fgrepflag)
887 execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
888 else
889 execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
890}