Berkeley copyright notice
[unix-history] / usr / src / usr.bin / grep / egrep / egrep.c
CommitLineData
896308ef 1#ifndef lint
552f296c 2static char sccsid[] = "@(#)egrep.c 5.11 (Berkeley) %G%";
896308ef
KB
3#endif not lint
4
5/*
6 Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7 table as in original paper (CACM, October, 1977). No delta1 or delta2.
8 According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9 minimal practical value. However, to improve for worst case input,
10 integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11 Comput., Feb. 1986) deserves consideration.
12
13 Method: extract longest metacharacter-free string from expression.
14 this is done using a side-effect from henry spencer's regcomp().
15 use boyer-moore to match such, then pass submatching lines
16 to either regexp() or standard 'egrep', depending on certain
17 criteria within execstrategy() below. [this tradeoff is due
18 to the general slowness of the regexp() nondeterministic
19 machine on complex expressions, as well as the startup time
20 of standard 'egrep' on short files.] alternatively, one may
21 change the vendor-supplied 'egrep' automaton to include
22 boyer-moore directly. see accompanying writeup for discussion
23 of kanji expression treatment.
24
25 late addition: apply trickbag for fast match of simple
26 alternations (sublinear, in common low-cardinality cases).
27 trap fgrep into this lair.
28
29 gnu additions: -f, newline as |, \< and \> [in regexec()], more
30 comments. inspire better dfa exec() strategy.
31 serious testing and help with special cases.
32
33 Algorithm amalgam summary:
34
35 dfa e?grep (aho/thompson)
36 ndfa regexp() (spencer/aho)
37 bmg (boyer/moore/gosper)
38 "superimposed" bmg (jaw)
39 fgrep (aho/corrasick)
40
41 sorry, but the knuth/morris/pratt machine, horspool's
42 "frequentist" code, and the rabin/karp matcher, however cute,
43 just don't cut it for this production.
44
45 James A. Woods Copyright (c) 1986
46 NASA Ames Research Center
47*/
48#include <stdio.h>
49#include <ctype.h>
50#include <sys/types.h>
51#include <sys/stat.h>
81922b46 52#include <sys/file.h>
896308ef
KB
53#include <regexp.h> /* must be henry spencer's version */
54
55#define MIN(A, B) ((A) > (B) ? (B) : (A))
56
57#ifdef SLOWSYS
58#define read xread
59#endif
60/*
61 * define existing [ef]?grep program locations below for use by execvp().
62 * [execlp() would be used were it not for the possibility of
63 * installation-dependent recursion.]
64 */
65#ifndef EGREPSTD
66#define EGREPSTD "/usr/bin/egrep"
67#endif
68#ifndef GREPSTD
69#define GREPSTD "/bin/grep"
70#endif
71#ifndef FGREPSTD
72#define FGREPSTD "/usr/bin/fgrep"
73#endif
74
75#define BUFSIZE 8192 /* make higher for cray */
76#define PATSIZE 6000
77#define LARGE BUFSIZE + PATSIZE
78
896308ef
KB
79#define NALT 7 /* tied to scanf() size in alternate() */
80#define NMUSH 6 /* loosely relates to expected alt length */
81
82#define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
83#define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
84 * was processed by regexp(), exec std egrep. */
85#define NL '\n'
86#define EOS '\0'
87#define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
88#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
89#define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
90#define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
91
92extern char *optarg;
93extern int optind;
94char *progname;
95
96int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
97int sflag, hflag; /* v7, v8, bsd */
98
99int firstflag; /* Stop at first match */
100int grepflag; /* Called as "grep" */
101int fgrepflag; /* Called as "fgrep" */
102int altflag; /* Simple alternation in pattern */
103int boyonly; /* No regexp needed -- all simple */
104int flushflag;
105int grepold, egrepold, fgrepold;
106
107int nalt; /* Number of alternatives */
108int nsuccess; /* 1 for match, 2 for error */
109int altmin; /* Minimum length of all the alternate
110 * strings */
111int firstfile; /* argv index of first file argument */
112int patind; /* argv index of pattern */
113long nmatch; /* Number of matches in this file */
114long incount, counted; /* Amount of input consumed */
115long rxcount; /* Bytes of input processed by regexec() */
116int boyfound; /* accumulated partial matches (tripped by
117 * FIRSTFEW) */
118int prevmatch; /* next three lines aid fast -n */
119long nline, prevnline;
120char *prevloc;
121
122regexp *rspencer;
123char *pattern;
124char *patboy; /* Pattern for simple Boyer-Moore */
125char *patfile; /* Filename containing pattern(s) */
126
127int delta0[256]; /* Boyer-Moore algorithm core */
128char cmap[256]; /* Usually 0-255, but if -i, maps upper to
129 * lower case */
130char str[BUFSIZE + 2];
131int nleftover;
132char linetemp[BUFSIZE];
89422f46 133char *altpat[NALT]; /* alternation component storage */
896308ef
KB
134int altlen[NALT];
135short altset[NMUSH + 1][256];
136char preamble[200]; /* match prefix (filename, line no.) */
137
138int fd;
139char *
a2061e43 140strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
896308ef
KB
141char *
142grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
552f296c 143char *gotamatch(), *kanji(), *linesave(), *submatch();
896308ef
KB
144char **args;
145
146main(argc, argv)
147 int argc;
148 char *argv[];
149{
7df1ddda 150 int c, oflag;
896308ef
KB
151 int errflag = 0;
152
153 args = argv;
154
155 if ((progname = strrchr(argv[0], '/')) != 0)
156 progname++;
157 else
158 progname = argv[0];
159 if (strcmp(progname, "grep") == 0)
160 grepflag++;
7df1ddda 161 else if (strcmp(progname, "fgrep") == 0)
896308ef
KB
162 fgrepflag++;
163
7df1ddda
KB
164 oflag = 0;
165 while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
896308ef
KB
166 switch (c) {
167
168 case 'f':
169 fflag++;
170 patfile = optarg;
171 continue;
172 case 'b':
173 case 'v':
174 egrepold++; /* boyer-moore of little help here */
175 continue;
176 case 'c':
177 cflag++;
178 continue;
179 case 'e':
180 eflag++;
181 pattern = optarg;
182 continue;
183 case 'h':
184 hflag++;
185 continue;
7df1ddda
KB
186 case 'o':
187 oflag++;
188 continue;
896308ef
KB
189 case '1': /* Stop at very first match */
190 firstflag++; /* spead freaks only */
191 continue;
192 case 'i':
193 iflag++;
194 continue;
195 case 'l':
196 lflag++;
197 continue;
198 case 'n':
199 nflag++;
200 continue;
201 case 's':
202 sflag++;
203 continue;
204 case 'w':
205 case 'y':
206 if (!grepflag)
207 errflag++;
208 grepold++;
209 continue;
210 case 'x': /* needs more work, like -b above */
211 if (!fgrepflag)
212 errflag++;
213 fgrepold++;
214 continue;
215 case '?':
216 errflag++;
217 }
218 }
219 if (errflag || ((argc <= optind) && !fflag && !eflag)) {
220 if (grepflag)
7df1ddda 221oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
896308ef 222 else if (fgrepflag)
fc0a6896 223oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
896308ef
KB
224 else /* encourage SVID options, though we provide
225 * others */
fc0a6896 226oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
896308ef 227 }
896308ef 228 if (fflag)
79d3f5d0
KB
229 pattern = pfile(patfile);
230 else if (!eflag) {
231 patind = optind;
896308ef 232 pattern = argv[optind++];
79d3f5d0 233 }
896308ef 234
7df1ddda 235 if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */
896308ef
KB
236 hflag++;
237 if (pattern[0] == EOS)
238 kernighan(argv); /* same as it ever was */
239 /*
240 * 'grep/egrep' merger -- "old" grep is called to handle: tagged
241 * exprs \( \), word matches \< and \>, -w and -y options, char
242 * classes with '-' at end (egrep bug?), and patterns beginning with
243 * an asterisk (don't ask why). otherwise, characters meaningful to
244 * 'egrep' but not to 'grep' are escaped; the entire expr is then
245 * passed to 'egrep'.
246 */
247 if (grepflag && !grepold) {
248 if (strindex(pattern, "\\(") >= 0 ||
249 strindex(pattern, "\\<") >= 0 ||
250 strindex(pattern, "\\>") >= 0 ||
251 strindex(pattern, "-]") >= 0 ||
252 pattern[0] == '*') /* grep bug */
253 grepold++;
254 else
255 pattern = grepxlat(pattern);
256 }
257 if (grepold || egrepold || fgrepold)
258 kernighan(argv);
259
260 if (iflag)
261 strcpy(pattern, fold(pattern));
262 /*
263 * If the pattern is a plain string, just run boyer-moore. If it
264 * consists of meta-free alternatives, run "superimposed" bmg.
265 * Otherwise, find best string, and compile pattern for regexec().
266 */
267 if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
268 boyonly++;
269 patboy = pattern;
270 } else {
271 if ((patboy = alternate(pattern)) != NULL)
272 boyonly++;
273 else {
274 if ((patboy = isolate(pattern)) == NULL)
275 kernighan(argv); /* expr too involved */
276#ifndef NOKANJI
277 for (c = 0; pattern[c] != EOS; c++)
278 if (pattern[c] & NONASCII) /* kanji + meta */
279 kernighan(argv);
280#endif
281 if ((rspencer = regcomp(pattern)) == NULL)
282 oops("regcomp failure");
283 }
284 }
285 gosper(patboy); /* "pre-conditioning is wonderful"
286 * -- v. strassen */
287
288 if ((firstfile = optind) >= argc) {
289 /* Grep standard input */
290 if (lflag) /* We don't know its name! */
291 exit(1);
292 egsecute((char *) NULL);
293 } else {
294 while (optind < argc) {
295 egsecute(argv[optind]);
296 optind++;
297 if (firstflag && (nsuccess == 1))
298 break;
299 }
300 }
301 exit((nsuccess == 2) ? 2 : (nsuccess == 0));
302}
303
304char *
305pfile(pfname) /* absorb expression from file */
306 char *pfname;
307{
81922b46 308 int fd;
896308ef
KB
309 struct stat patstat;
310 static char *pat;
311
81922b46 312 if ((fd = open(pfname, O_RDONLY, 0)) < 0)
896308ef 313 oops("can't read pattern file");
81922b46 314 if (fstat(fd, &patstat) != 0)
896308ef
KB
315 oops("can't stat pattern file");
316 if (patstat.st_size > PATSIZE) {
317 if (fgrepflag) { /* defer to unix version */
318 fgrepold++;
319 return "dummy";
320 } else
321 oops("pattern file too big");
322 }
323 if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
324 oops("out of memory to read pattern file");
81922b46 325 if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
896308ef 326 oops("error reading pattern file");
81922b46 327 (void) close(fd);
896308ef
KB
328
329 pat[patstat.st_size] = EOS;
330 if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
331 pat[patstat.st_size - 1] = EOS;
332
333 if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
334 if (fgrepflag)
335 fgrepold++; /* "what's it all about, alfie?" */
336 else
337 egrepold++;
338 }
339 return (pat);
340}
341
342egsecute(file)
343 char *file;
344{
345 if (file == NULL)
346 fd = 0;
89422f46 347 else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
896308ef
KB
348 fprintf(stderr, "%s: can't open %s\n", progname, file);
349 nsuccess = 2;
350 return;
351 }
352 chimaera(file, patboy);
353
354 if (!boyonly && !flushflag && file != NULL)
355 flushmatches();
356 if (file != NULL)
357 close(fd);
358}
359
360chimaera(file, pat) /* "reach out and boyer-moore search someone" */
361 char *file, *pat; /* -- soon-to-be-popular bumper sticker */
362{
363 register char *k, *strend, *s;
364 register int j, count;
365 register int *deltazero = delta0;
366 int patlen = altmin;
367 char *t;
896308ef
KB
368
369 nleftover = boyfound = flushflag = 0;
370 nline = 1L;
371 prevmatch = 0;
372 nmatch = counted = rxcount = 0L;
373
374 while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
375
376 counted += count;
377 strend = linesave(str, count);
378
379 for (k = str + patlen - 1; k < strend;) {
380 /*
381 * for a large class of patterns, upwards of 80% of
382 * match time is spent on the next line. we beat
383 * existing microcode (vax 'matchc') this way.
384 */
385 while ((k += deltazero[*(unsigned char *) k]) < strend);
386 if (k < (str + LARGE))
387 break;
388 k -= LARGE;
389
390 if (altflag) {
391 /*
392 * Parallel Boyer-Moore. Check whether each
393 * of the previous <altmin> chars COULD be
394 * from one of the alternative strings.
395 */
396 s = k - 1;
397 j = altmin;
398 while (altset[--j][(unsigned char)
399 cmap[*(unsigned char *) s--]]);
400 /*
401 * quick test fails. in this life, compare
402 * 'em all. but, a "reverse trie" would
403 * attenuate worst case (linear w/delta2?).
404 */
405 if (--j < 0) {
406 count = nalt - 1;
407 do {
408 s = k;
409 j = altlen[count];
410 t = altpat[count];
411
412 while
413 (cmap[*(unsigned char *) s--]
414 == t[--j]);
415 if (j < 0)
416 break;
417 }
418 while (count--);
419 }
420 } else {
421 /* One string -- check it */
422 j = patlen - 1;
423 s = k - 1;
424 while (cmap[*(unsigned char *) s--] == pat[--j]);
425 }
426 /*
427 * delta-less shortcut for literati. short shrift for
428 * genetic engineers?
429 */
430 if (j >= 0) {
431 k++; /* no match; restart next char */
432 continue;
433 }
434 k = submatch(file, pat, str, strend, k, count);
435 if (k == NULL)
436 return;
437 }
438 if (nflag) {
439 if (prevmatch)
440 nline = prevnline + nlcount(prevloc, k);
441 else
442 nline = nline + nlcount(str, k);
443 prevmatch = 0;
444 }
445 strncpy(str, linetemp, nleftover);
446 }
447 if (cflag) {
448 /* Bug from old grep: -c overrides -h. We fix the bug. */
449 if (!hflag)
450 printf("%s:", file);
451 printf("%ld\n", nmatch);
452 }
453}
454
455char *
456linesave(str, count) /* accumulate partial line at end of buffer */
457 char str[];
458 register int count;
459{
460 register int j;
461
462 count += nleftover;
463 if (count != BUFSIZE && fd != 0)
464 str[count++] = NL; /* insurance for broken last line */
465 str[count] = EOS;
466 for (j = count - 1; str[j] != NL && j >= 0;)
467 j--;
468 /*
469 * break up these lines: long line (> BUFSIZE), last line of file, or
470 * short return from read(), as from tee(1) input
471 */
472 if (j < 0 && (count == (BUFSIZE - nleftover))) {
473 str[count++] = NL;
474 str[count] = EOS;
475 linetemp[0] = EOS;
476 nleftover = 0;
477 return (str + count);
478 } else {
479 nleftover = count - j - 1;
480 strncpy(linetemp, str + j + 1, nleftover);
481 return (str + j);
482 }
483}
484
485/*
486 * Process partial match. First check for mis-aligned Kanji, then match line
487 * against full compiled r.e. if statistics do not warrant handing off to
488 * standard egrep.
489 */
490char *
491submatch(file, pat, str, strend, k, altindex)
492 char file[], pat[], str[];
493 register char *strend, *k;
494 int altindex;
495{
496 register char *s;
497 char *t, c;
498
499 t = k;
500 s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
501#ifndef NOKANJI
502 c = ((altflag) ? altpat[altindex][0] : pat[0]);
503 if (c & NONASCII)
504 if ((s = kanji(str, s, k)) == NULL)
505 return (++k); /* reject false kanji */
506#endif
507 do;
508 while (*s != NL && --s >= str);
509 k = s + 1; /* now at line start */
510
511 if (boyonly)
512 return (gotamatch(file, k));
513
514 incount = counted - (strend - k);
515 if (boyfound++ == FIRSTFEW)
516 execstrategy(file);
517
518 s = t;
519 do
520 rxcount++;
521 while (*s++ != NL);
522 *--s = EOS;
523 /*
524 * "quick henry -- the flit" (after theodor geisel)
525 */
526 if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
527 *s = NL;
528 if (gotamatch(file, k) == NULL)
529 return (NULL);
530 }
531 *s = NL;
532 return (s + 1);
533}
534
008a5c7a 535#ifndef NOKANJI
896308ef
KB
536/*
537 * EUC code disambiguation -- scan backwards to first 7-bit code, while
538 * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
539 * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
540 */
541char *
542kanji(str, s, k)
543 register char *str, *s, *k;
544{
545 register int j = 0;
546
547 for (s--; s >= str; s--) {
548 if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
549 break;
550 j++;
551 }
552#ifndef CHINESE
553 if (*s == SS2)
554 j -= 1;
555#endif CHINESE
556 return ((j & 01) ? NULL : k);
557}
008a5c7a 558#endif
896308ef
KB
559
560/*
561 * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
562 */
563gosper(pattern)
564 char *pattern; /* ... HAKMEM lives ... */
565{
566 register int i, j;
567 unsigned char c;
568
569 /* Make one-string case look like simple alternatives case */
570 if (!altflag) {
571 nalt = 1;
572 altmin = altlen[0] = strlen(pattern);
89422f46 573 altpat[0] = pattern;
896308ef
KB
574 }
575 /* For chars that aren't in any string, skip by string length. */
576 for (j = 0; j < 256; j++) {
577 delta0[j] = altmin;
578 cmap[j] = j; /* Sneak in initialization of cmap */
579 }
580
581 /* For chars in a string, skip distance from char to end of string. */
582 /* (If char appears more than once, skip minimum distance.) */
583 for (i = 0; i < nalt; i++)
584 for (j = 0; j < altlen[i] - 1; j++) {
585 c = altpat[i][j];
586 delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
587 if (iflag && islower((int) c))
588 delta0[toupper((int) c)] = delta0[c];
589 }
590
591 /* For last char of each string, fall out of search loop. */
592 for (i = 0; i < nalt; i++) {
593 c = altpat[i][altlen[i] - 1];
594 delta0[c] = LARGE;
595 if (iflag && islower((int) c))
596 delta0[toupper((int) c)] = LARGE;
597 }
598 if (iflag)
599 for (j = 'A'; j <= 'Z'; j++)
600 cmap[j] = tolower((int) j);
601}
602
603/*
604 * Print, count, or stop on full match. Result is either the location for
605 * continued search, or NULL to stop.
606 */
607char *
608gotamatch(file, s)
609 register char *file, *s;
610{
611 char *savematch();
612 int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
613
614 nmatch++;
615 nsuccess = 1;
616 if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
617 squirrel = 1;
618
619 if (sflag)
620 return (NULL); /* -s usurps all flags (unlike some versions) */
621 if (cflag) { /* -c overrides -l, we guess */
622 do;
623 while (*s++ != NL);
624 } else if (lflag) {
625 puts(file);
626 return (NULL);
627 } else {
628 if (!hflag)
629 if (!squirrel)
630 printf("%s:", file);
631 else
a2061e43 632 (void)sprintf(preamble, "%s:", file);
896308ef
KB
633 if (nflag) {
634 if (prevmatch)
635 prevnline = prevnline + nlcount(prevloc, s);
636 else
637 prevnline = nline + nlcount(str, s);
638 prevmatch = 1;
639
640 if (!squirrel)
641 printf("%ld:", prevnline);
642 else
a2061e43 643 (void)sprintf(preamble + strlen(preamble),
896308ef
KB
644 "%ld:", prevnline);
645 }
646 if (!squirrel) {
647 do
648 putchar(*s);
649 while (*s++ != NL);
650 } else
651 s = savematch(s);
652
653 if (nflag)
654 prevloc = s - 1;
655 }
656 return ((firstflag && !cflag) ? NULL : s);
657}
658
659char *
660fold(line)
661 char *line;
662{
663 static char fline[BUFSIZE];
664 register char *s, *t = fline;
665
666 for (s = line; *s != EOS; s++)
667 *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
668 *t = EOS;
669 return (fline);
670}
671
672strindex(s, t) /* the easy way, as in K&P, p. 192 */
673 char *s, *t;
674{
675 int i, n;
676
677 n = strlen(t);
678 for (i = 0; s[i] != '\0'; i++)
679 if (strncmp(s + i, t, n) == 0)
680 return (i);
681 return (-1);
682}
683
684char *
685grepxlat(pattern) /* grep pattern meta conversion */
686 char *pattern;
687{
688 register char *p, *s;
689 static char newpat[BUFSIZE];
690
691 for (s = newpat, p = pattern; *p != EOS;) {
692 if (*p == '\\') { /* skip escapes ... */
693 *s++ = *p++;
694 if (*p)
695 *s++ = *p++;
696 } else if (*p == '[') { /* ... and char classes */
697 while (*p != EOS && *p != ']')
698 *s++ = *p++;
699 } else if (strchr("+?|()", *p) != NULL) {
700 *s++ = '\\'; /* insert protection */
701 *s++ = *p++;
702 } else
703 *s++ = *p++;
704 }
705 *s = EOS;
706 grepflag = ((patind) ? 0 : 1);
707 return (newpat);
708}
709
710/*
711 * Test for simple alternation. Result is NULL if it's not so simple, or is
712 * a pointer to the first string if it is. Warning: sscanf size is a
713 * fixpoint, beyond which the speedup linearity starts to break down. In the
714 * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
715 * altpat[] to arbitrary size is not useful.
716 */
717char *
718alternate(regexpr)
719 char *regexpr;
720{
89422f46
KB
721 register int i, j;
722 register char *start, *stop;
896308ef 723 unsigned char c;
896308ef
KB
724
725 if (fgrepflag && strchr(regexpr, '|'))
89422f46 726 return (NULL);
896308ef 727
89422f46
KB
728 /*
729 * break pattern up into altpat array; delimit on newline, bar,
730 * or EOS. We know we won't overflow, we've already checked the
731 * number of patterns we're going to find against NALT.
732 * Also, set length of pattern and find minimum pattern length.
733 */
734 nalt = 0;
735 altmin = NMUSH;
736 for (start = stop = regexpr;; ++stop)
737 if (!*stop || *stop == '|' || *stop == NL) {
738 altlen[nalt] = j = stop - start;
739 if (j < altmin)
740 altmin = j;
741 if (!(altpat[nalt] = malloc((u_int)(j + 1))))
742 oops("out of memory");
743 bcopy(start, altpat[nalt], j);
744 altpat[nalt][j] = EOS;
dd9922c6 745 ++nalt;
89422f46
KB
746 if (!*stop)
747 break;
dd9922c6 748 if (nalt == NALT)
89422f46
KB
749 return(NULL);
750 if (*stop == NL)
751 *stop = '|';
752 start = stop + 1;
753 }
896308ef
KB
754 if (!fgrepflag) {
755 if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
756 return (NULL);
757 if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
758 || strindex(regexpr, "||") >= 0)
759 return (NULL);
760 }
896308ef 761
896308ef
KB
762 if (nalt > 1) { /* build superimposed "pre-match" sets per
763 * char */
764 altflag++;
765 for (j = 0; j < nalt; j++)
766 for (i = 0; i < altmin; i++) {
767 c = altpat[j][altlen[j] - altmin + i];
768 altset[i + 1][c] = 1; /* offset for sentinel */
769 }
770 }
771 return (altpat[0]);
772}
773
774/*
775 * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
776 * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
777 * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
778 * of the input, the r.e. is likely to be underspecified, so do old *grep,
779 * which is faster on complex patterns than regexp(). At FIRSTFEW,
780 * dump the saved matches collected by savematch(). They are saved
781 * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
782 * since it's hard to rewind.
783 */
784
785execstrategy(file)
786 char *file;
787{
788 int pctmatch;
789
790 pctmatch = (100 * rxcount) / incount;
791 if (pctmatch > PUNTPERCENT && file != NULL)
792 kernighan(args);
793 if (file != NULL)
794 flushmatches();
795}
796
797nlcount(bstart, bstop) /* flail interval to totalize newlines. */
798 char *bstart, *bstop;
799{
800 register char *s = bstart;
801 register char *t = bstop;
802 register int count = 0;
803
804 do { /* loop unroll for older architectures */
805 if (*t == NL) /* ... ask ames!jaw for sample code */
806 count++;
807 } while (t-- > s);
808
809 return (count);
810}
811
812char *
813isolate(regexpr) /* isolate longest metacharacter-free string */
814 char *regexpr;
815{
816 char *dummyexpr;
817
818 /*
819 * We add (.)* because Henry's regcomp only figures regmust if it
820 * sees a leading * pattern. Foo!
821 */
822 dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
a2061e43 823 (void)sprintf(dummyexpr, "(.)*%s", regexpr);
896308ef
KB
824 if ((rspencer = regcomp(dummyexpr)) == NULL)
825 kernighan(args);
826 return (rspencer->regmust);
827}
828
829char *matches[FIRSTFEW];
830static int mcount = 0;
831
832char *
833savematch(s) /* horde matches during statistics gathering */
834 register char *s;
835{
836 char *p;
837 char *start = s;
838 int msize = 0;
839 int psize = strlen(preamble);
840
841 while (*s++ != NL)
842 msize++;
843 *--s = EOS;
844
845 p = malloc((unsigned) msize + 1 + psize);
846 strcpy(p, preamble);
847 strcpy(p + psize, start);
848 matches[mcount++] = p;
849
850 preamble[0] = 0;
851 *s = NL;
852 return (s);
853}
854
855flushmatches()
856{
857 int n;
858
859 flushflag = 1;
860 for (n = 0; n < mcount; n++)
861 printf("%s\n", matches[n]);
862 mcount = 0;
863}
864
865oops(message)
866 char *message;
867{
868 fprintf(stderr, "%s: %s\n", progname, message);
869 exit(2);
870}
871
872kernighan(args) /* "let others do the hard part ..." */
873 char *args[];
874{
875 /*
876 * We may have already run grep on some of the files; remove them
877 * from the arg list we pass on. Note that we can't delete them
878 * totally because the number of file names affects the output
879 * (automatic -h).
880 */
881 /* better would be fork/exec per punted file -- jaw */
882
883 while (firstfile && optind > firstfile)
884 args[firstfile++] = "/dev/null";
885 if (patind)
886 args[patind] = pattern;
89422f46 887 (void) fflush(stdout);
896308ef
KB
888
889 if (grepflag)
890 execvp(GREPSTD, args), oops("can't exec old 'grep'");
891 else if (fgrepflag)
892 execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
893 else
894 execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
895}