static char sccsid
[] = "@(#)egrep.c 5.10 (Berkeley) 6/30/88";
Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
table as in original paper (CACM, October, 1977). No delta1 or delta2.
According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
minimal practical value. However, to improve for worst case input,
integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
Comput., Feb. 1986) deserves consideration.
Method: extract longest metacharacter-free string from expression.
this is done using a side-effect from henry spencer's regcomp().
use boyer-moore to match such, then pass submatching lines
to either regexp() or standard 'egrep', depending on certain
criteria within execstrategy() below. [this tradeoff is due
to the general slowness of the regexp() nondeterministic
machine on complex expressions, as well as the startup time
of standard 'egrep' on short files.] alternatively, one may
change the vendor-supplied 'egrep' automaton to include
boyer-moore directly. see accompanying writeup for discussion
of kanji expression treatment.
late addition: apply trickbag for fast match of simple
alternations (sublinear, in common low-cardinality cases).
trap fgrep into this lair.
gnu additions: -f, newline as |, \< and \> [in regexec()], more
comments. inspire better dfa exec() strategy.
serious testing and help with special cases.
Algorithm amalgam summary:
dfa e?grep (aho/thompson)
ndfa regexp() (spencer/aho)
sorry, but the knuth/morris/pratt machine, horspool's
"frequentist" code, and the rabin/karp matcher, however cute,
just don't cut it for this production.
James A. Woods Copyright (c) 1986
NASA Ames Research Center
#include <regexp.h> /* must be henry spencer's version */
#define MIN(A, B) ((A) > (B) ? (B) : (A))
* define existing [ef]?grep program locations below for use by execvp().
* [execlp() would be used were it not for the possibility of
* installation-dependent recursion.]
#define EGREPSTD "/usr/bin/egrep"
#define GREPSTD "/bin/grep"
#define FGREPSTD "/usr/bin/fgrep"
#define BUFSIZE 8192 /* make higher for cray */
#define LARGE BUFSIZE + PATSIZE
#define NALT 7 /* tied to scanf() size in alternate() */
#define NMUSH 6 /* loosely relates to expected alt length */
#define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
#define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
* was processed by regexp(), exec std egrep. */
#define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
#define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
#define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
#define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
int cflag
, iflag
, eflag
, fflag
, lflag
, nflag
; /* SVID flags */
int sflag
, hflag
; /* v7, v8, bsd */
int firstflag
; /* Stop at first match */
int grepflag
; /* Called as "grep" */
int fgrepflag
; /* Called as "fgrep" */
int altflag
; /* Simple alternation in pattern */
int boyonly
; /* No regexp needed -- all simple */
int grepold
, egrepold
, fgrepold
;
int nalt
; /* Number of alternatives */
int nsuccess
; /* 1 for match, 2 for error */
int altmin
; /* Minimum length of all the alternate
int firstfile
; /* argv index of first file argument */
int patind
; /* argv index of pattern */
long nmatch
; /* Number of matches in this file */
long incount
, counted
; /* Amount of input consumed */
long rxcount
; /* Bytes of input processed by regexec() */
int boyfound
; /* accumulated partial matches (tripped by
int prevmatch
; /* next three lines aid fast -n */
char *patboy
; /* Pattern for simple Boyer-Moore */
char *patfile
; /* Filename containing pattern(s) */
int delta0
[256]; /* Boyer-Moore algorithm core */
char cmap
[256]; /* Usually 0-255, but if -i, maps upper to
char *altpat
[NALT
]; /* alternation component storage */
short altset
[NMUSH
+ 1][256];
char preamble
[200]; /* match prefix (filename, line no.) */
strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
if ((progname
= strrchr(argv
[0], '/')) != 0)
if (strcmp(progname
, "grep") == 0)
else if (strcmp(progname
, "fgrep") == 0)
while ((c
= getopt(argc
, argv
, "bchie:f:lnosvwxy1")) != EOF
) {
egrepold
++; /* boyer-moore of little help here */
case '1': /* Stop at very first match */
firstflag
++; /* spead freaks only */
case 'x': /* needs more work, like -b above */
if (errflag
|| ((argc
<= optind
) && !fflag
&& !eflag
)) {
oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
else /* encourage SVID options, though we provide
oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
pattern
= pfile(patfile
);
pattern
= argv
[optind
++];
if (!oflag
&& (argc
- optind
) <= 1) /* Filename invisible given < 2 files */
kernighan(argv
); /* same as it ever was */
* 'grep/egrep' merger -- "old" grep is called to handle: tagged
* exprs \( \), word matches \< and \>, -w and -y options, char
* classes with '-' at end (egrep bug?), and patterns beginning with
* an asterisk (don't ask why). otherwise, characters meaningful to
* 'egrep' but not to 'grep' are escaped; the entire expr is then
if (grepflag
&& !grepold
) {
if (strindex(pattern
, "\\(") >= 0 ||
strindex(pattern
, "\\<") >= 0 ||
strindex(pattern
, "\\>") >= 0 ||
strindex(pattern
, "-]") >= 0 ||
pattern
[0] == '*') /* grep bug */
pattern
= grepxlat(pattern
);
if (grepold
|| egrepold
|| fgrepold
)
strcpy(pattern
, fold(pattern
));
* If the pattern is a plain string, just run boyer-moore. If it
* consists of meta-free alternatives, run "superimposed" bmg.
* Otherwise, find best string, and compile pattern for regexec().
if (strpbrk(pattern
, META
) == NULL
) { /* do boyer-moore only */
if ((patboy
= alternate(pattern
)) != NULL
)
if ((patboy
= isolate(pattern
)) == NULL
)
kernighan(argv
); /* expr too involved */
for (c
= 0; pattern
[c
] != EOS
; c
++)
if (pattern
[c
] & NONASCII
) /* kanji + meta */
if ((rspencer
= regcomp(pattern
)) == NULL
)
gosper(patboy
); /* "pre-conditioning is wonderful"
if ((firstfile
= optind
) >= argc
) {
/* Grep standard input */
if (lflag
) /* We don't know its name! */
if (firstflag
&& (nsuccess
== 1))
exit((nsuccess
== 2) ? 2 : (nsuccess
== 0));
pfile(pfname
) /* absorb expression from file */
if ((fd
= open(pfname
, O_RDONLY
, 0)) < 0)
oops("can't read pattern file");
if (fstat(fd
, &patstat
) != 0)
oops("can't stat pattern file");
if (patstat
.st_size
> PATSIZE
) {
if (fgrepflag
) { /* defer to unix version */
oops("pattern file too big");
if ((pat
= malloc((unsigned) patstat
.st_size
+ 1)) == NULL
)
oops("out of memory to read pattern file");
if (patstat
.st_size
!= read(fd
, pat
, (int)patstat
.st_size
))
oops("error reading pattern file");
pat
[patstat
.st_size
] = EOS
;
if (pat
[patstat
.st_size
- 1] == NL
) /* NOP for egrep; helps grep */
pat
[patstat
.st_size
- 1] = EOS
;
if (nlcount(pat
, &pat
[patstat
.st_size
]) > NALT
) {
fgrepold
++; /* "what's it all about, alfie?" */
else if ((fd
= open(file
, O_RDONLY
, 0)) <= 0) {
fprintf(stderr
, "%s: can't open %s\n", progname
, file
);
if (!boyonly
&& !flushflag
&& file
!= NULL
)
chimaera(file
, pat
) /* "reach out and boyer-moore search someone" */
char *file
, *pat
; /* -- soon-to-be-popular bumper sticker */
register char *k
, *strend
, *s
;
register int *deltazero
= delta0
;
char *gotamatch(), *kanji(), *linesave(), *submatch();
nleftover
= boyfound
= flushflag
= 0;
nmatch
= counted
= rxcount
= 0L;
while ((count
= read(fd
, str
+ nleftover
, BUFSIZE
- nleftover
)) > 0) {
strend
= linesave(str
, count
);
for (k
= str
+ patlen
- 1; k
< strend
;) {
* for a large class of patterns, upwards of 80% of
* match time is spent on the next line. we beat
* existing microcode (vax 'matchc') this way.
while ((k
+= deltazero
[*(unsigned char *) k
]) < strend
);
* Parallel Boyer-Moore. Check whether each
* of the previous <altmin> chars COULD be
* from one of the alternative strings.
while (altset
[--j
][(unsigned char)
cmap
[*(unsigned char *) s
--]]);
* quick test fails. in this life, compare
* 'em all. but, a "reverse trie" would
* attenuate worst case (linear w/delta2?).
(cmap
[*(unsigned char *) s
--]
/* One string -- check it */
while (cmap
[*(unsigned char *) s
--] == pat
[--j
]);
* delta-less shortcut for literati. short shrift for
k
++; /* no match; restart next char */
k
= submatch(file
, pat
, str
, strend
, k
, count
);
nline
= prevnline
+ nlcount(prevloc
, k
);
nline
= nline
+ nlcount(str
, k
);
strncpy(str
, linetemp
, nleftover
);
/* Bug from old grep: -c overrides -h. We fix the bug. */
linesave(str
, count
) /* accumulate partial line at end of buffer */
if (count
!= BUFSIZE
&& fd
!= 0)
str
[count
++] = NL
; /* insurance for broken last line */
for (j
= count
- 1; str
[j
] != NL
&& j
>= 0;)
* break up these lines: long line (> BUFSIZE), last line of file, or
* short return from read(), as from tee(1) input
if (j
< 0 && (count
== (BUFSIZE
- nleftover
))) {
nleftover
= count
- j
- 1;
strncpy(linetemp
, str
+ j
+ 1, nleftover
);
* Process partial match. First check for mis-aligned Kanji, then match line
* against full compiled r.e. if statistics do not warrant handing off to
submatch(file
, pat
, str
, strend
, k
, altindex
)
char file
[], pat
[], str
[];
register char *strend
, *k
;
s
= ((altflag
) ? k
- altlen
[altindex
] + 1 : k
- altmin
+ 1);
c
= ((altflag
) ? altpat
[altindex
][0] : pat
[0]);
if ((s
= kanji(str
, s
, k
)) == NULL
)
return (++k
); /* reject false kanji */
while (*s
!= NL
&& --s
>= str
);
k
= s
+ 1; /* now at line start */
return (gotamatch(file
, k
));
incount
= counted
- (strend
- k
);
if (boyfound
++ == FIRSTFEW
)
* "quick henry -- the flit" (after theodor geisel)
if (regexec(rspencer
, ((iflag
) ? fold(k
) : k
)) == 1) {
if (gotamatch(file
, k
) == NULL
)
* EUC code disambiguation -- scan backwards to first 7-bit code, while
* counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
* SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
register char *str
, *s
, *k
;
for (s
--; s
>= str
; s
--) {
if (*s
== SS2
|| *s
== SS3
|| (*s
& NONASCII
) == 0)
return ((j
& 01) ? NULL
: k
);
* Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
char *pattern
; /* ... HAKMEM lives ... */
/* Make one-string case look like simple alternatives case */
altmin
= altlen
[0] = strlen(pattern
);
/* For chars that aren't in any string, skip by string length. */
for (j
= 0; j
< 256; j
++) {
cmap
[j
] = j
; /* Sneak in initialization of cmap */
/* For chars in a string, skip distance from char to end of string. */
/* (If char appears more than once, skip minimum distance.) */
for (i
= 0; i
< nalt
; i
++)
for (j
= 0; j
< altlen
[i
] - 1; j
++) {
delta0
[c
] = MIN(delta0
[c
], altlen
[i
] - j
- 1);
if (iflag
&& islower((int) c
))
delta0
[toupper((int) c
)] = delta0
[c
];
/* For last char of each string, fall out of search loop. */
for (i
= 0; i
< nalt
; i
++) {
c
= altpat
[i
][altlen
[i
] - 1];
if (iflag
&& islower((int) c
))
delta0
[toupper((int) c
)] = LARGE
;
for (j
= 'A'; j
<= 'Z'; j
++)
cmap
[j
] = tolower((int) j
);
* Print, count, or stop on full match. Result is either the location for
* continued search, or NULL to stop.
int squirrel
= 0; /* nonzero to squirrel away FIRSTFEW matches */
if (!boyonly
&& boyfound
<= FIRSTFEW
&& file
!= NULL
)
return (NULL
); /* -s usurps all flags (unlike some versions) */
if (cflag
) { /* -c overrides -l, we guess */
(void)sprintf(preamble
, "%s:", file
);
prevnline
= prevnline
+ nlcount(prevloc
, s
);
prevnline
= nline
+ nlcount(str
, s
);
printf("%ld:", prevnline
);
(void)sprintf(preamble
+ strlen(preamble
),
return ((firstflag
&& !cflag
) ? NULL
: s
);
static char fline
[BUFSIZE
];
register char *s
, *t
= fline
;
for (s
= line
; *s
!= EOS
; s
++)
*t
++ = (isupper((int) *s
) ? (char) tolower((int) *s
) : *s
);
strindex(s
, t
) /* the easy way, as in K&P, p. 192 */
for (i
= 0; s
[i
] != '\0'; i
++)
if (strncmp(s
+ i
, t
, n
) == 0)
grepxlat(pattern
) /* grep pattern meta conversion */
static char newpat
[BUFSIZE
];
for (s
= newpat
, p
= pattern
; *p
!= EOS
;) {
if (*p
== '\\') { /* skip escapes ... */
} else if (*p
== '[') { /* ... and char classes */
while (*p
!= EOS
&& *p
!= ']')
} else if (strchr("+?|()", *p
) != NULL
) {
*s
++ = '\\'; /* insert protection */
grepflag
= ((patind
) ? 0 : 1);
* Test for simple alternation. Result is NULL if it's not so simple, or is
* a pointer to the first string if it is. Warning: sscanf size is a
* fixpoint, beyond which the speedup linearity starts to break down. In the
* wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
* altpat[] to arbitrary size is not useful.
register char *start
, *stop
;
if (fgrepflag
&& strchr(regexpr
, '|'))
* break pattern up into altpat array; delimit on newline, bar,
* or EOS. We know we won't overflow, we've already checked the
* number of patterns we're going to find against NALT.
* Also, set length of pattern and find minimum pattern length.
for (start
= stop
= regexpr
;; ++stop
)
if (!*stop
|| *stop
== '|' || *stop
== NL
) {
altlen
[nalt
] = j
= stop
- start
;
if (!(altpat
[nalt
] = malloc((u_int
)(j
+ 1))))
bcopy(start
, altpat
[nalt
], j
);
if (strchr(regexpr
, '|') == NULL
|| regexpr
[0] == '|')
if (strpbrk(regexpr
, "^$.[]()?+*\\") != NULL
|| strindex(regexpr
, "||") >= 0)
if (nalt
> 1) { /* build superimposed "pre-match" sets per
for (j
= 0; j
< nalt
; j
++)
for (i
= 0; i
< altmin
; i
++) {
c
= altpat
[j
][altlen
[j
] - altmin
+ i
];
altset
[i
+ 1][c
] = 1; /* offset for sentinel */
* Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
* determine whether to use dfa-based egrep: We do FIRSTFEW matches with
* regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
* of the input, the r.e. is likely to be underspecified, so do old *grep,
* which is faster on complex patterns than regexp(). At FIRSTFEW,
* dump the saved matches collected by savematch(). They are saved
* so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
* since it's hard to rewind.
pctmatch
= (100 * rxcount
) / incount
;
if (pctmatch
> PUNTPERCENT
&& file
!= NULL
)
nlcount(bstart
, bstop
) /* flail interval to totalize newlines. */
register char *s
= bstart
;
register char *t
= bstop
;
do { /* loop unroll for older architectures */
if (*t
== NL
) /* ... ask ames!jaw for sample code */
isolate(regexpr
) /* isolate longest metacharacter-free string */
* We add (.)* because Henry's regcomp only figures regmust if it
* sees a leading * pattern. Foo!
dummyexpr
= malloc((unsigned) strlen(regexpr
) + 5);
(void)sprintf(dummyexpr
, "(.)*%s", regexpr
);
if ((rspencer
= regcomp(dummyexpr
)) == NULL
)
return (rspencer
->regmust
);
savematch(s
) /* horde matches during statistics gathering */
int psize
= strlen(preamble
);
p
= malloc((unsigned) msize
+ 1 + psize
);
strcpy(p
+ psize
, start
);
for (n
= 0; n
< mcount
; n
++)
printf("%s\n", matches
[n
]);
fprintf(stderr
, "%s: %s\n", progname
, message
);
kernighan(args
) /* "let others do the hard part ..." */
* We may have already run grep on some of the files; remove them
* from the arg list we pass on. Note that we can't delete them
* totally because the number of file names affects the output
/* better would be fork/exec per punted file -- jaw */
while (firstfile
&& optind
> firstfile
)
args
[firstfile
++] = "/dev/null";
execvp(GREPSTD
, args
), oops("can't exec old 'grep'");
execvp(FGREPSTD
, args
), oops("can't exec old 'fgrep'");
execvp(EGREPSTD
, args
), oops("can't exec old 'egrep'");