386BSD 0.1 development
[unix-history] / usr / src / usr.bin / grep / dfa.h
CommitLineData
e30c5f65
WJ
1/* dfa.h - declarations for GNU deterministic regexp compiler
2 Copyright (C) 1988 Free Software Foundation, Inc.
3 Written June, 1988 by Mike Haertel
4
5 NO WARRANTY
6
7 BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY
8NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT
9WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC,
10RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS"
11WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
12BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
13FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY
14AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
15DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR
16CORRECTION.
17
18 IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M.
19STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY
20WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE
21LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR
22OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
23USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR
24DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR
25A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS
26PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
27DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY.
28
29 GENERAL PUBLIC LICENSE TO COPY
30
31 1. You may copy and distribute verbatim copies of this source file
32as you receive it, in any medium, provided that you conspicuously and
33appropriately publish on each copy a valid copyright notice "Copyright
34 (C) 1988 Free Software Foundation, Inc."; and include following the
35copyright notice a verbatim copy of the above disclaimer of warranty
36and of this License. You may charge a distribution fee for the
37physical act of transferring a copy.
38
39 2. You may modify your copy or copies of this source file or
40any portion of it, and copy and distribute such modifications under
41the terms of Paragraph 1 above, provided that you also do the following:
42
43 a) cause the modified files to carry prominent notices stating
44 that you changed the files and the date of any change; and
45
46 b) cause the whole of any work that you distribute or publish,
47 that in whole or in part contains or is a derivative of this
48 program or any part thereof, to be licensed at no charge to all
49 third parties on terms identical to those contained in this
50 License Agreement (except that you may choose to grant more extensive
51 warranty protection to some or all third parties, at your option).
52
53 c) You may charge a distribution fee for the physical act of
54 transferring a copy, and you may at your option offer warranty
55 protection in exchange for a fee.
56
57Mere aggregation of another unrelated program with this program (or its
58derivative) on a volume of a storage or distribution medium does not bring
59the other program under the scope of these terms.
60
61 3. You may copy and distribute this program or any portion of it in
62compiled, executable or object code form under the terms of Paragraphs
631 and 2 above provided that you do the following:
64
65 a) accompany it with the complete corresponding machine-readable
66 source code, which must be distributed under the terms of
67 Paragraphs 1 and 2 above; or,
68
69 b) accompany it with a written offer, valid for at least three
70 years, to give any third party free (except for a nominal
71 shipping charge) a complete machine-readable copy of the
72 corresponding source code, to be distributed under the terms of
73 Paragraphs 1 and 2 above; or,
74
75 c) accompany it with the information you received as to where the
76 corresponding source code may be obtained. (This alternative is
77 allowed only for noncommercial distribution and only if you
78 received the program in object code or executable form alone.)
79
80For an executable file, complete source code means all the source code for
81all modules it contains; but, as a special exception, it need not include
82source code for modules which are standard libraries that accompany the
83operating system on which the executable file runs.
84
85 4. You may not copy, sublicense, distribute or transfer this program
86except as expressly provided under this License Agreement. Any attempt
87otherwise to copy, sublicense, distribute or transfer this program is void and
88your rights to use the program under this License agreement shall be
89automatically terminated. However, parties who have received computer
90software programs from you with this License Agreement will not have
91their licenses terminated so long as such parties remain in full compliance.
92
93 5. If you wish to incorporate parts of this program into other free
94programs whose distribution conditions are different, write to the Free
95Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet
96worked out a simple rule that can be stated here, but we will often permit
97this. We will be guided by the two goals of preserving the free status of
98all derivatives our free software and of promoting the sharing and reuse of
99software.
100
101
102In other words, you are welcome to use, share and improve this program.
103You are forbidden to forbid anyone else to use, share and improve
104what you give them. Help stamp out software-hoarding! */
105\f
106
107#ifdef USG
108#include <string.h>
109extern char *index();
110#else
111#include <strings.h>
112/*extern char *strchr(), *strrchr(), *memcpy();*/
113#endif
114
115#ifdef __STDC__
116
117/* Missing include files for GNU C. */
118#include <stdlib.h>
119/*typedef int size_t;
120extern void *calloc(int, size_t);
121extern void *malloc(size_t);
122extern void *realloc(void *, size_t);
123extern void free(void *);
124
125extern char *bcopy(), *bzero();*/
126
127#ifdef SOMEDAY
128#define ISALNUM(c) isalnum(c)
129#define ISALPHA(c) isalpha(c)
130#define ISUPPER(c) isupper(c)
131#else
132#define ISALNUM(c) (isascii(c) && isalnum(c))
133#define ISALPHA(c) (isascii(c) && isalpha(c))
134#define ISUPPER(c) (isascii(c) && isupper(c))
135#endif
136
137#else /* ! __STDC__ */
138
139#define const
140typedef int size_t;
141extern char *calloc(), *malloc(), *realloc();
142extern void free();
143
144extern char *bcopy(), *bzero();
145
146#define ISALNUM(c) (isascii(c) && isalnum(c))
147#define ISALPHA(c) (isascii(c) && isalpha(c))
148#define ISUPPER(c) (isascii(c) && isupper(c))
149
150#endif /* ! __STDC__ */
151
152/* 1 means plain parentheses serve as grouping, and backslash
153 parentheses are needed for literal searching.
154 0 means backslash-parentheses are grouping, and plain parentheses
155 are for literal searching. */
156#define RE_NO_BK_PARENS 1
157
158/* 1 means plain | serves as the "or"-operator, and \| is a literal.
159 0 means \| serves as the "or"-operator, and | is a literal. */
160#define RE_NO_BK_VBAR 2
161
162/* 0 means plain + or ? serves as an operator, and \+, \? are literals.
163 1 means \+, \? are operators and plain +, ? are literals. */
164#define RE_BK_PLUS_QM 4
165
166/* 1 means | binds tighter than ^ or $.
167 0 means the contrary. */
168#define RE_TIGHT_VBAR 8
169
170/* 1 means treat \n as an _OR operator
171 0 means treat it as a normal character */
172#define RE_NEWLINE_OR 16
173
174/* 0 means that a special characters (such as *, ^, and $) always have
175 their special meaning regardless of the surrounding context.
176 1 means that special characters may act as normal characters in some
177 contexts. Specifically, this applies to:
178 ^ - only special at the beginning, or after ( or |
179 $ - only special at the end, or before ) or |
180 *, +, ? - only special when not after the beginning, (, or | */
181#define RE_CONTEXT_INDEP_OPS 32
182
183/* Now define combinations of bits for the standard possibilities. */
184#define RE_SYNTAX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS)
185#define RE_SYNTAX_EGREP (RE_SYNTAX_AWK | RE_NEWLINE_OR)
186#define RE_SYNTAX_GREP (RE_BK_PLUS_QM | RE_NEWLINE_OR)
187#define RE_SYNTAX_EMACS 0
188
189/* The NULL pointer. */
190#define NULL 0
191
192/* Number of bits in an unsigned char. */
193#define CHARBITS 8
194
195/* First integer value that is greater than any character code. */
196#define _NOTCHAR (1 << CHARBITS)
197
198/* INTBITS need not be exact, just a lower bound. */
199#define INTBITS (CHARBITS * sizeof (int))
200
201/* Number of ints required to hold a bit for every character. */
202#define _CHARSET_INTS ((_NOTCHAR + INTBITS - 1) / INTBITS)
203
204/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
205typedef int _charset[_CHARSET_INTS];
206
207/* The regexp is parsed into an array of tokens in postfix form. Some tokens
208 are operators and others are terminal symbols. Most (but not all) of these
209 codes are returned by the lexical analyzer. */
210#ifdef __STDC__
211
212typedef enum
213{
214 _END = -1, /* _END is a terminal symbol that matches the
215 end of input; any value of _END or less in
216 the parse tree is such a symbol. Accepting
217 states of the DFA are those that would have
218 a transition on _END. */
219
220 /* Ordinary character values are terminal symbols that match themselves. */
221
222 _EMPTY = _NOTCHAR, /* _EMPTY is a terminal symbol that matches
223 the empty string. */
224
225 _BACKREF, /* _BACKREF is generated by \<digit>; it
226 it not completely handled. If the scanner
227 detects a transition on backref, it returns
228 a kind of "semi-success" indicating that
229 the match will have to be verified with
230 a backtracking matcher. */
231
232 _BEGLINE, /* _BEGLINE is a terminal symbol that matches
233 the empty string if it is at the beginning
234 of a line. */
235
236 _ALLBEGLINE, /* _ALLBEGLINE is a terminal symbol that
237 matches the empty string if it is at the
238 beginning of a line; _ALLBEGLINE applies
239 to the entire regexp and can only occur
240 as the first token thereof. _ALLBEGLINE
241 never appears in the parse tree; a _BEGLINE
242 is prepended with _CAT to the entire
243 regexp instead. */
244
245 _ENDLINE, /* _ENDLINE is a terminal symbol that matches
246 the empty string if it is at the end of
247 a line. */
248
249 _ALLENDLINE, /* _ALLENDLINE is to _ENDLINE as _ALLBEGLINE
250 is to _BEGLINE. */
251
252 _BEGWORD, /* _BEGWORD is a terminal symbol that matches
253 the empty string if it is at the beginning
254 of a word. */
255
256 _ENDWORD, /* _ENDWORD is a terminal symbol that matches
257 the empty string if it is at the end of
258 a word. */
259
260 _LIMWORD, /* _LIMWORD is a terminal symbol that matches
261 the empty string if it is at the beginning
262 or the end of a word. */
263
264 _NOTLIMWORD, /* _NOTLIMWORD is a terminal symbol that
265 matches the empty string if it is not at
266 the beginning or end of a word. */
267
268 _QMARK, /* _QMARK is an operator of one argument that
269 matches zero or one occurences of its
270 argument. */
271
272 _STAR, /* _STAR is an operator of one argument that
273 matches the Kleene closure (zero or more
274 occurrences) of its argument. */
275
276 _PLUS, /* _PLUS is an operator of one argument that
277 matches the positive closure (one or more
278 occurrences) of its argument. */
279
280 _CAT, /* _CAT is an operator of two arguments that
281 matches the concatenation of its
282 arguments. _CAT is never returned by the
283 lexical analyzer. */
284
285 _OR, /* _OR is an operator of two arguments that
286 matches either of its arguments. */
287
288 _LPAREN, /* _LPAREN never appears in the parse tree,
289 it is only a lexeme. */
290
291 _RPAREN, /* _RPAREN never appears in the parse tree. */
292
293 _SET /* _SET and (and any value greater) is a
294 terminal symbol that matches any of a
295 class of characters. */
296} _token;
297
298#else /* ! __STDC__ */
299
300typedef short _token;
301
302#define _END -1
303#define _EMPTY _NOTCHAR
304#define _BACKREF (_EMPTY + 1)
305#define _BEGLINE (_EMPTY + 2)
306#define _ALLBEGLINE (_EMPTY + 3)
307#define _ENDLINE (_EMPTY + 4)
308#define _ALLENDLINE (_EMPTY + 5)
309#define _BEGWORD (_EMPTY + 6)
310#define _ENDWORD (_EMPTY + 7)
311#define _LIMWORD (_EMPTY + 8)
312#define _NOTLIMWORD (_EMPTY + 9)
313#define _QMARK (_EMPTY + 10)
314#define _STAR (_EMPTY + 11)
315#define _PLUS (_EMPTY + 12)
316#define _CAT (_EMPTY + 13)
317#define _OR (_EMPTY + 14)
318#define _LPAREN (_EMPTY + 15)
319#define _RPAREN (_EMPTY + 16)
320#define _SET (_EMPTY + 17)
321
322#endif /* ! __STDC__ */
323
324/* Sets are stored in an array in the compiled regexp; the index of the
325 array corresponding to a given set token is given by _SET_INDEX(t). */
326#define _SET_INDEX(t) ((t) - _SET)
327
328/* Sometimes characters can only be matched depending on the surrounding
329 context. Such context decisions depend on what the previous character
330 was, and the value of the current (lookahead) character. Context
331 dependent constraints are encoded as 8 bit integers. Each bit that
332 is set indicates that the constraint succeeds in the corresponding
333 context.
334
335 bit 7 - previous and current are newlines
336 bit 6 - previous was newline, current isn't
337 bit 5 - previous wasn't newline, current is
338 bit 4 - neither previous nor current is a newline
339 bit 3 - previous and current are word-constituents
340 bit 2 - previous was word-constituent, current isn't
341 bit 1 - previous wasn't word-constituent, current is
342 bit 0 - neither previous nor current is word-constituent
343
344 Word-constituent characters are those that satisfy isalnum().
345
346 The macro _SUCCEEDS_IN_CONTEXT determines whether a a given constraint
347 succeeds in a particular context. Prevn is true if the previous character
348 was a newline, currn is true if the lookahead character is a newline.
349 Prevl and currl similarly depend upon whether the previous and current
350 characters are word-constituent letters. */
351#define _MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
352 ((constraint) & 1 << ((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4)
353#define _MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
354 ((constraint) & 1 << ((prevl) ? 2 : 0) + ((currl) ? 1 : 0))
355#define _SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
356 (_MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
357 && _MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
358
359/* The following macros give information about what a constraint depends on. */
360#define _PREV_NEWLINE_DEPENDENT(constraint) \
361 (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
362#define _PREV_LETTER_DEPENDENT(constraint) \
363 (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
364
365/* Tokens that match the empty string subject to some constraint actually
366 work by applying that constraint to determine what may follow them,
367 taking into account what has gone before. The following values are
368 the constraints corresponding to the special tokens previously defined. */
369#define _NO_CONSTRAINT 0xff
370#define _BEGLINE_CONSTRAINT 0xcf
371#define _ENDLINE_CONSTRAINT 0xaf
372#define _BEGWORD_CONSTRAINT 0xf2
373#define _ENDWORD_CONSTRAINT 0xf4
374#define _LIMWORD_CONSTRAINT 0xf6
375#define _NOTLIMWORD_CONSTRAINT 0xf9
376
377/* States of the recognizer correspond to sets of positions in the parse
378 tree, together with the constraints under which they may be matched.
379 So a position is encoded as an index into the parse tree together with
380 a constraint. */
381typedef struct
382{
383 unsigned index; /* Index into the parse array. */
384 unsigned constraint; /* Constraint for matching this position. */
385} _position;
386
387/* Sets of positions are stored as arrays. */
388typedef struct
389{
390 _position *elems; /* Elements of this position set. */
391 int nelem; /* Number of elements in this set. */
392} _position_set;
393
394/* A state of the regexp consists of a set of positions, some flags,
395 and the token value of the lowest-numbered position of the state that
396 contains an _END token. */
397typedef struct
398{
399 int hash; /* Hash of the positions of this state. */
400 _position_set elems; /* Positions this state could match. */
401 char newline; /* True if previous state matched newline. */
402 char letter; /* True if previous state matched a letter. */
403 char backref; /* True if this state matches a \<digit>. */
404 unsigned char constraint; /* Constraint for this state to accept. */
405 int first_end; /* Token value of the first _END in elems. */
406} _dfa_state;
407
408/* If an r.e. is at most MUST_MAX characters long, we look for a string which
409 must appear in it; whatever's found is dropped into the struct reg. */
410
411#define MUST_MAX 50
412
413/* A compiled regular expression. */
414struct regexp
415{
416 /* Stuff built by the scanner. */
417 _charset *charsets; /* Array of character sets for _SET tokens. */
418 int cindex; /* Index for adding new charsets. */
419 int calloc; /* Number of charsets currently allocated. */
420
421 /* Stuff built by the parser. */
422 _token *tokens; /* Postfix parse array. */
423 int tindex; /* Index for adding new tokens. */
424 int talloc; /* Number of tokens currently allocated. */
425 int depth; /* Depth required of an evaluation stack
426 used for depth-first traversal of the
427 parse tree. */
428 int nleaves; /* Number of leaves on the parse tree. */
429 int nregexps; /* Count of parallel regexps being built
430 with regparse(). */
431
432 /* Stuff owned by the state builder. */
433 _dfa_state *states; /* States of the regexp. */
434 int sindex; /* Index for adding new states. */
435 int salloc; /* Number of states currently allocated. */
436
437 /* Stuff built by the structure analyzer. */
438 _position_set *follows; /* Array of follow sets, indexed by position
439 index. The follow of a position is the set
440 of positions containing characters that
441 could conceivably follow a character
442 matching the given position in a string
443 matching the regexp. Allocated to the
444 maximum possible position index. */
445 int searchflag; /* True if we are supposed to build a searching
446 as opposed to an exact matcher. A searching
447 matcher finds the first and shortest string
448 matching a regexp anywhere in the buffer,
449 whereas an exact matcher finds the longest
450 string matching, but anchored to the
451 beginning of the buffer. */
452
453 /* Stuff owned by the executor. */
454 int tralloc; /* Number of transition tables that have
455 slots so far. */
456 int trcount; /* Number of transition tables that have
457 actually been built. */
458 int **trans; /* Transition tables for states that can
459 never accept. If the transitions for a
460 state have not yet been computed, or the
461 state could possibly accept, its entry in
462 this table is NULL. */
463 int **realtrans; /* Trans always points to realtrans + 1; this
464 is so trans[-1] can contain NULL. */
465 int **fails; /* Transition tables after failing to accept
466 on a state that potentially could do so. */
467 int *success; /* Table of acceptance conditions used in
468 regexecute and computed in build_state. */
469 int *newlines; /* Transitions on newlines. The entry for a
470 newline in any transition table is always
471 -1 so we can count lines without wasting
472 too many cycles. The transition for a
473 newline is stored separately and handled
474 as a special case. Newline is also used
475 as a sentinel at the end of the buffer. */
476 char must[MUST_MAX];
477 int mustn;
478};
479
480/* Some macros for user access to regexp internals. */
481
482/* ACCEPTING returns true if s could possibly be an accepting state of r. */
483#define ACCEPTING(s, r) ((r).states[s].constraint)
484
485/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
486 specified context. */
487#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, reg) \
488 _SUCCEEDS_IN_CONTEXT((reg).states[state].constraint, \
489 prevn, currn, prevl, currl)
490
491/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
492 regexps that a given state could accept. Parallel regexps are numbered
493 starting at 1. */
494#define FIRST_MATCHING_REGEXP(state, reg) (-(reg).states[state].first_end)
495
496/* Entry points. */
497
498#ifdef __STDC__
499
500/* Regsyntax() takes two arguments; the first sets the syntax bits described
501 earlier in this file, and the second sets the case-folding flag. */
502extern void regsyntax(int, int);
503
504/* Compile the given string of the given length into the given struct regexp.
505 Final argument is a flag specifying whether to build a searching or an
506 exact matcher. */
507extern void regcompile(const char *, size_t, struct regexp *, int);
508
509/* Execute the given struct regexp on the buffer of characters. The
510 first char * points to the beginning, and the second points to the
511 first character after the end of the buffer, which must be a writable
512 place so a sentinel end-of-buffer marker can be stored there. The
513 second-to-last argument is a flag telling whether to allow newlines to
514 be part of a string matching the regexp. The next-to-last argument,
515 if non-NULL, points to a place to increment every time we see a
516 newline. The final argument, if non-NULL, points to a flag that will
517 be set if further examination by a backtracking matcher is needed in
518 order to verify backreferencing; otherwise the flag will be cleared.
519 Returns NULL if no match is found, or a pointer to the first
520 character after the first & shortest matching string in the buffer. */
521extern char *regexecute(struct regexp *, char *, char *, int, int *, int *);
522
523/* Free the storage held by the components of a struct regexp. */
524extern void regfree(struct regexp *);
525
526/* Entry points for people who know what they're doing. */
527
528/* Initialize the components of a struct regexp. */
529extern void reginit(struct regexp *);
530
531/* Incrementally parse a string of given length into a struct regexp. */
532extern void regparse(const char *, size_t, struct regexp *);
533
534/* Analyze a parsed regexp; second argument tells whether to build a searching
535 or an exact matcher. */
536extern void reganalyze(struct regexp *, int);
537
538/* Compute, for each possible character, the transitions out of a given
539 state, storing them in an array of integers. */
540extern void regstate(int, struct regexp *, int []);
541
542/* Error handling. */
543
544/* Regerror() is called by the regexp routines whenever an error occurs. It
545 takes a single argument, a NUL-terminated string describing the error.
546 The default regerror() prints the error message to stderr and exits.
547 The user can provide a different regfree() if so desired. */
548extern void regerror(const char *);
549
550#else /* ! __STDC__ */
551extern void regsyntax(), regcompile(), regfree(), reginit(), regparse();
552extern void reganalyze(), regstate(), regerror();
553extern char *regexecute();
554#endif