Commit | Line | Data |
---|---|---|
e30c5f65 WJ |
1 | /* dfa.h - declarations for GNU deterministic regexp compiler |
2 | Copyright (C) 1988 Free Software Foundation, Inc. | |
3 | Written June, 1988 by Mike Haertel | |
4 | ||
5 | NO WARRANTY | |
6 | ||
7 | BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY | |
8 | NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT | |
9 | WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, | |
10 | RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS" | |
11 | WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, | |
12 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
13 | FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY | |
14 | AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE | |
15 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR | |
16 | CORRECTION. | |
17 | ||
18 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M. | |
19 | STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY | |
20 | WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE | |
21 | LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR | |
22 | OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE | |
23 | USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR | |
24 | DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR | |
25 | A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS | |
26 | PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH | |
27 | DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. | |
28 | ||
29 | GENERAL PUBLIC LICENSE TO COPY | |
30 | ||
31 | 1. You may copy and distribute verbatim copies of this source file | |
32 | as you receive it, in any medium, provided that you conspicuously and | |
33 | appropriately publish on each copy a valid copyright notice "Copyright | |
34 | (C) 1988 Free Software Foundation, Inc."; and include following the | |
35 | copyright notice a verbatim copy of the above disclaimer of warranty | |
36 | and of this License. You may charge a distribution fee for the | |
37 | physical act of transferring a copy. | |
38 | ||
39 | 2. You may modify your copy or copies of this source file or | |
40 | any portion of it, and copy and distribute such modifications under | |
41 | the terms of Paragraph 1 above, provided that you also do the following: | |
42 | ||
43 | a) cause the modified files to carry prominent notices stating | |
44 | that you changed the files and the date of any change; and | |
45 | ||
46 | b) cause the whole of any work that you distribute or publish, | |
47 | that in whole or in part contains or is a derivative of this | |
48 | program or any part thereof, to be licensed at no charge to all | |
49 | third parties on terms identical to those contained in this | |
50 | License Agreement (except that you may choose to grant more extensive | |
51 | warranty protection to some or all third parties, at your option). | |
52 | ||
53 | c) You may charge a distribution fee for the physical act of | |
54 | transferring a copy, and you may at your option offer warranty | |
55 | protection in exchange for a fee. | |
56 | ||
57 | Mere aggregation of another unrelated program with this program (or its | |
58 | derivative) on a volume of a storage or distribution medium does not bring | |
59 | the other program under the scope of these terms. | |
60 | ||
61 | 3. You may copy and distribute this program or any portion of it in | |
62 | compiled, executable or object code form under the terms of Paragraphs | |
63 | 1 and 2 above provided that you do the following: | |
64 | ||
65 | a) accompany it with the complete corresponding machine-readable | |
66 | source code, which must be distributed under the terms of | |
67 | Paragraphs 1 and 2 above; or, | |
68 | ||
69 | b) accompany it with a written offer, valid for at least three | |
70 | years, to give any third party free (except for a nominal | |
71 | shipping charge) a complete machine-readable copy of the | |
72 | corresponding source code, to be distributed under the terms of | |
73 | Paragraphs 1 and 2 above; or, | |
74 | ||
75 | c) accompany it with the information you received as to where the | |
76 | corresponding source code may be obtained. (This alternative is | |
77 | allowed only for noncommercial distribution and only if you | |
78 | received the program in object code or executable form alone.) | |
79 | ||
80 | For an executable file, complete source code means all the source code for | |
81 | all modules it contains; but, as a special exception, it need not include | |
82 | source code for modules which are standard libraries that accompany the | |
83 | operating system on which the executable file runs. | |
84 | ||
85 | 4. You may not copy, sublicense, distribute or transfer this program | |
86 | except as expressly provided under this License Agreement. Any attempt | |
87 | otherwise to copy, sublicense, distribute or transfer this program is void and | |
88 | your rights to use the program under this License agreement shall be | |
89 | automatically terminated. However, parties who have received computer | |
90 | software programs from you with this License Agreement will not have | |
91 | their licenses terminated so long as such parties remain in full compliance. | |
92 | ||
93 | 5. If you wish to incorporate parts of this program into other free | |
94 | programs whose distribution conditions are different, write to the Free | |
95 | Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet | |
96 | worked out a simple rule that can be stated here, but we will often permit | |
97 | this. We will be guided by the two goals of preserving the free status of | |
98 | all derivatives our free software and of promoting the sharing and reuse of | |
99 | software. | |
100 | ||
101 | ||
102 | In other words, you are welcome to use, share and improve this program. | |
103 | You are forbidden to forbid anyone else to use, share and improve | |
104 | what you give them. Help stamp out software-hoarding! */ | |
105 | \f | |
106 | ||
107 | #ifdef USG | |
108 | #include <string.h> | |
109 | extern char *index(); | |
110 | #else | |
111 | #include <strings.h> | |
112 | /*extern char *strchr(), *strrchr(), *memcpy();*/ | |
113 | #endif | |
114 | ||
115 | #ifdef __STDC__ | |
116 | ||
117 | /* Missing include files for GNU C. */ | |
118 | #include <stdlib.h> | |
119 | /*typedef int size_t; | |
120 | extern void *calloc(int, size_t); | |
121 | extern void *malloc(size_t); | |
122 | extern void *realloc(void *, size_t); | |
123 | extern void free(void *); | |
124 | ||
125 | extern char *bcopy(), *bzero();*/ | |
126 | ||
127 | #ifdef SOMEDAY | |
128 | #define ISALNUM(c) isalnum(c) | |
129 | #define ISALPHA(c) isalpha(c) | |
130 | #define ISUPPER(c) isupper(c) | |
131 | #else | |
132 | #define ISALNUM(c) (isascii(c) && isalnum(c)) | |
133 | #define ISALPHA(c) (isascii(c) && isalpha(c)) | |
134 | #define ISUPPER(c) (isascii(c) && isupper(c)) | |
135 | #endif | |
136 | ||
137 | #else /* ! __STDC__ */ | |
138 | ||
139 | #define const | |
140 | typedef int size_t; | |
141 | extern char *calloc(), *malloc(), *realloc(); | |
142 | extern void free(); | |
143 | ||
144 | extern char *bcopy(), *bzero(); | |
145 | ||
146 | #define ISALNUM(c) (isascii(c) && isalnum(c)) | |
147 | #define ISALPHA(c) (isascii(c) && isalpha(c)) | |
148 | #define ISUPPER(c) (isascii(c) && isupper(c)) | |
149 | ||
150 | #endif /* ! __STDC__ */ | |
151 | ||
152 | /* 1 means plain parentheses serve as grouping, and backslash | |
153 | parentheses are needed for literal searching. | |
154 | 0 means backslash-parentheses are grouping, and plain parentheses | |
155 | are for literal searching. */ | |
156 | #define RE_NO_BK_PARENS 1 | |
157 | ||
158 | /* 1 means plain | serves as the "or"-operator, and \| is a literal. | |
159 | 0 means \| serves as the "or"-operator, and | is a literal. */ | |
160 | #define RE_NO_BK_VBAR 2 | |
161 | ||
162 | /* 0 means plain + or ? serves as an operator, and \+, \? are literals. | |
163 | 1 means \+, \? are operators and plain +, ? are literals. */ | |
164 | #define RE_BK_PLUS_QM 4 | |
165 | ||
166 | /* 1 means | binds tighter than ^ or $. | |
167 | 0 means the contrary. */ | |
168 | #define RE_TIGHT_VBAR 8 | |
169 | ||
170 | /* 1 means treat \n as an _OR operator | |
171 | 0 means treat it as a normal character */ | |
172 | #define RE_NEWLINE_OR 16 | |
173 | ||
174 | /* 0 means that a special characters (such as *, ^, and $) always have | |
175 | their special meaning regardless of the surrounding context. | |
176 | 1 means that special characters may act as normal characters in some | |
177 | contexts. Specifically, this applies to: | |
178 | ^ - only special at the beginning, or after ( or | | |
179 | $ - only special at the end, or before ) or | | |
180 | *, +, ? - only special when not after the beginning, (, or | */ | |
181 | #define RE_CONTEXT_INDEP_OPS 32 | |
182 | ||
183 | /* Now define combinations of bits for the standard possibilities. */ | |
184 | #define RE_SYNTAX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS) | |
185 | #define RE_SYNTAX_EGREP (RE_SYNTAX_AWK | RE_NEWLINE_OR) | |
186 | #define RE_SYNTAX_GREP (RE_BK_PLUS_QM | RE_NEWLINE_OR) | |
187 | #define RE_SYNTAX_EMACS 0 | |
188 | ||
189 | /* The NULL pointer. */ | |
190 | #define NULL 0 | |
191 | ||
192 | /* Number of bits in an unsigned char. */ | |
193 | #define CHARBITS 8 | |
194 | ||
195 | /* First integer value that is greater than any character code. */ | |
196 | #define _NOTCHAR (1 << CHARBITS) | |
197 | ||
198 | /* INTBITS need not be exact, just a lower bound. */ | |
199 | #define INTBITS (CHARBITS * sizeof (int)) | |
200 | ||
201 | /* Number of ints required to hold a bit for every character. */ | |
202 | #define _CHARSET_INTS ((_NOTCHAR + INTBITS - 1) / INTBITS) | |
203 | ||
204 | /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ | |
205 | typedef int _charset[_CHARSET_INTS]; | |
206 | ||
207 | /* The regexp is parsed into an array of tokens in postfix form. Some tokens | |
208 | are operators and others are terminal symbols. Most (but not all) of these | |
209 | codes are returned by the lexical analyzer. */ | |
210 | #ifdef __STDC__ | |
211 | ||
212 | typedef enum | |
213 | { | |
214 | _END = -1, /* _END is a terminal symbol that matches the | |
215 | end of input; any value of _END or less in | |
216 | the parse tree is such a symbol. Accepting | |
217 | states of the DFA are those that would have | |
218 | a transition on _END. */ | |
219 | ||
220 | /* Ordinary character values are terminal symbols that match themselves. */ | |
221 | ||
222 | _EMPTY = _NOTCHAR, /* _EMPTY is a terminal symbol that matches | |
223 | the empty string. */ | |
224 | ||
225 | _BACKREF, /* _BACKREF is generated by \<digit>; it | |
226 | it not completely handled. If the scanner | |
227 | detects a transition on backref, it returns | |
228 | a kind of "semi-success" indicating that | |
229 | the match will have to be verified with | |
230 | a backtracking matcher. */ | |
231 | ||
232 | _BEGLINE, /* _BEGLINE is a terminal symbol that matches | |
233 | the empty string if it is at the beginning | |
234 | of a line. */ | |
235 | ||
236 | _ALLBEGLINE, /* _ALLBEGLINE is a terminal symbol that | |
237 | matches the empty string if it is at the | |
238 | beginning of a line; _ALLBEGLINE applies | |
239 | to the entire regexp and can only occur | |
240 | as the first token thereof. _ALLBEGLINE | |
241 | never appears in the parse tree; a _BEGLINE | |
242 | is prepended with _CAT to the entire | |
243 | regexp instead. */ | |
244 | ||
245 | _ENDLINE, /* _ENDLINE is a terminal symbol that matches | |
246 | the empty string if it is at the end of | |
247 | a line. */ | |
248 | ||
249 | _ALLENDLINE, /* _ALLENDLINE is to _ENDLINE as _ALLBEGLINE | |
250 | is to _BEGLINE. */ | |
251 | ||
252 | _BEGWORD, /* _BEGWORD is a terminal symbol that matches | |
253 | the empty string if it is at the beginning | |
254 | of a word. */ | |
255 | ||
256 | _ENDWORD, /* _ENDWORD is a terminal symbol that matches | |
257 | the empty string if it is at the end of | |
258 | a word. */ | |
259 | ||
260 | _LIMWORD, /* _LIMWORD is a terminal symbol that matches | |
261 | the empty string if it is at the beginning | |
262 | or the end of a word. */ | |
263 | ||
264 | _NOTLIMWORD, /* _NOTLIMWORD is a terminal symbol that | |
265 | matches the empty string if it is not at | |
266 | the beginning or end of a word. */ | |
267 | ||
268 | _QMARK, /* _QMARK is an operator of one argument that | |
269 | matches zero or one occurences of its | |
270 | argument. */ | |
271 | ||
272 | _STAR, /* _STAR is an operator of one argument that | |
273 | matches the Kleene closure (zero or more | |
274 | occurrences) of its argument. */ | |
275 | ||
276 | _PLUS, /* _PLUS is an operator of one argument that | |
277 | matches the positive closure (one or more | |
278 | occurrences) of its argument. */ | |
279 | ||
280 | _CAT, /* _CAT is an operator of two arguments that | |
281 | matches the concatenation of its | |
282 | arguments. _CAT is never returned by the | |
283 | lexical analyzer. */ | |
284 | ||
285 | _OR, /* _OR is an operator of two arguments that | |
286 | matches either of its arguments. */ | |
287 | ||
288 | _LPAREN, /* _LPAREN never appears in the parse tree, | |
289 | it is only a lexeme. */ | |
290 | ||
291 | _RPAREN, /* _RPAREN never appears in the parse tree. */ | |
292 | ||
293 | _SET /* _SET and (and any value greater) is a | |
294 | terminal symbol that matches any of a | |
295 | class of characters. */ | |
296 | } _token; | |
297 | ||
298 | #else /* ! __STDC__ */ | |
299 | ||
300 | typedef short _token; | |
301 | ||
302 | #define _END -1 | |
303 | #define _EMPTY _NOTCHAR | |
304 | #define _BACKREF (_EMPTY + 1) | |
305 | #define _BEGLINE (_EMPTY + 2) | |
306 | #define _ALLBEGLINE (_EMPTY + 3) | |
307 | #define _ENDLINE (_EMPTY + 4) | |
308 | #define _ALLENDLINE (_EMPTY + 5) | |
309 | #define _BEGWORD (_EMPTY + 6) | |
310 | #define _ENDWORD (_EMPTY + 7) | |
311 | #define _LIMWORD (_EMPTY + 8) | |
312 | #define _NOTLIMWORD (_EMPTY + 9) | |
313 | #define _QMARK (_EMPTY + 10) | |
314 | #define _STAR (_EMPTY + 11) | |
315 | #define _PLUS (_EMPTY + 12) | |
316 | #define _CAT (_EMPTY + 13) | |
317 | #define _OR (_EMPTY + 14) | |
318 | #define _LPAREN (_EMPTY + 15) | |
319 | #define _RPAREN (_EMPTY + 16) | |
320 | #define _SET (_EMPTY + 17) | |
321 | ||
322 | #endif /* ! __STDC__ */ | |
323 | ||
324 | /* Sets are stored in an array in the compiled regexp; the index of the | |
325 | array corresponding to a given set token is given by _SET_INDEX(t). */ | |
326 | #define _SET_INDEX(t) ((t) - _SET) | |
327 | ||
328 | /* Sometimes characters can only be matched depending on the surrounding | |
329 | context. Such context decisions depend on what the previous character | |
330 | was, and the value of the current (lookahead) character. Context | |
331 | dependent constraints are encoded as 8 bit integers. Each bit that | |
332 | is set indicates that the constraint succeeds in the corresponding | |
333 | context. | |
334 | ||
335 | bit 7 - previous and current are newlines | |
336 | bit 6 - previous was newline, current isn't | |
337 | bit 5 - previous wasn't newline, current is | |
338 | bit 4 - neither previous nor current is a newline | |
339 | bit 3 - previous and current are word-constituents | |
340 | bit 2 - previous was word-constituent, current isn't | |
341 | bit 1 - previous wasn't word-constituent, current is | |
342 | bit 0 - neither previous nor current is word-constituent | |
343 | ||
344 | Word-constituent characters are those that satisfy isalnum(). | |
345 | ||
346 | The macro _SUCCEEDS_IN_CONTEXT determines whether a a given constraint | |
347 | succeeds in a particular context. Prevn is true if the previous character | |
348 | was a newline, currn is true if the lookahead character is a newline. | |
349 | Prevl and currl similarly depend upon whether the previous and current | |
350 | characters are word-constituent letters. */ | |
351 | #define _MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ | |
352 | ((constraint) & 1 << ((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4) | |
353 | #define _MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \ | |
354 | ((constraint) & 1 << ((prevl) ? 2 : 0) + ((currl) ? 1 : 0)) | |
355 | #define _SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \ | |
356 | (_MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ | |
357 | && _MATCHES_LETTER_CONTEXT(constraint, prevl, currl)) | |
358 | ||
359 | /* The following macros give information about what a constraint depends on. */ | |
360 | #define _PREV_NEWLINE_DEPENDENT(constraint) \ | |
361 | (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) | |
362 | #define _PREV_LETTER_DEPENDENT(constraint) \ | |
363 | (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) | |
364 | ||
365 | /* Tokens that match the empty string subject to some constraint actually | |
366 | work by applying that constraint to determine what may follow them, | |
367 | taking into account what has gone before. The following values are | |
368 | the constraints corresponding to the special tokens previously defined. */ | |
369 | #define _NO_CONSTRAINT 0xff | |
370 | #define _BEGLINE_CONSTRAINT 0xcf | |
371 | #define _ENDLINE_CONSTRAINT 0xaf | |
372 | #define _BEGWORD_CONSTRAINT 0xf2 | |
373 | #define _ENDWORD_CONSTRAINT 0xf4 | |
374 | #define _LIMWORD_CONSTRAINT 0xf6 | |
375 | #define _NOTLIMWORD_CONSTRAINT 0xf9 | |
376 | ||
377 | /* States of the recognizer correspond to sets of positions in the parse | |
378 | tree, together with the constraints under which they may be matched. | |
379 | So a position is encoded as an index into the parse tree together with | |
380 | a constraint. */ | |
381 | typedef struct | |
382 | { | |
383 | unsigned index; /* Index into the parse array. */ | |
384 | unsigned constraint; /* Constraint for matching this position. */ | |
385 | } _position; | |
386 | ||
387 | /* Sets of positions are stored as arrays. */ | |
388 | typedef struct | |
389 | { | |
390 | _position *elems; /* Elements of this position set. */ | |
391 | int nelem; /* Number of elements in this set. */ | |
392 | } _position_set; | |
393 | ||
394 | /* A state of the regexp consists of a set of positions, some flags, | |
395 | and the token value of the lowest-numbered position of the state that | |
396 | contains an _END token. */ | |
397 | typedef struct | |
398 | { | |
399 | int hash; /* Hash of the positions of this state. */ | |
400 | _position_set elems; /* Positions this state could match. */ | |
401 | char newline; /* True if previous state matched newline. */ | |
402 | char letter; /* True if previous state matched a letter. */ | |
403 | char backref; /* True if this state matches a \<digit>. */ | |
404 | unsigned char constraint; /* Constraint for this state to accept. */ | |
405 | int first_end; /* Token value of the first _END in elems. */ | |
406 | } _dfa_state; | |
407 | ||
408 | /* If an r.e. is at most MUST_MAX characters long, we look for a string which | |
409 | must appear in it; whatever's found is dropped into the struct reg. */ | |
410 | ||
411 | #define MUST_MAX 50 | |
412 | ||
413 | /* A compiled regular expression. */ | |
414 | struct regexp | |
415 | { | |
416 | /* Stuff built by the scanner. */ | |
417 | _charset *charsets; /* Array of character sets for _SET tokens. */ | |
418 | int cindex; /* Index for adding new charsets. */ | |
419 | int calloc; /* Number of charsets currently allocated. */ | |
420 | ||
421 | /* Stuff built by the parser. */ | |
422 | _token *tokens; /* Postfix parse array. */ | |
423 | int tindex; /* Index for adding new tokens. */ | |
424 | int talloc; /* Number of tokens currently allocated. */ | |
425 | int depth; /* Depth required of an evaluation stack | |
426 | used for depth-first traversal of the | |
427 | parse tree. */ | |
428 | int nleaves; /* Number of leaves on the parse tree. */ | |
429 | int nregexps; /* Count of parallel regexps being built | |
430 | with regparse(). */ | |
431 | ||
432 | /* Stuff owned by the state builder. */ | |
433 | _dfa_state *states; /* States of the regexp. */ | |
434 | int sindex; /* Index for adding new states. */ | |
435 | int salloc; /* Number of states currently allocated. */ | |
436 | ||
437 | /* Stuff built by the structure analyzer. */ | |
438 | _position_set *follows; /* Array of follow sets, indexed by position | |
439 | index. The follow of a position is the set | |
440 | of positions containing characters that | |
441 | could conceivably follow a character | |
442 | matching the given position in a string | |
443 | matching the regexp. Allocated to the | |
444 | maximum possible position index. */ | |
445 | int searchflag; /* True if we are supposed to build a searching | |
446 | as opposed to an exact matcher. A searching | |
447 | matcher finds the first and shortest string | |
448 | matching a regexp anywhere in the buffer, | |
449 | whereas an exact matcher finds the longest | |
450 | string matching, but anchored to the | |
451 | beginning of the buffer. */ | |
452 | ||
453 | /* Stuff owned by the executor. */ | |
454 | int tralloc; /* Number of transition tables that have | |
455 | slots so far. */ | |
456 | int trcount; /* Number of transition tables that have | |
457 | actually been built. */ | |
458 | int **trans; /* Transition tables for states that can | |
459 | never accept. If the transitions for a | |
460 | state have not yet been computed, or the | |
461 | state could possibly accept, its entry in | |
462 | this table is NULL. */ | |
463 | int **realtrans; /* Trans always points to realtrans + 1; this | |
464 | is so trans[-1] can contain NULL. */ | |
465 | int **fails; /* Transition tables after failing to accept | |
466 | on a state that potentially could do so. */ | |
467 | int *success; /* Table of acceptance conditions used in | |
468 | regexecute and computed in build_state. */ | |
469 | int *newlines; /* Transitions on newlines. The entry for a | |
470 | newline in any transition table is always | |
471 | -1 so we can count lines without wasting | |
472 | too many cycles. The transition for a | |
473 | newline is stored separately and handled | |
474 | as a special case. Newline is also used | |
475 | as a sentinel at the end of the buffer. */ | |
476 | char must[MUST_MAX]; | |
477 | int mustn; | |
478 | }; | |
479 | ||
480 | /* Some macros for user access to regexp internals. */ | |
481 | ||
482 | /* ACCEPTING returns true if s could possibly be an accepting state of r. */ | |
483 | #define ACCEPTING(s, r) ((r).states[s].constraint) | |
484 | ||
485 | /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the | |
486 | specified context. */ | |
487 | #define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, reg) \ | |
488 | _SUCCEEDS_IN_CONTEXT((reg).states[state].constraint, \ | |
489 | prevn, currn, prevl, currl) | |
490 | ||
491 | /* FIRST_MATCHING_REGEXP returns the index number of the first of parallel | |
492 | regexps that a given state could accept. Parallel regexps are numbered | |
493 | starting at 1. */ | |
494 | #define FIRST_MATCHING_REGEXP(state, reg) (-(reg).states[state].first_end) | |
495 | ||
496 | /* Entry points. */ | |
497 | ||
498 | #ifdef __STDC__ | |
499 | ||
500 | /* Regsyntax() takes two arguments; the first sets the syntax bits described | |
501 | earlier in this file, and the second sets the case-folding flag. */ | |
502 | extern void regsyntax(int, int); | |
503 | ||
504 | /* Compile the given string of the given length into the given struct regexp. | |
505 | Final argument is a flag specifying whether to build a searching or an | |
506 | exact matcher. */ | |
507 | extern void regcompile(const char *, size_t, struct regexp *, int); | |
508 | ||
509 | /* Execute the given struct regexp on the buffer of characters. The | |
510 | first char * points to the beginning, and the second points to the | |
511 | first character after the end of the buffer, which must be a writable | |
512 | place so a sentinel end-of-buffer marker can be stored there. The | |
513 | second-to-last argument is a flag telling whether to allow newlines to | |
514 | be part of a string matching the regexp. The next-to-last argument, | |
515 | if non-NULL, points to a place to increment every time we see a | |
516 | newline. The final argument, if non-NULL, points to a flag that will | |
517 | be set if further examination by a backtracking matcher is needed in | |
518 | order to verify backreferencing; otherwise the flag will be cleared. | |
519 | Returns NULL if no match is found, or a pointer to the first | |
520 | character after the first & shortest matching string in the buffer. */ | |
521 | extern char *regexecute(struct regexp *, char *, char *, int, int *, int *); | |
522 | ||
523 | /* Free the storage held by the components of a struct regexp. */ | |
524 | extern void regfree(struct regexp *); | |
525 | ||
526 | /* Entry points for people who know what they're doing. */ | |
527 | ||
528 | /* Initialize the components of a struct regexp. */ | |
529 | extern void reginit(struct regexp *); | |
530 | ||
531 | /* Incrementally parse a string of given length into a struct regexp. */ | |
532 | extern void regparse(const char *, size_t, struct regexp *); | |
533 | ||
534 | /* Analyze a parsed regexp; second argument tells whether to build a searching | |
535 | or an exact matcher. */ | |
536 | extern void reganalyze(struct regexp *, int); | |
537 | ||
538 | /* Compute, for each possible character, the transitions out of a given | |
539 | state, storing them in an array of integers. */ | |
540 | extern void regstate(int, struct regexp *, int []); | |
541 | ||
542 | /* Error handling. */ | |
543 | ||
544 | /* Regerror() is called by the regexp routines whenever an error occurs. It | |
545 | takes a single argument, a NUL-terminated string describing the error. | |
546 | The default regerror() prints the error message to stderr and exits. | |
547 | The user can provide a different regfree() if so desired. */ | |
548 | extern void regerror(const char *); | |
549 | ||
550 | #else /* ! __STDC__ */ | |
551 | extern void regsyntax(), regcompile(), regfree(), reginit(), regparse(); | |
552 | extern void reganalyze(), regstate(), regerror(); | |
553 | extern char *regexecute(); | |
554 | #endif |