| 1 | /* dfa.h - declarations for GNU deterministic regexp compiler |
| 2 | Copyright (C) 1988 Free Software Foundation, Inc. |
| 3 | Written June, 1988 by Mike Haertel |
| 4 | |
| 5 | NO WARRANTY |
| 6 | |
| 7 | BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY |
| 8 | NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT |
| 9 | WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, |
| 10 | RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS" |
| 11 | WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, |
| 12 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 13 | FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY |
| 14 | AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE |
| 15 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR |
| 16 | CORRECTION. |
| 17 | |
| 18 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M. |
| 19 | STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY |
| 20 | WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE |
| 21 | LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR |
| 22 | OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE |
| 23 | USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR |
| 24 | DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR |
| 25 | A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS |
| 26 | PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH |
| 27 | DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. |
| 28 | |
| 29 | GENERAL PUBLIC LICENSE TO COPY |
| 30 | |
| 31 | 1. You may copy and distribute verbatim copies of this source file |
| 32 | as you receive it, in any medium, provided that you conspicuously and |
| 33 | appropriately publish on each copy a valid copyright notice "Copyright |
| 34 | (C) 1988 Free Software Foundation, Inc."; and include following the |
| 35 | copyright notice a verbatim copy of the above disclaimer of warranty |
| 36 | and of this License. You may charge a distribution fee for the |
| 37 | physical act of transferring a copy. |
| 38 | |
| 39 | 2. You may modify your copy or copies of this source file or |
| 40 | any portion of it, and copy and distribute such modifications under |
| 41 | the terms of Paragraph 1 above, provided that you also do the following: |
| 42 | |
| 43 | a) cause the modified files to carry prominent notices stating |
| 44 | that you changed the files and the date of any change; and |
| 45 | |
| 46 | b) cause the whole of any work that you distribute or publish, |
| 47 | that in whole or in part contains or is a derivative of this |
| 48 | program or any part thereof, to be licensed at no charge to all |
| 49 | third parties on terms identical to those contained in this |
| 50 | License Agreement (except that you may choose to grant more extensive |
| 51 | warranty protection to some or all third parties, at your option). |
| 52 | |
| 53 | c) You may charge a distribution fee for the physical act of |
| 54 | transferring a copy, and you may at your option offer warranty |
| 55 | protection in exchange for a fee. |
| 56 | |
| 57 | Mere aggregation of another unrelated program with this program (or its |
| 58 | derivative) on a volume of a storage or distribution medium does not bring |
| 59 | the other program under the scope of these terms. |
| 60 | |
| 61 | 3. You may copy and distribute this program or any portion of it in |
| 62 | compiled, executable or object code form under the terms of Paragraphs |
| 63 | 1 and 2 above provided that you do the following: |
| 64 | |
| 65 | a) accompany it with the complete corresponding machine-readable |
| 66 | source code, which must be distributed under the terms of |
| 67 | Paragraphs 1 and 2 above; or, |
| 68 | |
| 69 | b) accompany it with a written offer, valid for at least three |
| 70 | years, to give any third party free (except for a nominal |
| 71 | shipping charge) a complete machine-readable copy of the |
| 72 | corresponding source code, to be distributed under the terms of |
| 73 | Paragraphs 1 and 2 above; or, |
| 74 | |
| 75 | c) accompany it with the information you received as to where the |
| 76 | corresponding source code may be obtained. (This alternative is |
| 77 | allowed only for noncommercial distribution and only if you |
| 78 | received the program in object code or executable form alone.) |
| 79 | |
| 80 | For an executable file, complete source code means all the source code for |
| 81 | all modules it contains; but, as a special exception, it need not include |
| 82 | source code for modules which are standard libraries that accompany the |
| 83 | operating system on which the executable file runs. |
| 84 | |
| 85 | 4. You may not copy, sublicense, distribute or transfer this program |
| 86 | except as expressly provided under this License Agreement. Any attempt |
| 87 | otherwise to copy, sublicense, distribute or transfer this program is void and |
| 88 | your rights to use the program under this License agreement shall be |
| 89 | automatically terminated. However, parties who have received computer |
| 90 | software programs from you with this License Agreement will not have |
| 91 | their licenses terminated so long as such parties remain in full compliance. |
| 92 | |
| 93 | 5. If you wish to incorporate parts of this program into other free |
| 94 | programs whose distribution conditions are different, write to the Free |
| 95 | Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet |
| 96 | worked out a simple rule that can be stated here, but we will often permit |
| 97 | this. We will be guided by the two goals of preserving the free status of |
| 98 | all derivatives our free software and of promoting the sharing and reuse of |
| 99 | software. |
| 100 | |
| 101 | |
| 102 | In other words, you are welcome to use, share and improve this program. |
| 103 | You are forbidden to forbid anyone else to use, share and improve |
| 104 | what you give them. Help stamp out software-hoarding! */ |
| 105 | \f |
| 106 | |
| 107 | #ifdef USG |
| 108 | #include <string.h> |
| 109 | extern char *index(); |
| 110 | #else |
| 111 | #include <strings.h> |
| 112 | /*extern char *strchr(), *strrchr(), *memcpy();*/ |
| 113 | #endif |
| 114 | |
| 115 | #ifdef __STDC__ |
| 116 | |
| 117 | /* Missing include files for GNU C. */ |
| 118 | #include <stdlib.h> |
| 119 | /*typedef int size_t; |
| 120 | extern void *calloc(int, size_t); |
| 121 | extern void *malloc(size_t); |
| 122 | extern void *realloc(void *, size_t); |
| 123 | extern void free(void *); |
| 124 | |
| 125 | extern char *bcopy(), *bzero();*/ |
| 126 | |
| 127 | #ifdef SOMEDAY |
| 128 | #define ISALNUM(c) isalnum(c) |
| 129 | #define ISALPHA(c) isalpha(c) |
| 130 | #define ISUPPER(c) isupper(c) |
| 131 | #else |
| 132 | #define ISALNUM(c) (isascii(c) && isalnum(c)) |
| 133 | #define ISALPHA(c) (isascii(c) && isalpha(c)) |
| 134 | #define ISUPPER(c) (isascii(c) && isupper(c)) |
| 135 | #endif |
| 136 | |
| 137 | #else /* ! __STDC__ */ |
| 138 | |
| 139 | #define const |
| 140 | typedef int size_t; |
| 141 | extern char *calloc(), *malloc(), *realloc(); |
| 142 | extern void free(); |
| 143 | |
| 144 | extern char *bcopy(), *bzero(); |
| 145 | |
| 146 | #define ISALNUM(c) (isascii(c) && isalnum(c)) |
| 147 | #define ISALPHA(c) (isascii(c) && isalpha(c)) |
| 148 | #define ISUPPER(c) (isascii(c) && isupper(c)) |
| 149 | |
| 150 | #endif /* ! __STDC__ */ |
| 151 | |
| 152 | /* 1 means plain parentheses serve as grouping, and backslash |
| 153 | parentheses are needed for literal searching. |
| 154 | 0 means backslash-parentheses are grouping, and plain parentheses |
| 155 | are for literal searching. */ |
| 156 | #define RE_NO_BK_PARENS 1 |
| 157 | |
| 158 | /* 1 means plain | serves as the "or"-operator, and \| is a literal. |
| 159 | 0 means \| serves as the "or"-operator, and | is a literal. */ |
| 160 | #define RE_NO_BK_VBAR 2 |
| 161 | |
| 162 | /* 0 means plain + or ? serves as an operator, and \+, \? are literals. |
| 163 | 1 means \+, \? are operators and plain +, ? are literals. */ |
| 164 | #define RE_BK_PLUS_QM 4 |
| 165 | |
| 166 | /* 1 means | binds tighter than ^ or $. |
| 167 | 0 means the contrary. */ |
| 168 | #define RE_TIGHT_VBAR 8 |
| 169 | |
| 170 | /* 1 means treat \n as an _OR operator |
| 171 | 0 means treat it as a normal character */ |
| 172 | #define RE_NEWLINE_OR 16 |
| 173 | |
| 174 | /* 0 means that a special characters (such as *, ^, and $) always have |
| 175 | their special meaning regardless of the surrounding context. |
| 176 | 1 means that special characters may act as normal characters in some |
| 177 | contexts. Specifically, this applies to: |
| 178 | ^ - only special at the beginning, or after ( or | |
| 179 | $ - only special at the end, or before ) or | |
| 180 | *, +, ? - only special when not after the beginning, (, or | */ |
| 181 | #define RE_CONTEXT_INDEP_OPS 32 |
| 182 | |
| 183 | /* Now define combinations of bits for the standard possibilities. */ |
| 184 | #define RE_SYNTAX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS) |
| 185 | #define RE_SYNTAX_EGREP (RE_SYNTAX_AWK | RE_NEWLINE_OR) |
| 186 | #define RE_SYNTAX_GREP (RE_BK_PLUS_QM | RE_NEWLINE_OR) |
| 187 | #define RE_SYNTAX_EMACS 0 |
| 188 | |
| 189 | /* The NULL pointer. */ |
| 190 | #define NULL 0 |
| 191 | |
| 192 | /* Number of bits in an unsigned char. */ |
| 193 | #define CHARBITS 8 |
| 194 | |
| 195 | /* First integer value that is greater than any character code. */ |
| 196 | #define _NOTCHAR (1 << CHARBITS) |
| 197 | |
| 198 | /* INTBITS need not be exact, just a lower bound. */ |
| 199 | #define INTBITS (CHARBITS * sizeof (int)) |
| 200 | |
| 201 | /* Number of ints required to hold a bit for every character. */ |
| 202 | #define _CHARSET_INTS ((_NOTCHAR + INTBITS - 1) / INTBITS) |
| 203 | |
| 204 | /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ |
| 205 | typedef int _charset[_CHARSET_INTS]; |
| 206 | |
| 207 | /* The regexp is parsed into an array of tokens in postfix form. Some tokens |
| 208 | are operators and others are terminal symbols. Most (but not all) of these |
| 209 | codes are returned by the lexical analyzer. */ |
| 210 | #ifdef __STDC__ |
| 211 | |
| 212 | typedef enum |
| 213 | { |
| 214 | _END = -1, /* _END is a terminal symbol that matches the |
| 215 | end of input; any value of _END or less in |
| 216 | the parse tree is such a symbol. Accepting |
| 217 | states of the DFA are those that would have |
| 218 | a transition on _END. */ |
| 219 | |
| 220 | /* Ordinary character values are terminal symbols that match themselves. */ |
| 221 | |
| 222 | _EMPTY = _NOTCHAR, /* _EMPTY is a terminal symbol that matches |
| 223 | the empty string. */ |
| 224 | |
| 225 | _BACKREF, /* _BACKREF is generated by \<digit>; it |
| 226 | it not completely handled. If the scanner |
| 227 | detects a transition on backref, it returns |
| 228 | a kind of "semi-success" indicating that |
| 229 | the match will have to be verified with |
| 230 | a backtracking matcher. */ |
| 231 | |
| 232 | _BEGLINE, /* _BEGLINE is a terminal symbol that matches |
| 233 | the empty string if it is at the beginning |
| 234 | of a line. */ |
| 235 | |
| 236 | _ALLBEGLINE, /* _ALLBEGLINE is a terminal symbol that |
| 237 | matches the empty string if it is at the |
| 238 | beginning of a line; _ALLBEGLINE applies |
| 239 | to the entire regexp and can only occur |
| 240 | as the first token thereof. _ALLBEGLINE |
| 241 | never appears in the parse tree; a _BEGLINE |
| 242 | is prepended with _CAT to the entire |
| 243 | regexp instead. */ |
| 244 | |
| 245 | _ENDLINE, /* _ENDLINE is a terminal symbol that matches |
| 246 | the empty string if it is at the end of |
| 247 | a line. */ |
| 248 | |
| 249 | _ALLENDLINE, /* _ALLENDLINE is to _ENDLINE as _ALLBEGLINE |
| 250 | is to _BEGLINE. */ |
| 251 | |
| 252 | _BEGWORD, /* _BEGWORD is a terminal symbol that matches |
| 253 | the empty string if it is at the beginning |
| 254 | of a word. */ |
| 255 | |
| 256 | _ENDWORD, /* _ENDWORD is a terminal symbol that matches |
| 257 | the empty string if it is at the end of |
| 258 | a word. */ |
| 259 | |
| 260 | _LIMWORD, /* _LIMWORD is a terminal symbol that matches |
| 261 | the empty string if it is at the beginning |
| 262 | or the end of a word. */ |
| 263 | |
| 264 | _NOTLIMWORD, /* _NOTLIMWORD is a terminal symbol that |
| 265 | matches the empty string if it is not at |
| 266 | the beginning or end of a word. */ |
| 267 | |
| 268 | _QMARK, /* _QMARK is an operator of one argument that |
| 269 | matches zero or one occurences of its |
| 270 | argument. */ |
| 271 | |
| 272 | _STAR, /* _STAR is an operator of one argument that |
| 273 | matches the Kleene closure (zero or more |
| 274 | occurrences) of its argument. */ |
| 275 | |
| 276 | _PLUS, /* _PLUS is an operator of one argument that |
| 277 | matches the positive closure (one or more |
| 278 | occurrences) of its argument. */ |
| 279 | |
| 280 | _CAT, /* _CAT is an operator of two arguments that |
| 281 | matches the concatenation of its |
| 282 | arguments. _CAT is never returned by the |
| 283 | lexical analyzer. */ |
| 284 | |
| 285 | _OR, /* _OR is an operator of two arguments that |
| 286 | matches either of its arguments. */ |
| 287 | |
| 288 | _LPAREN, /* _LPAREN never appears in the parse tree, |
| 289 | it is only a lexeme. */ |
| 290 | |
| 291 | _RPAREN, /* _RPAREN never appears in the parse tree. */ |
| 292 | |
| 293 | _SET /* _SET and (and any value greater) is a |
| 294 | terminal symbol that matches any of a |
| 295 | class of characters. */ |
| 296 | } _token; |
| 297 | |
| 298 | #else /* ! __STDC__ */ |
| 299 | |
| 300 | typedef short _token; |
| 301 | |
| 302 | #define _END -1 |
| 303 | #define _EMPTY _NOTCHAR |
| 304 | #define _BACKREF (_EMPTY + 1) |
| 305 | #define _BEGLINE (_EMPTY + 2) |
| 306 | #define _ALLBEGLINE (_EMPTY + 3) |
| 307 | #define _ENDLINE (_EMPTY + 4) |
| 308 | #define _ALLENDLINE (_EMPTY + 5) |
| 309 | #define _BEGWORD (_EMPTY + 6) |
| 310 | #define _ENDWORD (_EMPTY + 7) |
| 311 | #define _LIMWORD (_EMPTY + 8) |
| 312 | #define _NOTLIMWORD (_EMPTY + 9) |
| 313 | #define _QMARK (_EMPTY + 10) |
| 314 | #define _STAR (_EMPTY + 11) |
| 315 | #define _PLUS (_EMPTY + 12) |
| 316 | #define _CAT (_EMPTY + 13) |
| 317 | #define _OR (_EMPTY + 14) |
| 318 | #define _LPAREN (_EMPTY + 15) |
| 319 | #define _RPAREN (_EMPTY + 16) |
| 320 | #define _SET (_EMPTY + 17) |
| 321 | |
| 322 | #endif /* ! __STDC__ */ |
| 323 | |
| 324 | /* Sets are stored in an array in the compiled regexp; the index of the |
| 325 | array corresponding to a given set token is given by _SET_INDEX(t). */ |
| 326 | #define _SET_INDEX(t) ((t) - _SET) |
| 327 | |
| 328 | /* Sometimes characters can only be matched depending on the surrounding |
| 329 | context. Such context decisions depend on what the previous character |
| 330 | was, and the value of the current (lookahead) character. Context |
| 331 | dependent constraints are encoded as 8 bit integers. Each bit that |
| 332 | is set indicates that the constraint succeeds in the corresponding |
| 333 | context. |
| 334 | |
| 335 | bit 7 - previous and current are newlines |
| 336 | bit 6 - previous was newline, current isn't |
| 337 | bit 5 - previous wasn't newline, current is |
| 338 | bit 4 - neither previous nor current is a newline |
| 339 | bit 3 - previous and current are word-constituents |
| 340 | bit 2 - previous was word-constituent, current isn't |
| 341 | bit 1 - previous wasn't word-constituent, current is |
| 342 | bit 0 - neither previous nor current is word-constituent |
| 343 | |
| 344 | Word-constituent characters are those that satisfy isalnum(). |
| 345 | |
| 346 | The macro _SUCCEEDS_IN_CONTEXT determines whether a a given constraint |
| 347 | succeeds in a particular context. Prevn is true if the previous character |
| 348 | was a newline, currn is true if the lookahead character is a newline. |
| 349 | Prevl and currl similarly depend upon whether the previous and current |
| 350 | characters are word-constituent letters. */ |
| 351 | #define _MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ |
| 352 | ((constraint) & 1 << ((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4) |
| 353 | #define _MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \ |
| 354 | ((constraint) & 1 << ((prevl) ? 2 : 0) + ((currl) ? 1 : 0)) |
| 355 | #define _SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \ |
| 356 | (_MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ |
| 357 | && _MATCHES_LETTER_CONTEXT(constraint, prevl, currl)) |
| 358 | |
| 359 | /* The following macros give information about what a constraint depends on. */ |
| 360 | #define _PREV_NEWLINE_DEPENDENT(constraint) \ |
| 361 | (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) |
| 362 | #define _PREV_LETTER_DEPENDENT(constraint) \ |
| 363 | (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) |
| 364 | |
| 365 | /* Tokens that match the empty string subject to some constraint actually |
| 366 | work by applying that constraint to determine what may follow them, |
| 367 | taking into account what has gone before. The following values are |
| 368 | the constraints corresponding to the special tokens previously defined. */ |
| 369 | #define _NO_CONSTRAINT 0xff |
| 370 | #define _BEGLINE_CONSTRAINT 0xcf |
| 371 | #define _ENDLINE_CONSTRAINT 0xaf |
| 372 | #define _BEGWORD_CONSTRAINT 0xf2 |
| 373 | #define _ENDWORD_CONSTRAINT 0xf4 |
| 374 | #define _LIMWORD_CONSTRAINT 0xf6 |
| 375 | #define _NOTLIMWORD_CONSTRAINT 0xf9 |
| 376 | |
| 377 | /* States of the recognizer correspond to sets of positions in the parse |
| 378 | tree, together with the constraints under which they may be matched. |
| 379 | So a position is encoded as an index into the parse tree together with |
| 380 | a constraint. */ |
| 381 | typedef struct |
| 382 | { |
| 383 | unsigned index; /* Index into the parse array. */ |
| 384 | unsigned constraint; /* Constraint for matching this position. */ |
| 385 | } _position; |
| 386 | |
| 387 | /* Sets of positions are stored as arrays. */ |
| 388 | typedef struct |
| 389 | { |
| 390 | _position *elems; /* Elements of this position set. */ |
| 391 | int nelem; /* Number of elements in this set. */ |
| 392 | } _position_set; |
| 393 | |
| 394 | /* A state of the regexp consists of a set of positions, some flags, |
| 395 | and the token value of the lowest-numbered position of the state that |
| 396 | contains an _END token. */ |
| 397 | typedef struct |
| 398 | { |
| 399 | int hash; /* Hash of the positions of this state. */ |
| 400 | _position_set elems; /* Positions this state could match. */ |
| 401 | char newline; /* True if previous state matched newline. */ |
| 402 | char letter; /* True if previous state matched a letter. */ |
| 403 | char backref; /* True if this state matches a \<digit>. */ |
| 404 | unsigned char constraint; /* Constraint for this state to accept. */ |
| 405 | int first_end; /* Token value of the first _END in elems. */ |
| 406 | } _dfa_state; |
| 407 | |
| 408 | /* If an r.e. is at most MUST_MAX characters long, we look for a string which |
| 409 | must appear in it; whatever's found is dropped into the struct reg. */ |
| 410 | |
| 411 | #define MUST_MAX 50 |
| 412 | |
| 413 | /* A compiled regular expression. */ |
| 414 | struct regexp |
| 415 | { |
| 416 | /* Stuff built by the scanner. */ |
| 417 | _charset *charsets; /* Array of character sets for _SET tokens. */ |
| 418 | int cindex; /* Index for adding new charsets. */ |
| 419 | int calloc; /* Number of charsets currently allocated. */ |
| 420 | |
| 421 | /* Stuff built by the parser. */ |
| 422 | _token *tokens; /* Postfix parse array. */ |
| 423 | int tindex; /* Index for adding new tokens. */ |
| 424 | int talloc; /* Number of tokens currently allocated. */ |
| 425 | int depth; /* Depth required of an evaluation stack |
| 426 | used for depth-first traversal of the |
| 427 | parse tree. */ |
| 428 | int nleaves; /* Number of leaves on the parse tree. */ |
| 429 | int nregexps; /* Count of parallel regexps being built |
| 430 | with regparse(). */ |
| 431 | |
| 432 | /* Stuff owned by the state builder. */ |
| 433 | _dfa_state *states; /* States of the regexp. */ |
| 434 | int sindex; /* Index for adding new states. */ |
| 435 | int salloc; /* Number of states currently allocated. */ |
| 436 | |
| 437 | /* Stuff built by the structure analyzer. */ |
| 438 | _position_set *follows; /* Array of follow sets, indexed by position |
| 439 | index. The follow of a position is the set |
| 440 | of positions containing characters that |
| 441 | could conceivably follow a character |
| 442 | matching the given position in a string |
| 443 | matching the regexp. Allocated to the |
| 444 | maximum possible position index. */ |
| 445 | int searchflag; /* True if we are supposed to build a searching |
| 446 | as opposed to an exact matcher. A searching |
| 447 | matcher finds the first and shortest string |
| 448 | matching a regexp anywhere in the buffer, |
| 449 | whereas an exact matcher finds the longest |
| 450 | string matching, but anchored to the |
| 451 | beginning of the buffer. */ |
| 452 | |
| 453 | /* Stuff owned by the executor. */ |
| 454 | int tralloc; /* Number of transition tables that have |
| 455 | slots so far. */ |
| 456 | int trcount; /* Number of transition tables that have |
| 457 | actually been built. */ |
| 458 | int **trans; /* Transition tables for states that can |
| 459 | never accept. If the transitions for a |
| 460 | state have not yet been computed, or the |
| 461 | state could possibly accept, its entry in |
| 462 | this table is NULL. */ |
| 463 | int **realtrans; /* Trans always points to realtrans + 1; this |
| 464 | is so trans[-1] can contain NULL. */ |
| 465 | int **fails; /* Transition tables after failing to accept |
| 466 | on a state that potentially could do so. */ |
| 467 | int *success; /* Table of acceptance conditions used in |
| 468 | regexecute and computed in build_state. */ |
| 469 | int *newlines; /* Transitions on newlines. The entry for a |
| 470 | newline in any transition table is always |
| 471 | -1 so we can count lines without wasting |
| 472 | too many cycles. The transition for a |
| 473 | newline is stored separately and handled |
| 474 | as a special case. Newline is also used |
| 475 | as a sentinel at the end of the buffer. */ |
| 476 | char must[MUST_MAX]; |
| 477 | int mustn; |
| 478 | }; |
| 479 | |
| 480 | /* Some macros for user access to regexp internals. */ |
| 481 | |
| 482 | /* ACCEPTING returns true if s could possibly be an accepting state of r. */ |
| 483 | #define ACCEPTING(s, r) ((r).states[s].constraint) |
| 484 | |
| 485 | /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the |
| 486 | specified context. */ |
| 487 | #define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, reg) \ |
| 488 | _SUCCEEDS_IN_CONTEXT((reg).states[state].constraint, \ |
| 489 | prevn, currn, prevl, currl) |
| 490 | |
| 491 | /* FIRST_MATCHING_REGEXP returns the index number of the first of parallel |
| 492 | regexps that a given state could accept. Parallel regexps are numbered |
| 493 | starting at 1. */ |
| 494 | #define FIRST_MATCHING_REGEXP(state, reg) (-(reg).states[state].first_end) |
| 495 | |
| 496 | /* Entry points. */ |
| 497 | |
| 498 | #ifdef __STDC__ |
| 499 | |
| 500 | /* Regsyntax() takes two arguments; the first sets the syntax bits described |
| 501 | earlier in this file, and the second sets the case-folding flag. */ |
| 502 | extern void regsyntax(int, int); |
| 503 | |
| 504 | /* Compile the given string of the given length into the given struct regexp. |
| 505 | Final argument is a flag specifying whether to build a searching or an |
| 506 | exact matcher. */ |
| 507 | extern void regcompile(const char *, size_t, struct regexp *, int); |
| 508 | |
| 509 | /* Execute the given struct regexp on the buffer of characters. The |
| 510 | first char * points to the beginning, and the second points to the |
| 511 | first character after the end of the buffer, which must be a writable |
| 512 | place so a sentinel end-of-buffer marker can be stored there. The |
| 513 | second-to-last argument is a flag telling whether to allow newlines to |
| 514 | be part of a string matching the regexp. The next-to-last argument, |
| 515 | if non-NULL, points to a place to increment every time we see a |
| 516 | newline. The final argument, if non-NULL, points to a flag that will |
| 517 | be set if further examination by a backtracking matcher is needed in |
| 518 | order to verify backreferencing; otherwise the flag will be cleared. |
| 519 | Returns NULL if no match is found, or a pointer to the first |
| 520 | character after the first & shortest matching string in the buffer. */ |
| 521 | extern char *regexecute(struct regexp *, char *, char *, int, int *, int *); |
| 522 | |
| 523 | /* Free the storage held by the components of a struct regexp. */ |
| 524 | extern void regfree(struct regexp *); |
| 525 | |
| 526 | /* Entry points for people who know what they're doing. */ |
| 527 | |
| 528 | /* Initialize the components of a struct regexp. */ |
| 529 | extern void reginit(struct regexp *); |
| 530 | |
| 531 | /* Incrementally parse a string of given length into a struct regexp. */ |
| 532 | extern void regparse(const char *, size_t, struct regexp *); |
| 533 | |
| 534 | /* Analyze a parsed regexp; second argument tells whether to build a searching |
| 535 | or an exact matcher. */ |
| 536 | extern void reganalyze(struct regexp *, int); |
| 537 | |
| 538 | /* Compute, for each possible character, the transitions out of a given |
| 539 | state, storing them in an array of integers. */ |
| 540 | extern void regstate(int, struct regexp *, int []); |
| 541 | |
| 542 | /* Error handling. */ |
| 543 | |
| 544 | /* Regerror() is called by the regexp routines whenever an error occurs. It |
| 545 | takes a single argument, a NUL-terminated string describing the error. |
| 546 | The default regerror() prints the error message to stderr and exits. |
| 547 | The user can provide a different regfree() if so desired. */ |
| 548 | extern void regerror(const char *); |
| 549 | |
| 550 | #else /* ! __STDC__ */ |
| 551 | extern void regsyntax(), regcompile(), regfree(), reginit(), regparse(); |
| 552 | extern void reganalyze(), regstate(), regerror(); |
| 553 | extern char *regexecute(); |
| 554 | #endif |