| 1 | /* dfa.c - determinisitic extended regexp routines for GNU |
| 2 | Copyright (C) 1988 Free Software Foundation, Inc. |
| 3 | Written June, 1988 by Mike Haertel |
| 4 | Modified July, 1988 by Arthur David Olson |
| 5 | to assist BMG speedups |
| 6 | |
| 7 | NO WARRANTY |
| 8 | |
| 9 | BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY |
| 10 | NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT |
| 11 | WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, |
| 12 | RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS" |
| 13 | WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, |
| 14 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 15 | FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY |
| 16 | AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE |
| 17 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR |
| 18 | CORRECTION. |
| 19 | |
| 20 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M. |
| 21 | STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY |
| 22 | WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE |
| 23 | LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR |
| 24 | OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE |
| 25 | USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR |
| 26 | DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR |
| 27 | A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS |
| 28 | PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH |
| 29 | DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. |
| 30 | |
| 31 | GENERAL PUBLIC LICENSE TO COPY |
| 32 | |
| 33 | 1. You may copy and distribute verbatim copies of this source file |
| 34 | as you receive it, in any medium, provided that you conspicuously and |
| 35 | appropriately publish on each copy a valid copyright notice "Copyright |
| 36 | (C) 1988 Free Software Foundation, Inc."; and include following the |
| 37 | copyright notice a verbatim copy of the above disclaimer of warranty |
| 38 | and of this License. You may charge a distribution fee for the |
| 39 | physical act of transferring a copy. |
| 40 | |
| 41 | 2. You may modify your copy or copies of this source file or |
| 42 | any portion of it, and copy and distribute such modifications under |
| 43 | the terms of Paragraph 1 above, provided that you also do the following: |
| 44 | |
| 45 | a) cause the modified files to carry prominent notices stating |
| 46 | that you changed the files and the date of any change; and |
| 47 | |
| 48 | b) cause the whole of any work that you distribute or publish, |
| 49 | that in whole or in part contains or is a derivative of this |
| 50 | program or any part thereof, to be licensed at no charge to all |
| 51 | third parties on terms identical to those contained in this |
| 52 | License Agreement (except that you may choose to grant more extensive |
| 53 | warranty protection to some or all third parties, at your option). |
| 54 | |
| 55 | c) You may charge a distribution fee for the physical act of |
| 56 | transferring a copy, and you may at your option offer warranty |
| 57 | protection in exchange for a fee. |
| 58 | |
| 59 | Mere aggregation of another unrelated program with this program (or its |
| 60 | derivative) on a volume of a storage or distribution medium does not bring |
| 61 | the other program under the scope of these terms. |
| 62 | |
| 63 | 3. You may copy and distribute this program or any portion of it in |
| 64 | compiled, executable or object code form under the terms of Paragraphs |
| 65 | 1 and 2 above provided that you do the following: |
| 66 | |
| 67 | a) accompany it with the complete corresponding machine-readable |
| 68 | source code, which must be distributed under the terms of |
| 69 | Paragraphs 1 and 2 above; or, |
| 70 | |
| 71 | b) accompany it with a written offer, valid for at least three |
| 72 | years, to give any third party free (except for a nominal |
| 73 | shipping charge) a complete machine-readable copy of the |
| 74 | corresponding source code, to be distributed under the terms of |
| 75 | Paragraphs 1 and 2 above; or, |
| 76 | |
| 77 | c) accompany it with the information you received as to where the |
| 78 | corresponding source code may be obtained. (This alternative is |
| 79 | allowed only for noncommercial distribution and only if you |
| 80 | received the program in object code or executable form alone.) |
| 81 | |
| 82 | For an executable file, complete source code means all the source code for |
| 83 | all modules it contains; but, as a special exception, it need not include |
| 84 | source code for modules which are standard libraries that accompany the |
| 85 | operating system on which the executable file runs. |
| 86 | |
| 87 | 4. You may not copy, sublicense, distribute or transfer this program |
| 88 | except as expressly provided under this License Agreement. Any attempt |
| 89 | otherwise to copy, sublicense, distribute or transfer this program is void and |
| 90 | your rights to use the program under this License agreement shall be |
| 91 | automatically terminated. However, parties who have received computer |
| 92 | software programs from you with this License Agreement will not have |
| 93 | their licenses terminated so long as such parties remain in full compliance. |
| 94 | |
| 95 | 5. If you wish to incorporate parts of this program into other free |
| 96 | programs whose distribution conditions are different, write to the Free |
| 97 | Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet |
| 98 | worked out a simple rule that can be stated here, but we will often permit |
| 99 | this. We will be guided by the two goals of preserving the free status of |
| 100 | all derivatives our free software and of promoting the sharing and reuse of |
| 101 | software. |
| 102 | |
| 103 | |
| 104 | In other words, you are welcome to use, share and improve this program. |
| 105 | You are forbidden to forbid anyone else to use, share and improve |
| 106 | what you give them. Help stamp out software-hoarding! */ |
| 107 | \f |
| 108 | #include <stdio.h> |
| 109 | #include <assert.h> |
| 110 | #include <ctype.h> |
| 111 | #include "dfa.h" |
| 112 | |
| 113 | #ifdef __STDC__ |
| 114 | typedef void *ptr_t; |
| 115 | #else |
| 116 | typedef char *ptr_t; |
| 117 | #endif |
| 118 | |
| 119 | static void regmust(); |
| 120 | |
| 121 | static ptr_t |
| 122 | xcalloc(n, s) |
| 123 | int n; |
| 124 | size_t s; |
| 125 | { |
| 126 | ptr_t r = calloc(n, s); |
| 127 | |
| 128 | if (r) |
| 129 | return r; |
| 130 | else |
| 131 | regerror("Memory exhausted"); |
| 132 | } |
| 133 | |
| 134 | static ptr_t |
| 135 | xmalloc(n) |
| 136 | size_t n; |
| 137 | { |
| 138 | ptr_t r = malloc(n); |
| 139 | |
| 140 | assert(n != 0); |
| 141 | if (r) |
| 142 | return r; |
| 143 | else |
| 144 | regerror("Memory exhausted"); |
| 145 | } |
| 146 | |
| 147 | static ptr_t |
| 148 | xrealloc(p, n) |
| 149 | ptr_t p; |
| 150 | size_t n; |
| 151 | { |
| 152 | ptr_t r = realloc(p, n); |
| 153 | |
| 154 | assert(n != 0); |
| 155 | if (r) |
| 156 | return r; |
| 157 | else |
| 158 | regerror("Memory exhausted"); |
| 159 | } |
| 160 | |
| 161 | #define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t))) |
| 162 | #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) |
| 163 | #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) |
| 164 | |
| 165 | /* Reallocate an array of type t if nalloc is too small for index. */ |
| 166 | #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ |
| 167 | if ((index) >= (nalloc)) \ |
| 168 | { \ |
| 169 | while ((index) >= (nalloc)) \ |
| 170 | (nalloc) *= 2; \ |
| 171 | REALLOC(p, t, nalloc); \ |
| 172 | } |
| 173 | \f |
| 174 | /* Stuff pertaining to charsets. */ |
| 175 | |
| 176 | static |
| 177 | tstbit(b, c) |
| 178 | int b; |
| 179 | _charset c; |
| 180 | { |
| 181 | return c[b / INTBITS] & 1 << b % INTBITS; |
| 182 | } |
| 183 | |
| 184 | static void |
| 185 | setbit(b, c) |
| 186 | int b; |
| 187 | _charset c; |
| 188 | { |
| 189 | c[b / INTBITS] |= 1 << b % INTBITS; |
| 190 | } |
| 191 | |
| 192 | static void |
| 193 | clrbit(b, c) |
| 194 | int b; |
| 195 | _charset c; |
| 196 | { |
| 197 | c[b / INTBITS] &= ~(1 << b % INTBITS); |
| 198 | } |
| 199 | |
| 200 | static void |
| 201 | copyset(src, dst) |
| 202 | const _charset src; |
| 203 | _charset dst; |
| 204 | { |
| 205 | int i; |
| 206 | |
| 207 | for (i = 0; i < _CHARSET_INTS; ++i) |
| 208 | dst[i] = src[i]; |
| 209 | } |
| 210 | |
| 211 | static void |
| 212 | zeroset(s) |
| 213 | _charset s; |
| 214 | { |
| 215 | int i; |
| 216 | |
| 217 | for (i = 0; i < _CHARSET_INTS; ++i) |
| 218 | s[i] = 0; |
| 219 | } |
| 220 | |
| 221 | static void |
| 222 | notset(s) |
| 223 | _charset s; |
| 224 | { |
| 225 | int i; |
| 226 | |
| 227 | for (i = 0; i < _CHARSET_INTS; ++i) |
| 228 | s[i] = ~s[i]; |
| 229 | } |
| 230 | |
| 231 | static |
| 232 | equal(s1, s2) |
| 233 | const _charset s1; |
| 234 | const _charset s2; |
| 235 | { |
| 236 | int i; |
| 237 | |
| 238 | for (i = 0; i < _CHARSET_INTS; ++i) |
| 239 | if (s1[i] != s2[i]) |
| 240 | return 0; |
| 241 | return 1; |
| 242 | } |
| 243 | \f |
| 244 | /* A pointer to the current regexp is kept here during parsing. */ |
| 245 | static struct regexp *reg; |
| 246 | |
| 247 | /* Find the index of charset s in reg->charsets, or allocate a new charset. */ |
| 248 | static |
| 249 | charset_index(s) |
| 250 | const _charset s; |
| 251 | { |
| 252 | int i; |
| 253 | |
| 254 | for (i = 0; i < reg->cindex; ++i) |
| 255 | if (equal(s, reg->charsets[i])) |
| 256 | return i; |
| 257 | REALLOC_IF_NECESSARY(reg->charsets, _charset, reg->calloc, reg->cindex); |
| 258 | ++reg->cindex; |
| 259 | copyset(s, reg->charsets[i]); |
| 260 | return i; |
| 261 | } |
| 262 | |
| 263 | /* Syntax bits controlling the behavior of the lexical analyzer. */ |
| 264 | static syntax_bits, syntax_bits_set; |
| 265 | |
| 266 | /* Flag for case-folding letters into sets. */ |
| 267 | static case_fold; |
| 268 | |
| 269 | /* Entry point to set syntax options. */ |
| 270 | void |
| 271 | regsyntax(bits, fold) |
| 272 | int bits; |
| 273 | int fold; |
| 274 | { |
| 275 | syntax_bits_set = 1; |
| 276 | syntax_bits = bits; |
| 277 | case_fold = fold; |
| 278 | } |
| 279 | |
| 280 | /* Lexical analyzer. */ |
| 281 | static const char *lexstart; /* Pointer to beginning of input string. */ |
| 282 | static const char *lexptr; /* Pointer to next input character. */ |
| 283 | static lexleft; /* Number of characters remaining. */ |
| 284 | static caret_allowed; /* True if backward context allows ^ |
| 285 | (meaningful only if RE_CONTEXT_INDEP_OPS |
| 286 | is turned off). */ |
| 287 | static closure_allowed; /* True if backward context allows closures |
| 288 | (meaningful only if RE_CONTEXT_INDEP_OPS |
| 289 | is turned off). */ |
| 290 | |
| 291 | /* Note that characters become unsigned here. */ |
| 292 | #define FETCH(c, eoferr) \ |
| 293 | { \ |
| 294 | if (! lexleft) \ |
| 295 | if (eoferr) \ |
| 296 | regerror(eoferr); \ |
| 297 | else \ |
| 298 | return _END; \ |
| 299 | (c) = (unsigned char) *lexptr++; \ |
| 300 | --lexleft; \ |
| 301 | } |
| 302 | |
| 303 | static _token |
| 304 | lex() |
| 305 | { |
| 306 | _token c, c2; |
| 307 | int invert; |
| 308 | _charset cset; |
| 309 | |
| 310 | FETCH(c, (char *) 0); |
| 311 | switch (c) |
| 312 | { |
| 313 | case '^': |
| 314 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) |
| 315 | && (!caret_allowed || |
| 316 | (syntax_bits & RE_TIGHT_VBAR) && lexptr - 1 != lexstart)) |
| 317 | goto normal_char; |
| 318 | caret_allowed = 0; |
| 319 | return syntax_bits & RE_TIGHT_VBAR ? _ALLBEGLINE : _BEGLINE; |
| 320 | |
| 321 | case '$': |
| 322 | if (syntax_bits & RE_CONTEXT_INDEP_OPS || !lexleft |
| 323 | || (! (syntax_bits & RE_TIGHT_VBAR) |
| 324 | && ((syntax_bits & RE_NO_BK_PARENS |
| 325 | ? lexleft > 0 && *lexptr == ')' |
| 326 | : lexleft > 1 && *lexptr == '\\' && lexptr[1] == ')') |
| 327 | || (syntax_bits & RE_NO_BK_VBAR |
| 328 | ? lexleft > 0 && *lexptr == '|' |
| 329 | : lexleft > 1 && *lexptr == '\\' && lexptr[1] == '|')))) |
| 330 | return syntax_bits & RE_TIGHT_VBAR ? _ALLENDLINE : _ENDLINE; |
| 331 | goto normal_char; |
| 332 | |
| 333 | case '\\': |
| 334 | FETCH(c, "Unfinished \\ quote"); |
| 335 | switch (c) |
| 336 | { |
| 337 | case '1': |
| 338 | case '2': |
| 339 | case '3': |
| 340 | case '4': |
| 341 | case '5': |
| 342 | case '6': |
| 343 | case '7': |
| 344 | case '8': |
| 345 | case '9': |
| 346 | caret_allowed = 0; |
| 347 | closure_allowed = 1; |
| 348 | return _BACKREF; |
| 349 | |
| 350 | case '<': |
| 351 | caret_allowed = 0; |
| 352 | return _BEGWORD; |
| 353 | |
| 354 | case '>': |
| 355 | caret_allowed = 0; |
| 356 | return _ENDWORD; |
| 357 | |
| 358 | case 'b': |
| 359 | caret_allowed = 0; |
| 360 | return _LIMWORD; |
| 361 | |
| 362 | case 'B': |
| 363 | caret_allowed = 0; |
| 364 | return _NOTLIMWORD; |
| 365 | |
| 366 | case 'w': |
| 367 | case 'W': |
| 368 | zeroset(cset); |
| 369 | for (c2 = 0; c2 < _NOTCHAR; ++c2) |
| 370 | if (ISALNUM(c2)) |
| 371 | setbit(c2, cset); |
| 372 | if (c == 'W') |
| 373 | notset(cset); |
| 374 | caret_allowed = 0; |
| 375 | closure_allowed = 1; |
| 376 | return _SET + charset_index(cset); |
| 377 | |
| 378 | case '?': |
| 379 | if (syntax_bits & RE_BK_PLUS_QM) |
| 380 | goto qmark; |
| 381 | goto normal_char; |
| 382 | |
| 383 | case '+': |
| 384 | if (syntax_bits & RE_BK_PLUS_QM) |
| 385 | goto plus; |
| 386 | goto normal_char; |
| 387 | |
| 388 | case '|': |
| 389 | if (! (syntax_bits & RE_NO_BK_VBAR)) |
| 390 | goto or; |
| 391 | goto normal_char; |
| 392 | |
| 393 | case '(': |
| 394 | if (! (syntax_bits & RE_NO_BK_PARENS)) |
| 395 | goto lparen; |
| 396 | goto normal_char; |
| 397 | |
| 398 | case ')': |
| 399 | if (! (syntax_bits & RE_NO_BK_PARENS)) |
| 400 | goto rparen; |
| 401 | goto normal_char; |
| 402 | |
| 403 | default: |
| 404 | goto normal_char; |
| 405 | } |
| 406 | |
| 407 | case '?': |
| 408 | if (syntax_bits & RE_BK_PLUS_QM) |
| 409 | goto normal_char; |
| 410 | qmark: |
| 411 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) |
| 412 | goto normal_char; |
| 413 | return _QMARK; |
| 414 | |
| 415 | case '*': |
| 416 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) |
| 417 | goto normal_char; |
| 418 | return _STAR; |
| 419 | |
| 420 | case '+': |
| 421 | if (syntax_bits & RE_BK_PLUS_QM) |
| 422 | goto normal_char; |
| 423 | plus: |
| 424 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) |
| 425 | goto normal_char; |
| 426 | return _PLUS; |
| 427 | |
| 428 | case '|': |
| 429 | if (! (syntax_bits & RE_NO_BK_VBAR)) |
| 430 | goto normal_char; |
| 431 | or: |
| 432 | caret_allowed = 1; |
| 433 | closure_allowed = 0; |
| 434 | return _OR; |
| 435 | |
| 436 | case '\n': |
| 437 | if (! (syntax_bits & RE_NEWLINE_OR)) |
| 438 | goto normal_char; |
| 439 | goto or; |
| 440 | |
| 441 | case '(': |
| 442 | if (! (syntax_bits & RE_NO_BK_PARENS)) |
| 443 | goto normal_char; |
| 444 | lparen: |
| 445 | caret_allowed = 1; |
| 446 | closure_allowed = 0; |
| 447 | return _LPAREN; |
| 448 | |
| 449 | case ')': |
| 450 | if (! (syntax_bits & RE_NO_BK_PARENS)) |
| 451 | goto normal_char; |
| 452 | rparen: |
| 453 | caret_allowed = 0; |
| 454 | closure_allowed = 1; |
| 455 | return _RPAREN; |
| 456 | |
| 457 | case '.': |
| 458 | zeroset(cset); |
| 459 | notset(cset); |
| 460 | clrbit('\n', cset); |
| 461 | caret_allowed = 0; |
| 462 | closure_allowed = 1; |
| 463 | return _SET + charset_index(cset); |
| 464 | |
| 465 | case '[': |
| 466 | zeroset(cset); |
| 467 | FETCH(c, "Unbalanced ["); |
| 468 | if (c == '^') |
| 469 | { |
| 470 | FETCH(c, "Unbalanced ["); |
| 471 | invert = 1; |
| 472 | } |
| 473 | else |
| 474 | invert = 0; |
| 475 | do |
| 476 | { |
| 477 | FETCH(c2, "Unbalanced ["); |
| 478 | if (c2 == '-') |
| 479 | { |
| 480 | FETCH(c2, "Unbalanced ["); |
| 481 | while (c <= c2) |
| 482 | setbit(c++, cset); |
| 483 | FETCH(c, "Unbalanced ["); |
| 484 | } |
| 485 | else |
| 486 | { |
| 487 | setbit(c, cset); |
| 488 | c = c2; |
| 489 | } |
| 490 | } |
| 491 | while (c != ']'); |
| 492 | if (invert) |
| 493 | notset(cset); |
| 494 | caret_allowed = 0; |
| 495 | closure_allowed = 1; |
| 496 | return _SET + charset_index(cset); |
| 497 | |
| 498 | default: |
| 499 | normal_char: |
| 500 | caret_allowed = 0; |
| 501 | closure_allowed = 1; |
| 502 | if (case_fold && ISALPHA(c)) |
| 503 | { |
| 504 | zeroset(cset); |
| 505 | if (isupper(c)) |
| 506 | c = tolower(c); |
| 507 | setbit(c, cset); |
| 508 | setbit(toupper(c), cset); |
| 509 | return _SET + charset_index(cset); |
| 510 | } |
| 511 | return c; |
| 512 | } |
| 513 | } |
| 514 | \f |
| 515 | /* Recursive descent parser for regular expressions. */ |
| 516 | |
| 517 | static _token tok; /* Lookahead token. */ |
| 518 | static depth; /* Current depth of a hypothetical stack |
| 519 | holding deferred productions. This is |
| 520 | used to determine the depth that will be |
| 521 | required of the real stack later on in |
| 522 | reganalyze(). */ |
| 523 | |
| 524 | /* Add the given token to the parse tree, maintaining the depth count and |
| 525 | updating the maximum depth if necessary. */ |
| 526 | static void |
| 527 | addtok(t) |
| 528 | _token t; |
| 529 | { |
| 530 | REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex); |
| 531 | reg->tokens[reg->tindex++] = t; |
| 532 | |
| 533 | switch (t) |
| 534 | { |
| 535 | case _QMARK: |
| 536 | case _STAR: |
| 537 | case _PLUS: |
| 538 | break; |
| 539 | |
| 540 | case _CAT: |
| 541 | case _OR: |
| 542 | --depth; |
| 543 | break; |
| 544 | |
| 545 | default: |
| 546 | ++reg->nleaves; |
| 547 | case _EMPTY: |
| 548 | ++depth; |
| 549 | break; |
| 550 | } |
| 551 | if (depth > reg->depth) |
| 552 | reg->depth = depth; |
| 553 | } |
| 554 | |
| 555 | /* The grammar understood by the parser is as follows. |
| 556 | |
| 557 | start: |
| 558 | regexp |
| 559 | _ALLBEGLINE regexp |
| 560 | regexp _ALLENDLINE |
| 561 | _ALLBEGLINE regexp _ALLENDLINE |
| 562 | |
| 563 | regexp: |
| 564 | regexp _OR branch |
| 565 | branch |
| 566 | |
| 567 | branch: |
| 568 | branch closure |
| 569 | closure |
| 570 | |
| 571 | closure: |
| 572 | closure _QMARK |
| 573 | closure _STAR |
| 574 | closure _PLUS |
| 575 | atom |
| 576 | |
| 577 | atom: |
| 578 | <normal character> |
| 579 | _SET |
| 580 | _BACKREF |
| 581 | _BEGLINE |
| 582 | _ENDLINE |
| 583 | _BEGWORD |
| 584 | _ENDWORD |
| 585 | _LIMWORD |
| 586 | _NOTLIMWORD |
| 587 | <empty> |
| 588 | |
| 589 | The parser builds a parse tree in postfix form in an array of tokens. */ |
| 590 | |
| 591 | #ifdef __STDC__ |
| 592 | static void regexp(void); |
| 593 | #else |
| 594 | static void regexp(); |
| 595 | #endif |
| 596 | |
| 597 | static void |
| 598 | atom() |
| 599 | { |
| 600 | if (tok >= 0 && tok < _NOTCHAR || tok >= _SET || tok == _BACKREF |
| 601 | || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD |
| 602 | || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD) |
| 603 | { |
| 604 | addtok(tok); |
| 605 | tok = lex(); |
| 606 | } |
| 607 | else if (tok == _LPAREN) |
| 608 | { |
| 609 | tok = lex(); |
| 610 | regexp(); |
| 611 | if (tok != _RPAREN) |
| 612 | regerror("Unbalanced ("); |
| 613 | tok = lex(); |
| 614 | } |
| 615 | else |
| 616 | addtok(_EMPTY); |
| 617 | } |
| 618 | |
| 619 | static void |
| 620 | closure() |
| 621 | { |
| 622 | atom(); |
| 623 | while (tok == _QMARK || tok == _STAR || tok == _PLUS) |
| 624 | { |
| 625 | addtok(tok); |
| 626 | tok = lex(); |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | static void |
| 631 | branch() |
| 632 | { |
| 633 | closure(); |
| 634 | while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0) |
| 635 | { |
| 636 | closure(); |
| 637 | addtok(_CAT); |
| 638 | } |
| 639 | } |
| 640 | |
| 641 | static void |
| 642 | regexp() |
| 643 | { |
| 644 | branch(); |
| 645 | while (tok == _OR) |
| 646 | { |
| 647 | tok = lex(); |
| 648 | branch(); |
| 649 | addtok(_OR); |
| 650 | } |
| 651 | } |
| 652 | |
| 653 | /* Main entry point for the parser. S is a string to be parsed, len is the |
| 654 | length of the string, so s can include NUL characters. R is a pointer to |
| 655 | the struct regexp to parse into. */ |
| 656 | void |
| 657 | regparse(s, len, r) |
| 658 | const char *s; |
| 659 | size_t len; |
| 660 | struct regexp *r; |
| 661 | { |
| 662 | reg = r; |
| 663 | lexstart = lexptr = s; |
| 664 | lexleft = len; |
| 665 | caret_allowed = 1; |
| 666 | closure_allowed = 0; |
| 667 | |
| 668 | if (! syntax_bits_set) |
| 669 | regerror("No syntax specified"); |
| 670 | |
| 671 | tok = lex(); |
| 672 | depth = r->depth; |
| 673 | |
| 674 | if (tok == _ALLBEGLINE) |
| 675 | { |
| 676 | addtok(_BEGLINE); |
| 677 | tok = lex(); |
| 678 | regexp(); |
| 679 | addtok(_CAT); |
| 680 | } |
| 681 | else |
| 682 | regexp(); |
| 683 | |
| 684 | if (tok == _ALLENDLINE) |
| 685 | { |
| 686 | addtok(_ENDLINE); |
| 687 | addtok(_CAT); |
| 688 | tok = lex(); |
| 689 | } |
| 690 | |
| 691 | if (tok != _END) |
| 692 | regerror("Unbalanced )"); |
| 693 | |
| 694 | addtok(_END - r->nregexps); |
| 695 | addtok(_CAT); |
| 696 | |
| 697 | if (r->nregexps) |
| 698 | addtok(_OR); |
| 699 | |
| 700 | ++r->nregexps; |
| 701 | } |
| 702 | \f |
| 703 | /* Some primitives for operating on sets of positions. */ |
| 704 | |
| 705 | /* Copy one set to another; the destination must be large enough. */ |
| 706 | static void |
| 707 | copy(src, dst) |
| 708 | const _position_set *src; |
| 709 | _position_set *dst; |
| 710 | { |
| 711 | int i; |
| 712 | |
| 713 | for (i = 0; i < src->nelem; ++i) |
| 714 | dst->elems[i] = src->elems[i]; |
| 715 | dst->nelem = src->nelem; |
| 716 | } |
| 717 | |
| 718 | /* Insert a position in a set. Position sets are maintained in sorted |
| 719 | order according to index. If position already exists in the set with |
| 720 | the same index then their constraints are logically or'd together. |
| 721 | S->elems must point to an array large enough to hold the resulting set. */ |
| 722 | static void |
| 723 | insert(p, s) |
| 724 | _position p; |
| 725 | _position_set *s; |
| 726 | { |
| 727 | int i; |
| 728 | _position t1, t2; |
| 729 | |
| 730 | for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) |
| 731 | ; |
| 732 | if (i < s->nelem && p.index == s->elems[i].index) |
| 733 | s->elems[i].constraint |= p.constraint; |
| 734 | else |
| 735 | { |
| 736 | t1 = p; |
| 737 | ++s->nelem; |
| 738 | while (i < s->nelem) |
| 739 | { |
| 740 | t2 = s->elems[i]; |
| 741 | s->elems[i++] = t1; |
| 742 | t1 = t2; |
| 743 | } |
| 744 | } |
| 745 | } |
| 746 | |
| 747 | /* Merge two sets of positions into a third. The result is exactly as if |
| 748 | the positions of both sets were inserted into an initially empty set. */ |
| 749 | static void |
| 750 | merge(s1, s2, m) |
| 751 | _position_set *s1; |
| 752 | _position_set *s2; |
| 753 | _position_set *m; |
| 754 | { |
| 755 | int i = 0, j = 0; |
| 756 | |
| 757 | m->nelem = 0; |
| 758 | while (i < s1->nelem && j < s2->nelem) |
| 759 | if (s1->elems[i].index > s2->elems[j].index) |
| 760 | m->elems[m->nelem++] = s1->elems[i++]; |
| 761 | else if (s1->elems[i].index < s2->elems[j].index) |
| 762 | m->elems[m->nelem++] = s2->elems[j++]; |
| 763 | else |
| 764 | { |
| 765 | m->elems[m->nelem] = s1->elems[i++]; |
| 766 | m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; |
| 767 | } |
| 768 | while (i < s1->nelem) |
| 769 | m->elems[m->nelem++] = s1->elems[i++]; |
| 770 | while (j < s2->nelem) |
| 771 | m->elems[m->nelem++] = s2->elems[j++]; |
| 772 | } |
| 773 | |
| 774 | /* Delete a position from a set. */ |
| 775 | static void |
| 776 | delete(p, s) |
| 777 | _position p; |
| 778 | _position_set *s; |
| 779 | { |
| 780 | int i; |
| 781 | |
| 782 | for (i = 0; i < s->nelem; ++i) |
| 783 | if (p.index == s->elems[i].index) |
| 784 | break; |
| 785 | if (i < s->nelem) |
| 786 | for (--s->nelem; i < s->nelem; ++i) |
| 787 | s->elems[i] = s->elems[i + 1]; |
| 788 | } |
| 789 | \f |
| 790 | /* Find the index of the state corresponding to the given position set with |
| 791 | the given preceding context, or create a new state if there is no such |
| 792 | state. Newline and letter tell whether we got here on a newline or |
| 793 | letter, respectively. */ |
| 794 | static |
| 795 | state_index(r, s, newline, letter) |
| 796 | struct regexp *r; |
| 797 | _position_set *s; |
| 798 | int newline; |
| 799 | int letter; |
| 800 | { |
| 801 | int hash = 0; |
| 802 | int constraint; |
| 803 | int i, j; |
| 804 | |
| 805 | newline = newline ? 1 : 0; |
| 806 | letter = letter ? 1 : 0; |
| 807 | |
| 808 | for (i = 0; i < s->nelem; ++i) |
| 809 | hash ^= s->elems[i].index + s->elems[i].constraint; |
| 810 | |
| 811 | /* Try to find a state that exactly matches the proposed one. */ |
| 812 | for (i = 0; i < r->sindex; ++i) |
| 813 | { |
| 814 | if (hash != r->states[i].hash || s->nelem != r->states[i].elems.nelem |
| 815 | || newline != r->states[i].newline || letter != r->states[i].letter) |
| 816 | continue; |
| 817 | for (j = 0; j < s->nelem; ++j) |
| 818 | if (s->elems[j].constraint |
| 819 | != r->states[i].elems.elems[j].constraint |
| 820 | || s->elems[j].index != r->states[i].elems.elems[j].index) |
| 821 | break; |
| 822 | if (j == s->nelem) |
| 823 | return i; |
| 824 | } |
| 825 | |
| 826 | /* We'll have to create a new state. */ |
| 827 | REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex); |
| 828 | r->states[i].hash = hash; |
| 829 | MALLOC(r->states[i].elems.elems, _position, s->nelem); |
| 830 | copy(s, &r->states[i].elems); |
| 831 | r->states[i].newline = newline; |
| 832 | r->states[i].letter = letter; |
| 833 | r->states[i].backref = 0; |
| 834 | r->states[i].constraint = 0; |
| 835 | r->states[i].first_end = 0; |
| 836 | for (j = 0; j < s->nelem; ++j) |
| 837 | if (r->tokens[s->elems[j].index] < 0) |
| 838 | { |
| 839 | constraint = s->elems[j].constraint; |
| 840 | if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) |
| 841 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) |
| 842 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) |
| 843 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) |
| 844 | r->states[i].constraint |= constraint; |
| 845 | if (! r->states[i].first_end) |
| 846 | r->states[i].first_end = r->tokens[s->elems[j].index]; |
| 847 | } |
| 848 | else if (r->tokens[s->elems[j].index] == _BACKREF) |
| 849 | { |
| 850 | r->states[i].constraint = _NO_CONSTRAINT; |
| 851 | r->states[i].backref = 1; |
| 852 | } |
| 853 | |
| 854 | ++r->sindex; |
| 855 | |
| 856 | return i; |
| 857 | } |
| 858 | \f |
| 859 | /* Find the epsilon closure of a set of positions. If any position of the set |
| 860 | contains a symbol that matches the empty string in some context, replace |
| 861 | that position with the elements of its follow labeled with an appropriate |
| 862 | constraint. Repeat exhaustively until no funny positions are left. |
| 863 | S->elems must be large enough to hold the result. */ |
| 864 | epsclosure(s, r) |
| 865 | _position_set *s; |
| 866 | struct regexp *r; |
| 867 | { |
| 868 | int i, j; |
| 869 | int *visited; |
| 870 | _position p, old; |
| 871 | |
| 872 | MALLOC(visited, int, r->tindex); |
| 873 | for (i = 0; i < r->tindex; ++i) |
| 874 | visited[i] = 0; |
| 875 | |
| 876 | for (i = 0; i < s->nelem; ++i) |
| 877 | if (r->tokens[s->elems[i].index] >= _NOTCHAR |
| 878 | && r->tokens[s->elems[i].index] != _BACKREF |
| 879 | && r->tokens[s->elems[i].index] < _SET) |
| 880 | { |
| 881 | old = s->elems[i]; |
| 882 | p.constraint = old.constraint; |
| 883 | delete(s->elems[i], s); |
| 884 | if (visited[old.index]) |
| 885 | { |
| 886 | --i; |
| 887 | continue; |
| 888 | } |
| 889 | visited[old.index] = 1; |
| 890 | switch (r->tokens[old.index]) |
| 891 | { |
| 892 | case _BEGLINE: |
| 893 | p.constraint &= _BEGLINE_CONSTRAINT; |
| 894 | break; |
| 895 | case _ENDLINE: |
| 896 | p.constraint &= _ENDLINE_CONSTRAINT; |
| 897 | break; |
| 898 | case _BEGWORD: |
| 899 | p.constraint &= _BEGWORD_CONSTRAINT; |
| 900 | break; |
| 901 | case _ENDWORD: |
| 902 | p.constraint &= _ENDWORD_CONSTRAINT; |
| 903 | break; |
| 904 | case _LIMWORD: |
| 905 | p.constraint &= _ENDWORD_CONSTRAINT; |
| 906 | break; |
| 907 | case _NOTLIMWORD: |
| 908 | p.constraint &= _NOTLIMWORD_CONSTRAINT; |
| 909 | break; |
| 910 | } |
| 911 | for (j = 0; j < r->follows[old.index].nelem; ++j) |
| 912 | { |
| 913 | p.index = r->follows[old.index].elems[j].index; |
| 914 | insert(p, s); |
| 915 | } |
| 916 | /* Force rescan to start at the beginning. */ |
| 917 | i = -1; |
| 918 | } |
| 919 | |
| 920 | free(visited); |
| 921 | } |
| 922 | \f |
| 923 | /* Perform bottom-up analysis on the parse tree, computing various functions. |
| 924 | Note that at this point, we're pretending constructs like \< are real |
| 925 | characters rather than constraints on what can follow them. |
| 926 | |
| 927 | Nullable: A node is nullable if it is at the root of a regexp that can |
| 928 | match the empty string. |
| 929 | * _EMPTY leaves are nullable. |
| 930 | * No other leaf is nullable. |
| 931 | * A _QMARK or _STAR node is nullable. |
| 932 | * A _PLUS node is nullable if its argument is nullable. |
| 933 | * A _CAT node is nullable if both its arguments are nullable. |
| 934 | * An _OR node is nullable if either argument is nullable. |
| 935 | |
| 936 | Firstpos: The firstpos of a node is the set of positions (nonempty leaves) |
| 937 | that could correspond to the first character of a string matching the |
| 938 | regexp rooted at the given node. |
| 939 | * _EMPTY leaves have empty firstpos. |
| 940 | * The firstpos of a nonempty leaf is that leaf itself. |
| 941 | * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its |
| 942 | argument. |
| 943 | * The firstpos of a _CAT node is the firstpos of the left argument, union |
| 944 | the firstpos of the right if the left argument is nullable. |
| 945 | * The firstpos of an _OR node is the union of firstpos of each argument. |
| 946 | |
| 947 | Lastpos: The lastpos of a node is the set of positions that could |
| 948 | correspond to the last character of a string matching the regexp at |
| 949 | the given node. |
| 950 | * _EMPTY leaves have empty lastpos. |
| 951 | * The lastpos of a nonempty leaf is that leaf itself. |
| 952 | * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its |
| 953 | argument. |
| 954 | * The lastpos of a _CAT node is the lastpos of its right argument, union |
| 955 | the lastpos of the left if the right argument is nullable. |
| 956 | * The lastpos of an _OR node is the union of the lastpos of each argument. |
| 957 | |
| 958 | Follow: The follow of a position is the set of positions that could |
| 959 | correspond to the character following a character matching the node in |
| 960 | a string matching the regexp. At this point we consider special symbols |
| 961 | that match the empty string in some context to be just normal characters. |
| 962 | Later, if we find that a special symbol is in a follow set, we will |
| 963 | replace it with the elements of its follow, labeled with an appropriate |
| 964 | constraint. |
| 965 | * Every node in the firstpos of the argument of a _STAR or _PLUS node is in |
| 966 | the follow of every node in the lastpos. |
| 967 | * Every node in the firstpos of the second argument of a _CAT node is in |
| 968 | the follow of every node in the lastpos of the first argument. |
| 969 | |
| 970 | Because of the postfix representation of the parse tree, the depth-first |
| 971 | analysis is conveniently done by a linear scan with the aid of a stack. |
| 972 | Sets are stored as arrays of the elements, obeying a stack-like allocation |
| 973 | scheme; the number of elements in each set deeper in the stack can be |
| 974 | used to determine the address of a particular set's array. */ |
| 975 | void |
| 976 | reganalyze(r, searchflag) |
| 977 | struct regexp *r; |
| 978 | int searchflag; |
| 979 | { |
| 980 | int *nullable; /* Nullable stack. */ |
| 981 | int *nfirstpos; /* Element count stack for firstpos sets. */ |
| 982 | _position *firstpos; /* Array where firstpos elements are stored. */ |
| 983 | int *nlastpos; /* Element count stack for lastpos sets. */ |
| 984 | _position *lastpos; /* Array where lastpos elements are stored. */ |
| 985 | int *nalloc; /* Sizes of arrays allocated to follow sets. */ |
| 986 | _position_set tmp; /* Temporary set for merging sets. */ |
| 987 | _position_set merged; /* Result of merging sets. */ |
| 988 | int wants_newline; /* True if some position wants newline info. */ |
| 989 | int *o_nullable; |
| 990 | int *o_nfirst, *o_nlast; |
| 991 | _position *o_firstpos, *o_lastpos; |
| 992 | int i, j; |
| 993 | _position *pos; |
| 994 | |
| 995 | r->searchflag = searchflag; |
| 996 | |
| 997 | MALLOC(nullable, int, r->depth); |
| 998 | o_nullable = nullable; |
| 999 | MALLOC(nfirstpos, int, r->depth); |
| 1000 | o_nfirst = nfirstpos; |
| 1001 | MALLOC(firstpos, _position, r->nleaves); |
| 1002 | o_firstpos = firstpos, firstpos += r->nleaves; |
| 1003 | MALLOC(nlastpos, int, r->depth); |
| 1004 | o_nlast = nlastpos; |
| 1005 | MALLOC(lastpos, _position, r->nleaves); |
| 1006 | o_lastpos = lastpos, lastpos += r->nleaves; |
| 1007 | MALLOC(nalloc, int, r->tindex); |
| 1008 | for (i = 0; i < r->tindex; ++i) |
| 1009 | nalloc[i] = 0; |
| 1010 | MALLOC(merged.elems, _position, r->nleaves); |
| 1011 | |
| 1012 | CALLOC(r->follows, _position_set, r->tindex); |
| 1013 | |
| 1014 | for (i = 0; i < r->tindex; ++i) |
| 1015 | switch (r->tokens[i]) |
| 1016 | { |
| 1017 | case _EMPTY: |
| 1018 | /* The empty set is nullable. */ |
| 1019 | *nullable++ = 1; |
| 1020 | |
| 1021 | /* The firstpos and lastpos of the empty leaf are both empty. */ |
| 1022 | *nfirstpos++ = *nlastpos++ = 0; |
| 1023 | break; |
| 1024 | |
| 1025 | case _STAR: |
| 1026 | case _PLUS: |
| 1027 | /* Every element in the firstpos of the argument is in the follow |
| 1028 | of every element in the lastpos. */ |
| 1029 | tmp.nelem = nfirstpos[-1]; |
| 1030 | tmp.elems = firstpos; |
| 1031 | pos = lastpos; |
| 1032 | for (j = 0; j < nlastpos[-1]; ++j) |
| 1033 | { |
| 1034 | merge(&tmp, &r->follows[pos[j].index], &merged); |
| 1035 | REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, |
| 1036 | nalloc[pos[j].index], merged.nelem - 1); |
| 1037 | copy(&merged, &r->follows[pos[j].index]); |
| 1038 | } |
| 1039 | |
| 1040 | case _QMARK: |
| 1041 | /* A _QMARK or _STAR node is automatically nullable. */ |
| 1042 | if (r->tokens[i] != _PLUS) |
| 1043 | nullable[-1] = 1; |
| 1044 | break; |
| 1045 | |
| 1046 | case _CAT: |
| 1047 | /* Every element in the firstpos of the second argument is in the |
| 1048 | follow of every element in the lastpos of the first argument. */ |
| 1049 | tmp.nelem = nfirstpos[-1]; |
| 1050 | tmp.elems = firstpos; |
| 1051 | pos = lastpos + nlastpos[-1]; |
| 1052 | for (j = 0; j < nlastpos[-2]; ++j) |
| 1053 | { |
| 1054 | merge(&tmp, &r->follows[pos[j].index], &merged); |
| 1055 | REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, |
| 1056 | nalloc[pos[j].index], merged.nelem - 1); |
| 1057 | copy(&merged, &r->follows[pos[j].index]); |
| 1058 | } |
| 1059 | |
| 1060 | /* The firstpos of a _CAT node is the firstpos of the first argument, |
| 1061 | union that of the second argument if the first is nullable. */ |
| 1062 | if (nullable[-2]) |
| 1063 | nfirstpos[-2] += nfirstpos[-1]; |
| 1064 | else |
| 1065 | firstpos += nfirstpos[-1]; |
| 1066 | --nfirstpos; |
| 1067 | |
| 1068 | /* The lastpos of a _CAT node is the lastpos of the second argument, |
| 1069 | union that of the first argument if the second is nullable. */ |
| 1070 | if (nullable[-1]) |
| 1071 | nlastpos[-2] += nlastpos[-1]; |
| 1072 | else |
| 1073 | { |
| 1074 | pos = lastpos + nlastpos[-2]; |
| 1075 | for (j = nlastpos[-1] - 1; j >= 0; --j) |
| 1076 | pos[j] = lastpos[j]; |
| 1077 | lastpos += nlastpos[-2]; |
| 1078 | nlastpos[-2] = nlastpos[-1]; |
| 1079 | } |
| 1080 | --nlastpos; |
| 1081 | |
| 1082 | /* A _CAT node is nullable if both arguments are nullable. */ |
| 1083 | nullable[-2] = nullable[-1] && nullable[-2]; |
| 1084 | --nullable; |
| 1085 | break; |
| 1086 | |
| 1087 | case _OR: |
| 1088 | /* The firstpos is the union of the firstpos of each argument. */ |
| 1089 | nfirstpos[-2] += nfirstpos[-1]; |
| 1090 | --nfirstpos; |
| 1091 | |
| 1092 | /* The lastpos is the union of the lastpos of each argument. */ |
| 1093 | nlastpos[-2] += nlastpos[-1]; |
| 1094 | --nlastpos; |
| 1095 | |
| 1096 | /* An _OR node is nullable if either argument is nullable. */ |
| 1097 | nullable[-2] = nullable[-1] || nullable[-2]; |
| 1098 | --nullable; |
| 1099 | break; |
| 1100 | |
| 1101 | default: |
| 1102 | /* Anything else is a nonempty position. (Note that special |
| 1103 | constructs like \< are treated as nonempty strings here; |
| 1104 | an "epsilon closure" effectively makes them nullable later. |
| 1105 | Backreferences have to get a real position so we can detect |
| 1106 | transitions on them later. But they are nullable. */ |
| 1107 | *nullable++ = r->tokens[i] == _BACKREF; |
| 1108 | |
| 1109 | /* This position is in its own firstpos and lastpos. */ |
| 1110 | *nfirstpos++ = *nlastpos++ = 1; |
| 1111 | --firstpos, --lastpos; |
| 1112 | firstpos->index = lastpos->index = i; |
| 1113 | firstpos->constraint = lastpos->constraint = _NO_CONSTRAINT; |
| 1114 | |
| 1115 | /* Allocate the follow set for this position. */ |
| 1116 | nalloc[i] = 1; |
| 1117 | MALLOC(r->follows[i].elems, _position, nalloc[i]); |
| 1118 | break; |
| 1119 | } |
| 1120 | |
| 1121 | /* For each follow set that is the follow set of a real position, replace |
| 1122 | it with its epsilon closure. */ |
| 1123 | for (i = 0; i < r->tindex; ++i) |
| 1124 | if (r->tokens[i] < _NOTCHAR || r->tokens[i] == _BACKREF |
| 1125 | || r->tokens[i] >= _SET) |
| 1126 | { |
| 1127 | copy(&r->follows[i], &merged); |
| 1128 | epsclosure(&merged, r); |
| 1129 | if (r->follows[i].nelem < merged.nelem) |
| 1130 | REALLOC(r->follows[i].elems, _position, merged.nelem); |
| 1131 | copy(&merged, &r->follows[i]); |
| 1132 | } |
| 1133 | |
| 1134 | /* Get the epsilon closure of the firstpos of the regexp. The result will |
| 1135 | be the set of positions of state 0. */ |
| 1136 | merged.nelem = 0; |
| 1137 | for (i = 0; i < nfirstpos[-1]; ++i) |
| 1138 | insert(firstpos[i], &merged); |
| 1139 | epsclosure(&merged, r); |
| 1140 | |
| 1141 | /* Check if any of the positions of state 0 will want newline context. */ |
| 1142 | wants_newline = 0; |
| 1143 | for (i = 0; i < merged.nelem; ++i) |
| 1144 | if (_PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) |
| 1145 | wants_newline = 1; |
| 1146 | |
| 1147 | /* Build the initial state. */ |
| 1148 | r->salloc = 1; |
| 1149 | r->sindex = 0; |
| 1150 | MALLOC(r->states, _dfa_state, r->salloc); |
| 1151 | state_index(r, &merged, wants_newline, 0); |
| 1152 | |
| 1153 | free(o_nullable); |
| 1154 | free(o_nfirst); |
| 1155 | free(o_firstpos); |
| 1156 | free(o_nlast); |
| 1157 | free(o_lastpos); |
| 1158 | free(nalloc); |
| 1159 | free(merged.elems); |
| 1160 | } |
| 1161 | \f |
| 1162 | /* Find, for each character, the transition out of state s of r, and store |
| 1163 | it in the appropriate slot of trans. |
| 1164 | |
| 1165 | We divide the positions of s into groups (positions can appear in more |
| 1166 | than one group). Each group is labeled with a set of characters that |
| 1167 | every position in the group matches (taking into account, if necessary, |
| 1168 | preceding context information of s). For each group, find the union |
| 1169 | of the its elements' follows. This set is the set of positions of the |
| 1170 | new state. For each character in the group's label, set the transition |
| 1171 | on this character to be to a state corresponding to the set's positions, |
| 1172 | and its associated backward context information, if necessary. |
| 1173 | |
| 1174 | If we are building a searching matcher, we include the positions of state |
| 1175 | 0 in every state. |
| 1176 | |
| 1177 | The collection of groups is constructed by building an equivalence-class |
| 1178 | partition of the positions of s. |
| 1179 | |
| 1180 | For each position, find the set of characters C that it matches. Eliminate |
| 1181 | any characters from C that fail on grounds of backward context. |
| 1182 | |
| 1183 | Search through the groups, looking for a group whose label L has nonempty |
| 1184 | intersection with C. If L - C is nonempty, create a new group labeled |
| 1185 | L - C and having the same positions as the current group, and set L to |
| 1186 | the intersection of L and C. Insert the position in this group, set |
| 1187 | C = C - L, and resume scanning. |
| 1188 | |
| 1189 | If after comparing with every group there are characters remaining in C, |
| 1190 | create a new group labeled with the characters of C and insert this |
| 1191 | position in that group. */ |
| 1192 | void |
| 1193 | regstate(s, r, trans) |
| 1194 | int s; |
| 1195 | struct regexp *r; |
| 1196 | int trans[]; |
| 1197 | { |
| 1198 | _position_set grps[_NOTCHAR]; /* As many as will ever be needed. */ |
| 1199 | _charset labels[_NOTCHAR]; /* Labels corresponding to the groups. */ |
| 1200 | int ngrps = 0; /* Number of groups actually used. */ |
| 1201 | _position pos; /* Current position being considered. */ |
| 1202 | _charset matches; /* Set of matching characters. */ |
| 1203 | int matchesf; /* True if matches is nonempty. */ |
| 1204 | _charset intersect; /* Intersection with some label set. */ |
| 1205 | int intersectf; /* True if intersect is nonempty. */ |
| 1206 | _charset leftovers; /* Stuff in the label that didn't match. */ |
| 1207 | int leftoversf; /* True if leftovers is nonempty. */ |
| 1208 | static _charset letters; /* Set of characters considered letters. */ |
| 1209 | static _charset newline; /* Set of characters that aren't newline. */ |
| 1210 | _position_set follows; /* Union of the follows of some group. */ |
| 1211 | _position_set tmp; /* Temporary space for merging sets. */ |
| 1212 | int state; /* New state. */ |
| 1213 | int wants_newline; /* New state wants to know newline context. */ |
| 1214 | int state_newline; /* New state on a newline transition. */ |
| 1215 | int wants_letter; /* New state wants to know letter context. */ |
| 1216 | int state_letter; /* New state on a letter transition. */ |
| 1217 | static initialized; /* Flag for static initialization. */ |
| 1218 | int i, j, k; |
| 1219 | |
| 1220 | /* Initialize the set of letters, if necessary. */ |
| 1221 | if (! initialized) |
| 1222 | { |
| 1223 | initialized = 1; |
| 1224 | for (i = 0; i < _NOTCHAR; ++i) |
| 1225 | if (ISALNUM(i)) |
| 1226 | setbit(i, letters); |
| 1227 | setbit('\n', newline); |
| 1228 | } |
| 1229 | |
| 1230 | zeroset(matches); |
| 1231 | |
| 1232 | for (i = 0; i < r->states[s].elems.nelem; ++i) |
| 1233 | { |
| 1234 | pos = r->states[s].elems.elems[i]; |
| 1235 | if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR) |
| 1236 | setbit(r->tokens[pos.index], matches); |
| 1237 | else if (r->tokens[pos.index] >= _SET) |
| 1238 | copyset(r->charsets[r->tokens[pos.index] - _SET], matches); |
| 1239 | else |
| 1240 | continue; |
| 1241 | |
| 1242 | /* Some characters may need to be climinated from matches because |
| 1243 | they fail in the current context. */ |
| 1244 | if (pos.constraint != 0xff) |
| 1245 | { |
| 1246 | if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint, |
| 1247 | r->states[s].newline, 1)) |
| 1248 | clrbit('\n', matches); |
| 1249 | if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint, |
| 1250 | r->states[s].newline, 0)) |
| 1251 | for (j = 0; j < _CHARSET_INTS; ++j) |
| 1252 | matches[j] &= newline[j]; |
| 1253 | if (! _MATCHES_LETTER_CONTEXT(pos.constraint, |
| 1254 | r->states[s].letter, 1)) |
| 1255 | for (j = 0; j < _CHARSET_INTS; ++j) |
| 1256 | matches[j] &= ~letters[j]; |
| 1257 | if (! _MATCHES_LETTER_CONTEXT(pos.constraint, |
| 1258 | r->states[s].letter, 0)) |
| 1259 | for (j = 0; j < _CHARSET_INTS; ++j) |
| 1260 | matches[j] &= letters[j]; |
| 1261 | |
| 1262 | /* If there are no characters left, there's no point in going on. */ |
| 1263 | for (j = 0; j < _CHARSET_INTS && !matches[j]; ++j) |
| 1264 | ; |
| 1265 | if (j == _CHARSET_INTS) |
| 1266 | continue; |
| 1267 | } |
| 1268 | |
| 1269 | for (j = 0; j < ngrps; ++j) |
| 1270 | { |
| 1271 | /* If matches contains a single character only, and the current |
| 1272 | group's label doesn't contain that character, go on to the |
| 1273 | next group. */ |
| 1274 | if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR |
| 1275 | && !tstbit(r->tokens[pos.index], labels[j])) |
| 1276 | continue; |
| 1277 | |
| 1278 | /* Check if this group's label has a nonempty intersection with |
| 1279 | matches. */ |
| 1280 | intersectf = 0; |
| 1281 | for (k = 0; k < _CHARSET_INTS; ++k) |
| 1282 | (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0; |
| 1283 | if (! intersectf) |
| 1284 | continue; |
| 1285 | |
| 1286 | /* It does; now find the set differences both ways. */ |
| 1287 | leftoversf = matchesf = 0; |
| 1288 | for (k = 0; k < _CHARSET_INTS; ++k) |
| 1289 | { |
| 1290 | /* Even an optimizing compiler can't know this for sure. */ |
| 1291 | int match = matches[k], label = labels[j][k]; |
| 1292 | |
| 1293 | (leftovers[k] = ~match & label) ? leftoversf = 1 : 0; |
| 1294 | (matches[k] = match & ~label) ? matchesf = 1 : 0; |
| 1295 | } |
| 1296 | |
| 1297 | /* If there were leftovers, create a new group labeled with them. */ |
| 1298 | if (leftoversf) |
| 1299 | { |
| 1300 | copyset(leftovers, labels[ngrps]); |
| 1301 | copyset(intersect, labels[j]); |
| 1302 | MALLOC(grps[ngrps].elems, _position, r->nleaves); |
| 1303 | copy(&grps[j], &grps[ngrps]); |
| 1304 | ++ngrps; |
| 1305 | } |
| 1306 | |
| 1307 | /* Put the position in the current group. Note that there is no |
| 1308 | reason to call insert() here. */ |
| 1309 | grps[j].elems[grps[j].nelem++] = pos; |
| 1310 | |
| 1311 | /* If every character matching the current position has been |
| 1312 | accounted for, we're done. */ |
| 1313 | if (! matchesf) |
| 1314 | break; |
| 1315 | } |
| 1316 | |
| 1317 | /* If we've passed the last group, and there are still characters |
| 1318 | unaccounted for, then we'll have to create a new group. */ |
| 1319 | if (j == ngrps) |
| 1320 | { |
| 1321 | copyset(matches, labels[ngrps]); |
| 1322 | zeroset(matches); |
| 1323 | MALLOC(grps[ngrps].elems, _position, r->nleaves); |
| 1324 | grps[ngrps].nelem = 1; |
| 1325 | grps[ngrps].elems[0] = pos; |
| 1326 | ++ngrps; |
| 1327 | } |
| 1328 | } |
| 1329 | |
| 1330 | MALLOC(follows.elems, _position, r->nleaves); |
| 1331 | MALLOC(tmp.elems, _position, r->nleaves); |
| 1332 | |
| 1333 | /* If we are a searching matcher, the default transition is to a state |
| 1334 | containing the positions of state 0, otherwise the default transition |
| 1335 | is to fail miserably. */ |
| 1336 | if (r->searchflag) |
| 1337 | { |
| 1338 | wants_newline = 0; |
| 1339 | wants_letter = 0; |
| 1340 | for (i = 0; i < r->states[0].elems.nelem; ++i) |
| 1341 | { |
| 1342 | if (_PREV_NEWLINE_DEPENDENT(r->states[0].elems.elems[i].constraint)) |
| 1343 | wants_newline = 1; |
| 1344 | if (_PREV_LETTER_DEPENDENT(r->states[0].elems.elems[i].constraint)) |
| 1345 | wants_letter = 1; |
| 1346 | } |
| 1347 | copy(&r->states[0].elems, &follows); |
| 1348 | state = state_index(r, &follows, 0, 0); |
| 1349 | if (wants_newline) |
| 1350 | state_newline = state_index(r, &follows, 1, 0); |
| 1351 | else |
| 1352 | state_newline = state; |
| 1353 | if (wants_letter) |
| 1354 | state_letter = state_index(r, &follows, 0, 1); |
| 1355 | else |
| 1356 | state_letter = state; |
| 1357 | for (i = 0; i < _NOTCHAR; ++i) |
| 1358 | if (i == '\n') |
| 1359 | trans[i] = state_newline; |
| 1360 | else if (ISALNUM(i)) |
| 1361 | trans[i] = state_letter; |
| 1362 | else |
| 1363 | trans[i] = state; |
| 1364 | } |
| 1365 | else |
| 1366 | for (i = 0; i < _NOTCHAR; ++i) |
| 1367 | trans[i] = -1; |
| 1368 | |
| 1369 | for (i = 0; i < ngrps; ++i) |
| 1370 | { |
| 1371 | follows.nelem = 0; |
| 1372 | |
| 1373 | /* Find the union of the follows of the positions of the group. |
| 1374 | This is a hideously inefficient loop. Fix it someday. */ |
| 1375 | for (j = 0; j < grps[i].nelem; ++j) |
| 1376 | for (k = 0; k < r->follows[grps[i].elems[j].index].nelem; ++k) |
| 1377 | insert(r->follows[grps[i].elems[j].index].elems[k], &follows); |
| 1378 | |
| 1379 | /* If we are building a searching matcher, throw in the positions |
| 1380 | of state 0 as well. */ |
| 1381 | if (r->searchflag) |
| 1382 | for (j = 0; j < r->states[0].elems.nelem; ++j) |
| 1383 | insert(r->states[0].elems.elems[j], &follows); |
| 1384 | |
| 1385 | /* Find out if the new state will want any context information. */ |
| 1386 | wants_newline = 0; |
| 1387 | if (tstbit('\n', labels[i])) |
| 1388 | for (j = 0; j < follows.nelem; ++j) |
| 1389 | if (_PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) |
| 1390 | wants_newline = 1; |
| 1391 | |
| 1392 | wants_letter = 0; |
| 1393 | for (j = 0; j < _CHARSET_INTS; ++j) |
| 1394 | if (labels[i][j] & letters[j]) |
| 1395 | break; |
| 1396 | if (j < _CHARSET_INTS) |
| 1397 | for (j = 0; j < follows.nelem; ++j) |
| 1398 | if (_PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) |
| 1399 | wants_letter = 1; |
| 1400 | |
| 1401 | /* Find the state(s) corresponding to the union of the follows. */ |
| 1402 | state = state_index(r, &follows, 0, 0); |
| 1403 | if (wants_newline) |
| 1404 | state_newline = state_index(r, &follows, 1, 0); |
| 1405 | else |
| 1406 | state_newline = state; |
| 1407 | if (wants_letter) |
| 1408 | state_letter = state_index(r, &follows, 0, 1); |
| 1409 | else |
| 1410 | state_letter = state; |
| 1411 | |
| 1412 | /* Set the transitions for each character in the current label. */ |
| 1413 | for (j = 0; j < _CHARSET_INTS; ++j) |
| 1414 | for (k = 0; k < INTBITS; ++k) |
| 1415 | if (labels[i][j] & 1 << k) |
| 1416 | { |
| 1417 | int c = j * INTBITS + k; |
| 1418 | |
| 1419 | if (c == '\n') |
| 1420 | trans[c] = state_newline; |
| 1421 | else if (ISALNUM(c)) |
| 1422 | trans[c] = state_letter; |
| 1423 | else if (c < _NOTCHAR) |
| 1424 | trans[c] = state; |
| 1425 | } |
| 1426 | } |
| 1427 | |
| 1428 | for (i = 0; i < ngrps; ++i) |
| 1429 | free(grps[i].elems); |
| 1430 | free(follows.elems); |
| 1431 | free(tmp.elems); |
| 1432 | } |
| 1433 | \f |
| 1434 | /* Some routines for manipulating a compiled regexp's transition tables. |
| 1435 | Each state may or may not have a transition table; if it does, and it |
| 1436 | is a non-accepting state, then r->trans[state] points to its table. |
| 1437 | If it is an accepting state then r->fails[state] points to its table. |
| 1438 | If it has no table at all, then r->trans[state] is NULL. |
| 1439 | TODO: Improve this comment, get rid of the unnecessary redundancy. */ |
| 1440 | |
| 1441 | static void |
| 1442 | build_state(s, r) |
| 1443 | int s; |
| 1444 | struct regexp *r; |
| 1445 | { |
| 1446 | int *trans; /* The new transition table. */ |
| 1447 | int i; |
| 1448 | |
| 1449 | /* Set an upper limit on the number of transition tables that will ever |
| 1450 | exist at once. 1024 is arbitrary. The idea is that the frequently |
| 1451 | used transition tables will be quickly rebuilt, whereas the ones that |
| 1452 | were only needed once or twice will be cleared away. */ |
| 1453 | if (r->trcount >= 1024) |
| 1454 | { |
| 1455 | for (i = 0; i < r->tralloc; ++i) |
| 1456 | if (r->trans[i]) |
| 1457 | { |
| 1458 | free((ptr_t) r->trans[i]); |
| 1459 | r->trans[i] = NULL; |
| 1460 | } |
| 1461 | else if (r->fails[i]) |
| 1462 | { |
| 1463 | free((ptr_t) r->fails[i]); |
| 1464 | r->fails[i] = NULL; |
| 1465 | } |
| 1466 | r->trcount = 0; |
| 1467 | } |
| 1468 | |
| 1469 | ++r->trcount; |
| 1470 | |
| 1471 | /* Set up the success bits for this state. */ |
| 1472 | r->success[s] = 0; |
| 1473 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 1, r->states[s].letter, 0, |
| 1474 | s, *r)) |
| 1475 | r->success[s] |= 4; |
| 1476 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 1, |
| 1477 | s, *r)) |
| 1478 | r->success[s] |= 2; |
| 1479 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 0, |
| 1480 | s, *r)) |
| 1481 | r->success[s] |= 1; |
| 1482 | |
| 1483 | MALLOC(trans, int, _NOTCHAR); |
| 1484 | regstate(s, r, trans); |
| 1485 | |
| 1486 | /* Now go through the new transition table, and make sure that the trans |
| 1487 | and fail arrays are allocated large enough to hold a pointer for the |
| 1488 | largest state mentioned in the table. */ |
| 1489 | for (i = 0; i < _NOTCHAR; ++i) |
| 1490 | if (trans[i] >= r->tralloc) |
| 1491 | { |
| 1492 | int oldalloc = r->tralloc; |
| 1493 | |
| 1494 | while (trans[i] >= r->tralloc) |
| 1495 | r->tralloc *= 2; |
| 1496 | REALLOC(r->realtrans, int *, r->tralloc + 1); |
| 1497 | r->trans = r->realtrans + 1; |
| 1498 | REALLOC(r->fails, int *, r->tralloc); |
| 1499 | REALLOC(r->success, int, r->tralloc); |
| 1500 | REALLOC(r->newlines, int, r->tralloc); |
| 1501 | while (oldalloc < r->tralloc) |
| 1502 | { |
| 1503 | r->trans[oldalloc] = NULL; |
| 1504 | r->fails[oldalloc++] = NULL; |
| 1505 | } |
| 1506 | } |
| 1507 | |
| 1508 | /* Keep the newline transition in a special place so we can use it as |
| 1509 | a sentinel. */ |
| 1510 | r->newlines[s] = trans['\n']; |
| 1511 | trans['\n'] = -1; |
| 1512 | |
| 1513 | if (ACCEPTING(s, *r)) |
| 1514 | r->fails[s] = trans; |
| 1515 | else |
| 1516 | r->trans[s] = trans; |
| 1517 | } |
| 1518 | |
| 1519 | static void |
| 1520 | build_state_zero(r) |
| 1521 | struct regexp *r; |
| 1522 | { |
| 1523 | r->tralloc = 1; |
| 1524 | r->trcount = 0; |
| 1525 | CALLOC(r->realtrans, int *, r->tralloc + 1); |
| 1526 | r->trans = r->realtrans + 1; |
| 1527 | CALLOC(r->fails, int *, r->tralloc); |
| 1528 | MALLOC(r->success, int, r->tralloc); |
| 1529 | MALLOC(r->newlines, int, r->tralloc); |
| 1530 | build_state(0, r); |
| 1531 | } |
| 1532 | \f |
| 1533 | /* Search through a buffer looking for a match to the given struct regexp. |
| 1534 | Find the first occurrence of a string matching the regexp in the buffer, |
| 1535 | and the shortest possible version thereof. Return a pointer to the first |
| 1536 | character after the match, or NULL if none is found. Begin points to |
| 1537 | the beginning of the buffer, and end points to the first character after |
| 1538 | its end. We store a newline in *end to act as a sentinel, so end had |
| 1539 | better point somewhere valid. Newline is a flag indicating whether to |
| 1540 | allow newlines to be in the matching string. If count is non- |
| 1541 | NULL it points to a place we're supposed to increment every time we |
| 1542 | see a newline. Finally, if backref is non-NULL it points to a place |
| 1543 | where we're supposed to store a 1 if backreferencing happened and the |
| 1544 | match needs to be verified by a backtracking matcher. Otherwise |
| 1545 | we store a 0 in *backref. */ |
| 1546 | char * |
| 1547 | regexecute(r, begin, end, newline, count, backref) |
| 1548 | struct regexp *r; |
| 1549 | char *begin; |
| 1550 | char *end; |
| 1551 | int newline; |
| 1552 | int *count; |
| 1553 | int *backref; |
| 1554 | { |
| 1555 | register s, s1, tmp; /* Current state. */ |
| 1556 | register unsigned char *p; /* Current input character. */ |
| 1557 | register **trans, *t; /* Copy of r->trans so it can be optimized |
| 1558 | into a register. */ |
| 1559 | static sbit[_NOTCHAR]; /* Table for anding with r->success. */ |
| 1560 | static sbit_init; |
| 1561 | |
| 1562 | if (! sbit_init) |
| 1563 | { |
| 1564 | int i; |
| 1565 | |
| 1566 | sbit_init = 1; |
| 1567 | for (i = 0; i < _NOTCHAR; ++i) |
| 1568 | if (i == '\n') |
| 1569 | sbit[i] = 4; |
| 1570 | else if (ISALNUM(i)) |
| 1571 | sbit[i] = 2; |
| 1572 | else |
| 1573 | sbit[i] = 1; |
| 1574 | } |
| 1575 | |
| 1576 | if (! r->tralloc) |
| 1577 | build_state_zero(r); |
| 1578 | |
| 1579 | s = 0; |
| 1580 | p = (unsigned char *) begin; |
| 1581 | trans = r->trans; |
| 1582 | *end = '\n'; |
| 1583 | |
| 1584 | for (;;) |
| 1585 | { |
| 1586 | /* The dreaded inner loop. */ |
| 1587 | if (t = trans[s]) |
| 1588 | do |
| 1589 | { |
| 1590 | s1 = t[*p++]; |
| 1591 | if (! (t = trans[s1])) |
| 1592 | goto last_was_s; |
| 1593 | s = t[*p++]; |
| 1594 | } |
| 1595 | while (t = trans[s]); |
| 1596 | goto last_was_s1; |
| 1597 | last_was_s: |
| 1598 | tmp = s, s = s1, s1 = tmp; |
| 1599 | last_was_s1: |
| 1600 | |
| 1601 | if (s >= 0 && p <= (unsigned char *) end && r->fails[s]) |
| 1602 | { |
| 1603 | if (r->success[s] & sbit[*p]) |
| 1604 | { |
| 1605 | if (backref) |
| 1606 | if (r->states[s].backref) |
| 1607 | *backref = 1; |
| 1608 | else |
| 1609 | *backref = 0; |
| 1610 | return (char *) p; |
| 1611 | } |
| 1612 | |
| 1613 | s1 = s; |
| 1614 | s = r->fails[s][*p++]; |
| 1615 | continue; |
| 1616 | } |
| 1617 | |
| 1618 | /* If the previous character was a newline, count it. */ |
| 1619 | if (count && (char *) p <= end && p[-1] == '\n') |
| 1620 | ++*count; |
| 1621 | |
| 1622 | /* Check if we've run off the end of the buffer. */ |
| 1623 | if ((char *) p >= end) |
| 1624 | return NULL; |
| 1625 | |
| 1626 | if (s >= 0) |
| 1627 | { |
| 1628 | build_state(s, r); |
| 1629 | trans = r->trans; |
| 1630 | continue; |
| 1631 | } |
| 1632 | |
| 1633 | if (p[-1] == '\n' && newline) |
| 1634 | { |
| 1635 | s = r->newlines[s1]; |
| 1636 | continue; |
| 1637 | } |
| 1638 | |
| 1639 | s = 0; |
| 1640 | } |
| 1641 | } |
| 1642 | \f |
| 1643 | /* Initialize the components of a regexp that the other routines don't |
| 1644 | initialize for themselves. */ |
| 1645 | void |
| 1646 | reginit(r) |
| 1647 | struct regexp *r; |
| 1648 | { |
| 1649 | r->calloc = 1; |
| 1650 | MALLOC(r->charsets, _charset, r->calloc); |
| 1651 | r->cindex = 0; |
| 1652 | |
| 1653 | r->talloc = 1; |
| 1654 | MALLOC(r->tokens, _token, r->talloc); |
| 1655 | r->tindex = r->depth = r->nleaves = r->nregexps = 0; |
| 1656 | |
| 1657 | r->searchflag = 0; |
| 1658 | r->tralloc = 0; |
| 1659 | } |
| 1660 | |
| 1661 | /* Parse and analyze a single string of the given length. */ |
| 1662 | void |
| 1663 | regcompile(s, len, r, searchflag) |
| 1664 | const char *s; |
| 1665 | size_t len; |
| 1666 | struct regexp *r; |
| 1667 | int searchflag; |
| 1668 | { |
| 1669 | if (case_fold) /* dummy folding in service of regmust() */ |
| 1670 | { |
| 1671 | char *copy; |
| 1672 | int i; |
| 1673 | |
| 1674 | copy = malloc(len); |
| 1675 | if (!copy) |
| 1676 | regerror("out of memory"); |
| 1677 | |
| 1678 | /* This is a complete kludge and could potentially break |
| 1679 | \<letter> escapes . . . */ |
| 1680 | case_fold = 0; |
| 1681 | for (i = 0; i < len; ++i) |
| 1682 | if (ISUPPER(s[i])) |
| 1683 | copy[i] = tolower(s[i]); |
| 1684 | else |
| 1685 | copy[i] = s[i]; |
| 1686 | |
| 1687 | reginit(r); |
| 1688 | r->mustn = 0; |
| 1689 | r->must[0] = '\0'; |
| 1690 | regparse(copy, len, r); |
| 1691 | free(copy); |
| 1692 | regmust(r); |
| 1693 | reganalyze(r, searchflag); |
| 1694 | case_fold = 1; |
| 1695 | reginit(r); |
| 1696 | regparse(s, len, r); |
| 1697 | reganalyze(r, searchflag); |
| 1698 | } |
| 1699 | else |
| 1700 | { |
| 1701 | reginit(r); |
| 1702 | regparse(s, len, r); |
| 1703 | regmust(r); |
| 1704 | reganalyze(r, searchflag); |
| 1705 | } |
| 1706 | } |
| 1707 | |
| 1708 | /* Free the storage held by the components of a regexp. */ |
| 1709 | void |
| 1710 | regfree(r) |
| 1711 | struct regexp *r; |
| 1712 | { |
| 1713 | int i; |
| 1714 | |
| 1715 | free((ptr_t) r->charsets); |
| 1716 | free((ptr_t) r->tokens); |
| 1717 | for (i = 0; i < r->sindex; ++i) |
| 1718 | free((ptr_t) r->states[i].elems.elems); |
| 1719 | free((ptr_t) r->states); |
| 1720 | for (i = 0; i < r->tindex; ++i) |
| 1721 | if (r->follows[i].elems) |
| 1722 | free((ptr_t) r->follows[i].elems); |
| 1723 | free((ptr_t) r->follows); |
| 1724 | for (i = 0; i < r->tralloc; ++i) |
| 1725 | if (r->trans[i]) |
| 1726 | free((ptr_t) r->trans[i]); |
| 1727 | else if (r->fails[i]) |
| 1728 | free((ptr_t) r->fails[i]); |
| 1729 | free((ptr_t) r->realtrans); |
| 1730 | free((ptr_t) r->fails); |
| 1731 | free((ptr_t) r->newlines); |
| 1732 | } |
| 1733 | |
| 1734 | /* |
| 1735 | Having found the postfix representation of the regular expression, |
| 1736 | try to find a long sequence of characters that must appear in any line |
| 1737 | containing the r.e. |
| 1738 | Finding a "longest" sequence is beyond the scope here; |
| 1739 | we take an easy way out and hope for the best. |
| 1740 | (Take "(ab|a)b"--please.) |
| 1741 | |
| 1742 | We do a bottom-up calculation of sequences of characters that must appear |
| 1743 | in matches of r.e.'s represented by trees rooted at the nodes of the postfix |
| 1744 | representation: |
| 1745 | sequences that must appear at the left of the match ("left") |
| 1746 | sequences that must appear at the right of the match ("right") |
| 1747 | lists of sequences that must appear somewhere in the match ("in") |
| 1748 | sequences that must constitute the match ("is") |
| 1749 | When we get to the root of the tree, we use one of the longest of its |
| 1750 | calculated "in" sequences as our answer. The sequence we find is returned in |
| 1751 | r->must (where "r" is the single argument passed to "regmust"); |
| 1752 | the length of the sequence is returned in r->mustn. |
| 1753 | |
| 1754 | The sequences calculated for the various types of node (in pseudo ANSI c) |
| 1755 | are shown below. "p" is the operand of unary operators (and the left-hand |
| 1756 | operand of binary operators); "q" is the right-hand operand of binary operators |
| 1757 | . |
| 1758 | "ZERO" means "a zero-length sequence" below. |
| 1759 | |
| 1760 | Type left right is in |
| 1761 | ---- ---- ----- -- -- |
| 1762 | char c # c # c # c # c |
| 1763 | |
| 1764 | SET ZERO ZERO ZERO ZERO |
| 1765 | |
| 1766 | STAR ZERO ZERO ZERO ZERO |
| 1767 | |
| 1768 | QMARK ZERO ZERO ZERO ZERO |
| 1769 | |
| 1770 | PLUS p->left p->right ZERO p->in |
| 1771 | |
| 1772 | CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus |
| 1773 | p->left : q->right : q->is!=ZERO) ? q->in plus |
| 1774 | p->is##q->left p->right##q->is p->is##q->is : p->right##q->left |
| 1775 | ZERO |
| 1776 | |
| 1777 | OR longest common longest common (do p->is and substrings common to |
| 1778 | leading trailing q->is have same p->in and q->in |
| 1779 | (sub)sequence (sub)sequence length and |
| 1780 | of p->left of p->right content) ? |
| 1781 | and q->left and q->right p->is : NULL |
| 1782 | |
| 1783 | If there's anything else we recognize in the tree, all four sequences get set |
| 1784 | to zero-length sequences. If there's something we don't recognize in the tree, |
| 1785 | we just return a zero-length sequence. |
| 1786 | |
| 1787 | Break ties in favor of infrequent letters (choosing 'zzz' in preference to |
| 1788 | 'aaa')? |
| 1789 | |
| 1790 | And. . .is it here or someplace that we might ponder "optimizations" such as |
| 1791 | egrep 'psi|epsilon' -> egrep 'psi' |
| 1792 | egrep 'pepsi|epsilon' -> egrep 'epsi' |
| 1793 | (Yes, we now find "epsi" as a "string |
| 1794 | that must occur", but we might also |
| 1795 | simplify the *entire* r.e. being sought |
| 1796 | ) |
| 1797 | grep '[c]' -> grep 'c' |
| 1798 | grep '(ab|a)b' -> grep 'ab' |
| 1799 | grep 'ab*' -> grep 'a' |
| 1800 | grep 'a*b' -> grep 'b' |
| 1801 | There are several issues: |
| 1802 | Is optimization easy (enough)? |
| 1803 | |
| 1804 | Does optimization actually accomplish anything, |
| 1805 | or is the automaton you get from "psi|epsilon" (for example) |
| 1806 | the same as the one you get from "psi" (for example)? |
| 1807 | |
| 1808 | Are optimizable r.e.'s likely to be used in real-life situations |
| 1809 | (something like 'ab*' is probably unlikely; something like is |
| 1810 | 'psi|epsilon' is likelier)? |
| 1811 | */ |
| 1812 | |
| 1813 | static char * |
| 1814 | icatalloc(old, new) |
| 1815 | char * old; |
| 1816 | char * new; |
| 1817 | { |
| 1818 | register char * result; |
| 1819 | register int oldsize, newsize; |
| 1820 | |
| 1821 | newsize = (new == NULL) ? 0 : strlen(new); |
| 1822 | if (old == NULL) |
| 1823 | oldsize = 0; |
| 1824 | else if (newsize == 0) |
| 1825 | return old; |
| 1826 | else oldsize = strlen(old); |
| 1827 | if (old == NULL) |
| 1828 | result = (char *) malloc(newsize + 1); |
| 1829 | else result = (char *) realloc((void *) old, oldsize + newsize + 1); |
| 1830 | if (result != NULL && new != NULL) |
| 1831 | (void) strcpy(result + oldsize, new); |
| 1832 | return result; |
| 1833 | } |
| 1834 | |
| 1835 | static char * |
| 1836 | icpyalloc(string) |
| 1837 | const char * string; |
| 1838 | { |
| 1839 | return icatalloc((char *) NULL, string); |
| 1840 | } |
| 1841 | |
| 1842 | static char * |
| 1843 | istrstr(lookin, lookfor) |
| 1844 | char * lookin; |
| 1845 | register char * lookfor; |
| 1846 | { |
| 1847 | register char * cp; |
| 1848 | register int len; |
| 1849 | |
| 1850 | len = strlen(lookfor); |
| 1851 | for (cp = lookin; *cp != '\0'; ++cp) |
| 1852 | if (strncmp(cp, lookfor, len) == 0) |
| 1853 | return cp; |
| 1854 | return NULL; |
| 1855 | } |
| 1856 | |
| 1857 | static void |
| 1858 | ifree(cp) |
| 1859 | char * cp; |
| 1860 | { |
| 1861 | if (cp != NULL) |
| 1862 | free(cp); |
| 1863 | } |
| 1864 | |
| 1865 | static void |
| 1866 | freelist(cpp) |
| 1867 | register char ** cpp; |
| 1868 | { |
| 1869 | register int i; |
| 1870 | |
| 1871 | if (cpp == NULL) |
| 1872 | return; |
| 1873 | for (i = 0; cpp[i] != NULL; ++i) { |
| 1874 | free(cpp[i]); |
| 1875 | cpp[i] = NULL; |
| 1876 | } |
| 1877 | } |
| 1878 | |
| 1879 | static char ** |
| 1880 | enlist(cpp, new, len) |
| 1881 | register char ** cpp; |
| 1882 | register char * new; |
| 1883 | { |
| 1884 | register int i, j; |
| 1885 | |
| 1886 | if (cpp == NULL) |
| 1887 | return NULL; |
| 1888 | if ((new = icpyalloc(new)) == NULL) { |
| 1889 | freelist(cpp); |
| 1890 | return NULL; |
| 1891 | } |
| 1892 | new[len] = '\0'; |
| 1893 | /* |
| 1894 | ** Is there already something in the list that's new (or longer)? |
| 1895 | */ |
| 1896 | for (i = 0; cpp[i] != NULL; ++i) |
| 1897 | if (istrstr(cpp[i], new) != NULL) { |
| 1898 | free(new); |
| 1899 | return cpp; |
| 1900 | } |
| 1901 | /* |
| 1902 | ** Eliminate any obsoleted strings. |
| 1903 | */ |
| 1904 | j = 0; |
| 1905 | while (cpp[j] != NULL) |
| 1906 | if (istrstr(new, cpp[j]) == NULL) |
| 1907 | ++j; |
| 1908 | else { |
| 1909 | free(cpp[j]); |
| 1910 | if (--i == j) |
| 1911 | break; |
| 1912 | cpp[j] = cpp[i]; |
| 1913 | } |
| 1914 | /* |
| 1915 | ** Add the new string. |
| 1916 | */ |
| 1917 | cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); |
| 1918 | if (cpp == NULL) |
| 1919 | return NULL; |
| 1920 | cpp[i] = new; |
| 1921 | cpp[i + 1] = NULL; |
| 1922 | return cpp; |
| 1923 | } |
| 1924 | |
| 1925 | /* |
| 1926 | ** Given pointers to two strings, |
| 1927 | ** return a pointer to an allocated list of their distinct common substrings. |
| 1928 | ** Return NULL if something seems wild. |
| 1929 | */ |
| 1930 | |
| 1931 | static char ** |
| 1932 | comsubs(left, right) |
| 1933 | char * left; |
| 1934 | char * right; |
| 1935 | { |
| 1936 | register char ** cpp; |
| 1937 | register char * lcp; |
| 1938 | register char * rcp; |
| 1939 | register int i, len; |
| 1940 | |
| 1941 | if (left == NULL || right == NULL) |
| 1942 | return NULL; |
| 1943 | cpp = (char **) malloc(sizeof *cpp); |
| 1944 | if (cpp == NULL) |
| 1945 | return NULL; |
| 1946 | cpp[0] = NULL; |
| 1947 | for (lcp = left; *lcp != '\0'; ++lcp) { |
| 1948 | len = 0; |
| 1949 | rcp = strchr(right, *lcp); |
| 1950 | while (rcp != NULL) { |
| 1951 | for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) |
| 1952 | ; |
| 1953 | if (i > len) |
| 1954 | len = i; |
| 1955 | rcp = strchr(rcp + 1, *lcp); |
| 1956 | } |
| 1957 | if (len == 0) |
| 1958 | continue; |
| 1959 | if ((cpp = enlist(cpp, lcp, len)) == NULL) |
| 1960 | break; |
| 1961 | } |
| 1962 | return cpp; |
| 1963 | } |
| 1964 | |
| 1965 | static char ** |
| 1966 | addlists(old, new) |
| 1967 | char ** old; |
| 1968 | char ** new; |
| 1969 | { |
| 1970 | register int i; |
| 1971 | |
| 1972 | if (old == NULL || new == NULL) |
| 1973 | return NULL; |
| 1974 | for (i = 0; new[i] != NULL; ++i) { |
| 1975 | old = enlist(old, new[i], strlen(new[i])); |
| 1976 | if (old == NULL) |
| 1977 | break; |
| 1978 | } |
| 1979 | return old; |
| 1980 | } |
| 1981 | |
| 1982 | /* |
| 1983 | ** Given two lists of substrings, |
| 1984 | ** return a new list giving substrings common to both. |
| 1985 | */ |
| 1986 | |
| 1987 | static char ** |
| 1988 | inboth(left, right) |
| 1989 | char ** left; |
| 1990 | char ** right; |
| 1991 | { |
| 1992 | register char ** both; |
| 1993 | register char ** temp; |
| 1994 | register int lnum, rnum; |
| 1995 | |
| 1996 | if (left == NULL || right == NULL) |
| 1997 | return NULL; |
| 1998 | both = (char **) malloc(sizeof *both); |
| 1999 | if (both == NULL) |
| 2000 | return NULL; |
| 2001 | both[0] = NULL; |
| 2002 | for (lnum = 0; left[lnum] != NULL; ++lnum) { |
| 2003 | for (rnum = 0; right[rnum] != NULL; ++rnum) { |
| 2004 | temp = comsubs(left[lnum], right[rnum]); |
| 2005 | if (temp == NULL) { |
| 2006 | freelist(both); |
| 2007 | return NULL; |
| 2008 | } |
| 2009 | both = addlists(both, temp); |
| 2010 | freelist(temp); |
| 2011 | if (both == NULL) |
| 2012 | return NULL; |
| 2013 | } |
| 2014 | } |
| 2015 | return both; |
| 2016 | } |
| 2017 | |
| 2018 | typedef struct { |
| 2019 | char ** in; |
| 2020 | char * left; |
| 2021 | char * right; |
| 2022 | char * is; |
| 2023 | } must; |
| 2024 | |
| 2025 | static void |
| 2026 | resetmust(mp) |
| 2027 | register must * mp; |
| 2028 | { |
| 2029 | mp->left[0] = mp->right[0] = mp->is[0] = '\0'; |
| 2030 | freelist(mp->in); |
| 2031 | } |
| 2032 | |
| 2033 | static void |
| 2034 | regmust(r) |
| 2035 | register struct regexp * r; |
| 2036 | { |
| 2037 | register must * musts; |
| 2038 | register must * mp; |
| 2039 | register char * result; |
| 2040 | register int ri; |
| 2041 | register int i; |
| 2042 | register _token t; |
| 2043 | static must must0; |
| 2044 | |
| 2045 | reg->mustn = 0; |
| 2046 | reg->must[0] = '\0'; |
| 2047 | musts = (must *) malloc((reg->tindex + 1) * sizeof *musts); |
| 2048 | if (musts == NULL) |
| 2049 | return; |
| 2050 | mp = musts; |
| 2051 | for (i = 0; i <= reg->tindex; ++i) |
| 2052 | mp[i] = must0; |
| 2053 | for (i = 0; i <= reg->tindex; ++i) { |
| 2054 | mp[i].in = (char **) malloc(sizeof *mp[i].in); |
| 2055 | mp[i].left = malloc(2); |
| 2056 | mp[i].right = malloc(2); |
| 2057 | mp[i].is = malloc(2); |
| 2058 | if (mp[i].in == NULL || mp[i].left == NULL || |
| 2059 | mp[i].right == NULL || mp[i].is == NULL) |
| 2060 | goto done; |
| 2061 | mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; |
| 2062 | mp[i].in[0] = NULL; |
| 2063 | } |
| 2064 | result = ""; |
| 2065 | for (ri = 0; ri < reg->tindex; ++ri) { |
| 2066 | switch (t = reg->tokens[ri]) { |
| 2067 | case _ALLBEGLINE: |
| 2068 | case _ALLENDLINE: |
| 2069 | case _LPAREN: |
| 2070 | case _RPAREN: |
| 2071 | goto done; /* "cannot happen" */ |
| 2072 | case _EMPTY: |
| 2073 | case _BEGLINE: |
| 2074 | case _ENDLINE: |
| 2075 | case _BEGWORD: |
| 2076 | case _ENDWORD: |
| 2077 | case _LIMWORD: |
| 2078 | case _NOTLIMWORD: |
| 2079 | case _BACKREF: |
| 2080 | resetmust(mp); |
| 2081 | break; |
| 2082 | case _STAR: |
| 2083 | case _QMARK: |
| 2084 | if (mp <= musts) |
| 2085 | goto done; /* "cannot happen" */ |
| 2086 | --mp; |
| 2087 | resetmust(mp); |
| 2088 | break; |
| 2089 | case _OR: |
| 2090 | if (mp < &musts[2]) |
| 2091 | goto done; /* "cannot happen" */ |
| 2092 | { |
| 2093 | register char ** new; |
| 2094 | register must * lmp; |
| 2095 | register must * rmp; |
| 2096 | register int j, ln, rn, n; |
| 2097 | |
| 2098 | rmp = --mp; |
| 2099 | lmp = --mp; |
| 2100 | /* Guaranteed to be. Unlikely, but. . . */ |
| 2101 | if (strcmp(lmp->is, rmp->is) != 0) |
| 2102 | lmp->is[0] = '\0'; |
| 2103 | /* Left side--easy */ |
| 2104 | i = 0; |
| 2105 | while (lmp->left[i] != '\0' && |
| 2106 | lmp->left[i] == rmp->left[i]) |
| 2107 | ++i; |
| 2108 | lmp->left[i] = '\0'; |
| 2109 | /* Right side */ |
| 2110 | ln = strlen(lmp->right); |
| 2111 | rn = strlen(rmp->right); |
| 2112 | n = ln; |
| 2113 | if (n > rn) |
| 2114 | n = rn; |
| 2115 | for (i = 0; i < n; ++i) |
| 2116 | if (lmp->right[ln - i - 1] != |
| 2117 | rmp->right[rn - i - 1]) |
| 2118 | break; |
| 2119 | for (j = 0; j < i; ++j) |
| 2120 | lmp->right[j] = |
| 2121 | lmp->right[(ln - i) + j]; |
| 2122 | lmp->right[j] = '\0'; |
| 2123 | new = inboth(lmp->in, rmp->in); |
| 2124 | if (new == NULL) |
| 2125 | goto done; |
| 2126 | freelist(lmp->in); |
| 2127 | free((char *) lmp->in); |
| 2128 | lmp->in = new; |
| 2129 | } |
| 2130 | break; |
| 2131 | case _PLUS: |
| 2132 | if (mp <= musts) |
| 2133 | goto done; /* "cannot happen" */ |
| 2134 | --mp; |
| 2135 | mp->is[0] = '\0'; |
| 2136 | break; |
| 2137 | case _END: |
| 2138 | if (mp != &musts[1]) |
| 2139 | goto done; /* "cannot happen" */ |
| 2140 | for (i = 0; musts[0].in[i] != NULL; ++i) |
| 2141 | if (strlen(musts[0].in[i]) > strlen(result)) |
| 2142 | result = musts[0].in[i]; |
| 2143 | goto done; |
| 2144 | case _CAT: |
| 2145 | if (mp < &musts[2]) |
| 2146 | goto done; /* "cannot happen" */ |
| 2147 | { |
| 2148 | register must * lmp; |
| 2149 | register must * rmp; |
| 2150 | |
| 2151 | rmp = --mp; |
| 2152 | lmp = --mp; |
| 2153 | /* |
| 2154 | ** In. Everything in left, plus everything in |
| 2155 | ** right, plus catenation of |
| 2156 | ** left's right and right's left. |
| 2157 | */ |
| 2158 | lmp->in = addlists(lmp->in, rmp->in); |
| 2159 | if (lmp->in == NULL) |
| 2160 | goto done; |
| 2161 | if (lmp->right[0] != '\0' && |
| 2162 | rmp->left[0] != '\0') { |
| 2163 | register char * tp; |
| 2164 | |
| 2165 | tp = icpyalloc(lmp->right); |
| 2166 | if (tp == NULL) |
| 2167 | goto done; |
| 2168 | tp = icatalloc(tp, rmp->left); |
| 2169 | if (tp == NULL) |
| 2170 | goto done; |
| 2171 | lmp->in = enlist(lmp->in, tp, |
| 2172 | strlen(tp)); |
| 2173 | free(tp); |
| 2174 | if (lmp->in == NULL) |
| 2175 | goto done; |
| 2176 | } |
| 2177 | /* Left-hand */ |
| 2178 | if (lmp->is[0] != '\0') { |
| 2179 | lmp->left = icatalloc(lmp->left, |
| 2180 | rmp->left); |
| 2181 | if (lmp->left == NULL) |
| 2182 | goto done; |
| 2183 | } |
| 2184 | /* Right-hand */ |
| 2185 | if (rmp->is[0] == '\0') |
| 2186 | lmp->right[0] = '\0'; |
| 2187 | lmp->right = icatalloc(lmp->right, rmp->right); |
| 2188 | if (lmp->right == NULL) |
| 2189 | goto done; |
| 2190 | /* Guaranteed to be */ |
| 2191 | if (lmp->is[0] != '\0' && rmp->is[0] != '\0') { |
| 2192 | lmp->is = icatalloc(lmp->is, rmp->is); |
| 2193 | if (lmp->is == NULL) |
| 2194 | goto done; |
| 2195 | } |
| 2196 | } |
| 2197 | break; |
| 2198 | default: |
| 2199 | if (t < _END) { |
| 2200 | /* "cannot happen" */ |
| 2201 | goto done; |
| 2202 | } else if (t == '\0') { |
| 2203 | /* not on *my* shift */ |
| 2204 | goto done; |
| 2205 | } else if (t >= _SET) { |
| 2206 | /* easy enough */ |
| 2207 | resetmust(mp); |
| 2208 | } else { |
| 2209 | /* plain character */ |
| 2210 | resetmust(mp); |
| 2211 | mp->is[0] = mp->left[0] = mp->right[0] = t; |
| 2212 | mp->is[1] = mp->left[1] = mp->right[1] = '\0'; |
| 2213 | mp->in = enlist(mp->in, mp->is, 1); |
| 2214 | if (mp->in == NULL) |
| 2215 | goto done; |
| 2216 | } |
| 2217 | break; |
| 2218 | } |
| 2219 | ++mp; |
| 2220 | } |
| 2221 | done: |
| 2222 | (void) strncpy(reg->must, result, MUST_MAX - 1); |
| 2223 | reg->must[MUST_MAX - 1] = '\0'; |
| 2224 | reg->mustn = strlen(reg->must); |
| 2225 | mp = musts; |
| 2226 | for (i = 0; i <= reg->tindex; ++i) { |
| 2227 | freelist(mp[i].in); |
| 2228 | ifree((char *) mp[i].in); |
| 2229 | ifree(mp[i].left); |
| 2230 | ifree(mp[i].right); |
| 2231 | ifree(mp[i].is); |
| 2232 | } |
| 2233 | free((char *) mp); |
| 2234 | } |