Commit | Line | Data |
---|---|---|
fd7e7ce4 WJ |
1 | /* dfa.c - determinisitic extended regexp routines for GNU |
2 | Copyright (C) 1988 Free Software Foundation, Inc. | |
3 | Written June, 1988 by Mike Haertel | |
4 | Modified July, 1988 by Arthur David Olson | |
5 | to assist BMG speedups | |
6 | ||
7 | NO WARRANTY | |
8 | ||
9 | BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY | |
10 | NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT | |
11 | WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, | |
12 | RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS" | |
13 | WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, | |
14 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
15 | FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY | |
16 | AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE | |
17 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR | |
18 | CORRECTION. | |
19 | ||
20 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M. | |
21 | STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY | |
22 | WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE | |
23 | LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR | |
24 | OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE | |
25 | USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR | |
26 | DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR | |
27 | A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS | |
28 | PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH | |
29 | DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. | |
30 | ||
31 | GENERAL PUBLIC LICENSE TO COPY | |
32 | ||
33 | 1. You may copy and distribute verbatim copies of this source file | |
34 | as you receive it, in any medium, provided that you conspicuously and | |
35 | appropriately publish on each copy a valid copyright notice "Copyright | |
36 | (C) 1988 Free Software Foundation, Inc."; and include following the | |
37 | copyright notice a verbatim copy of the above disclaimer of warranty | |
38 | and of this License. You may charge a distribution fee for the | |
39 | physical act of transferring a copy. | |
40 | ||
41 | 2. You may modify your copy or copies of this source file or | |
42 | any portion of it, and copy and distribute such modifications under | |
43 | the terms of Paragraph 1 above, provided that you also do the following: | |
44 | ||
45 | a) cause the modified files to carry prominent notices stating | |
46 | that you changed the files and the date of any change; and | |
47 | ||
48 | b) cause the whole of any work that you distribute or publish, | |
49 | that in whole or in part contains or is a derivative of this | |
50 | program or any part thereof, to be licensed at no charge to all | |
51 | third parties on terms identical to those contained in this | |
52 | License Agreement (except that you may choose to grant more extensive | |
53 | warranty protection to some or all third parties, at your option). | |
54 | ||
55 | c) You may charge a distribution fee for the physical act of | |
56 | transferring a copy, and you may at your option offer warranty | |
57 | protection in exchange for a fee. | |
58 | ||
59 | Mere aggregation of another unrelated program with this program (or its | |
60 | derivative) on a volume of a storage or distribution medium does not bring | |
61 | the other program under the scope of these terms. | |
62 | ||
63 | 3. You may copy and distribute this program or any portion of it in | |
64 | compiled, executable or object code form under the terms of Paragraphs | |
65 | 1 and 2 above provided that you do the following: | |
66 | ||
67 | a) accompany it with the complete corresponding machine-readable | |
68 | source code, which must be distributed under the terms of | |
69 | Paragraphs 1 and 2 above; or, | |
70 | ||
71 | b) accompany it with a written offer, valid for at least three | |
72 | years, to give any third party free (except for a nominal | |
73 | shipping charge) a complete machine-readable copy of the | |
74 | corresponding source code, to be distributed under the terms of | |
75 | Paragraphs 1 and 2 above; or, | |
76 | ||
77 | c) accompany it with the information you received as to where the | |
78 | corresponding source code may be obtained. (This alternative is | |
79 | allowed only for noncommercial distribution and only if you | |
80 | received the program in object code or executable form alone.) | |
81 | ||
82 | For an executable file, complete source code means all the source code for | |
83 | all modules it contains; but, as a special exception, it need not include | |
84 | source code for modules which are standard libraries that accompany the | |
85 | operating system on which the executable file runs. | |
86 | ||
87 | 4. You may not copy, sublicense, distribute or transfer this program | |
88 | except as expressly provided under this License Agreement. Any attempt | |
89 | otherwise to copy, sublicense, distribute or transfer this program is void and | |
90 | your rights to use the program under this License agreement shall be | |
91 | automatically terminated. However, parties who have received computer | |
92 | software programs from you with this License Agreement will not have | |
93 | their licenses terminated so long as such parties remain in full compliance. | |
94 | ||
95 | 5. If you wish to incorporate parts of this program into other free | |
96 | programs whose distribution conditions are different, write to the Free | |
97 | Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet | |
98 | worked out a simple rule that can be stated here, but we will often permit | |
99 | this. We will be guided by the two goals of preserving the free status of | |
100 | all derivatives our free software and of promoting the sharing and reuse of | |
101 | software. | |
102 | ||
103 | ||
104 | In other words, you are welcome to use, share and improve this program. | |
105 | You are forbidden to forbid anyone else to use, share and improve | |
106 | what you give them. Help stamp out software-hoarding! */ | |
107 | \f | |
108 | #include <stdio.h> | |
109 | #include <assert.h> | |
110 | #include <ctype.h> | |
111 | #include "dfa.h" | |
112 | ||
113 | #ifdef __STDC__ | |
114 | typedef void *ptr_t; | |
115 | #else | |
116 | typedef char *ptr_t; | |
117 | #endif | |
118 | ||
119 | static void regmust(); | |
120 | ||
121 | static ptr_t | |
122 | xcalloc(n, s) | |
123 | int n; | |
124 | size_t s; | |
125 | { | |
126 | ptr_t r = calloc(n, s); | |
127 | ||
128 | if (r) | |
129 | return r; | |
130 | else | |
131 | regerror("Memory exhausted"); | |
132 | } | |
133 | ||
134 | static ptr_t | |
135 | xmalloc(n) | |
136 | size_t n; | |
137 | { | |
138 | ptr_t r = malloc(n); | |
139 | ||
140 | assert(n != 0); | |
141 | if (r) | |
142 | return r; | |
143 | else | |
144 | regerror("Memory exhausted"); | |
145 | } | |
146 | ||
147 | static ptr_t | |
148 | xrealloc(p, n) | |
149 | ptr_t p; | |
150 | size_t n; | |
151 | { | |
152 | ptr_t r = realloc(p, n); | |
153 | ||
154 | assert(n != 0); | |
155 | if (r) | |
156 | return r; | |
157 | else | |
158 | regerror("Memory exhausted"); | |
159 | } | |
160 | ||
161 | #define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t))) | |
162 | #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) | |
163 | #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) | |
164 | ||
165 | /* Reallocate an array of type t if nalloc is too small for index. */ | |
166 | #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ | |
167 | if ((index) >= (nalloc)) \ | |
168 | { \ | |
169 | while ((index) >= (nalloc)) \ | |
170 | (nalloc) *= 2; \ | |
171 | REALLOC(p, t, nalloc); \ | |
172 | } | |
173 | \f | |
174 | /* Stuff pertaining to charsets. */ | |
175 | ||
176 | static | |
177 | tstbit(b, c) | |
178 | int b; | |
179 | _charset c; | |
180 | { | |
181 | return c[b / INTBITS] & 1 << b % INTBITS; | |
182 | } | |
183 | ||
184 | static void | |
185 | setbit(b, c) | |
186 | int b; | |
187 | _charset c; | |
188 | { | |
189 | c[b / INTBITS] |= 1 << b % INTBITS; | |
190 | } | |
191 | ||
192 | static void | |
193 | clrbit(b, c) | |
194 | int b; | |
195 | _charset c; | |
196 | { | |
197 | c[b / INTBITS] &= ~(1 << b % INTBITS); | |
198 | } | |
199 | ||
200 | static void | |
201 | copyset(src, dst) | |
202 | const _charset src; | |
203 | _charset dst; | |
204 | { | |
205 | int i; | |
206 | ||
207 | for (i = 0; i < _CHARSET_INTS; ++i) | |
208 | dst[i] = src[i]; | |
209 | } | |
210 | ||
211 | static void | |
212 | zeroset(s) | |
213 | _charset s; | |
214 | { | |
215 | int i; | |
216 | ||
217 | for (i = 0; i < _CHARSET_INTS; ++i) | |
218 | s[i] = 0; | |
219 | } | |
220 | ||
221 | static void | |
222 | notset(s) | |
223 | _charset s; | |
224 | { | |
225 | int i; | |
226 | ||
227 | for (i = 0; i < _CHARSET_INTS; ++i) | |
228 | s[i] = ~s[i]; | |
229 | } | |
230 | ||
231 | static | |
232 | equal(s1, s2) | |
233 | const _charset s1; | |
234 | const _charset s2; | |
235 | { | |
236 | int i; | |
237 | ||
238 | for (i = 0; i < _CHARSET_INTS; ++i) | |
239 | if (s1[i] != s2[i]) | |
240 | return 0; | |
241 | return 1; | |
242 | } | |
243 | \f | |
244 | /* A pointer to the current regexp is kept here during parsing. */ | |
245 | static struct regexp *reg; | |
246 | ||
247 | /* Find the index of charset s in reg->charsets, or allocate a new charset. */ | |
248 | static | |
249 | charset_index(s) | |
250 | const _charset s; | |
251 | { | |
252 | int i; | |
253 | ||
254 | for (i = 0; i < reg->cindex; ++i) | |
255 | if (equal(s, reg->charsets[i])) | |
256 | return i; | |
257 | REALLOC_IF_NECESSARY(reg->charsets, _charset, reg->calloc, reg->cindex); | |
258 | ++reg->cindex; | |
259 | copyset(s, reg->charsets[i]); | |
260 | return i; | |
261 | } | |
262 | ||
263 | /* Syntax bits controlling the behavior of the lexical analyzer. */ | |
264 | static syntax_bits, syntax_bits_set; | |
265 | ||
266 | /* Flag for case-folding letters into sets. */ | |
267 | static case_fold; | |
268 | ||
269 | /* Entry point to set syntax options. */ | |
270 | void | |
271 | regsyntax(bits, fold) | |
272 | int bits; | |
273 | int fold; | |
274 | { | |
275 | syntax_bits_set = 1; | |
276 | syntax_bits = bits; | |
277 | case_fold = fold; | |
278 | } | |
279 | ||
280 | /* Lexical analyzer. */ | |
281 | static const char *lexstart; /* Pointer to beginning of input string. */ | |
282 | static const char *lexptr; /* Pointer to next input character. */ | |
283 | static lexleft; /* Number of characters remaining. */ | |
284 | static caret_allowed; /* True if backward context allows ^ | |
285 | (meaningful only if RE_CONTEXT_INDEP_OPS | |
286 | is turned off). */ | |
287 | static closure_allowed; /* True if backward context allows closures | |
288 | (meaningful only if RE_CONTEXT_INDEP_OPS | |
289 | is turned off). */ | |
290 | ||
291 | /* Note that characters become unsigned here. */ | |
292 | #define FETCH(c, eoferr) \ | |
293 | { \ | |
294 | if (! lexleft) \ | |
295 | if (eoferr) \ | |
296 | regerror(eoferr); \ | |
297 | else \ | |
298 | return _END; \ | |
299 | (c) = (unsigned char) *lexptr++; \ | |
300 | --lexleft; \ | |
301 | } | |
302 | ||
303 | static _token | |
304 | lex() | |
305 | { | |
306 | _token c, c2; | |
307 | int invert; | |
308 | _charset cset; | |
309 | ||
310 | FETCH(c, (char *) 0); | |
311 | switch (c) | |
312 | { | |
313 | case '^': | |
314 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) | |
315 | && (!caret_allowed || | |
316 | (syntax_bits & RE_TIGHT_VBAR) && lexptr - 1 != lexstart)) | |
317 | goto normal_char; | |
318 | caret_allowed = 0; | |
319 | return syntax_bits & RE_TIGHT_VBAR ? _ALLBEGLINE : _BEGLINE; | |
320 | ||
321 | case '$': | |
322 | if (syntax_bits & RE_CONTEXT_INDEP_OPS || !lexleft | |
323 | || (! (syntax_bits & RE_TIGHT_VBAR) | |
324 | && ((syntax_bits & RE_NO_BK_PARENS | |
325 | ? lexleft > 0 && *lexptr == ')' | |
326 | : lexleft > 1 && *lexptr == '\\' && lexptr[1] == ')') | |
327 | || (syntax_bits & RE_NO_BK_VBAR | |
328 | ? lexleft > 0 && *lexptr == '|' | |
329 | : lexleft > 1 && *lexptr == '\\' && lexptr[1] == '|')))) | |
330 | return syntax_bits & RE_TIGHT_VBAR ? _ALLENDLINE : _ENDLINE; | |
331 | goto normal_char; | |
332 | ||
333 | case '\\': | |
334 | FETCH(c, "Unfinished \\ quote"); | |
335 | switch (c) | |
336 | { | |
337 | case '1': | |
338 | case '2': | |
339 | case '3': | |
340 | case '4': | |
341 | case '5': | |
342 | case '6': | |
343 | case '7': | |
344 | case '8': | |
345 | case '9': | |
346 | caret_allowed = 0; | |
347 | closure_allowed = 1; | |
348 | return _BACKREF; | |
349 | ||
350 | case '<': | |
351 | caret_allowed = 0; | |
352 | return _BEGWORD; | |
353 | ||
354 | case '>': | |
355 | caret_allowed = 0; | |
356 | return _ENDWORD; | |
357 | ||
358 | case 'b': | |
359 | caret_allowed = 0; | |
360 | return _LIMWORD; | |
361 | ||
362 | case 'B': | |
363 | caret_allowed = 0; | |
364 | return _NOTLIMWORD; | |
365 | ||
366 | case 'w': | |
367 | case 'W': | |
368 | zeroset(cset); | |
369 | for (c2 = 0; c2 < _NOTCHAR; ++c2) | |
370 | if (ISALNUM(c2)) | |
371 | setbit(c2, cset); | |
372 | if (c == 'W') | |
373 | notset(cset); | |
374 | caret_allowed = 0; | |
375 | closure_allowed = 1; | |
376 | return _SET + charset_index(cset); | |
377 | ||
378 | case '?': | |
379 | if (syntax_bits & RE_BK_PLUS_QM) | |
380 | goto qmark; | |
381 | goto normal_char; | |
382 | ||
383 | case '+': | |
384 | if (syntax_bits & RE_BK_PLUS_QM) | |
385 | goto plus; | |
386 | goto normal_char; | |
387 | ||
388 | case '|': | |
389 | if (! (syntax_bits & RE_NO_BK_VBAR)) | |
390 | goto or; | |
391 | goto normal_char; | |
392 | ||
393 | case '(': | |
394 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
395 | goto lparen; | |
396 | goto normal_char; | |
397 | ||
398 | case ')': | |
399 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
400 | goto rparen; | |
401 | goto normal_char; | |
402 | ||
403 | default: | |
404 | goto normal_char; | |
405 | } | |
406 | ||
407 | case '?': | |
408 | if (syntax_bits & RE_BK_PLUS_QM) | |
409 | goto normal_char; | |
410 | qmark: | |
411 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) | |
412 | goto normal_char; | |
413 | return _QMARK; | |
414 | ||
415 | case '*': | |
416 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) | |
417 | goto normal_char; | |
418 | return _STAR; | |
419 | ||
420 | case '+': | |
421 | if (syntax_bits & RE_BK_PLUS_QM) | |
422 | goto normal_char; | |
423 | plus: | |
424 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) | |
425 | goto normal_char; | |
426 | return _PLUS; | |
427 | ||
428 | case '|': | |
429 | if (! (syntax_bits & RE_NO_BK_VBAR)) | |
430 | goto normal_char; | |
431 | or: | |
432 | caret_allowed = 1; | |
433 | closure_allowed = 0; | |
434 | return _OR; | |
435 | ||
436 | case '\n': | |
437 | if (! (syntax_bits & RE_NEWLINE_OR)) | |
438 | goto normal_char; | |
439 | goto or; | |
440 | ||
441 | case '(': | |
442 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
443 | goto normal_char; | |
444 | lparen: | |
445 | caret_allowed = 1; | |
446 | closure_allowed = 0; | |
447 | return _LPAREN; | |
448 | ||
449 | case ')': | |
450 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
451 | goto normal_char; | |
452 | rparen: | |
453 | caret_allowed = 0; | |
454 | closure_allowed = 1; | |
455 | return _RPAREN; | |
456 | ||
457 | case '.': | |
458 | zeroset(cset); | |
459 | notset(cset); | |
460 | clrbit('\n', cset); | |
461 | caret_allowed = 0; | |
462 | closure_allowed = 1; | |
463 | return _SET + charset_index(cset); | |
464 | ||
465 | case '[': | |
466 | zeroset(cset); | |
467 | FETCH(c, "Unbalanced ["); | |
468 | if (c == '^') | |
469 | { | |
470 | FETCH(c, "Unbalanced ["); | |
471 | invert = 1; | |
472 | } | |
473 | else | |
474 | invert = 0; | |
475 | do | |
476 | { | |
477 | FETCH(c2, "Unbalanced ["); | |
478 | if (c2 == '-') | |
479 | { | |
480 | FETCH(c2, "Unbalanced ["); | |
481 | while (c <= c2) | |
482 | setbit(c++, cset); | |
483 | FETCH(c, "Unbalanced ["); | |
484 | } | |
485 | else | |
486 | { | |
487 | setbit(c, cset); | |
488 | c = c2; | |
489 | } | |
490 | } | |
491 | while (c != ']'); | |
492 | if (invert) | |
493 | notset(cset); | |
494 | caret_allowed = 0; | |
495 | closure_allowed = 1; | |
496 | return _SET + charset_index(cset); | |
497 | ||
498 | default: | |
499 | normal_char: | |
500 | caret_allowed = 0; | |
501 | closure_allowed = 1; | |
502 | if (case_fold && ISALPHA(c)) | |
503 | { | |
504 | zeroset(cset); | |
505 | if (isupper(c)) | |
506 | c = tolower(c); | |
507 | setbit(c, cset); | |
508 | setbit(toupper(c), cset); | |
509 | return _SET + charset_index(cset); | |
510 | } | |
511 | return c; | |
512 | } | |
513 | } | |
514 | \f | |
515 | /* Recursive descent parser for regular expressions. */ | |
516 | ||
517 | static _token tok; /* Lookahead token. */ | |
518 | static depth; /* Current depth of a hypothetical stack | |
519 | holding deferred productions. This is | |
520 | used to determine the depth that will be | |
521 | required of the real stack later on in | |
522 | reganalyze(). */ | |
523 | ||
524 | /* Add the given token to the parse tree, maintaining the depth count and | |
525 | updating the maximum depth if necessary. */ | |
526 | static void | |
527 | addtok(t) | |
528 | _token t; | |
529 | { | |
530 | REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex); | |
531 | reg->tokens[reg->tindex++] = t; | |
532 | ||
533 | switch (t) | |
534 | { | |
535 | case _QMARK: | |
536 | case _STAR: | |
537 | case _PLUS: | |
538 | break; | |
539 | ||
540 | case _CAT: | |
541 | case _OR: | |
542 | --depth; | |
543 | break; | |
544 | ||
545 | default: | |
546 | ++reg->nleaves; | |
547 | case _EMPTY: | |
548 | ++depth; | |
549 | break; | |
550 | } | |
551 | if (depth > reg->depth) | |
552 | reg->depth = depth; | |
553 | } | |
554 | ||
555 | /* The grammar understood by the parser is as follows. | |
556 | ||
557 | start: | |
558 | regexp | |
559 | _ALLBEGLINE regexp | |
560 | regexp _ALLENDLINE | |
561 | _ALLBEGLINE regexp _ALLENDLINE | |
562 | ||
563 | regexp: | |
564 | regexp _OR branch | |
565 | branch | |
566 | ||
567 | branch: | |
568 | branch closure | |
569 | closure | |
570 | ||
571 | closure: | |
572 | closure _QMARK | |
573 | closure _STAR | |
574 | closure _PLUS | |
575 | atom | |
576 | ||
577 | atom: | |
578 | <normal character> | |
579 | _SET | |
580 | _BACKREF | |
581 | _BEGLINE | |
582 | _ENDLINE | |
583 | _BEGWORD | |
584 | _ENDWORD | |
585 | _LIMWORD | |
586 | _NOTLIMWORD | |
587 | <empty> | |
588 | ||
589 | The parser builds a parse tree in postfix form in an array of tokens. */ | |
590 | ||
591 | #ifdef __STDC__ | |
592 | static void regexp(void); | |
593 | #else | |
594 | static void regexp(); | |
595 | #endif | |
596 | ||
597 | static void | |
598 | atom() | |
599 | { | |
600 | if (tok >= 0 && tok < _NOTCHAR || tok >= _SET || tok == _BACKREF | |
601 | || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD | |
602 | || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD) | |
603 | { | |
604 | addtok(tok); | |
605 | tok = lex(); | |
606 | } | |
607 | else if (tok == _LPAREN) | |
608 | { | |
609 | tok = lex(); | |
610 | regexp(); | |
611 | if (tok != _RPAREN) | |
612 | regerror("Unbalanced ("); | |
613 | tok = lex(); | |
614 | } | |
615 | else | |
616 | addtok(_EMPTY); | |
617 | } | |
618 | ||
619 | static void | |
620 | closure() | |
621 | { | |
622 | atom(); | |
623 | while (tok == _QMARK || tok == _STAR || tok == _PLUS) | |
624 | { | |
625 | addtok(tok); | |
626 | tok = lex(); | |
627 | } | |
628 | } | |
629 | ||
630 | static void | |
631 | branch() | |
632 | { | |
633 | closure(); | |
634 | while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0) | |
635 | { | |
636 | closure(); | |
637 | addtok(_CAT); | |
638 | } | |
639 | } | |
640 | ||
641 | static void | |
642 | regexp() | |
643 | { | |
644 | branch(); | |
645 | while (tok == _OR) | |
646 | { | |
647 | tok = lex(); | |
648 | branch(); | |
649 | addtok(_OR); | |
650 | } | |
651 | } | |
652 | ||
653 | /* Main entry point for the parser. S is a string to be parsed, len is the | |
654 | length of the string, so s can include NUL characters. R is a pointer to | |
655 | the struct regexp to parse into. */ | |
656 | void | |
657 | regparse(s, len, r) | |
658 | const char *s; | |
659 | size_t len; | |
660 | struct regexp *r; | |
661 | { | |
662 | reg = r; | |
663 | lexstart = lexptr = s; | |
664 | lexleft = len; | |
665 | caret_allowed = 1; | |
666 | closure_allowed = 0; | |
667 | ||
668 | if (! syntax_bits_set) | |
669 | regerror("No syntax specified"); | |
670 | ||
671 | tok = lex(); | |
672 | depth = r->depth; | |
673 | ||
674 | if (tok == _ALLBEGLINE) | |
675 | { | |
676 | addtok(_BEGLINE); | |
677 | tok = lex(); | |
678 | regexp(); | |
679 | addtok(_CAT); | |
680 | } | |
681 | else | |
682 | regexp(); | |
683 | ||
684 | if (tok == _ALLENDLINE) | |
685 | { | |
686 | addtok(_ENDLINE); | |
687 | addtok(_CAT); | |
688 | tok = lex(); | |
689 | } | |
690 | ||
691 | if (tok != _END) | |
692 | regerror("Unbalanced )"); | |
693 | ||
694 | addtok(_END - r->nregexps); | |
695 | addtok(_CAT); | |
696 | ||
697 | if (r->nregexps) | |
698 | addtok(_OR); | |
699 | ||
700 | ++r->nregexps; | |
701 | } | |
702 | \f | |
703 | /* Some primitives for operating on sets of positions. */ | |
704 | ||
705 | /* Copy one set to another; the destination must be large enough. */ | |
706 | static void | |
707 | copy(src, dst) | |
708 | const _position_set *src; | |
709 | _position_set *dst; | |
710 | { | |
711 | int i; | |
712 | ||
713 | for (i = 0; i < src->nelem; ++i) | |
714 | dst->elems[i] = src->elems[i]; | |
715 | dst->nelem = src->nelem; | |
716 | } | |
717 | ||
718 | /* Insert a position in a set. Position sets are maintained in sorted | |
719 | order according to index. If position already exists in the set with | |
720 | the same index then their constraints are logically or'd together. | |
721 | S->elems must point to an array large enough to hold the resulting set. */ | |
722 | static void | |
723 | insert(p, s) | |
724 | _position p; | |
725 | _position_set *s; | |
726 | { | |
727 | int i; | |
728 | _position t1, t2; | |
729 | ||
730 | for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) | |
731 | ; | |
732 | if (i < s->nelem && p.index == s->elems[i].index) | |
733 | s->elems[i].constraint |= p.constraint; | |
734 | else | |
735 | { | |
736 | t1 = p; | |
737 | ++s->nelem; | |
738 | while (i < s->nelem) | |
739 | { | |
740 | t2 = s->elems[i]; | |
741 | s->elems[i++] = t1; | |
742 | t1 = t2; | |
743 | } | |
744 | } | |
745 | } | |
746 | ||
747 | /* Merge two sets of positions into a third. The result is exactly as if | |
748 | the positions of both sets were inserted into an initially empty set. */ | |
749 | static void | |
750 | merge(s1, s2, m) | |
751 | _position_set *s1; | |
752 | _position_set *s2; | |
753 | _position_set *m; | |
754 | { | |
755 | int i = 0, j = 0; | |
756 | ||
757 | m->nelem = 0; | |
758 | while (i < s1->nelem && j < s2->nelem) | |
759 | if (s1->elems[i].index > s2->elems[j].index) | |
760 | m->elems[m->nelem++] = s1->elems[i++]; | |
761 | else if (s1->elems[i].index < s2->elems[j].index) | |
762 | m->elems[m->nelem++] = s2->elems[j++]; | |
763 | else | |
764 | { | |
765 | m->elems[m->nelem] = s1->elems[i++]; | |
766 | m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; | |
767 | } | |
768 | while (i < s1->nelem) | |
769 | m->elems[m->nelem++] = s1->elems[i++]; | |
770 | while (j < s2->nelem) | |
771 | m->elems[m->nelem++] = s2->elems[j++]; | |
772 | } | |
773 | ||
774 | /* Delete a position from a set. */ | |
775 | static void | |
776 | delete(p, s) | |
777 | _position p; | |
778 | _position_set *s; | |
779 | { | |
780 | int i; | |
781 | ||
782 | for (i = 0; i < s->nelem; ++i) | |
783 | if (p.index == s->elems[i].index) | |
784 | break; | |
785 | if (i < s->nelem) | |
786 | for (--s->nelem; i < s->nelem; ++i) | |
787 | s->elems[i] = s->elems[i + 1]; | |
788 | } | |
789 | \f | |
790 | /* Find the index of the state corresponding to the given position set with | |
791 | the given preceding context, or create a new state if there is no such | |
792 | state. Newline and letter tell whether we got here on a newline or | |
793 | letter, respectively. */ | |
794 | static | |
795 | state_index(r, s, newline, letter) | |
796 | struct regexp *r; | |
797 | _position_set *s; | |
798 | int newline; | |
799 | int letter; | |
800 | { | |
801 | int hash = 0; | |
802 | int constraint; | |
803 | int i, j; | |
804 | ||
805 | newline = newline ? 1 : 0; | |
806 | letter = letter ? 1 : 0; | |
807 | ||
808 | for (i = 0; i < s->nelem; ++i) | |
809 | hash ^= s->elems[i].index + s->elems[i].constraint; | |
810 | ||
811 | /* Try to find a state that exactly matches the proposed one. */ | |
812 | for (i = 0; i < r->sindex; ++i) | |
813 | { | |
814 | if (hash != r->states[i].hash || s->nelem != r->states[i].elems.nelem | |
815 | || newline != r->states[i].newline || letter != r->states[i].letter) | |
816 | continue; | |
817 | for (j = 0; j < s->nelem; ++j) | |
818 | if (s->elems[j].constraint | |
819 | != r->states[i].elems.elems[j].constraint | |
820 | || s->elems[j].index != r->states[i].elems.elems[j].index) | |
821 | break; | |
822 | if (j == s->nelem) | |
823 | return i; | |
824 | } | |
825 | ||
826 | /* We'll have to create a new state. */ | |
827 | REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex); | |
828 | r->states[i].hash = hash; | |
829 | MALLOC(r->states[i].elems.elems, _position, s->nelem); | |
830 | copy(s, &r->states[i].elems); | |
831 | r->states[i].newline = newline; | |
832 | r->states[i].letter = letter; | |
833 | r->states[i].backref = 0; | |
834 | r->states[i].constraint = 0; | |
835 | r->states[i].first_end = 0; | |
836 | for (j = 0; j < s->nelem; ++j) | |
837 | if (r->tokens[s->elems[j].index] < 0) | |
838 | { | |
839 | constraint = s->elems[j].constraint; | |
840 | if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) | |
841 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) | |
842 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) | |
843 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) | |
844 | r->states[i].constraint |= constraint; | |
845 | if (! r->states[i].first_end) | |
846 | r->states[i].first_end = r->tokens[s->elems[j].index]; | |
847 | } | |
848 | else if (r->tokens[s->elems[j].index] == _BACKREF) | |
849 | { | |
850 | r->states[i].constraint = _NO_CONSTRAINT; | |
851 | r->states[i].backref = 1; | |
852 | } | |
853 | ||
854 | ++r->sindex; | |
855 | ||
856 | return i; | |
857 | } | |
858 | \f | |
859 | /* Find the epsilon closure of a set of positions. If any position of the set | |
860 | contains a symbol that matches the empty string in some context, replace | |
861 | that position with the elements of its follow labeled with an appropriate | |
862 | constraint. Repeat exhaustively until no funny positions are left. | |
863 | S->elems must be large enough to hold the result. */ | |
864 | epsclosure(s, r) | |
865 | _position_set *s; | |
866 | struct regexp *r; | |
867 | { | |
868 | int i, j; | |
869 | int *visited; | |
870 | _position p, old; | |
871 | ||
872 | MALLOC(visited, int, r->tindex); | |
873 | for (i = 0; i < r->tindex; ++i) | |
874 | visited[i] = 0; | |
875 | ||
876 | for (i = 0; i < s->nelem; ++i) | |
877 | if (r->tokens[s->elems[i].index] >= _NOTCHAR | |
878 | && r->tokens[s->elems[i].index] != _BACKREF | |
879 | && r->tokens[s->elems[i].index] < _SET) | |
880 | { | |
881 | old = s->elems[i]; | |
882 | p.constraint = old.constraint; | |
883 | delete(s->elems[i], s); | |
884 | if (visited[old.index]) | |
885 | { | |
886 | --i; | |
887 | continue; | |
888 | } | |
889 | visited[old.index] = 1; | |
890 | switch (r->tokens[old.index]) | |
891 | { | |
892 | case _BEGLINE: | |
893 | p.constraint &= _BEGLINE_CONSTRAINT; | |
894 | break; | |
895 | case _ENDLINE: | |
896 | p.constraint &= _ENDLINE_CONSTRAINT; | |
897 | break; | |
898 | case _BEGWORD: | |
899 | p.constraint &= _BEGWORD_CONSTRAINT; | |
900 | break; | |
901 | case _ENDWORD: | |
902 | p.constraint &= _ENDWORD_CONSTRAINT; | |
903 | break; | |
904 | case _LIMWORD: | |
905 | p.constraint &= _ENDWORD_CONSTRAINT; | |
906 | break; | |
907 | case _NOTLIMWORD: | |
908 | p.constraint &= _NOTLIMWORD_CONSTRAINT; | |
909 | break; | |
910 | } | |
911 | for (j = 0; j < r->follows[old.index].nelem; ++j) | |
912 | { | |
913 | p.index = r->follows[old.index].elems[j].index; | |
914 | insert(p, s); | |
915 | } | |
916 | /* Force rescan to start at the beginning. */ | |
917 | i = -1; | |
918 | } | |
919 | ||
920 | free(visited); | |
921 | } | |
922 | \f | |
923 | /* Perform bottom-up analysis on the parse tree, computing various functions. | |
924 | Note that at this point, we're pretending constructs like \< are real | |
925 | characters rather than constraints on what can follow them. | |
926 | ||
927 | Nullable: A node is nullable if it is at the root of a regexp that can | |
928 | match the empty string. | |
929 | * _EMPTY leaves are nullable. | |
930 | * No other leaf is nullable. | |
931 | * A _QMARK or _STAR node is nullable. | |
932 | * A _PLUS node is nullable if its argument is nullable. | |
933 | * A _CAT node is nullable if both its arguments are nullable. | |
934 | * An _OR node is nullable if either argument is nullable. | |
935 | ||
936 | Firstpos: The firstpos of a node is the set of positions (nonempty leaves) | |
937 | that could correspond to the first character of a string matching the | |
938 | regexp rooted at the given node. | |
939 | * _EMPTY leaves have empty firstpos. | |
940 | * The firstpos of a nonempty leaf is that leaf itself. | |
941 | * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its | |
942 | argument. | |
943 | * The firstpos of a _CAT node is the firstpos of the left argument, union | |
944 | the firstpos of the right if the left argument is nullable. | |
945 | * The firstpos of an _OR node is the union of firstpos of each argument. | |
946 | ||
947 | Lastpos: The lastpos of a node is the set of positions that could | |
948 | correspond to the last character of a string matching the regexp at | |
949 | the given node. | |
950 | * _EMPTY leaves have empty lastpos. | |
951 | * The lastpos of a nonempty leaf is that leaf itself. | |
952 | * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its | |
953 | argument. | |
954 | * The lastpos of a _CAT node is the lastpos of its right argument, union | |
955 | the lastpos of the left if the right argument is nullable. | |
956 | * The lastpos of an _OR node is the union of the lastpos of each argument. | |
957 | ||
958 | Follow: The follow of a position is the set of positions that could | |
959 | correspond to the character following a character matching the node in | |
960 | a string matching the regexp. At this point we consider special symbols | |
961 | that match the empty string in some context to be just normal characters. | |
962 | Later, if we find that a special symbol is in a follow set, we will | |
963 | replace it with the elements of its follow, labeled with an appropriate | |
964 | constraint. | |
965 | * Every node in the firstpos of the argument of a _STAR or _PLUS node is in | |
966 | the follow of every node in the lastpos. | |
967 | * Every node in the firstpos of the second argument of a _CAT node is in | |
968 | the follow of every node in the lastpos of the first argument. | |
969 | ||
970 | Because of the postfix representation of the parse tree, the depth-first | |
971 | analysis is conveniently done by a linear scan with the aid of a stack. | |
972 | Sets are stored as arrays of the elements, obeying a stack-like allocation | |
973 | scheme; the number of elements in each set deeper in the stack can be | |
974 | used to determine the address of a particular set's array. */ | |
975 | void | |
976 | reganalyze(r, searchflag) | |
977 | struct regexp *r; | |
978 | int searchflag; | |
979 | { | |
980 | int *nullable; /* Nullable stack. */ | |
981 | int *nfirstpos; /* Element count stack for firstpos sets. */ | |
982 | _position *firstpos; /* Array where firstpos elements are stored. */ | |
983 | int *nlastpos; /* Element count stack for lastpos sets. */ | |
984 | _position *lastpos; /* Array where lastpos elements are stored. */ | |
985 | int *nalloc; /* Sizes of arrays allocated to follow sets. */ | |
986 | _position_set tmp; /* Temporary set for merging sets. */ | |
987 | _position_set merged; /* Result of merging sets. */ | |
988 | int wants_newline; /* True if some position wants newline info. */ | |
989 | int *o_nullable; | |
990 | int *o_nfirst, *o_nlast; | |
991 | _position *o_firstpos, *o_lastpos; | |
992 | int i, j; | |
993 | _position *pos; | |
994 | ||
995 | r->searchflag = searchflag; | |
996 | ||
997 | MALLOC(nullable, int, r->depth); | |
998 | o_nullable = nullable; | |
999 | MALLOC(nfirstpos, int, r->depth); | |
1000 | o_nfirst = nfirstpos; | |
1001 | MALLOC(firstpos, _position, r->nleaves); | |
1002 | o_firstpos = firstpos, firstpos += r->nleaves; | |
1003 | MALLOC(nlastpos, int, r->depth); | |
1004 | o_nlast = nlastpos; | |
1005 | MALLOC(lastpos, _position, r->nleaves); | |
1006 | o_lastpos = lastpos, lastpos += r->nleaves; | |
1007 | MALLOC(nalloc, int, r->tindex); | |
1008 | for (i = 0; i < r->tindex; ++i) | |
1009 | nalloc[i] = 0; | |
1010 | MALLOC(merged.elems, _position, r->nleaves); | |
1011 | ||
1012 | CALLOC(r->follows, _position_set, r->tindex); | |
1013 | ||
1014 | for (i = 0; i < r->tindex; ++i) | |
1015 | switch (r->tokens[i]) | |
1016 | { | |
1017 | case _EMPTY: | |
1018 | /* The empty set is nullable. */ | |
1019 | *nullable++ = 1; | |
1020 | ||
1021 | /* The firstpos and lastpos of the empty leaf are both empty. */ | |
1022 | *nfirstpos++ = *nlastpos++ = 0; | |
1023 | break; | |
1024 | ||
1025 | case _STAR: | |
1026 | case _PLUS: | |
1027 | /* Every element in the firstpos of the argument is in the follow | |
1028 | of every element in the lastpos. */ | |
1029 | tmp.nelem = nfirstpos[-1]; | |
1030 | tmp.elems = firstpos; | |
1031 | pos = lastpos; | |
1032 | for (j = 0; j < nlastpos[-1]; ++j) | |
1033 | { | |
1034 | merge(&tmp, &r->follows[pos[j].index], &merged); | |
1035 | REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, | |
1036 | nalloc[pos[j].index], merged.nelem - 1); | |
1037 | copy(&merged, &r->follows[pos[j].index]); | |
1038 | } | |
1039 | ||
1040 | case _QMARK: | |
1041 | /* A _QMARK or _STAR node is automatically nullable. */ | |
1042 | if (r->tokens[i] != _PLUS) | |
1043 | nullable[-1] = 1; | |
1044 | break; | |
1045 | ||
1046 | case _CAT: | |
1047 | /* Every element in the firstpos of the second argument is in the | |
1048 | follow of every element in the lastpos of the first argument. */ | |
1049 | tmp.nelem = nfirstpos[-1]; | |
1050 | tmp.elems = firstpos; | |
1051 | pos = lastpos + nlastpos[-1]; | |
1052 | for (j = 0; j < nlastpos[-2]; ++j) | |
1053 | { | |
1054 | merge(&tmp, &r->follows[pos[j].index], &merged); | |
1055 | REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, | |
1056 | nalloc[pos[j].index], merged.nelem - 1); | |
1057 | copy(&merged, &r->follows[pos[j].index]); | |
1058 | } | |
1059 | ||
1060 | /* The firstpos of a _CAT node is the firstpos of the first argument, | |
1061 | union that of the second argument if the first is nullable. */ | |
1062 | if (nullable[-2]) | |
1063 | nfirstpos[-2] += nfirstpos[-1]; | |
1064 | else | |
1065 | firstpos += nfirstpos[-1]; | |
1066 | --nfirstpos; | |
1067 | ||
1068 | /* The lastpos of a _CAT node is the lastpos of the second argument, | |
1069 | union that of the first argument if the second is nullable. */ | |
1070 | if (nullable[-1]) | |
1071 | nlastpos[-2] += nlastpos[-1]; | |
1072 | else | |
1073 | { | |
1074 | pos = lastpos + nlastpos[-2]; | |
1075 | for (j = nlastpos[-1] - 1; j >= 0; --j) | |
1076 | pos[j] = lastpos[j]; | |
1077 | lastpos += nlastpos[-2]; | |
1078 | nlastpos[-2] = nlastpos[-1]; | |
1079 | } | |
1080 | --nlastpos; | |
1081 | ||
1082 | /* A _CAT node is nullable if both arguments are nullable. */ | |
1083 | nullable[-2] = nullable[-1] && nullable[-2]; | |
1084 | --nullable; | |
1085 | break; | |
1086 | ||
1087 | case _OR: | |
1088 | /* The firstpos is the union of the firstpos of each argument. */ | |
1089 | nfirstpos[-2] += nfirstpos[-1]; | |
1090 | --nfirstpos; | |
1091 | ||
1092 | /* The lastpos is the union of the lastpos of each argument. */ | |
1093 | nlastpos[-2] += nlastpos[-1]; | |
1094 | --nlastpos; | |
1095 | ||
1096 | /* An _OR node is nullable if either argument is nullable. */ | |
1097 | nullable[-2] = nullable[-1] || nullable[-2]; | |
1098 | --nullable; | |
1099 | break; | |
1100 | ||
1101 | default: | |
1102 | /* Anything else is a nonempty position. (Note that special | |
1103 | constructs like \< are treated as nonempty strings here; | |
1104 | an "epsilon closure" effectively makes them nullable later. | |
1105 | Backreferences have to get a real position so we can detect | |
1106 | transitions on them later. But they are nullable. */ | |
1107 | *nullable++ = r->tokens[i] == _BACKREF; | |
1108 | ||
1109 | /* This position is in its own firstpos and lastpos. */ | |
1110 | *nfirstpos++ = *nlastpos++ = 1; | |
1111 | --firstpos, --lastpos; | |
1112 | firstpos->index = lastpos->index = i; | |
1113 | firstpos->constraint = lastpos->constraint = _NO_CONSTRAINT; | |
1114 | ||
1115 | /* Allocate the follow set for this position. */ | |
1116 | nalloc[i] = 1; | |
1117 | MALLOC(r->follows[i].elems, _position, nalloc[i]); | |
1118 | break; | |
1119 | } | |
1120 | ||
1121 | /* For each follow set that is the follow set of a real position, replace | |
1122 | it with its epsilon closure. */ | |
1123 | for (i = 0; i < r->tindex; ++i) | |
1124 | if (r->tokens[i] < _NOTCHAR || r->tokens[i] == _BACKREF | |
1125 | || r->tokens[i] >= _SET) | |
1126 | { | |
1127 | copy(&r->follows[i], &merged); | |
1128 | epsclosure(&merged, r); | |
1129 | if (r->follows[i].nelem < merged.nelem) | |
1130 | REALLOC(r->follows[i].elems, _position, merged.nelem); | |
1131 | copy(&merged, &r->follows[i]); | |
1132 | } | |
1133 | ||
1134 | /* Get the epsilon closure of the firstpos of the regexp. The result will | |
1135 | be the set of positions of state 0. */ | |
1136 | merged.nelem = 0; | |
1137 | for (i = 0; i < nfirstpos[-1]; ++i) | |
1138 | insert(firstpos[i], &merged); | |
1139 | epsclosure(&merged, r); | |
1140 | ||
1141 | /* Check if any of the positions of state 0 will want newline context. */ | |
1142 | wants_newline = 0; | |
1143 | for (i = 0; i < merged.nelem; ++i) | |
1144 | if (_PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) | |
1145 | wants_newline = 1; | |
1146 | ||
1147 | /* Build the initial state. */ | |
1148 | r->salloc = 1; | |
1149 | r->sindex = 0; | |
1150 | MALLOC(r->states, _dfa_state, r->salloc); | |
1151 | state_index(r, &merged, wants_newline, 0); | |
1152 | ||
1153 | free(o_nullable); | |
1154 | free(o_nfirst); | |
1155 | free(o_firstpos); | |
1156 | free(o_nlast); | |
1157 | free(o_lastpos); | |
1158 | free(nalloc); | |
1159 | free(merged.elems); | |
1160 | } | |
1161 | \f | |
1162 | /* Find, for each character, the transition out of state s of r, and store | |
1163 | it in the appropriate slot of trans. | |
1164 | ||
1165 | We divide the positions of s into groups (positions can appear in more | |
1166 | than one group). Each group is labeled with a set of characters that | |
1167 | every position in the group matches (taking into account, if necessary, | |
1168 | preceding context information of s). For each group, find the union | |
1169 | of the its elements' follows. This set is the set of positions of the | |
1170 | new state. For each character in the group's label, set the transition | |
1171 | on this character to be to a state corresponding to the set's positions, | |
1172 | and its associated backward context information, if necessary. | |
1173 | ||
1174 | If we are building a searching matcher, we include the positions of state | |
1175 | 0 in every state. | |
1176 | ||
1177 | The collection of groups is constructed by building an equivalence-class | |
1178 | partition of the positions of s. | |
1179 | ||
1180 | For each position, find the set of characters C that it matches. Eliminate | |
1181 | any characters from C that fail on grounds of backward context. | |
1182 | ||
1183 | Search through the groups, looking for a group whose label L has nonempty | |
1184 | intersection with C. If L - C is nonempty, create a new group labeled | |
1185 | L - C and having the same positions as the current group, and set L to | |
1186 | the intersection of L and C. Insert the position in this group, set | |
1187 | C = C - L, and resume scanning. | |
1188 | ||
1189 | If after comparing with every group there are characters remaining in C, | |
1190 | create a new group labeled with the characters of C and insert this | |
1191 | position in that group. */ | |
1192 | void | |
1193 | regstate(s, r, trans) | |
1194 | int s; | |
1195 | struct regexp *r; | |
1196 | int trans[]; | |
1197 | { | |
1198 | _position_set grps[_NOTCHAR]; /* As many as will ever be needed. */ | |
1199 | _charset labels[_NOTCHAR]; /* Labels corresponding to the groups. */ | |
1200 | int ngrps = 0; /* Number of groups actually used. */ | |
1201 | _position pos; /* Current position being considered. */ | |
1202 | _charset matches; /* Set of matching characters. */ | |
1203 | int matchesf; /* True if matches is nonempty. */ | |
1204 | _charset intersect; /* Intersection with some label set. */ | |
1205 | int intersectf; /* True if intersect is nonempty. */ | |
1206 | _charset leftovers; /* Stuff in the label that didn't match. */ | |
1207 | int leftoversf; /* True if leftovers is nonempty. */ | |
1208 | static _charset letters; /* Set of characters considered letters. */ | |
1209 | static _charset newline; /* Set of characters that aren't newline. */ | |
1210 | _position_set follows; /* Union of the follows of some group. */ | |
1211 | _position_set tmp; /* Temporary space for merging sets. */ | |
1212 | int state; /* New state. */ | |
1213 | int wants_newline; /* New state wants to know newline context. */ | |
1214 | int state_newline; /* New state on a newline transition. */ | |
1215 | int wants_letter; /* New state wants to know letter context. */ | |
1216 | int state_letter; /* New state on a letter transition. */ | |
1217 | static initialized; /* Flag for static initialization. */ | |
1218 | int i, j, k; | |
1219 | ||
1220 | /* Initialize the set of letters, if necessary. */ | |
1221 | if (! initialized) | |
1222 | { | |
1223 | initialized = 1; | |
1224 | for (i = 0; i < _NOTCHAR; ++i) | |
1225 | if (ISALNUM(i)) | |
1226 | setbit(i, letters); | |
1227 | setbit('\n', newline); | |
1228 | } | |
1229 | ||
1230 | zeroset(matches); | |
1231 | ||
1232 | for (i = 0; i < r->states[s].elems.nelem; ++i) | |
1233 | { | |
1234 | pos = r->states[s].elems.elems[i]; | |
1235 | if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR) | |
1236 | setbit(r->tokens[pos.index], matches); | |
1237 | else if (r->tokens[pos.index] >= _SET) | |
1238 | copyset(r->charsets[r->tokens[pos.index] - _SET], matches); | |
1239 | else | |
1240 | continue; | |
1241 | ||
1242 | /* Some characters may need to be climinated from matches because | |
1243 | they fail in the current context. */ | |
1244 | if (pos.constraint != 0xff) | |
1245 | { | |
1246 | if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint, | |
1247 | r->states[s].newline, 1)) | |
1248 | clrbit('\n', matches); | |
1249 | if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint, | |
1250 | r->states[s].newline, 0)) | |
1251 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1252 | matches[j] &= newline[j]; | |
1253 | if (! _MATCHES_LETTER_CONTEXT(pos.constraint, | |
1254 | r->states[s].letter, 1)) | |
1255 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1256 | matches[j] &= ~letters[j]; | |
1257 | if (! _MATCHES_LETTER_CONTEXT(pos.constraint, | |
1258 | r->states[s].letter, 0)) | |
1259 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1260 | matches[j] &= letters[j]; | |
1261 | ||
1262 | /* If there are no characters left, there's no point in going on. */ | |
1263 | for (j = 0; j < _CHARSET_INTS && !matches[j]; ++j) | |
1264 | ; | |
1265 | if (j == _CHARSET_INTS) | |
1266 | continue; | |
1267 | } | |
1268 | ||
1269 | for (j = 0; j < ngrps; ++j) | |
1270 | { | |
1271 | /* If matches contains a single character only, and the current | |
1272 | group's label doesn't contain that character, go on to the | |
1273 | next group. */ | |
1274 | if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR | |
1275 | && !tstbit(r->tokens[pos.index], labels[j])) | |
1276 | continue; | |
1277 | ||
1278 | /* Check if this group's label has a nonempty intersection with | |
1279 | matches. */ | |
1280 | intersectf = 0; | |
1281 | for (k = 0; k < _CHARSET_INTS; ++k) | |
1282 | (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0; | |
1283 | if (! intersectf) | |
1284 | continue; | |
1285 | ||
1286 | /* It does; now find the set differences both ways. */ | |
1287 | leftoversf = matchesf = 0; | |
1288 | for (k = 0; k < _CHARSET_INTS; ++k) | |
1289 | { | |
1290 | /* Even an optimizing compiler can't know this for sure. */ | |
1291 | int match = matches[k], label = labels[j][k]; | |
1292 | ||
1293 | (leftovers[k] = ~match & label) ? leftoversf = 1 : 0; | |
1294 | (matches[k] = match & ~label) ? matchesf = 1 : 0; | |
1295 | } | |
1296 | ||
1297 | /* If there were leftovers, create a new group labeled with them. */ | |
1298 | if (leftoversf) | |
1299 | { | |
1300 | copyset(leftovers, labels[ngrps]); | |
1301 | copyset(intersect, labels[j]); | |
1302 | MALLOC(grps[ngrps].elems, _position, r->nleaves); | |
1303 | copy(&grps[j], &grps[ngrps]); | |
1304 | ++ngrps; | |
1305 | } | |
1306 | ||
1307 | /* Put the position in the current group. Note that there is no | |
1308 | reason to call insert() here. */ | |
1309 | grps[j].elems[grps[j].nelem++] = pos; | |
1310 | ||
1311 | /* If every character matching the current position has been | |
1312 | accounted for, we're done. */ | |
1313 | if (! matchesf) | |
1314 | break; | |
1315 | } | |
1316 | ||
1317 | /* If we've passed the last group, and there are still characters | |
1318 | unaccounted for, then we'll have to create a new group. */ | |
1319 | if (j == ngrps) | |
1320 | { | |
1321 | copyset(matches, labels[ngrps]); | |
1322 | zeroset(matches); | |
1323 | MALLOC(grps[ngrps].elems, _position, r->nleaves); | |
1324 | grps[ngrps].nelem = 1; | |
1325 | grps[ngrps].elems[0] = pos; | |
1326 | ++ngrps; | |
1327 | } | |
1328 | } | |
1329 | ||
1330 | MALLOC(follows.elems, _position, r->nleaves); | |
1331 | MALLOC(tmp.elems, _position, r->nleaves); | |
1332 | ||
1333 | /* If we are a searching matcher, the default transition is to a state | |
1334 | containing the positions of state 0, otherwise the default transition | |
1335 | is to fail miserably. */ | |
1336 | if (r->searchflag) | |
1337 | { | |
1338 | wants_newline = 0; | |
1339 | wants_letter = 0; | |
1340 | for (i = 0; i < r->states[0].elems.nelem; ++i) | |
1341 | { | |
1342 | if (_PREV_NEWLINE_DEPENDENT(r->states[0].elems.elems[i].constraint)) | |
1343 | wants_newline = 1; | |
1344 | if (_PREV_LETTER_DEPENDENT(r->states[0].elems.elems[i].constraint)) | |
1345 | wants_letter = 1; | |
1346 | } | |
1347 | copy(&r->states[0].elems, &follows); | |
1348 | state = state_index(r, &follows, 0, 0); | |
1349 | if (wants_newline) | |
1350 | state_newline = state_index(r, &follows, 1, 0); | |
1351 | else | |
1352 | state_newline = state; | |
1353 | if (wants_letter) | |
1354 | state_letter = state_index(r, &follows, 0, 1); | |
1355 | else | |
1356 | state_letter = state; | |
1357 | for (i = 0; i < _NOTCHAR; ++i) | |
1358 | if (i == '\n') | |
1359 | trans[i] = state_newline; | |
1360 | else if (ISALNUM(i)) | |
1361 | trans[i] = state_letter; | |
1362 | else | |
1363 | trans[i] = state; | |
1364 | } | |
1365 | else | |
1366 | for (i = 0; i < _NOTCHAR; ++i) | |
1367 | trans[i] = -1; | |
1368 | ||
1369 | for (i = 0; i < ngrps; ++i) | |
1370 | { | |
1371 | follows.nelem = 0; | |
1372 | ||
1373 | /* Find the union of the follows of the positions of the group. | |
1374 | This is a hideously inefficient loop. Fix it someday. */ | |
1375 | for (j = 0; j < grps[i].nelem; ++j) | |
1376 | for (k = 0; k < r->follows[grps[i].elems[j].index].nelem; ++k) | |
1377 | insert(r->follows[grps[i].elems[j].index].elems[k], &follows); | |
1378 | ||
1379 | /* If we are building a searching matcher, throw in the positions | |
1380 | of state 0 as well. */ | |
1381 | if (r->searchflag) | |
1382 | for (j = 0; j < r->states[0].elems.nelem; ++j) | |
1383 | insert(r->states[0].elems.elems[j], &follows); | |
1384 | ||
1385 | /* Find out if the new state will want any context information. */ | |
1386 | wants_newline = 0; | |
1387 | if (tstbit('\n', labels[i])) | |
1388 | for (j = 0; j < follows.nelem; ++j) | |
1389 | if (_PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) | |
1390 | wants_newline = 1; | |
1391 | ||
1392 | wants_letter = 0; | |
1393 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1394 | if (labels[i][j] & letters[j]) | |
1395 | break; | |
1396 | if (j < _CHARSET_INTS) | |
1397 | for (j = 0; j < follows.nelem; ++j) | |
1398 | if (_PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) | |
1399 | wants_letter = 1; | |
1400 | ||
1401 | /* Find the state(s) corresponding to the union of the follows. */ | |
1402 | state = state_index(r, &follows, 0, 0); | |
1403 | if (wants_newline) | |
1404 | state_newline = state_index(r, &follows, 1, 0); | |
1405 | else | |
1406 | state_newline = state; | |
1407 | if (wants_letter) | |
1408 | state_letter = state_index(r, &follows, 0, 1); | |
1409 | else | |
1410 | state_letter = state; | |
1411 | ||
1412 | /* Set the transitions for each character in the current label. */ | |
1413 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1414 | for (k = 0; k < INTBITS; ++k) | |
1415 | if (labels[i][j] & 1 << k) | |
1416 | { | |
1417 | int c = j * INTBITS + k; | |
1418 | ||
1419 | if (c == '\n') | |
1420 | trans[c] = state_newline; | |
1421 | else if (ISALNUM(c)) | |
1422 | trans[c] = state_letter; | |
1423 | else if (c < _NOTCHAR) | |
1424 | trans[c] = state; | |
1425 | } | |
1426 | } | |
1427 | ||
1428 | for (i = 0; i < ngrps; ++i) | |
1429 | free(grps[i].elems); | |
1430 | free(follows.elems); | |
1431 | free(tmp.elems); | |
1432 | } | |
1433 | \f | |
1434 | /* Some routines for manipulating a compiled regexp's transition tables. | |
1435 | Each state may or may not have a transition table; if it does, and it | |
1436 | is a non-accepting state, then r->trans[state] points to its table. | |
1437 | If it is an accepting state then r->fails[state] points to its table. | |
1438 | If it has no table at all, then r->trans[state] is NULL. | |
1439 | TODO: Improve this comment, get rid of the unnecessary redundancy. */ | |
1440 | ||
1441 | static void | |
1442 | build_state(s, r) | |
1443 | int s; | |
1444 | struct regexp *r; | |
1445 | { | |
1446 | int *trans; /* The new transition table. */ | |
1447 | int i; | |
1448 | ||
1449 | /* Set an upper limit on the number of transition tables that will ever | |
1450 | exist at once. 1024 is arbitrary. The idea is that the frequently | |
1451 | used transition tables will be quickly rebuilt, whereas the ones that | |
1452 | were only needed once or twice will be cleared away. */ | |
1453 | if (r->trcount >= 1024) | |
1454 | { | |
1455 | for (i = 0; i < r->tralloc; ++i) | |
1456 | if (r->trans[i]) | |
1457 | { | |
1458 | free((ptr_t) r->trans[i]); | |
1459 | r->trans[i] = NULL; | |
1460 | } | |
1461 | else if (r->fails[i]) | |
1462 | { | |
1463 | free((ptr_t) r->fails[i]); | |
1464 | r->fails[i] = NULL; | |
1465 | } | |
1466 | r->trcount = 0; | |
1467 | } | |
1468 | ||
1469 | ++r->trcount; | |
1470 | ||
1471 | /* Set up the success bits for this state. */ | |
1472 | r->success[s] = 0; | |
1473 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 1, r->states[s].letter, 0, | |
1474 | s, *r)) | |
1475 | r->success[s] |= 4; | |
1476 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 1, | |
1477 | s, *r)) | |
1478 | r->success[s] |= 2; | |
1479 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 0, | |
1480 | s, *r)) | |
1481 | r->success[s] |= 1; | |
1482 | ||
1483 | MALLOC(trans, int, _NOTCHAR); | |
1484 | regstate(s, r, trans); | |
1485 | ||
1486 | /* Now go through the new transition table, and make sure that the trans | |
1487 | and fail arrays are allocated large enough to hold a pointer for the | |
1488 | largest state mentioned in the table. */ | |
1489 | for (i = 0; i < _NOTCHAR; ++i) | |
1490 | if (trans[i] >= r->tralloc) | |
1491 | { | |
1492 | int oldalloc = r->tralloc; | |
1493 | ||
1494 | while (trans[i] >= r->tralloc) | |
1495 | r->tralloc *= 2; | |
1496 | REALLOC(r->realtrans, int *, r->tralloc + 1); | |
1497 | r->trans = r->realtrans + 1; | |
1498 | REALLOC(r->fails, int *, r->tralloc); | |
1499 | REALLOC(r->success, int, r->tralloc); | |
1500 | REALLOC(r->newlines, int, r->tralloc); | |
1501 | while (oldalloc < r->tralloc) | |
1502 | { | |
1503 | r->trans[oldalloc] = NULL; | |
1504 | r->fails[oldalloc++] = NULL; | |
1505 | } | |
1506 | } | |
1507 | ||
1508 | /* Keep the newline transition in a special place so we can use it as | |
1509 | a sentinel. */ | |
1510 | r->newlines[s] = trans['\n']; | |
1511 | trans['\n'] = -1; | |
1512 | ||
1513 | if (ACCEPTING(s, *r)) | |
1514 | r->fails[s] = trans; | |
1515 | else | |
1516 | r->trans[s] = trans; | |
1517 | } | |
1518 | ||
1519 | static void | |
1520 | build_state_zero(r) | |
1521 | struct regexp *r; | |
1522 | { | |
1523 | r->tralloc = 1; | |
1524 | r->trcount = 0; | |
1525 | CALLOC(r->realtrans, int *, r->tralloc + 1); | |
1526 | r->trans = r->realtrans + 1; | |
1527 | CALLOC(r->fails, int *, r->tralloc); | |
1528 | MALLOC(r->success, int, r->tralloc); | |
1529 | MALLOC(r->newlines, int, r->tralloc); | |
1530 | build_state(0, r); | |
1531 | } | |
1532 | \f | |
1533 | /* Search through a buffer looking for a match to the given struct regexp. | |
1534 | Find the first occurrence of a string matching the regexp in the buffer, | |
1535 | and the shortest possible version thereof. Return a pointer to the first | |
1536 | character after the match, or NULL if none is found. Begin points to | |
1537 | the beginning of the buffer, and end points to the first character after | |
1538 | its end. We store a newline in *end to act as a sentinel, so end had | |
1539 | better point somewhere valid. Newline is a flag indicating whether to | |
1540 | allow newlines to be in the matching string. If count is non- | |
1541 | NULL it points to a place we're supposed to increment every time we | |
1542 | see a newline. Finally, if backref is non-NULL it points to a place | |
1543 | where we're supposed to store a 1 if backreferencing happened and the | |
1544 | match needs to be verified by a backtracking matcher. Otherwise | |
1545 | we store a 0 in *backref. */ | |
1546 | char * | |
1547 | regexecute(r, begin, end, newline, count, backref) | |
1548 | struct regexp *r; | |
1549 | char *begin; | |
1550 | char *end; | |
1551 | int newline; | |
1552 | int *count; | |
1553 | int *backref; | |
1554 | { | |
1555 | register s, s1, tmp; /* Current state. */ | |
1556 | register unsigned char *p; /* Current input character. */ | |
1557 | register **trans, *t; /* Copy of r->trans so it can be optimized | |
1558 | into a register. */ | |
1559 | static sbit[_NOTCHAR]; /* Table for anding with r->success. */ | |
1560 | static sbit_init; | |
1561 | ||
1562 | if (! sbit_init) | |
1563 | { | |
1564 | int i; | |
1565 | ||
1566 | sbit_init = 1; | |
1567 | for (i = 0; i < _NOTCHAR; ++i) | |
1568 | if (i == '\n') | |
1569 | sbit[i] = 4; | |
1570 | else if (ISALNUM(i)) | |
1571 | sbit[i] = 2; | |
1572 | else | |
1573 | sbit[i] = 1; | |
1574 | } | |
1575 | ||
1576 | if (! r->tralloc) | |
1577 | build_state_zero(r); | |
1578 | ||
1579 | s = 0; | |
1580 | p = (unsigned char *) begin; | |
1581 | trans = r->trans; | |
1582 | *end = '\n'; | |
1583 | ||
1584 | for (;;) | |
1585 | { | |
1586 | /* The dreaded inner loop. */ | |
1587 | if (t = trans[s]) | |
1588 | do | |
1589 | { | |
1590 | s1 = t[*p++]; | |
1591 | if (! (t = trans[s1])) | |
1592 | goto last_was_s; | |
1593 | s = t[*p++]; | |
1594 | } | |
1595 | while (t = trans[s]); | |
1596 | goto last_was_s1; | |
1597 | last_was_s: | |
1598 | tmp = s, s = s1, s1 = tmp; | |
1599 | last_was_s1: | |
1600 | ||
1601 | if (s >= 0 && p <= (unsigned char *) end && r->fails[s]) | |
1602 | { | |
1603 | if (r->success[s] & sbit[*p]) | |
1604 | { | |
1605 | if (backref) | |
1606 | if (r->states[s].backref) | |
1607 | *backref = 1; | |
1608 | else | |
1609 | *backref = 0; | |
1610 | return (char *) p; | |
1611 | } | |
1612 | ||
1613 | s1 = s; | |
1614 | s = r->fails[s][*p++]; | |
1615 | continue; | |
1616 | } | |
1617 | ||
1618 | /* If the previous character was a newline, count it. */ | |
1619 | if (count && (char *) p <= end && p[-1] == '\n') | |
1620 | ++*count; | |
1621 | ||
1622 | /* Check if we've run off the end of the buffer. */ | |
1623 | if ((char *) p >= end) | |
1624 | return NULL; | |
1625 | ||
1626 | if (s >= 0) | |
1627 | { | |
1628 | build_state(s, r); | |
1629 | trans = r->trans; | |
1630 | continue; | |
1631 | } | |
1632 | ||
1633 | if (p[-1] == '\n' && newline) | |
1634 | { | |
1635 | s = r->newlines[s1]; | |
1636 | continue; | |
1637 | } | |
1638 | ||
1639 | s = 0; | |
1640 | } | |
1641 | } | |
1642 | \f | |
1643 | /* Initialize the components of a regexp that the other routines don't | |
1644 | initialize for themselves. */ | |
1645 | void | |
1646 | reginit(r) | |
1647 | struct regexp *r; | |
1648 | { | |
1649 | r->calloc = 1; | |
1650 | MALLOC(r->charsets, _charset, r->calloc); | |
1651 | r->cindex = 0; | |
1652 | ||
1653 | r->talloc = 1; | |
1654 | MALLOC(r->tokens, _token, r->talloc); | |
1655 | r->tindex = r->depth = r->nleaves = r->nregexps = 0; | |
1656 | ||
1657 | r->searchflag = 0; | |
1658 | r->tralloc = 0; | |
1659 | } | |
1660 | ||
1661 | /* Parse and analyze a single string of the given length. */ | |
1662 | void | |
1663 | regcompile(s, len, r, searchflag) | |
1664 | const char *s; | |
1665 | size_t len; | |
1666 | struct regexp *r; | |
1667 | int searchflag; | |
1668 | { | |
1669 | if (case_fold) /* dummy folding in service of regmust() */ | |
1670 | { | |
1671 | char *copy; | |
1672 | int i; | |
1673 | ||
1674 | copy = malloc(len); | |
1675 | if (!copy) | |
1676 | regerror("out of memory"); | |
1677 | ||
1678 | /* This is a complete kludge and could potentially break | |
1679 | \<letter> escapes . . . */ | |
1680 | case_fold = 0; | |
1681 | for (i = 0; i < len; ++i) | |
1682 | if (ISUPPER(s[i])) | |
1683 | copy[i] = tolower(s[i]); | |
1684 | else | |
1685 | copy[i] = s[i]; | |
1686 | ||
1687 | reginit(r); | |
1688 | r->mustn = 0; | |
1689 | r->must[0] = '\0'; | |
1690 | regparse(copy, len, r); | |
1691 | free(copy); | |
1692 | regmust(r); | |
1693 | reganalyze(r, searchflag); | |
1694 | case_fold = 1; | |
1695 | reginit(r); | |
1696 | regparse(s, len, r); | |
1697 | reganalyze(r, searchflag); | |
1698 | } | |
1699 | else | |
1700 | { | |
1701 | reginit(r); | |
1702 | regparse(s, len, r); | |
1703 | regmust(r); | |
1704 | reganalyze(r, searchflag); | |
1705 | } | |
1706 | } | |
1707 | ||
1708 | /* Free the storage held by the components of a regexp. */ | |
1709 | void | |
1710 | regfree(r) | |
1711 | struct regexp *r; | |
1712 | { | |
1713 | int i; | |
1714 | ||
1715 | free((ptr_t) r->charsets); | |
1716 | free((ptr_t) r->tokens); | |
1717 | for (i = 0; i < r->sindex; ++i) | |
1718 | free((ptr_t) r->states[i].elems.elems); | |
1719 | free((ptr_t) r->states); | |
1720 | for (i = 0; i < r->tindex; ++i) | |
1721 | if (r->follows[i].elems) | |
1722 | free((ptr_t) r->follows[i].elems); | |
1723 | free((ptr_t) r->follows); | |
1724 | for (i = 0; i < r->tralloc; ++i) | |
1725 | if (r->trans[i]) | |
1726 | free((ptr_t) r->trans[i]); | |
1727 | else if (r->fails[i]) | |
1728 | free((ptr_t) r->fails[i]); | |
1729 | free((ptr_t) r->realtrans); | |
1730 | free((ptr_t) r->fails); | |
1731 | free((ptr_t) r->newlines); | |
1732 | } | |
1733 | ||
1734 | /* | |
1735 | Having found the postfix representation of the regular expression, | |
1736 | try to find a long sequence of characters that must appear in any line | |
1737 | containing the r.e. | |
1738 | Finding a "longest" sequence is beyond the scope here; | |
1739 | we take an easy way out and hope for the best. | |
1740 | (Take "(ab|a)b"--please.) | |
1741 | ||
1742 | We do a bottom-up calculation of sequences of characters that must appear | |
1743 | in matches of r.e.'s represented by trees rooted at the nodes of the postfix | |
1744 | representation: | |
1745 | sequences that must appear at the left of the match ("left") | |
1746 | sequences that must appear at the right of the match ("right") | |
1747 | lists of sequences that must appear somewhere in the match ("in") | |
1748 | sequences that must constitute the match ("is") | |
1749 | When we get to the root of the tree, we use one of the longest of its | |
1750 | calculated "in" sequences as our answer. The sequence we find is returned in | |
1751 | r->must (where "r" is the single argument passed to "regmust"); | |
1752 | the length of the sequence is returned in r->mustn. | |
1753 | ||
1754 | The sequences calculated for the various types of node (in pseudo ANSI c) | |
1755 | are shown below. "p" is the operand of unary operators (and the left-hand | |
1756 | operand of binary operators); "q" is the right-hand operand of binary operators | |
1757 | . | |
1758 | "ZERO" means "a zero-length sequence" below. | |
1759 | ||
1760 | Type left right is in | |
1761 | ---- ---- ----- -- -- | |
1762 | char c # c # c # c # c | |
1763 | ||
1764 | SET ZERO ZERO ZERO ZERO | |
1765 | ||
1766 | STAR ZERO ZERO ZERO ZERO | |
1767 | ||
1768 | QMARK ZERO ZERO ZERO ZERO | |
1769 | ||
1770 | PLUS p->left p->right ZERO p->in | |
1771 | ||
1772 | CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus | |
1773 | p->left : q->right : q->is!=ZERO) ? q->in plus | |
1774 | p->is##q->left p->right##q->is p->is##q->is : p->right##q->left | |
1775 | ZERO | |
1776 | ||
1777 | OR longest common longest common (do p->is and substrings common to | |
1778 | leading trailing q->is have same p->in and q->in | |
1779 | (sub)sequence (sub)sequence length and | |
1780 | of p->left of p->right content) ? | |
1781 | and q->left and q->right p->is : NULL | |
1782 | ||
1783 | If there's anything else we recognize in the tree, all four sequences get set | |
1784 | to zero-length sequences. If there's something we don't recognize in the tree, | |
1785 | we just return a zero-length sequence. | |
1786 | ||
1787 | Break ties in favor of infrequent letters (choosing 'zzz' in preference to | |
1788 | 'aaa')? | |
1789 | ||
1790 | And. . .is it here or someplace that we might ponder "optimizations" such as | |
1791 | egrep 'psi|epsilon' -> egrep 'psi' | |
1792 | egrep 'pepsi|epsilon' -> egrep 'epsi' | |
1793 | (Yes, we now find "epsi" as a "string | |
1794 | that must occur", but we might also | |
1795 | simplify the *entire* r.e. being sought | |
1796 | ) | |
1797 | grep '[c]' -> grep 'c' | |
1798 | grep '(ab|a)b' -> grep 'ab' | |
1799 | grep 'ab*' -> grep 'a' | |
1800 | grep 'a*b' -> grep 'b' | |
1801 | There are several issues: | |
1802 | Is optimization easy (enough)? | |
1803 | ||
1804 | Does optimization actually accomplish anything, | |
1805 | or is the automaton you get from "psi|epsilon" (for example) | |
1806 | the same as the one you get from "psi" (for example)? | |
1807 | ||
1808 | Are optimizable r.e.'s likely to be used in real-life situations | |
1809 | (something like 'ab*' is probably unlikely; something like is | |
1810 | 'psi|epsilon' is likelier)? | |
1811 | */ | |
1812 | ||
1813 | static char * | |
1814 | icatalloc(old, new) | |
1815 | char * old; | |
1816 | char * new; | |
1817 | { | |
1818 | register char * result; | |
1819 | register int oldsize, newsize; | |
1820 | ||
1821 | newsize = (new == NULL) ? 0 : strlen(new); | |
1822 | if (old == NULL) | |
1823 | oldsize = 0; | |
1824 | else if (newsize == 0) | |
1825 | return old; | |
1826 | else oldsize = strlen(old); | |
1827 | if (old == NULL) | |
1828 | result = (char *) malloc(newsize + 1); | |
1829 | else result = (char *) realloc((void *) old, oldsize + newsize + 1); | |
1830 | if (result != NULL && new != NULL) | |
1831 | (void) strcpy(result + oldsize, new); | |
1832 | return result; | |
1833 | } | |
1834 | ||
1835 | static char * | |
1836 | icpyalloc(string) | |
1837 | const char * string; | |
1838 | { | |
1839 | return icatalloc((char *) NULL, string); | |
1840 | } | |
1841 | ||
1842 | static char * | |
1843 | istrstr(lookin, lookfor) | |
1844 | char * lookin; | |
1845 | register char * lookfor; | |
1846 | { | |
1847 | register char * cp; | |
1848 | register int len; | |
1849 | ||
1850 | len = strlen(lookfor); | |
1851 | for (cp = lookin; *cp != '\0'; ++cp) | |
1852 | if (strncmp(cp, lookfor, len) == 0) | |
1853 | return cp; | |
1854 | return NULL; | |
1855 | } | |
1856 | ||
1857 | static void | |
1858 | ifree(cp) | |
1859 | char * cp; | |
1860 | { | |
1861 | if (cp != NULL) | |
1862 | free(cp); | |
1863 | } | |
1864 | ||
1865 | static void | |
1866 | freelist(cpp) | |
1867 | register char ** cpp; | |
1868 | { | |
1869 | register int i; | |
1870 | ||
1871 | if (cpp == NULL) | |
1872 | return; | |
1873 | for (i = 0; cpp[i] != NULL; ++i) { | |
1874 | free(cpp[i]); | |
1875 | cpp[i] = NULL; | |
1876 | } | |
1877 | } | |
1878 | ||
1879 | static char ** | |
1880 | enlist(cpp, new, len) | |
1881 | register char ** cpp; | |
1882 | register char * new; | |
1883 | { | |
1884 | register int i, j; | |
1885 | ||
1886 | if (cpp == NULL) | |
1887 | return NULL; | |
1888 | if ((new = icpyalloc(new)) == NULL) { | |
1889 | freelist(cpp); | |
1890 | return NULL; | |
1891 | } | |
1892 | new[len] = '\0'; | |
1893 | /* | |
1894 | ** Is there already something in the list that's new (or longer)? | |
1895 | */ | |
1896 | for (i = 0; cpp[i] != NULL; ++i) | |
1897 | if (istrstr(cpp[i], new) != NULL) { | |
1898 | free(new); | |
1899 | return cpp; | |
1900 | } | |
1901 | /* | |
1902 | ** Eliminate any obsoleted strings. | |
1903 | */ | |
1904 | j = 0; | |
1905 | while (cpp[j] != NULL) | |
1906 | if (istrstr(new, cpp[j]) == NULL) | |
1907 | ++j; | |
1908 | else { | |
1909 | free(cpp[j]); | |
1910 | if (--i == j) | |
1911 | break; | |
1912 | cpp[j] = cpp[i]; | |
1913 | } | |
1914 | /* | |
1915 | ** Add the new string. | |
1916 | */ | |
1917 | cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); | |
1918 | if (cpp == NULL) | |
1919 | return NULL; | |
1920 | cpp[i] = new; | |
1921 | cpp[i + 1] = NULL; | |
1922 | return cpp; | |
1923 | } | |
1924 | ||
1925 | /* | |
1926 | ** Given pointers to two strings, | |
1927 | ** return a pointer to an allocated list of their distinct common substrings. | |
1928 | ** Return NULL if something seems wild. | |
1929 | */ | |
1930 | ||
1931 | static char ** | |
1932 | comsubs(left, right) | |
1933 | char * left; | |
1934 | char * right; | |
1935 | { | |
1936 | register char ** cpp; | |
1937 | register char * lcp; | |
1938 | register char * rcp; | |
1939 | register int i, len; | |
1940 | ||
1941 | if (left == NULL || right == NULL) | |
1942 | return NULL; | |
1943 | cpp = (char **) malloc(sizeof *cpp); | |
1944 | if (cpp == NULL) | |
1945 | return NULL; | |
1946 | cpp[0] = NULL; | |
1947 | for (lcp = left; *lcp != '\0'; ++lcp) { | |
1948 | len = 0; | |
1949 | rcp = strchr(right, *lcp); | |
1950 | while (rcp != NULL) { | |
1951 | for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) | |
1952 | ; | |
1953 | if (i > len) | |
1954 | len = i; | |
1955 | rcp = strchr(rcp + 1, *lcp); | |
1956 | } | |
1957 | if (len == 0) | |
1958 | continue; | |
1959 | if ((cpp = enlist(cpp, lcp, len)) == NULL) | |
1960 | break; | |
1961 | } | |
1962 | return cpp; | |
1963 | } | |
1964 | ||
1965 | static char ** | |
1966 | addlists(old, new) | |
1967 | char ** old; | |
1968 | char ** new; | |
1969 | { | |
1970 | register int i; | |
1971 | ||
1972 | if (old == NULL || new == NULL) | |
1973 | return NULL; | |
1974 | for (i = 0; new[i] != NULL; ++i) { | |
1975 | old = enlist(old, new[i], strlen(new[i])); | |
1976 | if (old == NULL) | |
1977 | break; | |
1978 | } | |
1979 | return old; | |
1980 | } | |
1981 | ||
1982 | /* | |
1983 | ** Given two lists of substrings, | |
1984 | ** return a new list giving substrings common to both. | |
1985 | */ | |
1986 | ||
1987 | static char ** | |
1988 | inboth(left, right) | |
1989 | char ** left; | |
1990 | char ** right; | |
1991 | { | |
1992 | register char ** both; | |
1993 | register char ** temp; | |
1994 | register int lnum, rnum; | |
1995 | ||
1996 | if (left == NULL || right == NULL) | |
1997 | return NULL; | |
1998 | both = (char **) malloc(sizeof *both); | |
1999 | if (both == NULL) | |
2000 | return NULL; | |
2001 | both[0] = NULL; | |
2002 | for (lnum = 0; left[lnum] != NULL; ++lnum) { | |
2003 | for (rnum = 0; right[rnum] != NULL; ++rnum) { | |
2004 | temp = comsubs(left[lnum], right[rnum]); | |
2005 | if (temp == NULL) { | |
2006 | freelist(both); | |
2007 | return NULL; | |
2008 | } | |
2009 | both = addlists(both, temp); | |
2010 | freelist(temp); | |
2011 | if (both == NULL) | |
2012 | return NULL; | |
2013 | } | |
2014 | } | |
2015 | return both; | |
2016 | } | |
2017 | ||
2018 | typedef struct { | |
2019 | char ** in; | |
2020 | char * left; | |
2021 | char * right; | |
2022 | char * is; | |
2023 | } must; | |
2024 | ||
2025 | static void | |
2026 | resetmust(mp) | |
2027 | register must * mp; | |
2028 | { | |
2029 | mp->left[0] = mp->right[0] = mp->is[0] = '\0'; | |
2030 | freelist(mp->in); | |
2031 | } | |
2032 | ||
2033 | static void | |
2034 | regmust(r) | |
2035 | register struct regexp * r; | |
2036 | { | |
2037 | register must * musts; | |
2038 | register must * mp; | |
2039 | register char * result; | |
2040 | register int ri; | |
2041 | register int i; | |
2042 | register _token t; | |
2043 | static must must0; | |
2044 | ||
2045 | reg->mustn = 0; | |
2046 | reg->must[0] = '\0'; | |
2047 | musts = (must *) malloc((reg->tindex + 1) * sizeof *musts); | |
2048 | if (musts == NULL) | |
2049 | return; | |
2050 | mp = musts; | |
2051 | for (i = 0; i <= reg->tindex; ++i) | |
2052 | mp[i] = must0; | |
2053 | for (i = 0; i <= reg->tindex; ++i) { | |
2054 | mp[i].in = (char **) malloc(sizeof *mp[i].in); | |
2055 | mp[i].left = malloc(2); | |
2056 | mp[i].right = malloc(2); | |
2057 | mp[i].is = malloc(2); | |
2058 | if (mp[i].in == NULL || mp[i].left == NULL || | |
2059 | mp[i].right == NULL || mp[i].is == NULL) | |
2060 | goto done; | |
2061 | mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; | |
2062 | mp[i].in[0] = NULL; | |
2063 | } | |
2064 | result = ""; | |
2065 | for (ri = 0; ri < reg->tindex; ++ri) { | |
2066 | switch (t = reg->tokens[ri]) { | |
2067 | case _ALLBEGLINE: | |
2068 | case _ALLENDLINE: | |
2069 | case _LPAREN: | |
2070 | case _RPAREN: | |
2071 | goto done; /* "cannot happen" */ | |
2072 | case _EMPTY: | |
2073 | case _BEGLINE: | |
2074 | case _ENDLINE: | |
2075 | case _BEGWORD: | |
2076 | case _ENDWORD: | |
2077 | case _LIMWORD: | |
2078 | case _NOTLIMWORD: | |
2079 | case _BACKREF: | |
2080 | resetmust(mp); | |
2081 | break; | |
2082 | case _STAR: | |
2083 | case _QMARK: | |
2084 | if (mp <= musts) | |
2085 | goto done; /* "cannot happen" */ | |
2086 | --mp; | |
2087 | resetmust(mp); | |
2088 | break; | |
2089 | case _OR: | |
2090 | if (mp < &musts[2]) | |
2091 | goto done; /* "cannot happen" */ | |
2092 | { | |
2093 | register char ** new; | |
2094 | register must * lmp; | |
2095 | register must * rmp; | |
2096 | register int j, ln, rn, n; | |
2097 | ||
2098 | rmp = --mp; | |
2099 | lmp = --mp; | |
2100 | /* Guaranteed to be. Unlikely, but. . . */ | |
2101 | if (strcmp(lmp->is, rmp->is) != 0) | |
2102 | lmp->is[0] = '\0'; | |
2103 | /* Left side--easy */ | |
2104 | i = 0; | |
2105 | while (lmp->left[i] != '\0' && | |
2106 | lmp->left[i] == rmp->left[i]) | |
2107 | ++i; | |
2108 | lmp->left[i] = '\0'; | |
2109 | /* Right side */ | |
2110 | ln = strlen(lmp->right); | |
2111 | rn = strlen(rmp->right); | |
2112 | n = ln; | |
2113 | if (n > rn) | |
2114 | n = rn; | |
2115 | for (i = 0; i < n; ++i) | |
2116 | if (lmp->right[ln - i - 1] != | |
2117 | rmp->right[rn - i - 1]) | |
2118 | break; | |
2119 | for (j = 0; j < i; ++j) | |
2120 | lmp->right[j] = | |
2121 | lmp->right[(ln - i) + j]; | |
2122 | lmp->right[j] = '\0'; | |
2123 | new = inboth(lmp->in, rmp->in); | |
2124 | if (new == NULL) | |
2125 | goto done; | |
2126 | freelist(lmp->in); | |
2127 | free((char *) lmp->in); | |
2128 | lmp->in = new; | |
2129 | } | |
2130 | break; | |
2131 | case _PLUS: | |
2132 | if (mp <= musts) | |
2133 | goto done; /* "cannot happen" */ | |
2134 | --mp; | |
2135 | mp->is[0] = '\0'; | |
2136 | break; | |
2137 | case _END: | |
2138 | if (mp != &musts[1]) | |
2139 | goto done; /* "cannot happen" */ | |
2140 | for (i = 0; musts[0].in[i] != NULL; ++i) | |
2141 | if (strlen(musts[0].in[i]) > strlen(result)) | |
2142 | result = musts[0].in[i]; | |
2143 | goto done; | |
2144 | case _CAT: | |
2145 | if (mp < &musts[2]) | |
2146 | goto done; /* "cannot happen" */ | |
2147 | { | |
2148 | register must * lmp; | |
2149 | register must * rmp; | |
2150 | ||
2151 | rmp = --mp; | |
2152 | lmp = --mp; | |
2153 | /* | |
2154 | ** In. Everything in left, plus everything in | |
2155 | ** right, plus catenation of | |
2156 | ** left's right and right's left. | |
2157 | */ | |
2158 | lmp->in = addlists(lmp->in, rmp->in); | |
2159 | if (lmp->in == NULL) | |
2160 | goto done; | |
2161 | if (lmp->right[0] != '\0' && | |
2162 | rmp->left[0] != '\0') { | |
2163 | register char * tp; | |
2164 | ||
2165 | tp = icpyalloc(lmp->right); | |
2166 | if (tp == NULL) | |
2167 | goto done; | |
2168 | tp = icatalloc(tp, rmp->left); | |
2169 | if (tp == NULL) | |
2170 | goto done; | |
2171 | lmp->in = enlist(lmp->in, tp, | |
2172 | strlen(tp)); | |
2173 | free(tp); | |
2174 | if (lmp->in == NULL) | |
2175 | goto done; | |
2176 | } | |
2177 | /* Left-hand */ | |
2178 | if (lmp->is[0] != '\0') { | |
2179 | lmp->left = icatalloc(lmp->left, | |
2180 | rmp->left); | |
2181 | if (lmp->left == NULL) | |
2182 | goto done; | |
2183 | } | |
2184 | /* Right-hand */ | |
2185 | if (rmp->is[0] == '\0') | |
2186 | lmp->right[0] = '\0'; | |
2187 | lmp->right = icatalloc(lmp->right, rmp->right); | |
2188 | if (lmp->right == NULL) | |
2189 | goto done; | |
2190 | /* Guaranteed to be */ | |
2191 | if (lmp->is[0] != '\0' && rmp->is[0] != '\0') { | |
2192 | lmp->is = icatalloc(lmp->is, rmp->is); | |
2193 | if (lmp->is == NULL) | |
2194 | goto done; | |
2195 | } | |
2196 | } | |
2197 | break; | |
2198 | default: | |
2199 | if (t < _END) { | |
2200 | /* "cannot happen" */ | |
2201 | goto done; | |
2202 | } else if (t == '\0') { | |
2203 | /* not on *my* shift */ | |
2204 | goto done; | |
2205 | } else if (t >= _SET) { | |
2206 | /* easy enough */ | |
2207 | resetmust(mp); | |
2208 | } else { | |
2209 | /* plain character */ | |
2210 | resetmust(mp); | |
2211 | mp->is[0] = mp->left[0] = mp->right[0] = t; | |
2212 | mp->is[1] = mp->left[1] = mp->right[1] = '\0'; | |
2213 | mp->in = enlist(mp->in, mp->is, 1); | |
2214 | if (mp->in == NULL) | |
2215 | goto done; | |
2216 | } | |
2217 | break; | |
2218 | } | |
2219 | ++mp; | |
2220 | } | |
2221 | done: | |
2222 | (void) strncpy(reg->must, result, MUST_MAX - 1); | |
2223 | reg->must[MUST_MAX - 1] = '\0'; | |
2224 | reg->mustn = strlen(reg->must); | |
2225 | mp = musts; | |
2226 | for (i = 0; i <= reg->tindex; ++i) { | |
2227 | freelist(mp[i].in); | |
2228 | ifree((char *) mp[i].in); | |
2229 | ifree(mp[i].left); | |
2230 | ifree(mp[i].right); | |
2231 | ifree(mp[i].is); | |
2232 | } | |
2233 | free((char *) mp); | |
2234 | } |