Commit | Line | Data |
---|---|---|
9049701c C |
1 | /* dfa.c - determinisitic extended regexp routines for GNU |
2 | Copyright (C) 1988 Free Software Foundation, Inc. | |
3 | Written June, 1988 by Mike Haertel | |
4 | Modified July, 1988 by Arthur David Olson | |
5 | to assist BMG speedups | |
6 | ||
7 | NO WARRANTY | |
8 | ||
9 | BECAUSE THIS PROGRAM IS LICENSED FREE OF CHARGE, WE PROVIDE ABSOLUTELY | |
10 | NO WARRANTY, TO THE EXTENT PERMITTED BY APPLICABLE STATE LAW. EXCEPT | |
11 | WHEN OTHERWISE STATED IN WRITING, FREE SOFTWARE FOUNDATION, INC, | |
12 | RICHARD M. STALLMAN AND/OR OTHER PARTIES PROVIDE THIS PROGRAM "AS IS" | |
13 | WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, | |
14 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
15 | FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY | |
16 | AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE | |
17 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR | |
18 | CORRECTION. | |
19 | ||
20 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW WILL RICHARD M. | |
21 | STALLMAN, THE FREE SOFTWARE FOUNDATION, INC., AND/OR ANY OTHER PARTY | |
22 | WHO MAY MODIFY AND REDISTRIBUTE THIS PROGRAM AS PERMITTED BELOW, BE | |
23 | LIABLE TO YOU FOR DAMAGES, INCLUDING ANY LOST PROFITS, LOST MONIES, OR | |
24 | OTHER SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE | |
25 | USE OR INABILITY TO USE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR | |
26 | DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY THIRD PARTIES OR | |
27 | A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS) THIS | |
28 | PROGRAM, EVEN IF YOU HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH | |
29 | DAMAGES, OR FOR ANY CLAIM BY ANY OTHER PARTY. | |
30 | ||
31 | GENERAL PUBLIC LICENSE TO COPY | |
32 | ||
33 | 1. You may copy and distribute verbatim copies of this source file | |
34 | as you receive it, in any medium, provided that you conspicuously and | |
35 | appropriately publish on each copy a valid copyright notice "Copyright | |
36 | (C) 1988 Free Software Foundation, Inc."; and include following the | |
37 | copyright notice a verbatim copy of the above disclaimer of warranty | |
38 | and of this License. You may charge a distribution fee for the | |
39 | physical act of transferring a copy. | |
40 | ||
41 | 2. You may modify your copy or copies of this source file or | |
42 | any portion of it, and copy and distribute such modifications under | |
43 | the terms of Paragraph 1 above, provided that you also do the following: | |
44 | ||
45 | a) cause the modified files to carry prominent notices stating | |
46 | that you changed the files and the date of any change; and | |
47 | ||
48 | b) cause the whole of any work that you distribute or publish, | |
49 | that in whole or in part contains or is a derivative of this | |
50 | program or any part thereof, to be licensed at no charge to all | |
51 | third parties on terms identical to those contained in this | |
52 | License Agreement (except that you may choose to grant more extensive | |
53 | warranty protection to some or all third parties, at your option). | |
54 | ||
55 | c) You may charge a distribution fee for the physical act of | |
56 | transferring a copy, and you may at your option offer warranty | |
57 | protection in exchange for a fee. | |
58 | ||
59 | Mere aggregation of another unrelated program with this program (or its | |
60 | derivative) on a volume of a storage or distribution medium does not bring | |
61 | the other program under the scope of these terms. | |
62 | ||
63 | 3. You may copy and distribute this program or any portion of it in | |
64 | compiled, executable or object code form under the terms of Paragraphs | |
65 | 1 and 2 above provided that you do the following: | |
66 | ||
67 | a) accompany it with the complete corresponding machine-readable | |
68 | source code, which must be distributed under the terms of | |
69 | Paragraphs 1 and 2 above; or, | |
70 | ||
71 | b) accompany it with a written offer, valid for at least three | |
72 | years, to give any third party free (except for a nominal | |
73 | shipping charge) a complete machine-readable copy of the | |
74 | corresponding source code, to be distributed under the terms of | |
75 | Paragraphs 1 and 2 above; or, | |
76 | ||
77 | c) accompany it with the information you received as to where the | |
78 | corresponding source code may be obtained. (This alternative is | |
79 | allowed only for noncommercial distribution and only if you | |
80 | received the program in object code or executable form alone.) | |
81 | ||
82 | For an executable file, complete source code means all the source code for | |
83 | all modules it contains; but, as a special exception, it need not include | |
84 | source code for modules which are standard libraries that accompany the | |
85 | operating system on which the executable file runs. | |
86 | ||
87 | 4. You may not copy, sublicense, distribute or transfer this program | |
88 | except as expressly provided under this License Agreement. Any attempt | |
89 | otherwise to copy, sublicense, distribute or transfer this program is void and | |
90 | your rights to use the program under this License agreement shall be | |
91 | automatically terminated. However, parties who have received computer | |
92 | software programs from you with this License Agreement will not have | |
93 | their licenses terminated so long as such parties remain in full compliance. | |
94 | ||
95 | 5. If you wish to incorporate parts of this program into other free | |
96 | programs whose distribution conditions are different, write to the Free | |
97 | Software Foundation at 675 Mass Ave, Cambridge, MA 02139. We have not yet | |
98 | worked out a simple rule that can be stated here, but we will often permit | |
99 | this. We will be guided by the two goals of preserving the free status of | |
100 | all derivatives our free software and of promoting the sharing and reuse of | |
101 | software. | |
102 | ||
103 | ||
104 | In other words, you are welcome to use, share and improve this program. | |
105 | You are forbidden to forbid anyone else to use, share and improve | |
106 | what you give them. Help stamp out software-hoarding! */ | |
107 | \f | |
108 | #include "awk.h" | |
109 | #include <assert.h> | |
110 | ||
111 | #ifdef setbit /* surprise - setbit and clrbit are macros on NeXT */ | |
112 | #undef setbit | |
113 | #endif | |
114 | #ifdef clrbit | |
115 | #undef clrbit | |
116 | #endif | |
117 | ||
118 | #ifdef __STDC__ | |
119 | typedef void *ptr_t; | |
120 | #else | |
121 | typedef char *ptr_t; | |
122 | #endif | |
123 | ||
124 | typedef struct { | |
125 | char ** in; | |
126 | char * left; | |
127 | char * right; | |
128 | char * is; | |
129 | } must; | |
130 | ||
131 | static ptr_t xcalloc P((int n, size_t s)); | |
132 | static ptr_t xmalloc P((size_t n)); | |
133 | static ptr_t xrealloc P((ptr_t p, size_t n)); | |
134 | static int tstbit P((int b, _charset c)); | |
135 | static void setbit P((int b, _charset c)); | |
136 | static void clrbit P((int b, _charset c)); | |
137 | static void copyset P((const _charset src, _charset dst)); | |
138 | static void zeroset P((_charset s)); | |
139 | static void notset P((_charset s)); | |
140 | static int equal P((const _charset s1, const _charset s2)); | |
141 | static int charset_index P((const _charset s)); | |
142 | static _token lex P((void)); | |
143 | static void addtok P((_token t)); | |
144 | static void atom P((void)); | |
145 | static void closure P((void)); | |
146 | static void branch P((void)); | |
147 | static void regexp P((void)); | |
148 | static void copy P((const _position_set *src, _position_set *dst)); | |
149 | static void insert P((_position p, _position_set *s)); | |
150 | static void merge P((_position_set *s1, _position_set *s2, _position_set *m)); | |
151 | static void delete P((_position p, _position_set *s)); | |
152 | static int state_index P((struct regexp *r, _position_set *s, | |
153 | int newline, int letter)); | |
154 | static void epsclosure P((_position_set *s, struct regexp *r)); | |
155 | static void build_state P((int s, struct regexp *r)); | |
156 | static void build_state_zero P((struct regexp *r)); | |
157 | static char *icatalloc P((char *old, const char *new)); | |
158 | static char *icpyalloc P((const char *string)); | |
159 | static char *istrstr P((char *lookin, char *lookfor)); | |
160 | static void ifree P((char *cp)); | |
161 | static void freelist P((char **cpp)); | |
162 | static char **enlist P((char **cpp, char *new, size_t len)); | |
163 | static char **comsubs P((char *left, char *right)); | |
164 | static char **addlists P((char **old, char **new)); | |
165 | static char **inboth P((char **left, char **right)); | |
166 | static void resetmust P((must *mp)); | |
167 | static void regmust P((struct regexp *r)); | |
168 | ||
169 | #undef P | |
170 | ||
171 | static ptr_t | |
172 | xcalloc(n, s) | |
173 | int n; | |
174 | size_t s; | |
175 | { | |
176 | ptr_t r = calloc(n, s); | |
177 | ||
178 | if (NULL == r) | |
179 | reg_error("Memory exhausted"); /* reg_error does not return */ | |
180 | return r; | |
181 | } | |
182 | ||
183 | static ptr_t | |
184 | xmalloc(n) | |
185 | size_t n; | |
186 | { | |
187 | ptr_t r = malloc(n); | |
188 | ||
189 | assert(n != 0); | |
190 | if (NULL == r) | |
191 | reg_error("Memory exhausted"); | |
192 | return r; | |
193 | } | |
194 | ||
195 | static ptr_t | |
196 | xrealloc(p, n) | |
197 | ptr_t p; | |
198 | size_t n; | |
199 | { | |
200 | ptr_t r = realloc(p, n); | |
201 | ||
202 | assert(n != 0); | |
203 | if (NULL == r) | |
204 | reg_error("Memory exhausted"); | |
205 | return r; | |
206 | } | |
207 | ||
208 | #define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t))) | |
209 | #undef MALLOC | |
210 | #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) | |
211 | #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) | |
212 | ||
213 | /* Reallocate an array of type t if nalloc is too small for index. */ | |
214 | #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ | |
215 | if ((index) >= (nalloc)) \ | |
216 | { \ | |
217 | while ((index) >= (nalloc)) \ | |
218 | (nalloc) *= 2; \ | |
219 | REALLOC(p, t, nalloc); \ | |
220 | } | |
221 | \f | |
222 | /* Stuff pertaining to charsets. */ | |
223 | ||
224 | static int | |
225 | tstbit(b, c) | |
226 | int b; | |
227 | _charset c; | |
228 | { | |
229 | return c[b / INTBITS] & 1 << b % INTBITS; | |
230 | } | |
231 | ||
232 | static void | |
233 | setbit(b, c) | |
234 | int b; | |
235 | _charset c; | |
236 | { | |
237 | c[b / INTBITS] |= 1 << b % INTBITS; | |
238 | } | |
239 | ||
240 | static void | |
241 | clrbit(b, c) | |
242 | int b; | |
243 | _charset c; | |
244 | { | |
245 | c[b / INTBITS] &= ~(1 << b % INTBITS); | |
246 | } | |
247 | ||
248 | static void | |
249 | copyset(src, dst) | |
250 | const _charset src; | |
251 | _charset dst; | |
252 | { | |
253 | int i; | |
254 | ||
255 | for (i = 0; i < _CHARSET_INTS; ++i) | |
256 | dst[i] = src[i]; | |
257 | } | |
258 | ||
259 | static void | |
260 | zeroset(s) | |
261 | _charset s; | |
262 | { | |
263 | int i; | |
264 | ||
265 | for (i = 0; i < _CHARSET_INTS; ++i) | |
266 | s[i] = 0; | |
267 | } | |
268 | ||
269 | static void | |
270 | notset(s) | |
271 | _charset s; | |
272 | { | |
273 | int i; | |
274 | ||
275 | for (i = 0; i < _CHARSET_INTS; ++i) | |
276 | s[i] = ~s[i]; | |
277 | } | |
278 | ||
279 | static int | |
280 | equal(s1, s2) | |
281 | const _charset s1; | |
282 | const _charset s2; | |
283 | { | |
284 | int i; | |
285 | ||
286 | for (i = 0; i < _CHARSET_INTS; ++i) | |
287 | if (s1[i] != s2[i]) | |
288 | return 0; | |
289 | return 1; | |
290 | } | |
291 | \f | |
292 | /* A pointer to the current regexp is kept here during parsing. */ | |
293 | static struct regexp *reg; | |
294 | ||
295 | /* Find the index of charset s in reg->charsets, or allocate a new charset. */ | |
296 | static int | |
297 | charset_index(s) | |
298 | const _charset s; | |
299 | { | |
300 | int i; | |
301 | ||
302 | for (i = 0; i < reg->cindex; ++i) | |
303 | if (equal(s, reg->charsets[i])) | |
304 | return i; | |
305 | REALLOC_IF_NECESSARY(reg->charsets, _charset, reg->calloc, reg->cindex); | |
306 | ++reg->cindex; | |
307 | copyset(s, reg->charsets[i]); | |
308 | return i; | |
309 | } | |
310 | ||
311 | /* Syntax bits controlling the behavior of the lexical analyzer. */ | |
312 | static syntax_bits, syntax_bits_set; | |
313 | ||
314 | /* Flag for case-folding letters into sets. */ | |
315 | static case_fold; | |
316 | ||
317 | /* Entry point to set syntax options. */ | |
318 | void | |
319 | regsyntax(bits, fold) | |
320 | long bits; | |
321 | int fold; | |
322 | { | |
323 | syntax_bits_set = 1; | |
324 | syntax_bits = bits; | |
325 | case_fold = fold; | |
326 | } | |
327 | ||
328 | /* Lexical analyzer. */ | |
329 | static const char *lexstart; /* Pointer to beginning of input string. */ | |
330 | static const char *lexptr; /* Pointer to next input character. */ | |
331 | static lexleft; /* Number of characters remaining. */ | |
332 | static caret_allowed; /* True if backward context allows ^ | |
333 | (meaningful only if RE_CONTEXT_INDEP_OPS | |
334 | is turned off). */ | |
335 | static closure_allowed; /* True if backward context allows closures | |
336 | (meaningful only if RE_CONTEXT_INDEP_OPS | |
337 | is turned off). */ | |
338 | ||
339 | /* Note that characters become unsigned here. */ | |
340 | #define FETCH(c, eoferr) \ | |
341 | { \ | |
342 | if (! lexleft) \ | |
343 | if (eoferr != NULL) \ | |
344 | reg_error(eoferr); \ | |
345 | else \ | |
346 | return _END; \ | |
347 | (c) = (unsigned char) *lexptr++; \ | |
348 | --lexleft; \ | |
349 | } | |
350 | ||
351 | static _token | |
352 | lex() | |
353 | { | |
354 | _token c, c2; | |
355 | int invert; | |
356 | _charset cset; | |
357 | ||
358 | FETCH(c, (char *) 0); | |
359 | switch (c) | |
360 | { | |
361 | case '^': | |
362 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) | |
363 | && (!caret_allowed || | |
364 | ((syntax_bits & RE_TIGHT_VBAR) && lexptr - 1 != lexstart))) | |
365 | goto normal_char; | |
366 | caret_allowed = 0; | |
367 | return syntax_bits & RE_TIGHT_VBAR ? _ALLBEGLINE : _BEGLINE; | |
368 | ||
369 | case '$': | |
370 | if (syntax_bits & RE_CONTEXT_INDEP_OPS || !lexleft | |
371 | || (! (syntax_bits & RE_TIGHT_VBAR) | |
372 | && ((syntax_bits & RE_NO_BK_PARENS | |
373 | ? lexleft > 0 && *lexptr == ')' | |
374 | : lexleft > 1 && *lexptr == '\\' && lexptr[1] == ')') | |
375 | || (syntax_bits & RE_NO_BK_VBAR | |
376 | ? lexleft > 0 && *lexptr == '|' | |
377 | : lexleft > 1 && *lexptr == '\\' && lexptr[1] == '|')))) | |
378 | return syntax_bits & RE_TIGHT_VBAR ? _ALLENDLINE : _ENDLINE; | |
379 | goto normal_char; | |
380 | ||
381 | case '\\': | |
382 | FETCH(c, "Unfinished \\ quote"); | |
383 | switch (c) | |
384 | { | |
385 | case '1': | |
386 | case '2': | |
387 | case '3': | |
388 | case '4': | |
389 | case '5': | |
390 | case '6': | |
391 | case '7': | |
392 | case '8': | |
393 | case '9': | |
394 | caret_allowed = 0; | |
395 | closure_allowed = 1; | |
396 | return _BACKREF; | |
397 | ||
398 | case '<': | |
399 | caret_allowed = 0; | |
400 | return _BEGWORD; | |
401 | ||
402 | case '>': | |
403 | caret_allowed = 0; | |
404 | return _ENDWORD; | |
405 | ||
406 | case 'b': | |
407 | caret_allowed = 0; | |
408 | return _LIMWORD; | |
409 | ||
410 | case 'B': | |
411 | caret_allowed = 0; | |
412 | return _NOTLIMWORD; | |
413 | ||
414 | case 'w': | |
415 | case 'W': | |
416 | zeroset(cset); | |
417 | for (c2 = 0; c2 < _NOTCHAR; ++c2) | |
418 | if (ISALNUM(c2)) | |
419 | setbit(c2, cset); | |
420 | if (c == 'W') | |
421 | notset(cset); | |
422 | caret_allowed = 0; | |
423 | closure_allowed = 1; | |
424 | return _SET + charset_index(cset); | |
425 | ||
426 | case '?': | |
427 | if (syntax_bits & RE_BK_PLUS_QM) | |
428 | goto qmark; | |
429 | goto normal_char; | |
430 | ||
431 | case '+': | |
432 | if (syntax_bits & RE_BK_PLUS_QM) | |
433 | goto plus; | |
434 | goto normal_char; | |
435 | ||
436 | case '|': | |
437 | if (! (syntax_bits & RE_NO_BK_VBAR)) | |
438 | goto or; | |
439 | goto normal_char; | |
440 | ||
441 | case '(': | |
442 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
443 | goto lparen; | |
444 | goto normal_char; | |
445 | ||
446 | case ')': | |
447 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
448 | goto rparen; | |
449 | goto normal_char; | |
450 | ||
451 | default: | |
452 | goto normal_char; | |
453 | } | |
454 | ||
455 | case '?': | |
456 | if (syntax_bits & RE_BK_PLUS_QM) | |
457 | goto normal_char; | |
458 | qmark: | |
459 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) | |
460 | goto normal_char; | |
461 | return _QMARK; | |
462 | ||
463 | case '*': | |
464 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) | |
465 | goto normal_char; | |
466 | return _STAR; | |
467 | ||
468 | case '+': | |
469 | if (syntax_bits & RE_BK_PLUS_QM) | |
470 | goto normal_char; | |
471 | plus: | |
472 | if (! (syntax_bits & RE_CONTEXT_INDEP_OPS) && !closure_allowed) | |
473 | goto normal_char; | |
474 | return _PLUS; | |
475 | ||
476 | case '|': | |
477 | if (! (syntax_bits & RE_NO_BK_VBAR)) | |
478 | goto normal_char; | |
479 | or: | |
480 | caret_allowed = 1; | |
481 | closure_allowed = 0; | |
482 | return _OR; | |
483 | ||
484 | case '\n': | |
485 | if (! (syntax_bits & RE_NEWLINE_OR)) | |
486 | goto normal_char; | |
487 | goto or; | |
488 | ||
489 | case '(': | |
490 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
491 | goto normal_char; | |
492 | lparen: | |
493 | caret_allowed = 1; | |
494 | closure_allowed = 0; | |
495 | return _LPAREN; | |
496 | ||
497 | case ')': | |
498 | if (! (syntax_bits & RE_NO_BK_PARENS)) | |
499 | goto normal_char; | |
500 | rparen: | |
501 | caret_allowed = 0; | |
502 | closure_allowed = 1; | |
503 | return _RPAREN; | |
504 | ||
505 | case '.': | |
506 | zeroset(cset); | |
507 | notset(cset); | |
508 | clrbit('\n', cset); | |
509 | caret_allowed = 0; | |
510 | closure_allowed = 1; | |
511 | return _SET + charset_index(cset); | |
512 | ||
513 | case '[': | |
514 | zeroset(cset); | |
515 | FETCH(c, "Unbalanced ["); | |
516 | if (c == '^') | |
517 | { | |
518 | FETCH(c, "Unbalanced ["); | |
519 | invert = 1; | |
520 | } | |
521 | else | |
522 | invert = 0; | |
523 | do | |
524 | { | |
525 | FETCH(c2, "Unbalanced ["); | |
526 | if ((syntax_bits & RE_AWK_CLASS_HACK) && c == '\\') | |
527 | { | |
528 | c = c2; | |
529 | FETCH(c2, "Unbalanced ["); | |
530 | } | |
531 | if (c2 == '-') | |
532 | { | |
533 | FETCH(c2, "Unbalanced ["); | |
534 | if (c2 == ']' && (syntax_bits & RE_AWK_CLASS_HACK)) | |
535 | { | |
536 | setbit(c, cset); | |
537 | setbit('-', cset); | |
538 | break; | |
539 | } | |
540 | while (c <= c2) | |
541 | setbit(c++, cset); | |
542 | FETCH(c, "Unbalanced ["); | |
543 | } | |
544 | else | |
545 | { | |
546 | setbit(c, cset); | |
547 | c = c2; | |
548 | } | |
549 | } | |
550 | while (c != ']'); | |
551 | if (invert) | |
552 | notset(cset); | |
553 | caret_allowed = 0; | |
554 | closure_allowed = 1; | |
555 | return _SET + charset_index(cset); | |
556 | ||
557 | default: | |
558 | normal_char: | |
559 | caret_allowed = 0; | |
560 | closure_allowed = 1; | |
561 | if (case_fold && ISALPHA(c)) | |
562 | { | |
563 | zeroset(cset); | |
564 | if (isupper(c)) | |
565 | c = tolower(c); | |
566 | setbit(c, cset); | |
567 | setbit(toupper(c), cset); | |
568 | return _SET + charset_index(cset); | |
569 | } | |
570 | return c; | |
571 | } | |
572 | } | |
573 | \f | |
574 | /* Recursive descent parser for regular expressions. */ | |
575 | ||
576 | static _token tok; /* Lookahead token. */ | |
577 | static depth; /* Current depth of a hypothetical stack | |
578 | holding deferred productions. This is | |
579 | used to determine the depth that will be | |
580 | required of the real stack later on in | |
581 | reganalyze(). */ | |
582 | ||
583 | /* Add the given token to the parse tree, maintaining the depth count and | |
584 | updating the maximum depth if necessary. */ | |
585 | static void | |
586 | addtok(t) | |
587 | _token t; | |
588 | { | |
589 | REALLOC_IF_NECESSARY(reg->tokens, _token, reg->talloc, reg->tindex); | |
590 | reg->tokens[reg->tindex++] = t; | |
591 | ||
592 | switch (t) | |
593 | { | |
594 | case _QMARK: | |
595 | case _STAR: | |
596 | case _PLUS: | |
597 | break; | |
598 | ||
599 | case _CAT: | |
600 | case _OR: | |
601 | --depth; | |
602 | break; | |
603 | ||
604 | default: | |
605 | ++reg->nleaves; | |
606 | case _EMPTY: | |
607 | ++depth; | |
608 | break; | |
609 | } | |
610 | if (depth > reg->depth) | |
611 | reg->depth = depth; | |
612 | } | |
613 | ||
614 | /* The grammar understood by the parser is as follows. | |
615 | ||
616 | start: | |
617 | regexp | |
618 | _ALLBEGLINE regexp | |
619 | regexp _ALLENDLINE | |
620 | _ALLBEGLINE regexp _ALLENDLINE | |
621 | ||
622 | regexp: | |
623 | regexp _OR branch | |
624 | branch | |
625 | ||
626 | branch: | |
627 | branch closure | |
628 | closure | |
629 | ||
630 | closure: | |
631 | closure _QMARK | |
632 | closure _STAR | |
633 | closure _PLUS | |
634 | atom | |
635 | ||
636 | atom: | |
637 | <normal character> | |
638 | _SET | |
639 | _BACKREF | |
640 | _BEGLINE | |
641 | _ENDLINE | |
642 | _BEGWORD | |
643 | _ENDWORD | |
644 | _LIMWORD | |
645 | _NOTLIMWORD | |
646 | <empty> | |
647 | ||
648 | The parser builds a parse tree in postfix form in an array of tokens. */ | |
649 | ||
650 | #ifdef __STDC__ | |
651 | static void regexp(void); | |
652 | #else | |
653 | static void regexp(); | |
654 | #endif | |
655 | ||
656 | static void | |
657 | atom() | |
658 | { | |
659 | if (tok >= 0 && (tok < _NOTCHAR || tok >= _SET || tok == _BACKREF | |
660 | || tok == _BEGLINE || tok == _ENDLINE || tok == _BEGWORD | |
661 | || tok == _ENDWORD || tok == _LIMWORD || tok == _NOTLIMWORD)) | |
662 | { | |
663 | addtok(tok); | |
664 | tok = lex(); | |
665 | } | |
666 | else if (tok == _LPAREN) | |
667 | { | |
668 | tok = lex(); | |
669 | regexp(); | |
670 | if (tok != _RPAREN) | |
671 | reg_error("Unbalanced ("); | |
672 | tok = lex(); | |
673 | } | |
674 | else | |
675 | addtok(_EMPTY); | |
676 | } | |
677 | ||
678 | static void | |
679 | closure() | |
680 | { | |
681 | atom(); | |
682 | while (tok == _QMARK || tok == _STAR || tok == _PLUS) | |
683 | { | |
684 | addtok(tok); | |
685 | tok = lex(); | |
686 | } | |
687 | } | |
688 | ||
689 | static void | |
690 | branch() | |
691 | { | |
692 | closure(); | |
693 | while (tok != _RPAREN && tok != _OR && tok != _ALLENDLINE && tok >= 0) | |
694 | { | |
695 | closure(); | |
696 | addtok(_CAT); | |
697 | } | |
698 | } | |
699 | ||
700 | static void | |
701 | regexp() | |
702 | { | |
703 | branch(); | |
704 | while (tok == _OR) | |
705 | { | |
706 | tok = lex(); | |
707 | branch(); | |
708 | addtok(_OR); | |
709 | } | |
710 | } | |
711 | ||
712 | /* Main entry point for the parser. S is a string to be parsed, len is the | |
713 | length of the string, so s can include NUL characters. R is a pointer to | |
714 | the struct regexp to parse into. */ | |
715 | void | |
716 | regparse(s, len, r) | |
717 | const char *s; | |
718 | size_t len; | |
719 | struct regexp *r; | |
720 | { | |
721 | reg = r; | |
722 | lexstart = lexptr = s; | |
723 | lexleft = len; | |
724 | caret_allowed = 1; | |
725 | closure_allowed = 0; | |
726 | ||
727 | if (! syntax_bits_set) | |
728 | reg_error("No syntax specified"); | |
729 | ||
730 | tok = lex(); | |
731 | depth = r->depth; | |
732 | ||
733 | if (tok == _ALLBEGLINE) | |
734 | { | |
735 | addtok(_BEGLINE); | |
736 | tok = lex(); | |
737 | regexp(); | |
738 | addtok(_CAT); | |
739 | } | |
740 | else | |
741 | regexp(); | |
742 | ||
743 | if (tok == _ALLENDLINE) | |
744 | { | |
745 | addtok(_ENDLINE); | |
746 | addtok(_CAT); | |
747 | tok = lex(); | |
748 | } | |
749 | ||
750 | if (tok != _END) | |
751 | reg_error("Unbalanced )"); | |
752 | ||
753 | addtok(_END - r->nregexps); | |
754 | addtok(_CAT); | |
755 | ||
756 | if (r->nregexps) | |
757 | addtok(_OR); | |
758 | ||
759 | ++r->nregexps; | |
760 | } | |
761 | \f | |
762 | /* Some primitives for operating on sets of positions. */ | |
763 | ||
764 | /* Copy one set to another; the destination must be large enough. */ | |
765 | static void | |
766 | copy(src, dst) | |
767 | const _position_set *src; | |
768 | _position_set *dst; | |
769 | { | |
770 | int i; | |
771 | ||
772 | for (i = 0; i < src->nelem; ++i) | |
773 | dst->elems[i] = src->elems[i]; | |
774 | dst->nelem = src->nelem; | |
775 | } | |
776 | ||
777 | /* Insert a position in a set. Position sets are maintained in sorted | |
778 | order according to index. If position already exists in the set with | |
779 | the same index then their constraints are logically or'd together. | |
780 | S->elems must point to an array large enough to hold the resulting set. */ | |
781 | static void | |
782 | insert(p, s) | |
783 | _position p; | |
784 | _position_set *s; | |
785 | { | |
786 | int i; | |
787 | _position t1, t2; | |
788 | ||
789 | for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) | |
790 | ; | |
791 | if (i < s->nelem && p.index == s->elems[i].index) | |
792 | s->elems[i].constraint |= p.constraint; | |
793 | else | |
794 | { | |
795 | t1 = p; | |
796 | ++s->nelem; | |
797 | while (i < s->nelem) | |
798 | { | |
799 | t2 = s->elems[i]; | |
800 | s->elems[i++] = t1; | |
801 | t1 = t2; | |
802 | } | |
803 | } | |
804 | } | |
805 | ||
806 | /* Merge two sets of positions into a third. The result is exactly as if | |
807 | the positions of both sets were inserted into an initially empty set. */ | |
808 | static void | |
809 | merge(s1, s2, m) | |
810 | _position_set *s1; | |
811 | _position_set *s2; | |
812 | _position_set *m; | |
813 | { | |
814 | int i = 0, j = 0; | |
815 | ||
816 | m->nelem = 0; | |
817 | while (i < s1->nelem && j < s2->nelem) | |
818 | if (s1->elems[i].index > s2->elems[j].index) | |
819 | m->elems[m->nelem++] = s1->elems[i++]; | |
820 | else if (s1->elems[i].index < s2->elems[j].index) | |
821 | m->elems[m->nelem++] = s2->elems[j++]; | |
822 | else | |
823 | { | |
824 | m->elems[m->nelem] = s1->elems[i++]; | |
825 | m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; | |
826 | } | |
827 | while (i < s1->nelem) | |
828 | m->elems[m->nelem++] = s1->elems[i++]; | |
829 | while (j < s2->nelem) | |
830 | m->elems[m->nelem++] = s2->elems[j++]; | |
831 | } | |
832 | ||
833 | /* Delete a position from a set. */ | |
834 | static void | |
835 | delete(p, s) | |
836 | _position p; | |
837 | _position_set *s; | |
838 | { | |
839 | int i; | |
840 | ||
841 | for (i = 0; i < s->nelem; ++i) | |
842 | if (p.index == s->elems[i].index) | |
843 | break; | |
844 | if (i < s->nelem) | |
845 | for (--s->nelem; i < s->nelem; ++i) | |
846 | s->elems[i] = s->elems[i + 1]; | |
847 | } | |
848 | \f | |
849 | /* Find the index of the state corresponding to the given position set with | |
850 | the given preceding context, or create a new state if there is no such | |
851 | state. Newline and letter tell whether we got here on a newline or | |
852 | letter, respectively. */ | |
853 | static int | |
854 | state_index(r, s, newline, letter) | |
855 | struct regexp *r; | |
856 | _position_set *s; | |
857 | int newline; | |
858 | int letter; | |
859 | { | |
860 | int lhash = 0; | |
861 | int constraint; | |
862 | int i, j; | |
863 | ||
864 | newline = newline ? 1 : 0; | |
865 | letter = letter ? 1 : 0; | |
866 | ||
867 | for (i = 0; i < s->nelem; ++i) | |
868 | lhash ^= s->elems[i].index + s->elems[i].constraint; | |
869 | ||
870 | /* Try to find a state that exactly matches the proposed one. */ | |
871 | for (i = 0; i < r->sindex; ++i) | |
872 | { | |
873 | if (lhash != r->states[i].hash || s->nelem != r->states[i].elems.nelem | |
874 | || newline != r->states[i].newline || letter != r->states[i].letter) | |
875 | continue; | |
876 | for (j = 0; j < s->nelem; ++j) | |
877 | if (s->elems[j].constraint | |
878 | != r->states[i].elems.elems[j].constraint | |
879 | || s->elems[j].index != r->states[i].elems.elems[j].index) | |
880 | break; | |
881 | if (j == s->nelem) | |
882 | return i; | |
883 | } | |
884 | ||
885 | /* We'll have to create a new state. */ | |
886 | REALLOC_IF_NECESSARY(r->states, _dfa_state, r->salloc, r->sindex); | |
887 | r->states[i].hash = lhash; | |
888 | MALLOC(r->states[i].elems.elems, _position, s->nelem); | |
889 | copy(s, &r->states[i].elems); | |
890 | r->states[i].newline = newline; | |
891 | r->states[i].letter = letter; | |
892 | r->states[i].backref = 0; | |
893 | r->states[i].constraint = 0; | |
894 | r->states[i].first_end = 0; | |
895 | for (j = 0; j < s->nelem; ++j) | |
896 | if (r->tokens[s->elems[j].index] < 0) | |
897 | { | |
898 | constraint = s->elems[j].constraint; | |
899 | if (_SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) | |
900 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) | |
901 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) | |
902 | || _SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) | |
903 | r->states[i].constraint |= constraint; | |
904 | if (! r->states[i].first_end) | |
905 | r->states[i].first_end = r->tokens[s->elems[j].index]; | |
906 | } | |
907 | else if (r->tokens[s->elems[j].index] == _BACKREF) | |
908 | { | |
909 | r->states[i].constraint = _NO_CONSTRAINT; | |
910 | r->states[i].backref = 1; | |
911 | } | |
912 | ||
913 | ++r->sindex; | |
914 | ||
915 | return i; | |
916 | } | |
917 | \f | |
918 | /* Find the epsilon closure of a set of positions. If any position of the set | |
919 | contains a symbol that matches the empty string in some context, replace | |
920 | that position with the elements of its follow labeled with an appropriate | |
921 | constraint. Repeat exhaustively until no funny positions are left. | |
922 | S->elems must be large enough to hold the result. */ | |
923 | static void | |
924 | epsclosure(s, r) | |
925 | _position_set *s; | |
926 | struct regexp *r; | |
927 | { | |
928 | int i, j; | |
929 | int *visited; | |
930 | _position p, old; | |
931 | ||
932 | MALLOC(visited, int, r->tindex); | |
933 | for (i = 0; i < r->tindex; ++i) | |
934 | visited[i] = 0; | |
935 | ||
936 | for (i = 0; i < s->nelem; ++i) | |
937 | if (r->tokens[s->elems[i].index] >= _NOTCHAR | |
938 | && r->tokens[s->elems[i].index] != _BACKREF | |
939 | && r->tokens[s->elems[i].index] < _SET) | |
940 | { | |
941 | old = s->elems[i]; | |
942 | p.constraint = old.constraint; | |
943 | delete(s->elems[i], s); | |
944 | if (visited[old.index]) | |
945 | { | |
946 | --i; | |
947 | continue; | |
948 | } | |
949 | visited[old.index] = 1; | |
950 | switch (r->tokens[old.index]) | |
951 | { | |
952 | case _BEGLINE: | |
953 | p.constraint &= _BEGLINE_CONSTRAINT; | |
954 | break; | |
955 | case _ENDLINE: | |
956 | p.constraint &= _ENDLINE_CONSTRAINT; | |
957 | break; | |
958 | case _BEGWORD: | |
959 | p.constraint &= _BEGWORD_CONSTRAINT; | |
960 | break; | |
961 | case _ENDWORD: | |
962 | p.constraint &= _ENDWORD_CONSTRAINT; | |
963 | break; | |
964 | case _LIMWORD: | |
965 | p.constraint &= _ENDWORD_CONSTRAINT; | |
966 | break; | |
967 | case _NOTLIMWORD: | |
968 | p.constraint &= _NOTLIMWORD_CONSTRAINT; | |
969 | break; | |
970 | default: | |
971 | break; | |
972 | } | |
973 | for (j = 0; j < r->follows[old.index].nelem; ++j) | |
974 | { | |
975 | p.index = r->follows[old.index].elems[j].index; | |
976 | insert(p, s); | |
977 | } | |
978 | /* Force rescan to start at the beginning. */ | |
979 | i = -1; | |
980 | } | |
981 | ||
982 | free(visited); | |
983 | } | |
984 | \f | |
985 | /* Perform bottom-up analysis on the parse tree, computing various functions. | |
986 | Note that at this point, we're pretending constructs like \< are real | |
987 | characters rather than constraints on what can follow them. | |
988 | ||
989 | Nullable: A node is nullable if it is at the root of a regexp that can | |
990 | match the empty string. | |
991 | * _EMPTY leaves are nullable. | |
992 | * No other leaf is nullable. | |
993 | * A _QMARK or _STAR node is nullable. | |
994 | * A _PLUS node is nullable if its argument is nullable. | |
995 | * A _CAT node is nullable if both its arguments are nullable. | |
996 | * An _OR node is nullable if either argument is nullable. | |
997 | ||
998 | Firstpos: The firstpos of a node is the set of positions (nonempty leaves) | |
999 | that could correspond to the first character of a string matching the | |
1000 | regexp rooted at the given node. | |
1001 | * _EMPTY leaves have empty firstpos. | |
1002 | * The firstpos of a nonempty leaf is that leaf itself. | |
1003 | * The firstpos of a _QMARK, _STAR, or _PLUS node is the firstpos of its | |
1004 | argument. | |
1005 | * The firstpos of a _CAT node is the firstpos of the left argument, union | |
1006 | the firstpos of the right if the left argument is nullable. | |
1007 | * The firstpos of an _OR node is the union of firstpos of each argument. | |
1008 | ||
1009 | Lastpos: The lastpos of a node is the set of positions that could | |
1010 | correspond to the last character of a string matching the regexp at | |
1011 | the given node. | |
1012 | * _EMPTY leaves have empty lastpos. | |
1013 | * The lastpos of a nonempty leaf is that leaf itself. | |
1014 | * The lastpos of a _QMARK, _STAR, or _PLUS node is the lastpos of its | |
1015 | argument. | |
1016 | * The lastpos of a _CAT node is the lastpos of its right argument, union | |
1017 | the lastpos of the left if the right argument is nullable. | |
1018 | * The lastpos of an _OR node is the union of the lastpos of each argument. | |
1019 | ||
1020 | Follow: The follow of a position is the set of positions that could | |
1021 | correspond to the character following a character matching the node in | |
1022 | a string matching the regexp. At this point we consider special symbols | |
1023 | that match the empty string in some context to be just normal characters. | |
1024 | Later, if we find that a special symbol is in a follow set, we will | |
1025 | replace it with the elements of its follow, labeled with an appropriate | |
1026 | constraint. | |
1027 | * Every node in the firstpos of the argument of a _STAR or _PLUS node is in | |
1028 | the follow of every node in the lastpos. | |
1029 | * Every node in the firstpos of the second argument of a _CAT node is in | |
1030 | the follow of every node in the lastpos of the first argument. | |
1031 | ||
1032 | Because of the postfix representation of the parse tree, the depth-first | |
1033 | analysis is conveniently done by a linear scan with the aid of a stack. | |
1034 | Sets are stored as arrays of the elements, obeying a stack-like allocation | |
1035 | scheme; the number of elements in each set deeper in the stack can be | |
1036 | used to determine the address of a particular set's array. */ | |
1037 | void | |
1038 | reganalyze(r, searchflag) | |
1039 | struct regexp *r; | |
1040 | int searchflag; | |
1041 | { | |
1042 | int *nullable; /* Nullable stack. */ | |
1043 | int *nfirstpos; /* Element count stack for firstpos sets. */ | |
1044 | _position *firstpos; /* Array where firstpos elements are stored. */ | |
1045 | int *nlastpos; /* Element count stack for lastpos sets. */ | |
1046 | _position *lastpos; /* Array where lastpos elements are stored. */ | |
1047 | int *nalloc; /* Sizes of arrays allocated to follow sets. */ | |
1048 | _position_set tmp; /* Temporary set for merging sets. */ | |
1049 | _position_set merged; /* Result of merging sets. */ | |
1050 | int wants_newline; /* True if some position wants newline info. */ | |
1051 | int *o_nullable; | |
1052 | int *o_nfirst, *o_nlast; | |
1053 | _position *o_firstpos, *o_lastpos; | |
1054 | int i, j; | |
1055 | _position *pos; | |
1056 | ||
1057 | r->searchflag = searchflag; | |
1058 | ||
1059 | MALLOC(nullable, int, r->depth); | |
1060 | o_nullable = nullable; | |
1061 | MALLOC(nfirstpos, int, r->depth); | |
1062 | o_nfirst = nfirstpos; | |
1063 | MALLOC(firstpos, _position, r->nleaves); | |
1064 | o_firstpos = firstpos, firstpos += r->nleaves; | |
1065 | MALLOC(nlastpos, int, r->depth); | |
1066 | o_nlast = nlastpos; | |
1067 | MALLOC(lastpos, _position, r->nleaves); | |
1068 | o_lastpos = lastpos, lastpos += r->nleaves; | |
1069 | MALLOC(nalloc, int, r->tindex); | |
1070 | for (i = 0; i < r->tindex; ++i) | |
1071 | nalloc[i] = 0; | |
1072 | MALLOC(merged.elems, _position, r->nleaves); | |
1073 | ||
1074 | CALLOC(r->follows, _position_set, r->tindex); | |
1075 | ||
1076 | for (i = 0; i < r->tindex; ++i) | |
1077 | switch (r->tokens[i]) | |
1078 | { | |
1079 | case _EMPTY: | |
1080 | /* The empty set is nullable. */ | |
1081 | *nullable++ = 1; | |
1082 | ||
1083 | /* The firstpos and lastpos of the empty leaf are both empty. */ | |
1084 | *nfirstpos++ = *nlastpos++ = 0; | |
1085 | break; | |
1086 | ||
1087 | case _STAR: | |
1088 | case _PLUS: | |
1089 | /* Every element in the firstpos of the argument is in the follow | |
1090 | of every element in the lastpos. */ | |
1091 | tmp.nelem = nfirstpos[-1]; | |
1092 | tmp.elems = firstpos; | |
1093 | pos = lastpos; | |
1094 | for (j = 0; j < nlastpos[-1]; ++j) | |
1095 | { | |
1096 | merge(&tmp, &r->follows[pos[j].index], &merged); | |
1097 | REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, | |
1098 | nalloc[pos[j].index], merged.nelem - 1); | |
1099 | copy(&merged, &r->follows[pos[j].index]); | |
1100 | } | |
1101 | ||
1102 | case _QMARK: | |
1103 | /* A _QMARK or _STAR node is automatically nullable. */ | |
1104 | if (r->tokens[i] != _PLUS) | |
1105 | nullable[-1] = 1; | |
1106 | break; | |
1107 | ||
1108 | case _CAT: | |
1109 | /* Every element in the firstpos of the second argument is in the | |
1110 | follow of every element in the lastpos of the first argument. */ | |
1111 | tmp.nelem = nfirstpos[-1]; | |
1112 | tmp.elems = firstpos; | |
1113 | pos = lastpos + nlastpos[-1]; | |
1114 | for (j = 0; j < nlastpos[-2]; ++j) | |
1115 | { | |
1116 | merge(&tmp, &r->follows[pos[j].index], &merged); | |
1117 | REALLOC_IF_NECESSARY(r->follows[pos[j].index].elems, _position, | |
1118 | nalloc[pos[j].index], merged.nelem - 1); | |
1119 | copy(&merged, &r->follows[pos[j].index]); | |
1120 | } | |
1121 | ||
1122 | /* The firstpos of a _CAT node is the firstpos of the first argument, | |
1123 | union that of the second argument if the first is nullable. */ | |
1124 | if (nullable[-2]) | |
1125 | nfirstpos[-2] += nfirstpos[-1]; | |
1126 | else | |
1127 | firstpos += nfirstpos[-1]; | |
1128 | --nfirstpos; | |
1129 | ||
1130 | /* The lastpos of a _CAT node is the lastpos of the second argument, | |
1131 | union that of the first argument if the second is nullable. */ | |
1132 | if (nullable[-1]) | |
1133 | nlastpos[-2] += nlastpos[-1]; | |
1134 | else | |
1135 | { | |
1136 | pos = lastpos + nlastpos[-2]; | |
1137 | for (j = nlastpos[-1] - 1; j >= 0; --j) | |
1138 | pos[j] = lastpos[j]; | |
1139 | lastpos += nlastpos[-2]; | |
1140 | nlastpos[-2] = nlastpos[-1]; | |
1141 | } | |
1142 | --nlastpos; | |
1143 | ||
1144 | /* A _CAT node is nullable if both arguments are nullable. */ | |
1145 | nullable[-2] = nullable[-1] && nullable[-2]; | |
1146 | --nullable; | |
1147 | break; | |
1148 | ||
1149 | case _OR: | |
1150 | /* The firstpos is the union of the firstpos of each argument. */ | |
1151 | nfirstpos[-2] += nfirstpos[-1]; | |
1152 | --nfirstpos; | |
1153 | ||
1154 | /* The lastpos is the union of the lastpos of each argument. */ | |
1155 | nlastpos[-2] += nlastpos[-1]; | |
1156 | --nlastpos; | |
1157 | ||
1158 | /* An _OR node is nullable if either argument is nullable. */ | |
1159 | nullable[-2] = nullable[-1] || nullable[-2]; | |
1160 | --nullable; | |
1161 | break; | |
1162 | ||
1163 | default: | |
1164 | /* Anything else is a nonempty position. (Note that special | |
1165 | constructs like \< are treated as nonempty strings here; | |
1166 | an "epsilon closure" effectively makes them nullable later. | |
1167 | Backreferences have to get a real position so we can detect | |
1168 | transitions on them later. But they are nullable. */ | |
1169 | *nullable++ = r->tokens[i] == _BACKREF; | |
1170 | ||
1171 | /* This position is in its own firstpos and lastpos. */ | |
1172 | *nfirstpos++ = *nlastpos++ = 1; | |
1173 | --firstpos, --lastpos; | |
1174 | firstpos->index = lastpos->index = i; | |
1175 | firstpos->constraint = lastpos->constraint = _NO_CONSTRAINT; | |
1176 | ||
1177 | /* Allocate the follow set for this position. */ | |
1178 | nalloc[i] = 1; | |
1179 | MALLOC(r->follows[i].elems, _position, nalloc[i]); | |
1180 | break; | |
1181 | } | |
1182 | ||
1183 | /* For each follow set that is the follow set of a real position, replace | |
1184 | it with its epsilon closure. */ | |
1185 | for (i = 0; i < r->tindex; ++i) | |
1186 | if (r->tokens[i] < _NOTCHAR || r->tokens[i] == _BACKREF | |
1187 | || r->tokens[i] >= _SET) | |
1188 | { | |
1189 | copy(&r->follows[i], &merged); | |
1190 | epsclosure(&merged, r); | |
1191 | if (r->follows[i].nelem < merged.nelem) | |
1192 | REALLOC(r->follows[i].elems, _position, merged.nelem); | |
1193 | copy(&merged, &r->follows[i]); | |
1194 | } | |
1195 | ||
1196 | /* Get the epsilon closure of the firstpos of the regexp. The result will | |
1197 | be the set of positions of state 0. */ | |
1198 | merged.nelem = 0; | |
1199 | for (i = 0; i < nfirstpos[-1]; ++i) | |
1200 | insert(firstpos[i], &merged); | |
1201 | epsclosure(&merged, r); | |
1202 | ||
1203 | /* Check if any of the positions of state 0 will want newline context. */ | |
1204 | wants_newline = 0; | |
1205 | for (i = 0; i < merged.nelem; ++i) | |
1206 | if (_PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) | |
1207 | wants_newline = 1; | |
1208 | ||
1209 | /* Build the initial state. */ | |
1210 | r->salloc = 1; | |
1211 | r->sindex = 0; | |
1212 | MALLOC(r->states, _dfa_state, r->salloc); | |
1213 | state_index(r, &merged, wants_newline, 0); | |
1214 | ||
1215 | free(o_nullable); | |
1216 | free(o_nfirst); | |
1217 | free(o_firstpos); | |
1218 | free(o_nlast); | |
1219 | free(o_lastpos); | |
1220 | free(nalloc); | |
1221 | free(merged.elems); | |
1222 | } | |
1223 | \f | |
1224 | /* Find, for each character, the transition out of state s of r, and store | |
1225 | it in the appropriate slot of trans. | |
1226 | ||
1227 | We divide the positions of s into groups (positions can appear in more | |
1228 | than one group). Each group is labeled with a set of characters that | |
1229 | every position in the group matches (taking into account, if necessary, | |
1230 | preceding context information of s). For each group, find the union | |
1231 | of the its elements' follows. This set is the set of positions of the | |
1232 | new state. For each character in the group's label, set the transition | |
1233 | on this character to be to a state corresponding to the set's positions, | |
1234 | and its associated backward context information, if necessary. | |
1235 | ||
1236 | If we are building a searching matcher, we include the positions of state | |
1237 | 0 in every state. | |
1238 | ||
1239 | The collection of groups is constructed by building an equivalence-class | |
1240 | partition of the positions of s. | |
1241 | ||
1242 | For each position, find the set of characters C that it matches. Eliminate | |
1243 | any characters from C that fail on grounds of backward context. | |
1244 | ||
1245 | Search through the groups, looking for a group whose label L has nonempty | |
1246 | intersection with C. If L - C is nonempty, create a new group labeled | |
1247 | L - C and having the same positions as the current group, and set L to | |
1248 | the intersection of L and C. Insert the position in this group, set | |
1249 | C = C - L, and resume scanning. | |
1250 | ||
1251 | If after comparing with every group there are characters remaining in C, | |
1252 | create a new group labeled with the characters of C and insert this | |
1253 | position in that group. */ | |
1254 | void | |
1255 | regstate(s, r, trans) | |
1256 | int s; | |
1257 | struct regexp *r; | |
1258 | int trans[]; | |
1259 | { | |
1260 | _position_set grps[_NOTCHAR]; /* As many as will ever be needed. */ | |
1261 | _charset labels[_NOTCHAR]; /* Labels corresponding to the groups. */ | |
1262 | int ngrps = 0; /* Number of groups actually used. */ | |
1263 | _position pos; /* Current position being considered. */ | |
1264 | _charset matches; /* Set of matching characters. */ | |
1265 | int matchesf; /* True if matches is nonempty. */ | |
1266 | _charset intersect; /* Intersection with some label set. */ | |
1267 | int intersectf; /* True if intersect is nonempty. */ | |
1268 | _charset leftovers; /* Stuff in the label that didn't match. */ | |
1269 | int leftoversf; /* True if leftovers is nonempty. */ | |
1270 | static _charset letters; /* Set of characters considered letters. */ | |
1271 | static _charset newline; /* Set of characters that aren't newline. */ | |
1272 | _position_set follows; /* Union of the follows of some group. */ | |
1273 | _position_set tmp; /* Temporary space for merging sets. */ | |
1274 | int state; /* New state. */ | |
1275 | int wants_newline; /* New state wants to know newline context. */ | |
1276 | int state_newline; /* New state on a newline transition. */ | |
1277 | int wants_letter; /* New state wants to know letter context. */ | |
1278 | int state_letter; /* New state on a letter transition. */ | |
1279 | static initialized; /* Flag for static initialization. */ | |
1280 | int i, j, k; | |
1281 | ||
1282 | /* Initialize the set of letters, if necessary. */ | |
1283 | if (! initialized) | |
1284 | { | |
1285 | initialized = 1; | |
1286 | for (i = 0; i < _NOTCHAR; ++i) | |
1287 | if (ISALNUM(i)) | |
1288 | setbit(i, letters); | |
1289 | setbit('\n', newline); | |
1290 | } | |
1291 | ||
1292 | zeroset(matches); | |
1293 | ||
1294 | for (i = 0; i < r->states[s].elems.nelem; ++i) | |
1295 | { | |
1296 | pos = r->states[s].elems.elems[i]; | |
1297 | if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR) | |
1298 | setbit(r->tokens[pos.index], matches); | |
1299 | else if (r->tokens[pos.index] >= _SET) | |
1300 | copyset(r->charsets[r->tokens[pos.index] - _SET], matches); | |
1301 | else | |
1302 | continue; | |
1303 | ||
1304 | /* Some characters may need to be climinated from matches because | |
1305 | they fail in the current context. */ | |
1306 | if (pos.constraint != 0xff) | |
1307 | { | |
1308 | if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint, | |
1309 | r->states[s].newline, 1)) | |
1310 | clrbit('\n', matches); | |
1311 | if (! _MATCHES_NEWLINE_CONTEXT(pos.constraint, | |
1312 | r->states[s].newline, 0)) | |
1313 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1314 | matches[j] &= newline[j]; | |
1315 | if (! _MATCHES_LETTER_CONTEXT(pos.constraint, | |
1316 | r->states[s].letter, 1)) | |
1317 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1318 | matches[j] &= ~letters[j]; | |
1319 | if (! _MATCHES_LETTER_CONTEXT(pos.constraint, | |
1320 | r->states[s].letter, 0)) | |
1321 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1322 | matches[j] &= letters[j]; | |
1323 | ||
1324 | /* If there are no characters left, there's no point in going on. */ | |
1325 | for (j = 0; j < _CHARSET_INTS && !matches[j]; ++j) | |
1326 | ; | |
1327 | if (j == _CHARSET_INTS) | |
1328 | continue; | |
1329 | } | |
1330 | ||
1331 | for (j = 0; j < ngrps; ++j) | |
1332 | { | |
1333 | /* If matches contains a single character only, and the current | |
1334 | group's label doesn't contain that character, go on to the | |
1335 | next group. */ | |
1336 | if (r->tokens[pos.index] >= 0 && r->tokens[pos.index] < _NOTCHAR | |
1337 | && !tstbit(r->tokens[pos.index], labels[j])) | |
1338 | continue; | |
1339 | ||
1340 | /* Check if this group's label has a nonempty intersection with | |
1341 | matches. */ | |
1342 | intersectf = 0; | |
1343 | for (k = 0; k < _CHARSET_INTS; ++k) | |
1344 | (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0; | |
1345 | if (! intersectf) | |
1346 | continue; | |
1347 | ||
1348 | /* It does; now find the set differences both ways. */ | |
1349 | leftoversf = matchesf = 0; | |
1350 | for (k = 0; k < _CHARSET_INTS; ++k) | |
1351 | { | |
1352 | /* Even an optimizing compiler can't know this for sure. */ | |
1353 | int match = matches[k], label = labels[j][k]; | |
1354 | ||
1355 | (leftovers[k] = ~match & label) ? leftoversf = 1 : 0; | |
1356 | (matches[k] = match & ~label) ? matchesf = 1 : 0; | |
1357 | } | |
1358 | ||
1359 | /* If there were leftovers, create a new group labeled with them. */ | |
1360 | if (leftoversf) | |
1361 | { | |
1362 | copyset(leftovers, labels[ngrps]); | |
1363 | copyset(intersect, labels[j]); | |
1364 | MALLOC(grps[ngrps].elems, _position, r->nleaves); | |
1365 | copy(&grps[j], &grps[ngrps]); | |
1366 | ++ngrps; | |
1367 | } | |
1368 | ||
1369 | /* Put the position in the current group. Note that there is no | |
1370 | reason to call insert() here. */ | |
1371 | grps[j].elems[grps[j].nelem++] = pos; | |
1372 | ||
1373 | /* If every character matching the current position has been | |
1374 | accounted for, we're done. */ | |
1375 | if (! matchesf) | |
1376 | break; | |
1377 | } | |
1378 | ||
1379 | /* If we've passed the last group, and there are still characters | |
1380 | unaccounted for, then we'll have to create a new group. */ | |
1381 | if (j == ngrps) | |
1382 | { | |
1383 | copyset(matches, labels[ngrps]); | |
1384 | zeroset(matches); | |
1385 | MALLOC(grps[ngrps].elems, _position, r->nleaves); | |
1386 | grps[ngrps].nelem = 1; | |
1387 | grps[ngrps].elems[0] = pos; | |
1388 | ++ngrps; | |
1389 | } | |
1390 | } | |
1391 | ||
1392 | MALLOC(follows.elems, _position, r->nleaves); | |
1393 | MALLOC(tmp.elems, _position, r->nleaves); | |
1394 | ||
1395 | /* If we are a searching matcher, the default transition is to a state | |
1396 | containing the positions of state 0, otherwise the default transition | |
1397 | is to fail miserably. */ | |
1398 | if (r->searchflag) | |
1399 | { | |
1400 | wants_newline = 0; | |
1401 | wants_letter = 0; | |
1402 | for (i = 0; i < r->states[0].elems.nelem; ++i) | |
1403 | { | |
1404 | if (_PREV_NEWLINE_DEPENDENT(r->states[0].elems.elems[i].constraint)) | |
1405 | wants_newline = 1; | |
1406 | if (_PREV_LETTER_DEPENDENT(r->states[0].elems.elems[i].constraint)) | |
1407 | wants_letter = 1; | |
1408 | } | |
1409 | copy(&r->states[0].elems, &follows); | |
1410 | state = state_index(r, &follows, 0, 0); | |
1411 | if (wants_newline) | |
1412 | state_newline = state_index(r, &follows, 1, 0); | |
1413 | else | |
1414 | state_newline = state; | |
1415 | if (wants_letter) | |
1416 | state_letter = state_index(r, &follows, 0, 1); | |
1417 | else | |
1418 | state_letter = state; | |
1419 | for (i = 0; i < _NOTCHAR; ++i) | |
1420 | trans[i] = (ISALNUM(i)) ? state_letter : state ; | |
1421 | trans['\n'] = state_newline; | |
1422 | } | |
1423 | else | |
1424 | for (i = 0; i < _NOTCHAR; ++i) | |
1425 | trans[i] = -1; | |
1426 | ||
1427 | for (i = 0; i < ngrps; ++i) | |
1428 | { | |
1429 | follows.nelem = 0; | |
1430 | ||
1431 | /* Find the union of the follows of the positions of the group. | |
1432 | This is a hideously inefficient loop. Fix it someday. */ | |
1433 | for (j = 0; j < grps[i].nelem; ++j) | |
1434 | for (k = 0; k < r->follows[grps[i].elems[j].index].nelem; ++k) | |
1435 | insert(r->follows[grps[i].elems[j].index].elems[k], &follows); | |
1436 | ||
1437 | /* If we are building a searching matcher, throw in the positions | |
1438 | of state 0 as well. */ | |
1439 | if (r->searchflag) | |
1440 | for (j = 0; j < r->states[0].elems.nelem; ++j) | |
1441 | insert(r->states[0].elems.elems[j], &follows); | |
1442 | ||
1443 | /* Find out if the new state will want any context information. */ | |
1444 | wants_newline = 0; | |
1445 | if (tstbit('\n', labels[i])) | |
1446 | for (j = 0; j < follows.nelem; ++j) | |
1447 | if (_PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) | |
1448 | wants_newline = 1; | |
1449 | ||
1450 | wants_letter = 0; | |
1451 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1452 | if (labels[i][j] & letters[j]) | |
1453 | break; | |
1454 | if (j < _CHARSET_INTS) | |
1455 | for (j = 0; j < follows.nelem; ++j) | |
1456 | if (_PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) | |
1457 | wants_letter = 1; | |
1458 | ||
1459 | /* Find the state(s) corresponding to the union of the follows. */ | |
1460 | state = state_index(r, &follows, 0, 0); | |
1461 | if (wants_newline) | |
1462 | state_newline = state_index(r, &follows, 1, 0); | |
1463 | else | |
1464 | state_newline = state; | |
1465 | if (wants_letter) | |
1466 | state_letter = state_index(r, &follows, 0, 1); | |
1467 | else | |
1468 | state_letter = state; | |
1469 | ||
1470 | /* Set the transitions for each character in the current label. */ | |
1471 | for (j = 0; j < _CHARSET_INTS; ++j) | |
1472 | for (k = 0; k < INTBITS; ++k) | |
1473 | if (labels[i][j] & 1 << k) | |
1474 | { | |
1475 | int c = j * INTBITS + k; | |
1476 | ||
1477 | if (c == '\n') | |
1478 | trans[c] = state_newline; | |
1479 | else if (ISALNUM(c)) | |
1480 | trans[c] = state_letter; | |
1481 | else if (c < _NOTCHAR) | |
1482 | trans[c] = state; | |
1483 | } | |
1484 | } | |
1485 | ||
1486 | for (i = 0; i < ngrps; ++i) | |
1487 | free(grps[i].elems); | |
1488 | free(follows.elems); | |
1489 | free(tmp.elems); | |
1490 | } | |
1491 | \f | |
1492 | /* Some routines for manipulating a compiled regexp's transition tables. | |
1493 | Each state may or may not have a transition table; if it does, and it | |
1494 | is a non-accepting state, then r->trans[state] points to its table. | |
1495 | If it is an accepting state then r->fails[state] points to its table. | |
1496 | If it has no table at all, then r->trans[state] is NULL. | |
1497 | TODO: Improve this comment, get rid of the unnecessary redundancy. */ | |
1498 | ||
1499 | static void | |
1500 | build_state(s, r) | |
1501 | int s; | |
1502 | struct regexp *r; | |
1503 | { | |
1504 | int *trans; /* The new transition table. */ | |
1505 | int i; | |
1506 | ||
1507 | /* Set an upper limit on the number of transition tables that will ever | |
1508 | exist at once. 1024 is arbitrary. The idea is that the frequently | |
1509 | used transition tables will be quickly rebuilt, whereas the ones that | |
1510 | were only needed once or twice will be cleared away. */ | |
1511 | if (r->trcount >= 1024) | |
1512 | { | |
1513 | for (i = 0; i < r->tralloc; ++i) | |
1514 | if (r->trans[i]) | |
1515 | { | |
1516 | free((ptr_t) r->trans[i]); | |
1517 | r->trans[i] = NULL; | |
1518 | } | |
1519 | else if (r->fails[i]) | |
1520 | { | |
1521 | free((ptr_t) r->fails[i]); | |
1522 | r->fails[i] = NULL; | |
1523 | } | |
1524 | r->trcount = 0; | |
1525 | } | |
1526 | ||
1527 | ++r->trcount; | |
1528 | ||
1529 | /* Set up the success bits for this state. */ | |
1530 | r->success[s] = 0; | |
1531 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 1, r->states[s].letter, 0, | |
1532 | s, *r)) | |
1533 | r->success[s] |= 4; | |
1534 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 1, | |
1535 | s, *r)) | |
1536 | r->success[s] |= 2; | |
1537 | if (ACCEPTS_IN_CONTEXT(r->states[s].newline, 0, r->states[s].letter, 0, | |
1538 | s, *r)) | |
1539 | r->success[s] |= 1; | |
1540 | ||
1541 | MALLOC(trans, int, _NOTCHAR); | |
1542 | regstate(s, r, trans); | |
1543 | ||
1544 | /* Now go through the new transition table, and make sure that the trans | |
1545 | and fail arrays are allocated large enough to hold a pointer for the | |
1546 | largest state mentioned in the table. */ | |
1547 | for (i = 0; i < _NOTCHAR; ++i) | |
1548 | if (trans[i] >= r->tralloc) | |
1549 | { | |
1550 | int oldalloc = r->tralloc; | |
1551 | ||
1552 | while (trans[i] >= r->tralloc) | |
1553 | r->tralloc *= 2; | |
1554 | REALLOC(r->realtrans, int *, r->tralloc + 1); | |
1555 | r->trans = r->realtrans + 1; | |
1556 | REALLOC(r->fails, int *, r->tralloc); | |
1557 | REALLOC(r->success, int, r->tralloc); | |
1558 | REALLOC(r->newlines, int, r->tralloc); | |
1559 | while (oldalloc < r->tralloc) | |
1560 | { | |
1561 | r->trans[oldalloc] = NULL; | |
1562 | r->fails[oldalloc++] = NULL; | |
1563 | } | |
1564 | } | |
1565 | ||
1566 | /* Keep the newline transition in a special place so we can use it as | |
1567 | a sentinel. */ | |
1568 | r->newlines[s] = trans['\n']; | |
1569 | trans['\n'] = -1; | |
1570 | ||
1571 | if (ACCEPTING(s, *r)) | |
1572 | r->fails[s] = trans; | |
1573 | else | |
1574 | r->trans[s] = trans; | |
1575 | } | |
1576 | ||
1577 | static void | |
1578 | build_state_zero(r) | |
1579 | struct regexp *r; | |
1580 | { | |
1581 | r->tralloc = 1; | |
1582 | r->trcount = 0; | |
1583 | CALLOC(r->realtrans, int *, r->tralloc + 1); | |
1584 | r->trans = r->realtrans + 1; | |
1585 | CALLOC(r->fails, int *, r->tralloc); | |
1586 | MALLOC(r->success, int, r->tralloc); | |
1587 | MALLOC(r->newlines, int, r->tralloc); | |
1588 | build_state(0, r); | |
1589 | } | |
1590 | \f | |
1591 | /* Search through a buffer looking for a match to the given struct regexp. | |
1592 | Find the first occurrence of a string matching the regexp in the buffer, | |
1593 | and the shortest possible version thereof. Return a pointer to the first | |
1594 | character after the match, or NULL if none is found. Begin points to | |
1595 | the beginning of the buffer, and end points to the first character after | |
1596 | its end. We store a newline in *end to act as a sentinel, so end had | |
1597 | better point somewhere valid. Newline is a flag indicating whether to | |
1598 | allow newlines to be in the matching string. If count is non- | |
1599 | NULL it points to a place we're supposed to increment every time we | |
1600 | see a newline. Finally, if backref is non-NULL it points to a place | |
1601 | where we're supposed to store a 1 if backreferencing happened and the | |
1602 | match needs to be verified by a backtracking matcher. Otherwise | |
1603 | we store a 0 in *backref. */ | |
1604 | char * | |
1605 | regexecute(r, begin, end, newline, count, backref) | |
1606 | struct regexp *r; | |
1607 | char *begin; | |
1608 | char *end; | |
1609 | int newline; | |
1610 | int *count; | |
1611 | int *backref; | |
1612 | { | |
1613 | register s, s1, tmp; /* Current state. */ | |
1614 | register unsigned char *p; /* Current input character. */ | |
1615 | register **trans, *t; /* Copy of r->trans so it can be optimized | |
1616 | into a register. */ | |
1617 | static sbit[_NOTCHAR]; /* Table for anding with r->success. */ | |
1618 | static sbit_init; | |
1619 | ||
1620 | if (! sbit_init) | |
1621 | { | |
1622 | int i; | |
1623 | ||
1624 | sbit_init = 1; | |
1625 | for (i = 0; i < _NOTCHAR; ++i) | |
1626 | sbit[i] = (ISALNUM(i)) ? 2 : 1; | |
1627 | sbit['\n'] = 4; | |
1628 | } | |
1629 | ||
1630 | if (! r->tralloc) | |
1631 | build_state_zero(r); | |
1632 | ||
1633 | s = s1 = 0; | |
1634 | p = (unsigned char *) begin; | |
1635 | trans = r->trans; | |
1636 | *end = '\n'; | |
1637 | ||
1638 | for (;;) | |
1639 | { | |
1640 | while ((t = trans[s]) != 0) { /* hand-optimized loop */ | |
1641 | s1 = t[*p++]; | |
1642 | if ((t = trans[s1]) == 0) { | |
1643 | tmp = s ; s = s1 ; s1 = tmp ; /* swap */ | |
1644 | break; | |
1645 | } | |
1646 | s = t[*p++]; | |
1647 | } | |
1648 | ||
1649 | if (s >= 0 && p <= (unsigned char *) end && r->fails[s]) | |
1650 | { | |
1651 | if (r->success[s] & sbit[*p]) | |
1652 | { | |
1653 | if (backref) | |
1654 | *backref = (r->states[s].backref != 0); | |
1655 | return (char *) p; | |
1656 | } | |
1657 | ||
1658 | s1 = s; | |
1659 | s = r->fails[s][*p++]; | |
1660 | continue; | |
1661 | } | |
1662 | ||
1663 | /* If the previous character was a newline, count it. */ | |
1664 | if (count && (char *) p <= end && p[-1] == '\n') | |
1665 | ++*count; | |
1666 | ||
1667 | /* Check if we've run off the end of the buffer. */ | |
1668 | if ((char *) p >= end) | |
1669 | return NULL; | |
1670 | ||
1671 | if (s >= 0) | |
1672 | { | |
1673 | build_state(s, r); | |
1674 | trans = r->trans; | |
1675 | continue; | |
1676 | } | |
1677 | ||
1678 | if (p[-1] == '\n' && newline) | |
1679 | { | |
1680 | s = r->newlines[s1]; | |
1681 | continue; | |
1682 | } | |
1683 | ||
1684 | s = 0; | |
1685 | } | |
1686 | } | |
1687 | \f | |
1688 | /* Initialize the components of a regexp that the other routines don't | |
1689 | initialize for themselves. */ | |
1690 | void | |
1691 | reginit(r) | |
1692 | struct regexp *r; | |
1693 | { | |
1694 | r->calloc = 1; | |
1695 | MALLOC(r->charsets, _charset, r->calloc); | |
1696 | r->cindex = 0; | |
1697 | ||
1698 | r->talloc = 1; | |
1699 | MALLOC(r->tokens, _token, r->talloc); | |
1700 | r->tindex = r->depth = r->nleaves = r->nregexps = 0; | |
1701 | ||
1702 | r->searchflag = 0; | |
1703 | r->tralloc = 0; | |
1704 | } | |
1705 | ||
1706 | /* Parse and analyze a single string of the given length. */ | |
1707 | void | |
1708 | regcompile(s, len, r, searchflag) | |
1709 | const char *s; | |
1710 | size_t len; | |
1711 | struct regexp *r; | |
1712 | int searchflag; | |
1713 | { | |
1714 | if (case_fold) /* dummy folding in service of regmust() */ | |
1715 | { | |
1716 | char *regcopy; | |
1717 | int i; | |
1718 | ||
1719 | regcopy = malloc(len); | |
1720 | if (!regcopy) | |
1721 | reg_error("out of memory"); | |
1722 | ||
1723 | /* This is a complete kludge and could potentially break | |
1724 | \<letter> escapes . . . */ | |
1725 | case_fold = 0; | |
1726 | for (i = 0; i < len; ++i) | |
1727 | if (ISUPPER(s[i])) | |
1728 | regcopy[i] = tolower(s[i]); | |
1729 | else | |
1730 | regcopy[i] = s[i]; | |
1731 | ||
1732 | reginit(r); | |
1733 | r->mustn = 0; | |
1734 | r->must[0] = '\0'; | |
1735 | regparse(regcopy, len, r); | |
1736 | free(regcopy); | |
1737 | regmust(r); | |
1738 | reganalyze(r, searchflag); | |
1739 | case_fold = 1; | |
1740 | reginit(r); | |
1741 | regparse(s, len, r); | |
1742 | reganalyze(r, searchflag); | |
1743 | } | |
1744 | else | |
1745 | { | |
1746 | reginit(r); | |
1747 | regparse(s, len, r); | |
1748 | regmust(r); | |
1749 | reganalyze(r, searchflag); | |
1750 | } | |
1751 | } | |
1752 | ||
1753 | /* Free the storage held by the components of a regexp. */ | |
1754 | void | |
1755 | reg_free(r) | |
1756 | struct regexp *r; | |
1757 | { | |
1758 | int i; | |
1759 | ||
1760 | free((ptr_t) r->charsets); | |
1761 | free((ptr_t) r->tokens); | |
1762 | for (i = 0; i < r->sindex; ++i) | |
1763 | free((ptr_t) r->states[i].elems.elems); | |
1764 | free((ptr_t) r->states); | |
1765 | for (i = 0; i < r->tindex; ++i) | |
1766 | if (r->follows[i].elems) | |
1767 | free((ptr_t) r->follows[i].elems); | |
1768 | free((ptr_t) r->follows); | |
1769 | for (i = 0; i < r->tralloc; ++i) | |
1770 | if (r->trans[i]) | |
1771 | free((ptr_t) r->trans[i]); | |
1772 | else if (r->fails[i]) | |
1773 | free((ptr_t) r->fails[i]); | |
1774 | if (r->realtrans) | |
1775 | free((ptr_t) r->realtrans); | |
1776 | if (r->fails) | |
1777 | free((ptr_t) r->fails); | |
1778 | if (r->newlines) | |
1779 | free((ptr_t) r->newlines); | |
1780 | } | |
1781 | ||
1782 | /* | |
1783 | Having found the postfix representation of the regular expression, | |
1784 | try to find a long sequence of characters that must appear in any line | |
1785 | containing the r.e. | |
1786 | Finding a "longest" sequence is beyond the scope here; | |
1787 | we take an easy way out and hope for the best. | |
1788 | (Take "(ab|a)b"--please.) | |
1789 | ||
1790 | We do a bottom-up calculation of sequences of characters that must appear | |
1791 | in matches of r.e.'s represented by trees rooted at the nodes of the postfix | |
1792 | representation: | |
1793 | sequences that must appear at the left of the match ("left") | |
1794 | sequences that must appear at the right of the match ("right") | |
1795 | lists of sequences that must appear somewhere in the match ("in") | |
1796 | sequences that must constitute the match ("is") | |
1797 | When we get to the root of the tree, we use one of the longest of its | |
1798 | calculated "in" sequences as our answer. The sequence we find is returned in | |
1799 | r->must (where "r" is the single argument passed to "regmust"); | |
1800 | the length of the sequence is returned in r->mustn. | |
1801 | ||
1802 | The sequences calculated for the various types of node (in pseudo ANSI c) | |
1803 | are shown below. "p" is the operand of unary operators (and the left-hand | |
1804 | operand of binary operators); "q" is the right-hand operand of binary operators | |
1805 | . | |
1806 | "ZERO" means "a zero-length sequence" below. | |
1807 | ||
1808 | Type left right is in | |
1809 | ---- ---- ----- -- -- | |
1810 | char c # c # c # c # c | |
1811 | ||
1812 | SET ZERO ZERO ZERO ZERO | |
1813 | ||
1814 | STAR ZERO ZERO ZERO ZERO | |
1815 | ||
1816 | QMARK ZERO ZERO ZERO ZERO | |
1817 | ||
1818 | PLUS p->left p->right ZERO p->in | |
1819 | ||
1820 | CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus | |
1821 | p->left : q->right : q->is!=ZERO) ? q->in plus | |
1822 | p->is##q->left p->right##q->is p->is##q->is : p->right##q->left | |
1823 | ZERO | |
1824 | ||
1825 | OR longest common longest common (do p->is and substrings common to | |
1826 | leading trailing q->is have same p->in and q->in | |
1827 | (sub)sequence (sub)sequence length and | |
1828 | of p->left of p->right content) ? | |
1829 | and q->left and q->right p->is : NULL | |
1830 | ||
1831 | If there's anything else we recognize in the tree, all four sequences get set | |
1832 | to zero-length sequences. If there's something we don't recognize in the tree, | |
1833 | we just return a zero-length sequence. | |
1834 | ||
1835 | Break ties in favor of infrequent letters (choosing 'zzz' in preference to | |
1836 | 'aaa')? | |
1837 | ||
1838 | And. . .is it here or someplace that we might ponder "optimizations" such as | |
1839 | egrep 'psi|epsilon' -> egrep 'psi' | |
1840 | egrep 'pepsi|epsilon' -> egrep 'epsi' | |
1841 | (Yes, we now find "epsi" as a "string | |
1842 | that must occur", but we might also | |
1843 | simplify the *entire* r.e. being sought | |
1844 | ) | |
1845 | grep '[c]' -> grep 'c' | |
1846 | grep '(ab|a)b' -> grep 'ab' | |
1847 | grep 'ab*' -> grep 'a' | |
1848 | grep 'a*b' -> grep 'b' | |
1849 | There are several issues: | |
1850 | Is optimization easy (enough)? | |
1851 | ||
1852 | Does optimization actually accomplish anything, | |
1853 | or is the automaton you get from "psi|epsilon" (for example) | |
1854 | the same as the one you get from "psi" (for example)? | |
1855 | ||
1856 | Are optimizable r.e.'s likely to be used in real-life situations | |
1857 | (something like 'ab*' is probably unlikely; something like is | |
1858 | 'psi|epsilon' is likelier)? | |
1859 | */ | |
1860 | ||
1861 | static char * | |
1862 | icatalloc(old, new) | |
1863 | char * old; | |
1864 | const char * new; | |
1865 | { | |
1866 | register char * result; | |
1867 | register int oldsize, newsize; | |
1868 | ||
1869 | newsize = (new == NULL) ? 0 : strlen(new); | |
1870 | if (old == NULL) | |
1871 | oldsize = 0; | |
1872 | else if (newsize == 0) | |
1873 | return old; | |
1874 | else oldsize = strlen(old); | |
1875 | if (old == NULL) | |
1876 | result = (char *) malloc(newsize + 1); | |
1877 | else result = (char *) realloc((void *) old, oldsize + newsize + 1); | |
1878 | if (result != NULL && new != NULL) | |
1879 | (void) strcpy(result + oldsize, new); | |
1880 | return result; | |
1881 | } | |
1882 | ||
1883 | static char * | |
1884 | icpyalloc(string) | |
1885 | const char * string; | |
1886 | { | |
1887 | return icatalloc((char *) NULL, string); | |
1888 | } | |
1889 | ||
1890 | static char * | |
1891 | istrstr(lookin, lookfor) | |
1892 | char * lookin; | |
1893 | register char * lookfor; | |
1894 | { | |
1895 | register char * cp; | |
1896 | register int len; | |
1897 | ||
1898 | len = strlen(lookfor); | |
1899 | for (cp = lookin; *cp != '\0'; ++cp) | |
1900 | if (strncmp(cp, lookfor, len) == 0) | |
1901 | return cp; | |
1902 | return NULL; | |
1903 | } | |
1904 | ||
1905 | static void | |
1906 | ifree(cp) | |
1907 | char * cp; | |
1908 | { | |
1909 | if (cp != NULL) | |
1910 | free(cp); | |
1911 | } | |
1912 | ||
1913 | static void | |
1914 | freelist(cpp) | |
1915 | register char ** cpp; | |
1916 | { | |
1917 | register int i; | |
1918 | ||
1919 | if (cpp == NULL) | |
1920 | return; | |
1921 | for (i = 0; cpp[i] != NULL; ++i) { | |
1922 | free(cpp[i]); | |
1923 | cpp[i] = NULL; | |
1924 | } | |
1925 | } | |
1926 | ||
1927 | static char ** | |
1928 | enlist(cpp, new, len) | |
1929 | register char ** cpp; | |
1930 | register char * new; | |
1931 | #ifdef __STDC__ | |
1932 | size_t len; | |
1933 | #else | |
1934 | int len; | |
1935 | #endif | |
1936 | { | |
1937 | register int i, j; | |
1938 | ||
1939 | if (cpp == NULL) | |
1940 | return NULL; | |
1941 | if ((new = icpyalloc(new)) == NULL) { | |
1942 | freelist(cpp); | |
1943 | return NULL; | |
1944 | } | |
1945 | new[len] = '\0'; | |
1946 | /* | |
1947 | ** Is there already something in the list that's new (or longer)? | |
1948 | */ | |
1949 | for (i = 0; cpp[i] != NULL; ++i) | |
1950 | if (istrstr(cpp[i], new) != NULL) { | |
1951 | free(new); | |
1952 | return cpp; | |
1953 | } | |
1954 | /* | |
1955 | ** Eliminate any obsoleted strings. | |
1956 | */ | |
1957 | j = 0; | |
1958 | while (cpp[j] != NULL) | |
1959 | if (istrstr(new, cpp[j]) == NULL) | |
1960 | ++j; | |
1961 | else { | |
1962 | free(cpp[j]); | |
1963 | if (--i == j) | |
1964 | break; | |
1965 | cpp[j] = cpp[i]; | |
1966 | } | |
1967 | /* | |
1968 | ** Add the new string. | |
1969 | */ | |
1970 | cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); | |
1971 | if (cpp == NULL) | |
1972 | return NULL; | |
1973 | cpp[i] = new; | |
1974 | cpp[i + 1] = NULL; | |
1975 | return cpp; | |
1976 | } | |
1977 | ||
1978 | /* | |
1979 | ** Given pointers to two strings, | |
1980 | ** return a pointer to an allocated list of their distinct common substrings. | |
1981 | ** Return NULL if something seems wild. | |
1982 | */ | |
1983 | ||
1984 | static char ** | |
1985 | comsubs(left, right) | |
1986 | char * left; | |
1987 | char * right; | |
1988 | { | |
1989 | register char ** cpp; | |
1990 | register char * lcp; | |
1991 | register char * rcp; | |
1992 | register int i, len; | |
1993 | ||
1994 | if (left == NULL || right == NULL) | |
1995 | return NULL; | |
1996 | cpp = (char **) malloc(sizeof *cpp); | |
1997 | if (cpp == NULL) | |
1998 | return NULL; | |
1999 | cpp[0] = NULL; | |
2000 | for (lcp = left; *lcp != '\0'; ++lcp) { | |
2001 | len = 0; | |
2002 | rcp = strchr(right, *lcp); | |
2003 | while (rcp != NULL) { | |
2004 | for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) | |
2005 | ; | |
2006 | if (i > len) | |
2007 | len = i; | |
2008 | rcp = strchr(rcp + 1, *lcp); | |
2009 | } | |
2010 | if (len == 0) | |
2011 | continue; | |
2012 | #ifdef __STDC__ | |
2013 | if ((cpp = enlist(cpp, lcp, (size_t)len)) == NULL) | |
2014 | #else | |
2015 | if ((cpp = enlist(cpp, lcp, len)) == NULL) | |
2016 | #endif | |
2017 | break; | |
2018 | } | |
2019 | return cpp; | |
2020 | } | |
2021 | ||
2022 | static char ** | |
2023 | addlists(old, new) | |
2024 | char ** old; | |
2025 | char ** new; | |
2026 | { | |
2027 | register int i; | |
2028 | ||
2029 | if (old == NULL || new == NULL) | |
2030 | return NULL; | |
2031 | for (i = 0; new[i] != NULL; ++i) { | |
2032 | old = enlist(old, new[i], strlen(new[i])); | |
2033 | if (old == NULL) | |
2034 | break; | |
2035 | } | |
2036 | return old; | |
2037 | } | |
2038 | ||
2039 | /* | |
2040 | ** Given two lists of substrings, | |
2041 | ** return a new list giving substrings common to both. | |
2042 | */ | |
2043 | ||
2044 | static char ** | |
2045 | inboth(left, right) | |
2046 | char ** left; | |
2047 | char ** right; | |
2048 | { | |
2049 | register char ** both; | |
2050 | register char ** temp; | |
2051 | register int lnum, rnum; | |
2052 | ||
2053 | if (left == NULL || right == NULL) | |
2054 | return NULL; | |
2055 | both = (char **) malloc(sizeof *both); | |
2056 | if (both == NULL) | |
2057 | return NULL; | |
2058 | both[0] = NULL; | |
2059 | for (lnum = 0; left[lnum] != NULL; ++lnum) { | |
2060 | for (rnum = 0; right[rnum] != NULL; ++rnum) { | |
2061 | temp = comsubs(left[lnum], right[rnum]); | |
2062 | if (temp == NULL) { | |
2063 | freelist(both); | |
2064 | return NULL; | |
2065 | } | |
2066 | both = addlists(both, temp); | |
2067 | freelist(temp); | |
2068 | if (both == NULL) | |
2069 | return NULL; | |
2070 | } | |
2071 | } | |
2072 | return both; | |
2073 | } | |
2074 | ||
2075 | /* | |
2076 | typedef struct { | |
2077 | char ** in; | |
2078 | char * left; | |
2079 | char * right; | |
2080 | char * is; | |
2081 | } must; | |
2082 | */ | |
2083 | static void | |
2084 | resetmust(mp) | |
2085 | register must * mp; | |
2086 | { | |
2087 | mp->left[0] = mp->right[0] = mp->is[0] = '\0'; | |
2088 | freelist(mp->in); | |
2089 | } | |
2090 | ||
2091 | static void | |
2092 | regmust(r) | |
2093 | register struct regexp * r; | |
2094 | { | |
2095 | register must * musts; | |
2096 | register must * mp; | |
2097 | register char * result = ""; | |
2098 | register int ri; | |
2099 | register int i; | |
2100 | register _token t; | |
2101 | static must must0; | |
2102 | ||
2103 | reg->mustn = 0; | |
2104 | reg->must[0] = '\0'; | |
2105 | musts = (must *) malloc((reg->tindex + 1) * sizeof *musts); | |
2106 | if (musts == NULL) | |
2107 | return; | |
2108 | mp = musts; | |
2109 | for (i = 0; i <= reg->tindex; ++i) | |
2110 | mp[i] = must0; | |
2111 | for (i = 0; i <= reg->tindex; ++i) { | |
2112 | mp[i].in = (char **) malloc(sizeof *mp[i].in); | |
2113 | mp[i].left = malloc(2); | |
2114 | mp[i].right = malloc(2); | |
2115 | mp[i].is = malloc(2); | |
2116 | if (mp[i].in == NULL || mp[i].left == NULL || | |
2117 | mp[i].right == NULL || mp[i].is == NULL) | |
2118 | goto done; | |
2119 | mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; | |
2120 | mp[i].in[0] = NULL; | |
2121 | } | |
2122 | for (ri = 0; ri < reg->tindex; ++ri) { | |
2123 | switch (t = reg->tokens[ri]) { | |
2124 | case _ALLBEGLINE: | |
2125 | case _ALLENDLINE: | |
2126 | case _LPAREN: | |
2127 | case _RPAREN: | |
2128 | goto done; /* "cannot happen" */ | |
2129 | case _EMPTY: | |
2130 | case _BEGLINE: | |
2131 | case _ENDLINE: | |
2132 | case _BEGWORD: | |
2133 | case _ENDWORD: | |
2134 | case _LIMWORD: | |
2135 | case _NOTLIMWORD: | |
2136 | case _BACKREF: | |
2137 | resetmust(mp); | |
2138 | break; | |
2139 | case _STAR: | |
2140 | case _QMARK: | |
2141 | if (mp <= musts) | |
2142 | goto done; /* "cannot happen" */ | |
2143 | --mp; | |
2144 | resetmust(mp); | |
2145 | break; | |
2146 | case _OR: | |
2147 | if (mp < &musts[2]) | |
2148 | goto done; /* "cannot happen" */ | |
2149 | { | |
2150 | register char ** new; | |
2151 | register must * lmp; | |
2152 | register must * rmp; | |
2153 | register int j, ln, rn, n; | |
2154 | ||
2155 | rmp = --mp; | |
2156 | lmp = --mp; | |
2157 | /* Guaranteed to be. Unlikely, but. . . */ | |
2158 | if (strcmp(lmp->is, rmp->is) != 0) | |
2159 | lmp->is[0] = '\0'; | |
2160 | /* Left side--easy */ | |
2161 | i = 0; | |
2162 | while (lmp->left[i] != '\0' && | |
2163 | lmp->left[i] == rmp->left[i]) | |
2164 | ++i; | |
2165 | lmp->left[i] = '\0'; | |
2166 | /* Right side */ | |
2167 | ln = strlen(lmp->right); | |
2168 | rn = strlen(rmp->right); | |
2169 | n = ln; | |
2170 | if (n > rn) | |
2171 | n = rn; | |
2172 | for (i = 0; i < n; ++i) | |
2173 | if (lmp->right[ln - i - 1] != | |
2174 | rmp->right[rn - i - 1]) | |
2175 | break; | |
2176 | for (j = 0; j < i; ++j) | |
2177 | lmp->right[j] = | |
2178 | lmp->right[(ln - i) + j]; | |
2179 | lmp->right[j] = '\0'; | |
2180 | new = inboth(lmp->in, rmp->in); | |
2181 | if (new == NULL) | |
2182 | goto done; | |
2183 | freelist(lmp->in); | |
2184 | free((char *) lmp->in); | |
2185 | lmp->in = new; | |
2186 | } | |
2187 | break; | |
2188 | case _PLUS: | |
2189 | if (mp <= musts) | |
2190 | goto done; /* "cannot happen" */ | |
2191 | --mp; | |
2192 | mp->is[0] = '\0'; | |
2193 | break; | |
2194 | case _END: | |
2195 | if (mp != &musts[1]) | |
2196 | goto done; /* "cannot happen" */ | |
2197 | for (i = 0; musts[0].in[i] != NULL; ++i) | |
2198 | if (strlen(musts[0].in[i]) > strlen(result)) | |
2199 | result = musts[0].in[i]; | |
2200 | goto done; | |
2201 | case _CAT: | |
2202 | if (mp < &musts[2]) | |
2203 | goto done; /* "cannot happen" */ | |
2204 | { | |
2205 | register must * lmp; | |
2206 | register must * rmp; | |
2207 | ||
2208 | rmp = --mp; | |
2209 | lmp = --mp; | |
2210 | /* | |
2211 | ** In. Everything in left, plus everything in | |
2212 | ** right, plus catenation of | |
2213 | ** left's right and right's left. | |
2214 | */ | |
2215 | lmp->in = addlists(lmp->in, rmp->in); | |
2216 | if (lmp->in == NULL) | |
2217 | goto done; | |
2218 | if (lmp->right[0] != '\0' && | |
2219 | rmp->left[0] != '\0') { | |
2220 | register char * tp; | |
2221 | ||
2222 | tp = icpyalloc(lmp->right); | |
2223 | if (tp == NULL) | |
2224 | goto done; | |
2225 | tp = icatalloc(tp, rmp->left); | |
2226 | if (tp == NULL) | |
2227 | goto done; | |
2228 | lmp->in = enlist(lmp->in, tp, | |
2229 | strlen(tp)); | |
2230 | free(tp); | |
2231 | if (lmp->in == NULL) | |
2232 | goto done; | |
2233 | } | |
2234 | /* Left-hand */ | |
2235 | if (lmp->is[0] != '\0') { | |
2236 | lmp->left = icatalloc(lmp->left, | |
2237 | rmp->left); | |
2238 | if (lmp->left == NULL) | |
2239 | goto done; | |
2240 | } | |
2241 | /* Right-hand */ | |
2242 | if (rmp->is[0] == '\0') | |
2243 | lmp->right[0] = '\0'; | |
2244 | lmp->right = icatalloc(lmp->right, rmp->right); | |
2245 | if (lmp->right == NULL) | |
2246 | goto done; | |
2247 | /* Guaranteed to be */ | |
2248 | if (lmp->is[0] != '\0' && rmp->is[0] != '\0') { | |
2249 | lmp->is = icatalloc(lmp->is, rmp->is); | |
2250 | if (lmp->is == NULL) | |
2251 | goto done; | |
2252 | } | |
2253 | } | |
2254 | break; | |
2255 | default: | |
2256 | if (t < _END) { | |
2257 | /* "cannot happen" */ | |
2258 | goto done; | |
2259 | } else if (t == '\0') { | |
2260 | /* not on *my* shift */ | |
2261 | goto done; | |
2262 | } else if (t >= _SET) { | |
2263 | /* easy enough */ | |
2264 | resetmust(mp); | |
2265 | } else { | |
2266 | /* plain character */ | |
2267 | resetmust(mp); | |
2268 | mp->is[0] = mp->left[0] = mp->right[0] = t; | |
2269 | mp->is[1] = mp->left[1] = mp->right[1] = '\0'; | |
2270 | mp->in = enlist(mp->in, mp->is, 1); | |
2271 | if (mp->in == NULL) | |
2272 | goto done; | |
2273 | } | |
2274 | break; | |
2275 | } | |
2276 | ++mp; | |
2277 | } | |
2278 | done: | |
2279 | (void) strncpy(reg->must, result, MUST_MAX - 1); | |
2280 | reg->must[MUST_MAX - 1] = '\0'; | |
2281 | reg->mustn = strlen(reg->must); | |
2282 | mp = musts; | |
2283 | for (i = 0; i <= reg->tindex; ++i) { | |
2284 | freelist(mp[i].in); | |
2285 | ifree((char *) mp[i].in); | |
2286 | ifree(mp[i].left); | |
2287 | ifree(mp[i].right); | |
2288 | ifree(mp[i].is); | |
2289 | } | |
2290 | free((char *) mp); | |
2291 | } |