Commit | Line | Data |
---|---|---|
42d6e430 BJ |
1 | #include "awk.def" |
2 | #include "stdio.h" | |
3 | #include "awk.h" | |
4 | ||
5 | extern node *op2(); | |
6 | extern struct fa *cgotofn(); | |
7 | #define MAXLIN 256 | |
8 | #define NCHARS 128 | |
9 | #define NSTATES 256 | |
10 | ||
11 | #define type(v) v->nobj | |
12 | #define left(v) v->narg[0] | |
13 | #define right(v) v->narg[1] | |
14 | #define parent(v) v->nnext | |
15 | ||
16 | #define LEAF case CCL: case NCCL: case CHAR: case DOT: | |
17 | #define UNARY case FINAL: case STAR: case PLUS: case QUEST: | |
18 | ||
19 | /* encoding in tree nodes: | |
20 | leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value | |
21 | unary (FINAL, STAR, PLUS, QUEST): left is child, right is null | |
22 | binary (CAT, OR): left and right are children | |
23 | parent contains pointer to parent | |
24 | */ | |
25 | ||
26 | struct fa { | |
27 | int cch; | |
28 | struct fa *st; | |
29 | }; | |
30 | ||
31 | int *state[NSTATES]; | |
32 | int *foll[MAXLIN]; | |
33 | char chars[MAXLIN]; | |
34 | int setvec[MAXLIN]; | |
35 | node *point[MAXLIN]; | |
36 | ||
37 | int setcnt; | |
38 | int line; | |
39 | ||
40 | ||
41 | struct fa *makedfa(p) /* returns dfa for tree pointed to by p */ | |
42 | node *p; | |
43 | { | |
44 | node *p1; | |
45 | struct fa *fap; | |
46 | p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p); | |
47 | /* put DOT STAR in front of reg. exp. */ | |
48 | p1 = op2(FINAL, p1, (node *) 0); /* install FINAL node */ | |
49 | ||
50 | line = 0; | |
51 | penter(p1); /* enter parent pointers and leaf indices */ | |
52 | point[line] = p1; /* FINAL node */ | |
53 | setvec[0] = 1; /* for initial DOT STAR */ | |
54 | cfoll(p1); /* set up follow sets */ | |
55 | fap = cgotofn(); | |
56 | freetr(p1); /* add this when alloc works */ | |
57 | return(fap); | |
58 | } | |
59 | ||
60 | penter(p) /* set up parent pointers and leaf indices */ | |
61 | node *p; | |
62 | { | |
63 | switch(type(p)) { | |
64 | LEAF | |
65 | left(p) = (node *) line; | |
66 | point[line++] = p; | |
67 | break; | |
68 | UNARY | |
69 | penter(left(p)); | |
70 | parent(left(p)) = p; | |
71 | break; | |
72 | case CAT: | |
73 | case OR: | |
74 | penter(left(p)); | |
75 | penter(right(p)); | |
76 | parent(left(p)) = p; | |
77 | parent(right(p)) = p; | |
78 | break; | |
79 | default: | |
80 | error(FATAL, "unknown type %d in penter\n", type(p)); | |
81 | break; | |
82 | } | |
83 | } | |
84 | ||
85 | freetr(p) /* free parse tree and follow sets */ | |
86 | node *p; | |
87 | { | |
88 | switch(type(p)) { | |
89 | LEAF | |
90 | xfree(foll[(int) left(p)]); | |
91 | xfree(p); | |
92 | break; | |
93 | UNARY | |
94 | freetr(left(p)); | |
95 | xfree(p); | |
96 | break; | |
97 | case CAT: | |
98 | case OR: | |
99 | freetr(left(p)); | |
100 | freetr(right(p)); | |
101 | xfree(p); | |
102 | break; | |
103 | default: | |
104 | error(FATAL, "unknown type %d in freetr", type(p)); | |
105 | break; | |
106 | } | |
107 | } | |
108 | char *cclenter(p) | |
109 | register char *p; | |
110 | { | |
111 | register i, c; | |
112 | char *op; | |
113 | ||
114 | op = p; | |
115 | i = 0; | |
116 | while ((c = *p++) != 0) { | |
117 | if (c == '-' && i > 0 && chars[i-1] != 0) { | |
118 | if (*p != 0) { | |
119 | c = chars[i-1]; | |
120 | while (c < *p) { | |
121 | if (i >= MAXLIN) | |
122 | overflo(); | |
123 | chars[i++] = ++c; | |
124 | } | |
125 | p++; | |
126 | continue; | |
127 | } | |
128 | } | |
129 | if (i >= MAXLIN) | |
130 | overflo(); | |
131 | chars[i++] = c; | |
132 | } | |
133 | chars[i++] = '\0'; | |
134 | dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL); | |
135 | xfree(op); | |
136 | return(tostring(chars)); | |
137 | } | |
138 | ||
139 | overflo() | |
140 | { | |
141 | error(FATAL, "regular expression too long\n"); | |
142 | } | |
143 | ||
144 | cfoll(v) /* enter follow set of each leaf of vertex v into foll[leaf] */ | |
145 | register node *v; | |
146 | { | |
147 | register i; | |
148 | int prev; | |
149 | int *add(); | |
150 | ||
151 | switch(type(v)) { | |
152 | LEAF | |
153 | setcnt = 0; | |
154 | for (i=1; i<=line; i++) | |
155 | setvec[i] = 0; | |
156 | follow(v); | |
157 | if (notin(foll, ( (int) left(v))-1, &prev)) { | |
158 | foll[(int) left(v)] = add(setcnt); | |
159 | } | |
160 | else | |
161 | foll[ (int) left(v)] = foll[prev]; | |
162 | break; | |
163 | UNARY | |
164 | cfoll(left(v)); | |
165 | break; | |
166 | case CAT: | |
167 | case OR: | |
168 | cfoll(left(v)); | |
169 | cfoll(right(v)); | |
170 | break; | |
171 | default: | |
172 | error(FATAL, "unknown type %d in cfoll", type(v)); | |
173 | } | |
174 | } | |
175 | ||
176 | first(p) /* collects initially active leaves of p into setvec */ | |
177 | register node *p; /* returns 0 or 1 depending on whether p matches empty string */ | |
178 | { | |
179 | register b; | |
180 | ||
181 | switch(type(p)) { | |
182 | LEAF | |
183 | if (setvec[(int) left(p)] != 1) { | |
184 | setvec[(int) left(p)] = 1; | |
185 | setcnt++; | |
186 | } | |
187 | if (type(p) == CCL && (*(char *) right(p)) == '\0') | |
188 | return(0); /* empty CCL */ | |
189 | else return(1); | |
190 | case FINAL: | |
191 | case PLUS: | |
192 | if (first(left(p)) == 0) return(0); | |
193 | return(1); | |
194 | case STAR: | |
195 | case QUEST: | |
196 | first(left(p)); | |
197 | return(0); | |
198 | case CAT: | |
199 | if (first(left(p)) == 0 && first(right(p)) == 0) return(0); | |
200 | return(1); | |
201 | case OR: | |
202 | b = first(right(p)); | |
203 | if (first(left(p)) == 0 || b == 0) return(0); | |
204 | return(1); | |
205 | } | |
206 | error(FATAL, "unknown type %d in first\n", type(p)); | |
207 | return(-1); | |
208 | } | |
209 | ||
210 | follow(v) | |
211 | node *v; /* collects leaves that can follow v into setvec */ | |
212 | { | |
213 | node *p; | |
214 | ||
215 | if (type(v) == FINAL) | |
216 | return; | |
217 | p = parent(v); | |
218 | switch (type(p)) { | |
219 | case STAR: | |
220 | case PLUS: first(v); | |
221 | follow(p); | |
222 | return; | |
223 | ||
224 | case OR: | |
225 | case QUEST: follow(p); | |
226 | return; | |
227 | ||
228 | case CAT: if (v == left(p)) { /* v is left child of p */ | |
229 | if (first(right(p)) == 0) { | |
230 | follow(p); | |
231 | return; | |
232 | } | |
233 | } | |
234 | else /* v is right child */ | |
235 | follow(p); | |
236 | return; | |
237 | case FINAL: if (setvec[line] != 1) { | |
238 | setvec[line] = 1; | |
239 | setcnt++; | |
240 | } | |
241 | return; | |
242 | } | |
243 | } | |
244 | ||
245 | member(c, s) /* is c in s? */ | |
246 | register char c, *s; | |
247 | { | |
248 | while (*s) | |
249 | if (c == *s++) | |
250 | return(1); | |
251 | return(0); | |
252 | } | |
253 | ||
254 | notin(array, n, prev) /* is setvec in array[0] thru array[n]? */ | |
255 | int **array; | |
256 | int *prev; { | |
257 | register i, j; | |
258 | int *ptr; | |
259 | for (i=0; i<=n; i++) { | |
260 | ptr = array[i]; | |
261 | if (*ptr == setcnt) { | |
262 | for (j=0; j < setcnt; j++) | |
263 | if (setvec[*(++ptr)] != 1) goto nxt; | |
264 | *prev = i; | |
265 | return(0); | |
266 | } | |
267 | nxt: ; | |
268 | } | |
269 | return(1); | |
270 | } | |
271 | ||
272 | int *add(n) { /* remember setvec */ | |
273 | int *ptr, *p; | |
274 | register i; | |
275 | if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL) | |
276 | overflo(); | |
277 | *ptr = n; | |
278 | dprintf("add(%d)\n", n, NULL, NULL); | |
279 | for (i=1; i <= line; i++) | |
280 | if (setvec[i] == 1) { | |
281 | *(++ptr) = i; | |
282 | dprintf(" ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i); | |
283 | } | |
284 | dprintf("\n", NULL, NULL, NULL); | |
285 | return(p); | |
286 | } | |
287 | ||
288 | struct fa *cgotofn() | |
289 | { | |
290 | register i, k; | |
291 | register int *ptr; | |
292 | char c; | |
293 | char *p; | |
294 | node *cp; | |
295 | int j, n, s, ind, numtrans; | |
296 | int finflg; | |
297 | int curpos, num, prev; | |
298 | struct fa *where[NSTATES]; | |
299 | ||
300 | int fatab[257]; | |
301 | struct fa *pfa; | |
302 | ||
303 | char index[MAXLIN]; | |
304 | char iposns[MAXLIN]; | |
305 | int sposns[MAXLIN]; | |
306 | int spmax, spinit; | |
307 | ||
308 | char symbol[NCHARS]; | |
309 | char isyms[NCHARS]; | |
310 | char ssyms[NCHARS]; | |
311 | int ssmax, ssinit; | |
312 | ||
313 | for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0; | |
314 | for (i=0; i<NCHARS; i++) isyms[i] = symbol[i] = 0; | |
315 | setcnt = 0; | |
316 | state[0] = add(0); | |
317 | /* compute initial positions and symbols of state 0 */ | |
318 | ssmax = 0; | |
319 | ptr = foll[0]; | |
320 | spinit = *ptr; | |
321 | for (i=0; i<spinit; i++) { | |
322 | curpos = *(++ptr); | |
323 | sposns[i] = curpos; | |
324 | iposns[curpos] = 1; | |
325 | cp = point[curpos]; | |
326 | dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos); | |
327 | switch (type(cp)) { | |
328 | case CHAR: | |
329 | k = (int) right(cp); | |
330 | if (isyms[k] != 1) { | |
331 | isyms[k] = 1; | |
332 | ssyms[ssmax++] = k; | |
333 | } | |
334 | break; | |
335 | case DOT: | |
336 | for (k=1; k<NCHARS; k++) { | |
337 | if (k != HAT) { | |
338 | if (isyms[k] != 1) { | |
339 | isyms[k] = 1; | |
340 | ssyms[ssmax++] = k; | |
341 | } | |
342 | } | |
343 | } | |
344 | break; | |
345 | case CCL: | |
346 | for (p = (char *) right(cp); *p; p++) { | |
347 | if (*p != HAT) { | |
348 | if (isyms[*p] != 1) { | |
349 | isyms[*p] = 1; | |
350 | ssyms[ssmax++] = *p; | |
351 | } | |
352 | } | |
353 | } | |
354 | break; | |
355 | case NCCL: | |
356 | for (k=1; k<NCHARS; k++) { | |
357 | if (k != HAT && !member(k, (char *) right(cp))) { | |
358 | if (isyms[k] != 1) { | |
359 | isyms[k] = 1; | |
360 | ssyms[ssmax++] = k; | |
361 | } | |
362 | } | |
363 | } | |
364 | } | |
365 | } | |
366 | ssinit = ssmax; | |
367 | n = 0; | |
368 | for (s=0; s<=n; s++) { | |
369 | dprintf("s = %d\n", s, NULL, NULL); | |
370 | ind = 0; | |
371 | numtrans = 0; | |
372 | finflg = 0; | |
373 | if (*(state[s] + *state[s]) == line) { /* s final? */ | |
374 | finflg = 1; | |
375 | goto tenter; | |
376 | } | |
377 | spmax = spinit; | |
378 | ssmax = ssinit; | |
379 | ptr = state[s]; | |
380 | num = *ptr; | |
381 | for (i=0; i<num; i++) { | |
382 | curpos = *(++ptr); | |
383 | if (iposns[curpos] != 1 && index[curpos] != 1) { | |
384 | index[curpos] = 1; | |
385 | sposns[spmax++] = curpos; | |
386 | } | |
387 | cp = point[curpos]; | |
388 | switch (type(cp)) { | |
389 | case CHAR: | |
390 | k = (int) right(cp); | |
391 | if (isyms[k] == 0 && symbol[k] == 0) { | |
392 | symbol[k] = 1; | |
393 | ssyms[ssmax++] = k; | |
394 | } | |
395 | break; | |
396 | case DOT: | |
397 | for (k=1; k<NCHARS; k++) { | |
398 | if (k != HAT) { | |
399 | if (isyms[k] == 0 && symbol[k] == 0) { | |
400 | symbol[k] = 1; | |
401 | ssyms[ssmax++] = k; | |
402 | } | |
403 | } | |
404 | } | |
405 | break; | |
406 | case CCL: | |
407 | for (p = (char *) right(cp); *p; p++) { | |
408 | if (*p != HAT) { | |
409 | if (isyms[*p] == 0 && symbol[*p] == 0) { | |
410 | symbol[*p] = 1; | |
411 | ssyms[ssmax++] = *p; | |
412 | } | |
413 | } | |
414 | } | |
415 | break; | |
416 | case NCCL: | |
417 | for (k=1; k<NCHARS; k++) { | |
418 | if (k != HAT && !member(k, (char *) right(cp))) { | |
419 | if (isyms[k] == 0 && symbol[k] == 0) { | |
420 | symbol[k] = 1; | |
421 | ssyms[ssmax++] = k; | |
422 | } | |
423 | } | |
424 | } | |
425 | } | |
426 | } | |
427 | for (j=0; j<ssmax; j++) { /* nextstate(s, ssyms[j]) */ | |
428 | c = ssyms[j]; | |
429 | symbol[c] = 0; | |
430 | setcnt = 0; | |
431 | for (k=0; k<=line; k++) setvec[k] = 0; | |
432 | for (i=0; i<spmax; i++) { | |
433 | index[sposns[i]] = 0; | |
434 | cp = point[sposns[i]]; | |
435 | if ((k = type(cp)) != FINAL) | |
436 | if (k == CHAR && c == (int) right(cp) | |
437 | || k == DOT | |
438 | || k == CCL && member(c, (char *) right(cp)) | |
439 | || k == NCCL && !member(c, (char *) right(cp))) { | |
440 | ptr = foll[sposns[i]]; | |
441 | num = *ptr; | |
442 | for (k=0; k<num; k++) { | |
443 | if (setvec[*(++ptr)] != 1 | |
444 | && iposns[*ptr] != 1) { | |
445 | setvec[*ptr] = 1; | |
446 | setcnt++; | |
447 | } | |
448 | } | |
449 | } | |
450 | } /* end nextstate */ | |
451 | if (notin(state, n, &prev)) { | |
452 | if (n >= NSTATES) { | |
453 | dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL); | |
454 | overflo(); | |
455 | } | |
456 | state[++n] = add(setcnt); | |
457 | dprintf(" delta(%d,%o) = %d", s,c,n); | |
458 | dprintf(", ind = %d\n", ind+1, NULL, NULL); | |
459 | fatab[++ind] = c; | |
460 | fatab[++ind] = n; | |
461 | numtrans++; | |
462 | } | |
463 | else { | |
464 | if (prev != 0) { | |
465 | dprintf(" delta(%d,%o) = %d", s,c,prev); | |
466 | dprintf(", ind = %d\n", ind+1, NULL, NULL); | |
467 | fatab[++ind] = c; | |
468 | fatab[++ind] = prev; | |
469 | numtrans++; | |
470 | } | |
471 | } | |
472 | } | |
473 | tenter: | |
474 | if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL) | |
475 | overflo(); | |
476 | where[s] = pfa; | |
477 | if (finflg) | |
478 | pfa->cch = -1; /* s is a final state */ | |
479 | else | |
480 | pfa->cch = numtrans; | |
481 | pfa->st = 0; | |
482 | for (i=1, pfa += 1; i<=numtrans; i++, pfa++) { | |
483 | pfa->cch = fatab[2*i-1]; | |
484 | pfa->st = (struct fa *) fatab[2*i]; | |
485 | } | |
486 | } | |
487 | for (i=0; i<=n; i++) { | |
488 | xfree(state[i]); /* free state[i] */ | |
489 | pfa = where[i]; | |
490 | pfa->st = where[0]; | |
491 | dprintf("state %d: (%o)\n", i, pfa, NULL); | |
492 | dprintf(" numtrans = %d, default = %o\n", pfa->cch, pfa->st, NULL); | |
493 | for (k=1; k<=pfa->cch; k++) { | |
494 | (pfa+k)->st = where[ (int) (pfa+k)->st]; | |
495 | dprintf(" char = %o, nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL); | |
496 | } | |
497 | } | |
498 | pfa = where[0]; | |
499 | if ((num = pfa->cch) < 0) | |
500 | return(where[0]); | |
501 | for (pfa += num; num; num--, pfa--) | |
502 | if (pfa->cch == HAT) { | |
503 | return(pfa->st); | |
504 | } | |
505 | return(where[0]); | |
506 | } | |
507 | ||
508 | match(pfa, p) | |
509 | register struct fa *pfa; | |
510 | register char *p; | |
511 | { | |
512 | register count; | |
513 | char c; | |
514 | if (p == 0) return(0); | |
515 | if (pfa->cch == 1) { /* fast test for first character, if possible */ | |
516 | c = (++pfa)->cch; | |
517 | do | |
518 | if (c == *p) { | |
519 | p++; | |
520 | pfa = pfa->st; | |
521 | goto adv; | |
522 | } | |
523 | while (*p++ != 0); | |
524 | return(0); | |
525 | } | |
526 | adv: if ((count = pfa->cch) < 0) return(1); | |
527 | do { | |
528 | for (pfa += count; count; count--, pfa--) | |
529 | if (pfa->cch == *p) { | |
530 | break; | |
531 | } | |
532 | pfa = pfa->st; | |
533 | if ((count = pfa->cch) < 0) return(1); | |
534 | } while(*p++ != 0); | |
535 | return(0); | |
536 | } |