| 1 | /* b.c 4.1 82/05/07 */ |
| 2 | |
| 3 | #include "awk.def" |
| 4 | #include "stdio.h" |
| 5 | #include "awk.h" |
| 6 | |
| 7 | extern node *op2(); |
| 8 | extern struct fa *cgotofn(); |
| 9 | #define MAXLIN 256 |
| 10 | #define NCHARS 128 |
| 11 | #define NSTATES 256 |
| 12 | |
| 13 | #define type(v) v->nobj |
| 14 | #define left(v) v->narg[0] |
| 15 | #define right(v) v->narg[1] |
| 16 | #define parent(v) v->nnext |
| 17 | |
| 18 | #define LEAF case CCL: case NCCL: case CHAR: case DOT: |
| 19 | #define UNARY case FINAL: case STAR: case PLUS: case QUEST: |
| 20 | |
| 21 | /* encoding in tree nodes: |
| 22 | leaf (CCL, NCCL, CHAR, DOT): left is index, right contains value or pointer to value |
| 23 | unary (FINAL, STAR, PLUS, QUEST): left is child, right is null |
| 24 | binary (CAT, OR): left and right are children |
| 25 | parent contains pointer to parent |
| 26 | */ |
| 27 | |
| 28 | struct fa { |
| 29 | int cch; |
| 30 | struct fa *st; |
| 31 | }; |
| 32 | |
| 33 | int *state[NSTATES]; |
| 34 | int *foll[MAXLIN]; |
| 35 | char chars[MAXLIN]; |
| 36 | int setvec[MAXLIN]; |
| 37 | node *point[MAXLIN]; |
| 38 | |
| 39 | int setcnt; |
| 40 | int line; |
| 41 | |
| 42 | |
| 43 | struct fa *makedfa(p) /* returns dfa for tree pointed to by p */ |
| 44 | node *p; |
| 45 | { |
| 46 | node *p1; |
| 47 | struct fa *fap; |
| 48 | p1 = op2(CAT, op2(STAR, op2(DOT, (node *) 0, (node *) 0), (node *) 0), p); |
| 49 | /* put DOT STAR in front of reg. exp. */ |
| 50 | p1 = op2(FINAL, p1, (node *) 0); /* install FINAL node */ |
| 51 | |
| 52 | line = 0; |
| 53 | penter(p1); /* enter parent pointers and leaf indices */ |
| 54 | point[line] = p1; /* FINAL node */ |
| 55 | setvec[0] = 1; /* for initial DOT STAR */ |
| 56 | cfoll(p1); /* set up follow sets */ |
| 57 | fap = cgotofn(); |
| 58 | freetr(p1); /* add this when alloc works */ |
| 59 | return(fap); |
| 60 | } |
| 61 | |
| 62 | penter(p) /* set up parent pointers and leaf indices */ |
| 63 | node *p; |
| 64 | { |
| 65 | switch(type(p)) { |
| 66 | LEAF |
| 67 | left(p) = (node *) line; |
| 68 | point[line++] = p; |
| 69 | break; |
| 70 | UNARY |
| 71 | penter(left(p)); |
| 72 | parent(left(p)) = p; |
| 73 | break; |
| 74 | case CAT: |
| 75 | case OR: |
| 76 | penter(left(p)); |
| 77 | penter(right(p)); |
| 78 | parent(left(p)) = p; |
| 79 | parent(right(p)) = p; |
| 80 | break; |
| 81 | default: |
| 82 | error(FATAL, "unknown type %d in penter\n", type(p)); |
| 83 | break; |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | freetr(p) /* free parse tree and follow sets */ |
| 88 | node *p; |
| 89 | { |
| 90 | switch(type(p)) { |
| 91 | LEAF |
| 92 | xfree(foll[(int) left(p)]); |
| 93 | xfree(p); |
| 94 | break; |
| 95 | UNARY |
| 96 | freetr(left(p)); |
| 97 | xfree(p); |
| 98 | break; |
| 99 | case CAT: |
| 100 | case OR: |
| 101 | freetr(left(p)); |
| 102 | freetr(right(p)); |
| 103 | xfree(p); |
| 104 | break; |
| 105 | default: |
| 106 | error(FATAL, "unknown type %d in freetr", type(p)); |
| 107 | break; |
| 108 | } |
| 109 | } |
| 110 | char *cclenter(p) |
| 111 | register char *p; |
| 112 | { |
| 113 | register i, c; |
| 114 | char *op; |
| 115 | |
| 116 | op = p; |
| 117 | i = 0; |
| 118 | while ((c = *p++) != 0) { |
| 119 | if (c == '-' && i > 0 && chars[i-1] != 0) { |
| 120 | if (*p != 0) { |
| 121 | c = chars[i-1]; |
| 122 | while (c < *p) { |
| 123 | if (i >= MAXLIN) |
| 124 | overflo(); |
| 125 | chars[i++] = ++c; |
| 126 | } |
| 127 | p++; |
| 128 | continue; |
| 129 | } |
| 130 | } |
| 131 | if (i >= MAXLIN) |
| 132 | overflo(); |
| 133 | chars[i++] = c; |
| 134 | } |
| 135 | chars[i++] = '\0'; |
| 136 | dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL); |
| 137 | xfree(op); |
| 138 | return(tostring(chars)); |
| 139 | } |
| 140 | |
| 141 | overflo() |
| 142 | { |
| 143 | error(FATAL, "regular expression too long\n"); |
| 144 | } |
| 145 | |
| 146 | cfoll(v) /* enter follow set of each leaf of vertex v into foll[leaf] */ |
| 147 | register node *v; |
| 148 | { |
| 149 | register i; |
| 150 | int prev; |
| 151 | int *add(); |
| 152 | |
| 153 | switch(type(v)) { |
| 154 | LEAF |
| 155 | setcnt = 0; |
| 156 | for (i=1; i<=line; i++) |
| 157 | setvec[i] = 0; |
| 158 | follow(v); |
| 159 | if (notin(foll, ( (int) left(v))-1, &prev)) { |
| 160 | foll[(int) left(v)] = add(setcnt); |
| 161 | } |
| 162 | else |
| 163 | foll[ (int) left(v)] = foll[prev]; |
| 164 | break; |
| 165 | UNARY |
| 166 | cfoll(left(v)); |
| 167 | break; |
| 168 | case CAT: |
| 169 | case OR: |
| 170 | cfoll(left(v)); |
| 171 | cfoll(right(v)); |
| 172 | break; |
| 173 | default: |
| 174 | error(FATAL, "unknown type %d in cfoll", type(v)); |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | first(p) /* collects initially active leaves of p into setvec */ |
| 179 | register node *p; /* returns 0 or 1 depending on whether p matches empty string */ |
| 180 | { |
| 181 | register b; |
| 182 | |
| 183 | switch(type(p)) { |
| 184 | LEAF |
| 185 | if (setvec[(int) left(p)] != 1) { |
| 186 | setvec[(int) left(p)] = 1; |
| 187 | setcnt++; |
| 188 | } |
| 189 | if (type(p) == CCL && (*(char *) right(p)) == '\0') |
| 190 | return(0); /* empty CCL */ |
| 191 | else return(1); |
| 192 | case FINAL: |
| 193 | case PLUS: |
| 194 | if (first(left(p)) == 0) return(0); |
| 195 | return(1); |
| 196 | case STAR: |
| 197 | case QUEST: |
| 198 | first(left(p)); |
| 199 | return(0); |
| 200 | case CAT: |
| 201 | if (first(left(p)) == 0 && first(right(p)) == 0) return(0); |
| 202 | return(1); |
| 203 | case OR: |
| 204 | b = first(right(p)); |
| 205 | if (first(left(p)) == 0 || b == 0) return(0); |
| 206 | return(1); |
| 207 | } |
| 208 | error(FATAL, "unknown type %d in first\n", type(p)); |
| 209 | return(-1); |
| 210 | } |
| 211 | |
| 212 | follow(v) |
| 213 | node *v; /* collects leaves that can follow v into setvec */ |
| 214 | { |
| 215 | node *p; |
| 216 | |
| 217 | if (type(v) == FINAL) |
| 218 | return; |
| 219 | p = parent(v); |
| 220 | switch (type(p)) { |
| 221 | case STAR: |
| 222 | case PLUS: first(v); |
| 223 | follow(p); |
| 224 | return; |
| 225 | |
| 226 | case OR: |
| 227 | case QUEST: follow(p); |
| 228 | return; |
| 229 | |
| 230 | case CAT: if (v == left(p)) { /* v is left child of p */ |
| 231 | if (first(right(p)) == 0) { |
| 232 | follow(p); |
| 233 | return; |
| 234 | } |
| 235 | } |
| 236 | else /* v is right child */ |
| 237 | follow(p); |
| 238 | return; |
| 239 | case FINAL: if (setvec[line] != 1) { |
| 240 | setvec[line] = 1; |
| 241 | setcnt++; |
| 242 | } |
| 243 | return; |
| 244 | } |
| 245 | } |
| 246 | |
| 247 | member(c, s) /* is c in s? */ |
| 248 | register char c, *s; |
| 249 | { |
| 250 | while (*s) |
| 251 | if (c == *s++) |
| 252 | return(1); |
| 253 | return(0); |
| 254 | } |
| 255 | |
| 256 | notin(array, n, prev) /* is setvec in array[0] thru array[n]? */ |
| 257 | int **array; |
| 258 | int *prev; { |
| 259 | register i, j; |
| 260 | int *ptr; |
| 261 | for (i=0; i<=n; i++) { |
| 262 | ptr = array[i]; |
| 263 | if (*ptr == setcnt) { |
| 264 | for (j=0; j < setcnt; j++) |
| 265 | if (setvec[*(++ptr)] != 1) goto nxt; |
| 266 | *prev = i; |
| 267 | return(0); |
| 268 | } |
| 269 | nxt: ; |
| 270 | } |
| 271 | return(1); |
| 272 | } |
| 273 | |
| 274 | int *add(n) { /* remember setvec */ |
| 275 | int *ptr, *p; |
| 276 | register i; |
| 277 | if ((p = ptr = (int *) malloc((n+1)*sizeof(int))) == NULL) |
| 278 | overflo(); |
| 279 | *ptr = n; |
| 280 | dprintf("add(%d)\n", n, NULL, NULL); |
| 281 | for (i=1; i <= line; i++) |
| 282 | if (setvec[i] == 1) { |
| 283 | *(++ptr) = i; |
| 284 | dprintf(" ptr = %o, *ptr = %d, i = %d\n", ptr, *ptr, i); |
| 285 | } |
| 286 | dprintf("\n", NULL, NULL, NULL); |
| 287 | return(p); |
| 288 | } |
| 289 | |
| 290 | struct fa *cgotofn() |
| 291 | { |
| 292 | register i, k; |
| 293 | register int *ptr; |
| 294 | char c; |
| 295 | char *p; |
| 296 | node *cp; |
| 297 | int j, n, s, ind, numtrans; |
| 298 | int finflg; |
| 299 | int curpos, num, prev; |
| 300 | struct fa *where[NSTATES]; |
| 301 | |
| 302 | int fatab[257]; |
| 303 | struct fa *pfa; |
| 304 | |
| 305 | char index[MAXLIN]; |
| 306 | char iposns[MAXLIN]; |
| 307 | int sposns[MAXLIN]; |
| 308 | int spmax, spinit; |
| 309 | |
| 310 | char symbol[NCHARS]; |
| 311 | char isyms[NCHARS]; |
| 312 | char ssyms[NCHARS]; |
| 313 | int ssmax, ssinit; |
| 314 | |
| 315 | for (i=0; i<=line; i++) index[i] = iposns[i] = setvec[i] = 0; |
| 316 | for (i=0; i<NCHARS; i++) isyms[i] = symbol[i] = 0; |
| 317 | setcnt = 0; |
| 318 | /* compute initial positions and symbols of state 0 */ |
| 319 | ssmax = 0; |
| 320 | ptr = state[0] = foll[0]; |
| 321 | spinit = *ptr; |
| 322 | for (i=0; i<spinit; i++) { |
| 323 | curpos = *(++ptr); |
| 324 | sposns[i] = curpos; |
| 325 | iposns[curpos] = 1; |
| 326 | cp = point[curpos]; |
| 327 | dprintf("i = %d, spinit = %d, curpos = %d\n", i, spinit, curpos); |
| 328 | switch (type(cp)) { |
| 329 | case CHAR: |
| 330 | k = (int) right(cp); |
| 331 | if (isyms[k] != 1) { |
| 332 | isyms[k] = 1; |
| 333 | ssyms[ssmax++] = k; |
| 334 | } |
| 335 | break; |
| 336 | case DOT: |
| 337 | for (k=1; k<NCHARS; k++) { |
| 338 | if (k != HAT) { |
| 339 | if (isyms[k] != 1) { |
| 340 | isyms[k] = 1; |
| 341 | ssyms[ssmax++] = k; |
| 342 | } |
| 343 | } |
| 344 | } |
| 345 | break; |
| 346 | case CCL: |
| 347 | for (p = (char *) right(cp); *p; p++) { |
| 348 | if (*p != HAT) { |
| 349 | if (isyms[*p] != 1) { |
| 350 | isyms[*p] = 1; |
| 351 | ssyms[ssmax++] = *p; |
| 352 | } |
| 353 | } |
| 354 | } |
| 355 | break; |
| 356 | case NCCL: |
| 357 | for (k=1; k<NCHARS; k++) { |
| 358 | if (k != HAT && !member(k, (char *) right(cp))) { |
| 359 | if (isyms[k] != 1) { |
| 360 | isyms[k] = 1; |
| 361 | ssyms[ssmax++] = k; |
| 362 | } |
| 363 | } |
| 364 | } |
| 365 | } |
| 366 | } |
| 367 | ssinit = ssmax; |
| 368 | n = 0; |
| 369 | for (s=0; s<=n; s++) { |
| 370 | dprintf("s = %d\n", s, NULL, NULL); |
| 371 | ind = 0; |
| 372 | numtrans = 0; |
| 373 | finflg = 0; |
| 374 | if (*(state[s] + *state[s]) == line) { /* s final? */ |
| 375 | finflg = 1; |
| 376 | goto tenter; |
| 377 | } |
| 378 | spmax = spinit; |
| 379 | ssmax = ssinit; |
| 380 | ptr = state[s]; |
| 381 | num = *ptr; |
| 382 | for (i=0; i<num; i++) { |
| 383 | curpos = *(++ptr); |
| 384 | if (iposns[curpos] != 1 && index[curpos] != 1) { |
| 385 | index[curpos] = 1; |
| 386 | sposns[spmax++] = curpos; |
| 387 | } |
| 388 | cp = point[curpos]; |
| 389 | switch (type(cp)) { |
| 390 | case CHAR: |
| 391 | k = (int) right(cp); |
| 392 | if (isyms[k] == 0 && symbol[k] == 0) { |
| 393 | symbol[k] = 1; |
| 394 | ssyms[ssmax++] = k; |
| 395 | } |
| 396 | break; |
| 397 | case DOT: |
| 398 | for (k=1; k<NCHARS; k++) { |
| 399 | if (k != HAT) { |
| 400 | if (isyms[k] == 0 && symbol[k] == 0) { |
| 401 | symbol[k] = 1; |
| 402 | ssyms[ssmax++] = k; |
| 403 | } |
| 404 | } |
| 405 | } |
| 406 | break; |
| 407 | case CCL: |
| 408 | for (p = (char *) right(cp); *p; p++) { |
| 409 | if (*p != HAT) { |
| 410 | if (isyms[*p] == 0 && symbol[*p] == 0) { |
| 411 | symbol[*p] = 1; |
| 412 | ssyms[ssmax++] = *p; |
| 413 | } |
| 414 | } |
| 415 | } |
| 416 | break; |
| 417 | case NCCL: |
| 418 | for (k=1; k<NCHARS; k++) { |
| 419 | if (k != HAT && !member(k, (char *) right(cp))) { |
| 420 | if (isyms[k] == 0 && symbol[k] == 0) { |
| 421 | symbol[k] = 1; |
| 422 | ssyms[ssmax++] = k; |
| 423 | } |
| 424 | } |
| 425 | } |
| 426 | } |
| 427 | } |
| 428 | for (j=0; j<ssmax; j++) { /* nextstate(s, ssyms[j]) */ |
| 429 | c = ssyms[j]; |
| 430 | symbol[c] = 0; |
| 431 | setcnt = 0; |
| 432 | for (k=0; k<=line; k++) setvec[k] = 0; |
| 433 | for (i=0; i<spmax; i++) { |
| 434 | index[sposns[i]] = 0; |
| 435 | cp = point[sposns[i]]; |
| 436 | if ((k = type(cp)) != FINAL) |
| 437 | if (k == CHAR && c == (int) right(cp) |
| 438 | || k == DOT |
| 439 | || k == CCL && member(c, (char *) right(cp)) |
| 440 | || k == NCCL && !member(c, (char *) right(cp))) { |
| 441 | ptr = foll[sposns[i]]; |
| 442 | num = *ptr; |
| 443 | for (k=0; k<num; k++) { |
| 444 | if (setvec[*(++ptr)] != 1 |
| 445 | && iposns[*ptr] != 1) { |
| 446 | setvec[*ptr] = 1; |
| 447 | setcnt++; |
| 448 | } |
| 449 | } |
| 450 | } |
| 451 | } /* end nextstate */ |
| 452 | if (notin(state, n, &prev)) { |
| 453 | if (n >= NSTATES) { |
| 454 | dprintf("cgotofn: notin; state = %d, n = %d\n", state, n, NULL); |
| 455 | overflo(); |
| 456 | } |
| 457 | state[++n] = add(setcnt); |
| 458 | dprintf(" delta(%d,%o) = %d", s,c,n); |
| 459 | dprintf(", ind = %d\n", ind+1, NULL, NULL); |
| 460 | fatab[++ind] = c; |
| 461 | fatab[++ind] = n; |
| 462 | numtrans++; |
| 463 | } |
| 464 | else { |
| 465 | if (prev != 0) { |
| 466 | dprintf(" delta(%d,%o) = %d", s,c,prev); |
| 467 | dprintf(", ind = %d\n", ind+1, NULL, NULL); |
| 468 | fatab[++ind] = c; |
| 469 | fatab[++ind] = prev; |
| 470 | numtrans++; |
| 471 | } |
| 472 | } |
| 473 | } |
| 474 | tenter: |
| 475 | if ((pfa = (struct fa *) malloc((numtrans + 1) * sizeof(struct fa))) == NULL) |
| 476 | overflo(); |
| 477 | where[s] = pfa; |
| 478 | if (finflg) |
| 479 | pfa->cch = -1; /* s is a final state */ |
| 480 | else |
| 481 | pfa->cch = numtrans; |
| 482 | pfa->st = 0; |
| 483 | for (i=1, pfa += 1; i<=numtrans; i++, pfa++) { |
| 484 | pfa->cch = fatab[2*i-1]; |
| 485 | pfa->st = (struct fa *) fatab[2*i]; |
| 486 | } |
| 487 | } |
| 488 | for (i=0; i<=n; i++) { |
| 489 | xfree(state[i]); /* free state[i] */ |
| 490 | pfa = where[i]; |
| 491 | pfa->st = where[0]; |
| 492 | dprintf("state %d: (%o)\n", i, pfa, NULL); |
| 493 | dprintf(" numtrans = %d, default = %o\n", pfa->cch, pfa->st, NULL); |
| 494 | for (k=1; k<=pfa->cch; k++) { |
| 495 | (pfa+k)->st = where[ (int) (pfa+k)->st]; |
| 496 | dprintf(" char = %o, nextstate = %o\n",(pfa+k)->cch, (pfa+k)->st, NULL); |
| 497 | } |
| 498 | } |
| 499 | pfa = where[0]; |
| 500 | if ((num = pfa->cch) < 0) |
| 501 | return(where[0]); |
| 502 | for (pfa += num; num; num--, pfa--) |
| 503 | if (pfa->cch == HAT) { |
| 504 | return(pfa->st); |
| 505 | } |
| 506 | return(where[0]); |
| 507 | } |
| 508 | |
| 509 | match(pfa, p) |
| 510 | register struct fa *pfa; |
| 511 | register char *p; |
| 512 | { |
| 513 | register count; |
| 514 | char c; |
| 515 | if (p == 0) return(0); |
| 516 | if (pfa->cch == 1) { /* fast test for first character, if possible */ |
| 517 | c = (++pfa)->cch; |
| 518 | do |
| 519 | if (c == *p) { |
| 520 | p++; |
| 521 | pfa = pfa->st; |
| 522 | goto adv; |
| 523 | } |
| 524 | while (*p++ != 0); |
| 525 | return(0); |
| 526 | } |
| 527 | adv: if ((count = pfa->cch) < 0) return(1); |
| 528 | do { |
| 529 | for (pfa += count; count; count--, pfa--) |
| 530 | if (pfa->cch == *p) { |
| 531 | break; |
| 532 | } |
| 533 | pfa = pfa->st; |
| 534 | if ((count = pfa->cch) < 0) return(1); |
| 535 | } while(*p++ != 0); |
| 536 | return(0); |
| 537 | } |