Commit | Line | Data |
---|---|---|
7d684d5e NW |
1 | /* dfa - DFA construction routines */ |
2 | ||
15637ed4 RG |
3 | /*- |
4 | * Copyright (c) 1990 The Regents of the University of California. | |
5 | * All rights reserved. | |
6 | * | |
7 | * This code is derived from software contributed to Berkeley by | |
7d684d5e | 8 | * Vern Paxson. |
15637ed4 RG |
9 | * |
10 | * The United States Government has rights in this work pursuant | |
11 | * to contract no. DE-AC03-76SF00098 between the United States | |
12 | * Department of Energy and the University of California. | |
13 | * | |
7d684d5e NW |
14 | * Redistribution and use in source and binary forms are permitted provided |
15 | * that: (1) source distributions retain this entire copyright notice and | |
16 | * comment, and (2) distributions including binaries display the following | |
17 | * acknowledgement: ``This product includes software developed by the | |
18 | * University of California, Berkeley and its contributors'' in the | |
19 | * documentation or other materials provided with the distribution and in | |
20 | * all advertising materials mentioning features or use of this software. | |
21 | * Neither the name of the University nor the names of its contributors may | |
22 | * be used to endorse or promote products derived from this software without | |
23 | * specific prior written permission. | |
24 | * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED | |
25 | * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF | |
26 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | |
15637ed4 RG |
27 | */ |
28 | ||
29 | #ifndef lint | |
7d684d5e NW |
30 | static char rcsid[] = |
31 | "@(#) $Header: /usr/fsys/odin/a/vern/flex/RCS/dfa.c,v 2.7 90/06/27 23:48:15 vern Exp $ (LBL)"; | |
32 | #endif | |
15637ed4 RG |
33 | |
34 | #include "flexdef.h" | |
35 | ||
7d684d5e | 36 | |
15637ed4 RG |
37 | /* declare functions that have forward references */ |
38 | ||
39 | void dump_associated_rules PROTO((FILE*, int)); | |
40 | void dump_transitions PROTO((FILE*, int[])); | |
41 | void sympartition PROTO((int[], int, int[], int[])); | |
42 | int symfollowset PROTO((int[], int, int, int[])); | |
43 | ||
44 | ||
45 | /* check_for_backtracking - check a DFA state for backtracking | |
46 | * | |
47 | * synopsis | |
48 | * int ds, state[numecs]; | |
49 | * check_for_backtracking( ds, state ); | |
50 | * | |
51 | * ds is the number of the state to check and state[] is its out-transitions, | |
52 | * indexed by equivalence class, and state_rules[] is the set of rules | |
53 | * associated with this state | |
54 | */ | |
55 | ||
56 | void check_for_backtracking( ds, state ) | |
57 | int ds; | |
58 | int state[]; | |
59 | ||
60 | { | |
61 | if ( (reject && ! dfaacc[ds].dfaacc_set) || ! dfaacc[ds].dfaacc_state ) | |
62 | { /* state is non-accepting */ | |
63 | ++num_backtracking; | |
64 | ||
65 | if ( backtrack_report ) | |
66 | { | |
67 | fprintf( backtrack_file, "State #%d is non-accepting -\n", ds ); | |
68 | ||
69 | /* identify the state */ | |
70 | dump_associated_rules( backtrack_file, ds ); | |
71 | ||
72 | /* now identify it further using the out- and jam-transitions */ | |
73 | dump_transitions( backtrack_file, state ); | |
74 | ||
75 | putc( '\n', backtrack_file ); | |
76 | } | |
77 | } | |
78 | } | |
79 | ||
80 | ||
81 | /* check_trailing_context - check to see if NFA state set constitutes | |
82 | * "dangerous" trailing context | |
83 | * | |
84 | * synopsis | |
85 | * int nfa_states[num_states+1], num_states; | |
86 | * int accset[nacc+1], nacc; | |
87 | * check_trailing_context( nfa_states, num_states, accset, nacc ); | |
88 | * | |
89 | * NOTES | |
90 | * Trailing context is "dangerous" if both the head and the trailing | |
91 | * part are of variable size \and/ there's a DFA state which contains | |
92 | * both an accepting state for the head part of the rule and NFA states | |
93 | * which occur after the beginning of the trailing context. | |
94 | * When such a rule is matched, it's impossible to tell if having been | |
95 | * in the DFA state indicates the beginning of the trailing context | |
96 | * or further-along scanning of the pattern. In these cases, a warning | |
97 | * message is issued. | |
98 | * | |
99 | * nfa_states[1 .. num_states] is the list of NFA states in the DFA. | |
100 | * accset[1 .. nacc] is the list of accepting numbers for the DFA state. | |
101 | */ | |
102 | ||
103 | void check_trailing_context( nfa_states, num_states, accset, nacc ) | |
104 | int *nfa_states, num_states; | |
105 | int *accset; | |
106 | register int nacc; | |
107 | ||
108 | { | |
109 | register int i, j; | |
110 | ||
111 | for ( i = 1; i <= num_states; ++i ) | |
112 | { | |
113 | int ns = nfa_states[i]; | |
114 | register int type = state_type[ns]; | |
115 | register int ar = assoc_rule[ns]; | |
116 | ||
117 | if ( type == STATE_NORMAL || rule_type[ar] != RULE_VARIABLE ) | |
118 | { /* do nothing */ | |
119 | } | |
120 | ||
121 | else if ( type == STATE_TRAILING_CONTEXT ) | |
122 | { | |
123 | /* potential trouble. Scan set of accepting numbers for | |
124 | * the one marking the end of the "head". We assume that | |
125 | * this looping will be fairly cheap since it's rare that | |
126 | * an accepting number set is large. | |
127 | */ | |
128 | for ( j = 1; j <= nacc; ++j ) | |
129 | if ( accset[j] & YY_TRAILING_HEAD_MASK ) | |
130 | { | |
131 | fprintf( stderr, | |
132 | "%s: Dangerous trailing context in rule at line %d\n", | |
133 | program_name, rule_linenum[ar] ); | |
134 | return; | |
135 | } | |
136 | } | |
137 | } | |
138 | } | |
139 | ||
140 | ||
141 | /* dump_associated_rules - list the rules associated with a DFA state | |
142 | * | |
143 | * synopisis | |
144 | * int ds; | |
145 | * FILE *file; | |
146 | * dump_associated_rules( file, ds ); | |
147 | * | |
148 | * goes through the set of NFA states associated with the DFA and | |
149 | * extracts the first MAX_ASSOC_RULES unique rules, sorts them, | |
150 | * and writes a report to the given file | |
151 | */ | |
152 | ||
153 | void dump_associated_rules( file, ds ) | |
154 | FILE *file; | |
155 | int ds; | |
156 | ||
157 | { | |
158 | register int i, j; | |
159 | register int num_associated_rules = 0; | |
160 | int rule_set[MAX_ASSOC_RULES + 1]; | |
161 | int *dset = dss[ds]; | |
162 | int size = dfasiz[ds]; | |
163 | ||
164 | for ( i = 1; i <= size; ++i ) | |
165 | { | |
166 | register rule_num = rule_linenum[assoc_rule[dset[i]]]; | |
167 | ||
168 | for ( j = 1; j <= num_associated_rules; ++j ) | |
169 | if ( rule_num == rule_set[j] ) | |
170 | break; | |
171 | ||
172 | if ( j > num_associated_rules ) | |
173 | { /* new rule */ | |
174 | if ( num_associated_rules < MAX_ASSOC_RULES ) | |
175 | rule_set[++num_associated_rules] = rule_num; | |
176 | } | |
177 | } | |
178 | ||
179 | bubble( rule_set, num_associated_rules ); | |
180 | ||
181 | fprintf( file, " associated rule line numbers:" ); | |
182 | ||
183 | for ( i = 1; i <= num_associated_rules; ++i ) | |
184 | { | |
185 | if ( i % 8 == 1 ) | |
186 | putc( '\n', file ); | |
187 | ||
188 | fprintf( file, "\t%d", rule_set[i] ); | |
189 | } | |
190 | ||
191 | putc( '\n', file ); | |
192 | } | |
193 | ||
194 | ||
195 | /* dump_transitions - list the transitions associated with a DFA state | |
196 | * | |
197 | * synopisis | |
198 | * int state[numecs]; | |
199 | * FILE *file; | |
200 | * dump_transitions( file, state ); | |
201 | * | |
202 | * goes through the set of out-transitions and lists them in human-readable | |
203 | * form (i.e., not as equivalence classes); also lists jam transitions | |
204 | * (i.e., all those which are not out-transitions, plus EOF). The dump | |
205 | * is done to the given file. | |
206 | */ | |
207 | ||
208 | void dump_transitions( file, state ) | |
209 | FILE *file; | |
210 | int state[]; | |
211 | ||
212 | { | |
213 | register int i, ec; | |
214 | int out_char_set[CSIZE]; | |
215 | ||
216 | for ( i = 0; i < csize; ++i ) | |
217 | { | |
218 | ec = abs( ecgroup[i] ); | |
219 | out_char_set[i] = state[ec]; | |
220 | } | |
221 | ||
222 | fprintf( file, " out-transitions: " ); | |
223 | ||
224 | list_character_set( file, out_char_set ); | |
225 | ||
226 | /* now invert the members of the set to get the jam transitions */ | |
227 | for ( i = 0; i < csize; ++i ) | |
228 | out_char_set[i] = ! out_char_set[i]; | |
229 | ||
230 | fprintf( file, "\n jam-transitions: EOF " ); | |
231 | ||
232 | list_character_set( file, out_char_set ); | |
233 | ||
234 | putc( '\n', file ); | |
235 | } | |
236 | ||
237 | ||
238 | /* epsclosure - construct the epsilon closure of a set of ndfa states | |
239 | * | |
240 | * synopsis | |
241 | * int t[current_max_dfa_size], numstates, accset[num_rules + 1], nacc; | |
242 | * int hashval; | |
243 | * int *epsclosure(); | |
244 | * t = epsclosure( t, &numstates, accset, &nacc, &hashval ); | |
245 | * | |
246 | * NOTES | |
247 | * the epsilon closure is the set of all states reachable by an arbitrary | |
248 | * number of epsilon transitions which themselves do not have epsilon | |
249 | * transitions going out, unioned with the set of states which have non-null | |
250 | * accepting numbers. t is an array of size numstates of nfa state numbers. | |
251 | * Upon return, t holds the epsilon closure and numstates is updated. accset | |
252 | * holds a list of the accepting numbers, and the size of accset is given | |
253 | * by nacc. t may be subjected to reallocation if it is not large enough | |
254 | * to hold the epsilon closure. | |
255 | * | |
256 | * hashval is the hash value for the dfa corresponding to the state set | |
257 | */ | |
258 | ||
259 | int *epsclosure( t, ns_addr, accset, nacc_addr, hv_addr ) | |
260 | int *t, *ns_addr, accset[], *nacc_addr, *hv_addr; | |
261 | ||
262 | { | |
263 | register int stkpos, ns, tsp; | |
264 | int numstates = *ns_addr, nacc, hashval, transsym, nfaccnum; | |
265 | int stkend, nstate; | |
266 | static int did_stk_init = false, *stk; | |
267 | ||
268 | #define MARK_STATE(state) \ | |
269 | trans1[state] = trans1[state] - MARKER_DIFFERENCE; | |
270 | ||
271 | #define IS_MARKED(state) (trans1[state] < 0) | |
272 | ||
273 | #define UNMARK_STATE(state) \ | |
274 | trans1[state] = trans1[state] + MARKER_DIFFERENCE; | |
275 | ||
276 | #define CHECK_ACCEPT(state) \ | |
277 | { \ | |
278 | nfaccnum = accptnum[state]; \ | |
279 | if ( nfaccnum != NIL ) \ | |
280 | accset[++nacc] = nfaccnum; \ | |
281 | } | |
282 | ||
283 | #define DO_REALLOCATION \ | |
284 | { \ | |
285 | current_max_dfa_size += MAX_DFA_SIZE_INCREMENT; \ | |
286 | ++num_reallocs; \ | |
287 | t = reallocate_integer_array( t, current_max_dfa_size ); \ | |
288 | stk = reallocate_integer_array( stk, current_max_dfa_size ); \ | |
289 | } \ | |
290 | ||
291 | #define PUT_ON_STACK(state) \ | |
292 | { \ | |
293 | if ( ++stkend >= current_max_dfa_size ) \ | |
294 | DO_REALLOCATION \ | |
295 | stk[stkend] = state; \ | |
296 | MARK_STATE(state) \ | |
297 | } | |
298 | ||
299 | #define ADD_STATE(state) \ | |
300 | { \ | |
301 | if ( ++numstates >= current_max_dfa_size ) \ | |
302 | DO_REALLOCATION \ | |
303 | t[numstates] = state; \ | |
304 | hashval = hashval + state; \ | |
305 | } | |
306 | ||
307 | #define STACK_STATE(state) \ | |
308 | { \ | |
309 | PUT_ON_STACK(state) \ | |
310 | CHECK_ACCEPT(state) \ | |
311 | if ( nfaccnum != NIL || transchar[state] != SYM_EPSILON ) \ | |
312 | ADD_STATE(state) \ | |
313 | } | |
314 | ||
315 | if ( ! did_stk_init ) | |
316 | { | |
317 | stk = allocate_integer_array( current_max_dfa_size ); | |
318 | did_stk_init = true; | |
319 | } | |
320 | ||
321 | nacc = stkend = hashval = 0; | |
322 | ||
323 | for ( nstate = 1; nstate <= numstates; ++nstate ) | |
324 | { | |
325 | ns = t[nstate]; | |
326 | ||
327 | /* the state could be marked if we've already pushed it onto | |
328 | * the stack | |
329 | */ | |
330 | if ( ! IS_MARKED(ns) ) | |
331 | PUT_ON_STACK(ns) | |
332 | ||
333 | CHECK_ACCEPT(ns) | |
334 | hashval = hashval + ns; | |
335 | } | |
336 | ||
337 | for ( stkpos = 1; stkpos <= stkend; ++stkpos ) | |
338 | { | |
339 | ns = stk[stkpos]; | |
340 | transsym = transchar[ns]; | |
341 | ||
342 | if ( transsym == SYM_EPSILON ) | |
343 | { | |
344 | tsp = trans1[ns] + MARKER_DIFFERENCE; | |
345 | ||
346 | if ( tsp != NO_TRANSITION ) | |
347 | { | |
348 | if ( ! IS_MARKED(tsp) ) | |
349 | STACK_STATE(tsp) | |
350 | ||
351 | tsp = trans2[ns]; | |
352 | ||
353 | if ( tsp != NO_TRANSITION ) | |
354 | if ( ! IS_MARKED(tsp) ) | |
355 | STACK_STATE(tsp) | |
356 | } | |
357 | } | |
358 | } | |
359 | ||
360 | /* clear out "visit" markers */ | |
361 | ||
362 | for ( stkpos = 1; stkpos <= stkend; ++stkpos ) | |
363 | { | |
364 | if ( IS_MARKED(stk[stkpos]) ) | |
365 | { | |
366 | UNMARK_STATE(stk[stkpos]) | |
367 | } | |
368 | else | |
369 | flexfatal( "consistency check failed in epsclosure()" ); | |
370 | } | |
371 | ||
372 | *ns_addr = numstates; | |
373 | *hv_addr = hashval; | |
374 | *nacc_addr = nacc; | |
375 | ||
376 | return ( t ); | |
377 | } | |
378 | ||
379 | ||
380 | /* increase_max_dfas - increase the maximum number of DFAs */ | |
381 | ||
382 | void increase_max_dfas() | |
383 | ||
384 | { | |
385 | current_max_dfas += MAX_DFAS_INCREMENT; | |
386 | ||
387 | ++num_reallocs; | |
388 | ||
389 | base = reallocate_integer_array( base, current_max_dfas ); | |
390 | def = reallocate_integer_array( def, current_max_dfas ); | |
391 | dfasiz = reallocate_integer_array( dfasiz, current_max_dfas ); | |
392 | accsiz = reallocate_integer_array( accsiz, current_max_dfas ); | |
393 | dhash = reallocate_integer_array( dhash, current_max_dfas ); | |
394 | dss = reallocate_int_ptr_array( dss, current_max_dfas ); | |
395 | dfaacc = reallocate_dfaacc_union( dfaacc, current_max_dfas ); | |
396 | ||
397 | if ( nultrans ) | |
398 | nultrans = reallocate_integer_array( nultrans, current_max_dfas ); | |
399 | } | |
400 | ||
401 | ||
402 | /* ntod - convert an ndfa to a dfa | |
403 | * | |
404 | * synopsis | |
405 | * ntod(); | |
406 | * | |
407 | * creates the dfa corresponding to the ndfa we've constructed. the | |
408 | * dfa starts out in state #1. | |
409 | */ | |
410 | ||
411 | void ntod() | |
412 | ||
413 | { | |
414 | int *accset, ds, nacc, newds; | |
415 | int sym, hashval, numstates, dsize; | |
416 | int num_full_table_rows; /* used only for -f */ | |
417 | int *nset, *dset; | |
418 | int targptr, totaltrans, i, comstate, comfreq, targ; | |
419 | int *epsclosure(), snstods(), symlist[CSIZE + 1]; | |
420 | int num_start_states; | |
421 | int todo_head, todo_next; | |
422 | ||
423 | /* note that the following are indexed by *equivalence classes* | |
424 | * and not by characters. Since equivalence classes are indexed | |
425 | * beginning with 1, even if the scanner accepts NUL's, this | |
426 | * means that (since every character is potentially in its own | |
427 | * equivalence class) these arrays must have room for indices | |
428 | * from 1 to CSIZE, so their size must be CSIZE + 1. | |
429 | */ | |
430 | int duplist[CSIZE + 1], state[CSIZE + 1]; | |
431 | int targfreq[CSIZE + 1], targstate[CSIZE + 1]; | |
432 | ||
433 | /* this is so find_table_space(...) will know where to start looking in | |
434 | * chk/nxt for unused records for space to put in the state | |
435 | */ | |
436 | if ( fullspd ) | |
437 | firstfree = 0; | |
438 | ||
439 | accset = allocate_integer_array( num_rules + 1 ); | |
440 | nset = allocate_integer_array( current_max_dfa_size ); | |
441 | ||
442 | /* the "todo" queue is represented by the head, which is the DFA | |
443 | * state currently being processed, and the "next", which is the | |
444 | * next DFA state number available (not in use). We depend on the | |
445 | * fact that snstods() returns DFA's \in increasing order/, and thus | |
446 | * need only know the bounds of the dfas to be processed. | |
447 | */ | |
448 | todo_head = todo_next = 0; | |
449 | ||
450 | for ( i = 0; i <= csize; ++i ) | |
451 | { | |
452 | duplist[i] = NIL; | |
453 | symlist[i] = false; | |
454 | } | |
455 | ||
456 | for ( i = 0; i <= num_rules; ++i ) | |
457 | accset[i] = NIL; | |
458 | ||
459 | if ( trace ) | |
460 | { | |
461 | dumpnfa( scset[1] ); | |
462 | fputs( "\n\nDFA Dump:\n\n", stderr ); | |
463 | } | |
464 | ||
465 | inittbl(); | |
466 | ||
467 | /* check to see whether we should build a separate table for transitions | |
468 | * on NUL characters. We don't do this for full-speed (-F) scanners, | |
469 | * since for them we don't have a simple state number lying around with | |
470 | * which to index the table. We also don't bother doing it for scanners | |
471 | * unless (1) NUL is in its own equivalence class (indicated by a | |
472 | * positive value of ecgroup[NUL]), (2) NUL's equilvalence class is | |
473 | * the last equivalence class, and (3) the number of equivalence classes | |
474 | * is the same as the number of characters. This latter case comes about | |
475 | * when useecs is false or when its true but every character still | |
476 | * manages to land in its own class (unlikely, but it's cheap to check | |
477 | * for). If all these things are true then the character code needed | |
478 | * to represent NUL's equivalence class for indexing the tables is | |
479 | * going to take one more bit than the number of characters, and therefore | |
480 | * we won't be assured of being able to fit it into a YY_CHAR variable. | |
481 | * This rules out storing the transitions in a compressed table, since | |
482 | * the code for interpreting them uses a YY_CHAR variable (perhaps it | |
483 | * should just use an integer, though; this is worth pondering ... ###). | |
484 | * | |
485 | * Finally, for full tables, we want the number of entries in the | |
486 | * table to be a power of two so the array references go fast (it | |
487 | * will just take a shift to compute the major index). If encoding | |
488 | * NUL's transitions in the table will spoil this, we give it its | |
489 | * own table (note that this will be the case if we're not using | |
490 | * equivalence classes). | |
491 | */ | |
492 | ||
493 | /* note that the test for ecgroup[0] == numecs below accomplishes | |
494 | * both (1) and (2) above | |
495 | */ | |
496 | if ( ! fullspd && ecgroup[0] == numecs ) | |
497 | { /* NUL is alone in its equivalence class, which is the last one */ | |
498 | int use_NUL_table = (numecs == csize); | |
499 | ||
500 | if ( fulltbl && ! use_NUL_table ) | |
501 | { /* we still may want to use the table if numecs is a power of 2 */ | |
502 | int power_of_two; | |
503 | ||
504 | for ( power_of_two = 1; power_of_two <= csize; power_of_two *= 2 ) | |
505 | if ( numecs == power_of_two ) | |
506 | { | |
507 | use_NUL_table = true; | |
508 | break; | |
509 | } | |
510 | } | |
511 | ||
512 | if ( use_NUL_table ) | |
513 | nultrans = allocate_integer_array( current_max_dfas ); | |
514 | /* from now on, nultrans != nil indicates that we're | |
515 | * saving null transitions for later, separate encoding | |
516 | */ | |
517 | } | |
518 | ||
519 | ||
520 | if ( fullspd ) | |
521 | { | |
522 | for ( i = 0; i <= numecs; ++i ) | |
523 | state[i] = 0; | |
524 | place_state( state, 0, 0 ); | |
525 | } | |
526 | ||
527 | else if ( fulltbl ) | |
528 | { | |
529 | if ( nultrans ) | |
530 | /* we won't be including NUL's transitions in the table, | |
531 | * so build it for entries from 0 .. numecs - 1 | |
532 | */ | |
533 | num_full_table_rows = numecs; | |
534 | ||
535 | else | |
536 | /* take into account the fact that we'll be including | |
537 | * the NUL entries in the transition table. Build it | |
538 | * from 0 .. numecs. | |
539 | */ | |
540 | num_full_table_rows = numecs + 1; | |
541 | ||
542 | /* declare it "short" because it's a real long-shot that that | |
543 | * won't be large enough. | |
544 | */ | |
545 | printf( "static short int yy_nxt[][%d] =\n {\n", | |
546 | /* '}' so vi doesn't get too confused */ | |
547 | num_full_table_rows ); | |
548 | ||
549 | /* generate 0 entries for state #0 */ | |
550 | for ( i = 0; i < num_full_table_rows; ++i ) | |
551 | mk2data( 0 ); | |
552 | ||
553 | /* force ',' and dataflush() next call to mk2data */ | |
554 | datapos = NUMDATAITEMS; | |
555 | ||
556 | /* force extra blank line next dataflush() */ | |
557 | dataline = NUMDATALINES; | |
558 | } | |
559 | ||
560 | /* create the first states */ | |
561 | ||
562 | num_start_states = lastsc * 2; | |
563 | ||
564 | for ( i = 1; i <= num_start_states; ++i ) | |
565 | { | |
566 | numstates = 1; | |
567 | ||
568 | /* for each start condition, make one state for the case when | |
569 | * we're at the beginning of the line (the '%' operator) and | |
570 | * one for the case when we're not | |
571 | */ | |
572 | if ( i % 2 == 1 ) | |
573 | nset[numstates] = scset[(i / 2) + 1]; | |
574 | else | |
575 | nset[numstates] = mkbranch( scbol[i / 2], scset[i / 2] ); | |
576 | ||
577 | nset = epsclosure( nset, &numstates, accset, &nacc, &hashval ); | |
578 | ||
579 | if ( snstods( nset, numstates, accset, nacc, hashval, &ds ) ) | |
580 | { | |
581 | numas += nacc; | |
582 | totnst += numstates; | |
583 | ++todo_next; | |
584 | ||
585 | if ( variable_trailing_context_rules && nacc > 0 ) | |
586 | check_trailing_context( nset, numstates, accset, nacc ); | |
587 | } | |
588 | } | |
589 | ||
590 | if ( ! fullspd ) | |
591 | { | |
592 | if ( ! snstods( nset, 0, accset, 0, 0, &end_of_buffer_state ) ) | |
593 | flexfatal( "could not create unique end-of-buffer state" ); | |
594 | ||
595 | ++numas; | |
596 | ++num_start_states; | |
597 | ++todo_next; | |
598 | } | |
599 | ||
600 | while ( todo_head < todo_next ) | |
601 | { | |
602 | targptr = 0; | |
603 | totaltrans = 0; | |
604 | ||
605 | for ( i = 1; i <= numecs; ++i ) | |
606 | state[i] = 0; | |
607 | ||
608 | ds = ++todo_head; | |
609 | ||
610 | dset = dss[ds]; | |
611 | dsize = dfasiz[ds]; | |
612 | ||
613 | if ( trace ) | |
614 | fprintf( stderr, "state # %d:\n", ds ); | |
615 | ||
616 | sympartition( dset, dsize, symlist, duplist ); | |
617 | ||
618 | for ( sym = 1; sym <= numecs; ++sym ) | |
619 | { | |
620 | if ( symlist[sym] ) | |
621 | { | |
622 | symlist[sym] = 0; | |
623 | ||
624 | if ( duplist[sym] == NIL ) | |
625 | { /* symbol has unique out-transitions */ | |
626 | numstates = symfollowset( dset, dsize, sym, nset ); | |
627 | nset = epsclosure( nset, &numstates, accset, | |
628 | &nacc, &hashval ); | |
629 | ||
630 | if ( snstods( nset, numstates, accset, | |
631 | nacc, hashval, &newds ) ) | |
632 | { | |
633 | totnst = totnst + numstates; | |
634 | ++todo_next; | |
635 | numas += nacc; | |
636 | ||
637 | if ( variable_trailing_context_rules && nacc > 0 ) | |
638 | check_trailing_context( nset, numstates, | |
639 | accset, nacc ); | |
640 | } | |
641 | ||
642 | state[sym] = newds; | |
643 | ||
644 | if ( trace ) | |
645 | fprintf( stderr, "\t%d\t%d\n", sym, newds ); | |
646 | ||
647 | targfreq[++targptr] = 1; | |
648 | targstate[targptr] = newds; | |
649 | ++numuniq; | |
650 | } | |
651 | ||
652 | else | |
653 | { | |
654 | /* sym's equivalence class has the same transitions | |
655 | * as duplist(sym)'s equivalence class | |
656 | */ | |
657 | targ = state[duplist[sym]]; | |
658 | state[sym] = targ; | |
659 | ||
660 | if ( trace ) | |
661 | fprintf( stderr, "\t%d\t%d\n", sym, targ ); | |
662 | ||
663 | /* update frequency count for destination state */ | |
664 | ||
665 | i = 0; | |
666 | while ( targstate[++i] != targ ) | |
667 | ; | |
668 | ||
669 | ++targfreq[i]; | |
670 | ++numdup; | |
671 | } | |
672 | ||
673 | ++totaltrans; | |
674 | duplist[sym] = NIL; | |
675 | } | |
676 | } | |
677 | ||
678 | numsnpairs = numsnpairs + totaltrans; | |
679 | ||
680 | if ( caseins && ! useecs ) | |
681 | { | |
682 | register int j; | |
683 | ||
684 | for ( i = 'A', j = 'a'; i <= 'Z'; ++i, ++j ) | |
685 | state[i] = state[j]; | |
686 | } | |
687 | ||
688 | if ( ds > num_start_states ) | |
689 | check_for_backtracking( ds, state ); | |
690 | ||
691 | if ( nultrans ) | |
692 | { | |
693 | nultrans[ds] = state[NUL_ec]; | |
694 | state[NUL_ec] = 0; /* remove transition */ | |
695 | } | |
696 | ||
697 | if ( fulltbl ) | |
698 | { | |
699 | /* supply array's 0-element */ | |
700 | if ( ds == end_of_buffer_state ) | |
701 | mk2data( -end_of_buffer_state ); | |
702 | else | |
703 | mk2data( end_of_buffer_state ); | |
704 | ||
705 | for ( i = 1; i < num_full_table_rows; ++i ) | |
706 | /* jams are marked by negative of state number */ | |
707 | mk2data( state[i] ? state[i] : -ds ); | |
708 | ||
709 | /* force ',' and dataflush() next call to mk2data */ | |
710 | datapos = NUMDATAITEMS; | |
711 | ||
712 | /* force extra blank line next dataflush() */ | |
713 | dataline = NUMDATALINES; | |
714 | } | |
715 | ||
716 | else if ( fullspd ) | |
717 | place_state( state, ds, totaltrans ); | |
718 | ||
719 | else if ( ds == end_of_buffer_state ) | |
720 | /* special case this state to make sure it does what it's | |
721 | * supposed to, i.e., jam on end-of-buffer | |
722 | */ | |
723 | stack1( ds, 0, 0, JAMSTATE ); | |
724 | ||
725 | else /* normal, compressed state */ | |
726 | { | |
727 | /* determine which destination state is the most common, and | |
728 | * how many transitions to it there are | |
729 | */ | |
730 | ||
731 | comfreq = 0; | |
732 | comstate = 0; | |
733 | ||
734 | for ( i = 1; i <= targptr; ++i ) | |
735 | if ( targfreq[i] > comfreq ) | |
736 | { | |
737 | comfreq = targfreq[i]; | |
738 | comstate = targstate[i]; | |
739 | } | |
740 | ||
741 | bldtbl( state, ds, totaltrans, comstate, comfreq ); | |
742 | } | |
743 | } | |
744 | ||
745 | if ( fulltbl ) | |
746 | dataend(); | |
747 | ||
748 | else if ( ! fullspd ) | |
749 | { | |
750 | cmptmps(); /* create compressed template entries */ | |
751 | ||
752 | /* create tables for all the states with only one out-transition */ | |
753 | while ( onesp > 0 ) | |
754 | { | |
755 | mk1tbl( onestate[onesp], onesym[onesp], onenext[onesp], | |
756 | onedef[onesp] ); | |
757 | --onesp; | |
758 | } | |
759 | ||
760 | mkdeftbl(); | |
761 | } | |
762 | } | |
763 | ||
764 | ||
765 | /* snstods - converts a set of ndfa states into a dfa state | |
766 | * | |
767 | * synopsis | |
768 | * int sns[numstates], numstates, newds, accset[num_rules + 1], nacc, hashval; | |
769 | * int snstods(); | |
770 | * is_new_state = snstods( sns, numstates, accset, nacc, hashval, &newds ); | |
771 | * | |
772 | * on return, the dfa state number is in newds. | |
773 | */ | |
774 | ||
775 | int snstods( sns, numstates, accset, nacc, hashval, newds_addr ) | |
776 | int sns[], numstates, accset[], nacc, hashval, *newds_addr; | |
777 | ||
778 | { | |
779 | int didsort = 0; | |
780 | register int i, j; | |
781 | int newds, *oldsns; | |
782 | ||
783 | for ( i = 1; i <= lastdfa; ++i ) | |
784 | if ( hashval == dhash[i] ) | |
785 | { | |
786 | if ( numstates == dfasiz[i] ) | |
787 | { | |
788 | oldsns = dss[i]; | |
789 | ||
790 | if ( ! didsort ) | |
791 | { | |
792 | /* we sort the states in sns so we can compare it to | |
793 | * oldsns quickly. we use bubble because there probably | |
794 | * aren't very many states | |
795 | */ | |
796 | bubble( sns, numstates ); | |
797 | didsort = 1; | |
798 | } | |
799 | ||
800 | for ( j = 1; j <= numstates; ++j ) | |
801 | if ( sns[j] != oldsns[j] ) | |
802 | break; | |
803 | ||
804 | if ( j > numstates ) | |
805 | { | |
806 | ++dfaeql; | |
807 | *newds_addr = i; | |
808 | return ( 0 ); | |
809 | } | |
810 | ||
811 | ++hshcol; | |
812 | } | |
813 | ||
814 | else | |
815 | ++hshsave; | |
816 | } | |
817 | ||
818 | /* make a new dfa */ | |
819 | ||
820 | if ( ++lastdfa >= current_max_dfas ) | |
821 | increase_max_dfas(); | |
822 | ||
823 | newds = lastdfa; | |
824 | ||
825 | dss[newds] = (int *) malloc( (unsigned) ((numstates + 1) * sizeof( int )) ); | |
826 | ||
827 | if ( ! dss[newds] ) | |
828 | flexfatal( "dynamic memory failure in snstods()" ); | |
829 | ||
830 | /* if we haven't already sorted the states in sns, we do so now, so that | |
831 | * future comparisons with it can be made quickly | |
832 | */ | |
833 | ||
834 | if ( ! didsort ) | |
835 | bubble( sns, numstates ); | |
836 | ||
837 | for ( i = 1; i <= numstates; ++i ) | |
838 | dss[newds][i] = sns[i]; | |
839 | ||
840 | dfasiz[newds] = numstates; | |
841 | dhash[newds] = hashval; | |
842 | ||
843 | if ( nacc == 0 ) | |
844 | { | |
845 | if ( reject ) | |
846 | dfaacc[newds].dfaacc_set = (int *) 0; | |
847 | else | |
848 | dfaacc[newds].dfaacc_state = 0; | |
849 | ||
850 | accsiz[newds] = 0; | |
851 | } | |
852 | ||
853 | else if ( reject ) | |
854 | { | |
855 | /* we sort the accepting set in increasing order so the disambiguating | |
856 | * rule that the first rule listed is considered match in the event of | |
857 | * ties will work. We use a bubble sort since the list is probably | |
858 | * quite small. | |
859 | */ | |
860 | ||
861 | bubble( accset, nacc ); | |
862 | ||
863 | dfaacc[newds].dfaacc_set = | |
864 | (int *) malloc( (unsigned) ((nacc + 1) * sizeof( int )) ); | |
865 | ||
866 | if ( ! dfaacc[newds].dfaacc_set ) | |
867 | flexfatal( "dynamic memory failure in snstods()" ); | |
868 | ||
869 | /* save the accepting set for later */ | |
870 | for ( i = 1; i <= nacc; ++i ) | |
871 | dfaacc[newds].dfaacc_set[i] = accset[i]; | |
872 | ||
873 | accsiz[newds] = nacc; | |
874 | } | |
875 | ||
876 | else | |
877 | { /* find lowest numbered rule so the disambiguating rule will work */ | |
878 | j = num_rules + 1; | |
879 | ||
880 | for ( i = 1; i <= nacc; ++i ) | |
881 | if ( accset[i] < j ) | |
882 | j = accset[i]; | |
883 | ||
884 | dfaacc[newds].dfaacc_state = j; | |
885 | } | |
886 | ||
887 | *newds_addr = newds; | |
888 | ||
889 | return ( 1 ); | |
890 | } | |
891 | ||
892 | ||
893 | /* symfollowset - follow the symbol transitions one step | |
894 | * | |
895 | * synopsis | |
896 | * int ds[current_max_dfa_size], dsize, transsym; | |
897 | * int nset[current_max_dfa_size], numstates; | |
898 | * numstates = symfollowset( ds, dsize, transsym, nset ); | |
899 | */ | |
900 | ||
901 | int symfollowset( ds, dsize, transsym, nset ) | |
902 | int ds[], dsize, transsym, nset[]; | |
903 | ||
904 | { | |
905 | int ns, tsp, sym, i, j, lenccl, ch, numstates; | |
906 | int ccllist; | |
907 | ||
908 | numstates = 0; | |
909 | ||
910 | for ( i = 1; i <= dsize; ++i ) | |
911 | { /* for each nfa state ns in the state set of ds */ | |
912 | ns = ds[i]; | |
913 | sym = transchar[ns]; | |
914 | tsp = trans1[ns]; | |
915 | ||
916 | if ( sym < 0 ) | |
917 | { /* it's a character class */ | |
918 | sym = -sym; | |
919 | ccllist = cclmap[sym]; | |
920 | lenccl = ccllen[sym]; | |
921 | ||
922 | if ( cclng[sym] ) | |
923 | { | |
924 | for ( j = 0; j < lenccl; ++j ) | |
925 | { /* loop through negated character class */ | |
926 | ch = ccltbl[ccllist + j]; | |
927 | ||
928 | if ( ch == 0 ) | |
929 | ch = NUL_ec; | |
930 | ||
931 | if ( ch > transsym ) | |
932 | break; /* transsym isn't in negated ccl */ | |
933 | ||
934 | else if ( ch == transsym ) | |
935 | /* next 2 */ goto bottom; | |
936 | } | |
937 | ||
938 | /* didn't find transsym in ccl */ | |
939 | nset[++numstates] = tsp; | |
940 | } | |
941 | ||
942 | else | |
943 | for ( j = 0; j < lenccl; ++j ) | |
944 | { | |
945 | ch = ccltbl[ccllist + j]; | |
946 | ||
947 | if ( ch == 0 ) | |
948 | ch = NUL_ec; | |
949 | ||
950 | if ( ch > transsym ) | |
951 | break; | |
952 | ||
953 | else if ( ch == transsym ) | |
954 | { | |
955 | nset[++numstates] = tsp; | |
956 | break; | |
957 | } | |
958 | } | |
959 | } | |
960 | ||
961 | else if ( sym >= 'A' && sym <= 'Z' && caseins ) | |
962 | flexfatal( "consistency check failed in symfollowset" ); | |
963 | ||
964 | else if ( sym == SYM_EPSILON ) | |
965 | { /* do nothing */ | |
966 | } | |
967 | ||
968 | else if ( abs( ecgroup[sym] ) == transsym ) | |
969 | nset[++numstates] = tsp; | |
970 | ||
971 | bottom: | |
972 | ; | |
973 | } | |
974 | ||
975 | return ( numstates ); | |
976 | } | |
977 | ||
978 | ||
979 | /* sympartition - partition characters with same out-transitions | |
980 | * | |
981 | * synopsis | |
982 | * integer ds[current_max_dfa_size], numstates, duplist[numecs]; | |
983 | * symlist[numecs]; | |
984 | * sympartition( ds, numstates, symlist, duplist ); | |
985 | */ | |
986 | ||
987 | void sympartition( ds, numstates, symlist, duplist ) | |
988 | int ds[], numstates, duplist[]; | |
989 | int symlist[]; | |
990 | ||
991 | { | |
992 | int tch, i, j, k, ns, dupfwd[CSIZE + 1], lenccl, cclp, ich; | |
993 | ||
994 | /* partitioning is done by creating equivalence classes for those | |
995 | * characters which have out-transitions from the given state. Thus | |
996 | * we are really creating equivalence classes of equivalence classes. | |
997 | */ | |
998 | ||
999 | for ( i = 1; i <= numecs; ++i ) | |
1000 | { /* initialize equivalence class list */ | |
1001 | duplist[i] = i - 1; | |
1002 | dupfwd[i] = i + 1; | |
1003 | } | |
1004 | ||
1005 | duplist[1] = NIL; | |
1006 | dupfwd[numecs] = NIL; | |
1007 | ||
1008 | for ( i = 1; i <= numstates; ++i ) | |
1009 | { | |
1010 | ns = ds[i]; | |
1011 | tch = transchar[ns]; | |
1012 | ||
1013 | if ( tch != SYM_EPSILON ) | |
1014 | { | |
7d684d5e | 1015 | if ( tch < -lastccl || tch >= csize ) |
15637ed4 | 1016 | { |
7d684d5e | 1017 | if ( tch >= csize && tch <= CSIZE ) |
15637ed4 RG |
1018 | flexerror( "scanner requires -8 flag" ); |
1019 | ||
1020 | else | |
1021 | flexfatal( | |
1022 | "bad transition character detected in sympartition()" ); | |
1023 | } | |
1024 | ||
1025 | if ( tch >= 0 ) | |
1026 | { /* character transition */ | |
1027 | /* abs() needed for fake %t ec's */ | |
1028 | int ec = abs( ecgroup[tch] ); | |
1029 | ||
1030 | mkechar( ec, dupfwd, duplist ); | |
1031 | symlist[ec] = 1; | |
1032 | } | |
1033 | ||
1034 | else | |
1035 | { /* character class */ | |
1036 | tch = -tch; | |
1037 | ||
1038 | lenccl = ccllen[tch]; | |
1039 | cclp = cclmap[tch]; | |
1040 | mkeccl( ccltbl + cclp, lenccl, dupfwd, duplist, numecs, | |
1041 | NUL_ec ); | |
1042 | ||
1043 | if ( cclng[tch] ) | |
1044 | { | |
1045 | j = 0; | |
1046 | ||
1047 | for ( k = 0; k < lenccl; ++k ) | |
1048 | { | |
1049 | ich = ccltbl[cclp + k]; | |
1050 | ||
1051 | if ( ich == 0 ) | |
1052 | ich = NUL_ec; | |
1053 | ||
1054 | for ( ++j; j < ich; ++j ) | |
1055 | symlist[j] = 1; | |
1056 | } | |
1057 | ||
1058 | for ( ++j; j <= numecs; ++j ) | |
1059 | symlist[j] = 1; | |
1060 | } | |
1061 | ||
1062 | else | |
1063 | for ( k = 0; k < lenccl; ++k ) | |
1064 | { | |
1065 | ich = ccltbl[cclp + k]; | |
1066 | ||
1067 | if ( ich == 0 ) | |
1068 | ich = NUL_ec; | |
1069 | ||
1070 | symlist[ich] = 1; | |
1071 | } | |
1072 | } | |
1073 | } | |
1074 | } | |
1075 | } |