Initial import, 0.1 + pk 0.2.4-B1
[unix-history] / usr.bin / lex / nfa.c
CommitLineData
15637ed4
RG
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Vern Paxson of Lawrence Berkeley Laboratory.
7 *
8 * The United States Government has rights in this work pursuant
9 * to contract no. DE-AC03-76SF00098 between the United States
10 * Department of Energy and the University of California.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41#ifndef lint
42static char sccsid[] = "@(#)nfa.c 5.2 (Berkeley) 6/18/90";
43#endif /* not lint */
44
45/* nfa - NFA construction routines */
46
47#include "flexdef.h"
48
49/* declare functions that have forward references */
50
51int dupmachine PROTO((int));
52void mkxtion PROTO((int, int));
53
54
55/* add_accept - add an accepting state to a machine
56 *
57 * synopsis
58 *
59 * add_accept( mach, accepting_number );
60 *
61 * accepting_number becomes mach's accepting number.
62 */
63
64void add_accept( mach, accepting_number )
65int mach, accepting_number;
66
67 {
68 /* hang the accepting number off an epsilon state. if it is associated
69 * with a state that has a non-epsilon out-transition, then the state
70 * will accept BEFORE it makes that transition, i.e., one character
71 * too soon
72 */
73
74 if ( transchar[finalst[mach]] == SYM_EPSILON )
75 accptnum[finalst[mach]] = accepting_number;
76
77 else
78 {
79 int astate = mkstate( SYM_EPSILON );
80 accptnum[astate] = accepting_number;
81 mach = link_machines( mach, astate );
82 }
83 }
84
85
86/* copysingl - make a given number of copies of a singleton machine
87 *
88 * synopsis
89 *
90 * newsng = copysingl( singl, num );
91 *
92 * newsng - a new singleton composed of num copies of singl
93 * singl - a singleton machine
94 * num - the number of copies of singl to be present in newsng
95 */
96
97int copysingl( singl, num )
98int singl, num;
99
100 {
101 int copy, i;
102
103 copy = mkstate( SYM_EPSILON );
104
105 for ( i = 1; i <= num; ++i )
106 copy = link_machines( copy, dupmachine( singl ) );
107
108 return ( copy );
109 }
110
111
112/* dumpnfa - debugging routine to write out an nfa
113 *
114 * synopsis
115 * int state1;
116 * dumpnfa( state1 );
117 */
118
119void dumpnfa( state1 )
120int state1;
121
122 {
123 int sym, tsp1, tsp2, anum, ns;
124
125 fprintf( stderr, "\n\n********** beginning dump of nfa with start state %d\n",
126 state1 );
127
128 /* we probably should loop starting at firstst[state1] and going to
129 * lastst[state1], but they're not maintained properly when we "or"
130 * all of the rules together. So we use our knowledge that the machine
131 * starts at state 1 and ends at lastnfa.
132 */
133
134 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
135 for ( ns = 1; ns <= lastnfa; ++ns )
136 {
137 fprintf( stderr, "state # %4d\t", ns );
138
139 sym = transchar[ns];
140 tsp1 = trans1[ns];
141 tsp2 = trans2[ns];
142 anum = accptnum[ns];
143
144 fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
145
146 if ( anum != NIL )
147 fprintf( stderr, " [%d]", anum );
148
149 fprintf( stderr, "\n" );
150 }
151
152 fprintf( stderr, "********** end of dump\n" );
153 }
154
155
156/* dupmachine - make a duplicate of a given machine
157 *
158 * synopsis
159 *
160 * copy = dupmachine( mach );
161 *
162 * copy - holds duplicate of mach
163 * mach - machine to be duplicated
164 *
165 * note that the copy of mach is NOT an exact duplicate; rather, all the
166 * transition states values are adjusted so that the copy is self-contained,
167 * as the original should have been.
168 *
169 * also note that the original MUST be contiguous, with its low and high
170 * states accessible by the arrays firstst and lastst
171 */
172
173int dupmachine( mach )
174int mach;
175
176 {
177 int i, init, state_offset;
178 int state = 0;
179 int last = lastst[mach];
180
181 for ( i = firstst[mach]; i <= last; ++i )
182 {
183 state = mkstate( transchar[i] );
184
185 if ( trans1[i] != NO_TRANSITION )
186 {
187 mkxtion( finalst[state], trans1[i] + state - i );
188
189 if ( transchar[i] == SYM_EPSILON && trans2[i] != NO_TRANSITION )
190 mkxtion( finalst[state], trans2[i] + state - i );
191 }
192
193 accptnum[state] = accptnum[i];
194 }
195
196 if ( state == 0 )
197 flexfatal( "empty machine in dupmachine()" );
198
199 state_offset = state - i + 1;
200
201 init = mach + state_offset;
202 firstst[init] = firstst[mach] + state_offset;
203 finalst[init] = finalst[mach] + state_offset;
204 lastst[init] = lastst[mach] + state_offset;
205
206 return ( init );
207 }
208
209
210/* finish_rule - finish up the processing for a rule
211 *
212 * synopsis
213 *
214 * finish_rule( mach, variable_trail_rule, headcnt, trailcnt );
215 *
216 * An accepting number is added to the given machine. If variable_trail_rule
217 * is true then the rule has trailing context and both the head and trail
218 * are variable size. Otherwise if headcnt or trailcnt is non-zero then
219 * the machine recognizes a pattern with trailing context and headcnt is
220 * the number of characters in the matched part of the pattern, or zero
221 * if the matched part has variable length. trailcnt is the number of
222 * trailing context characters in the pattern, or zero if the trailing
223 * context has variable length.
224 */
225
226void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
227int mach, variable_trail_rule, headcnt, trailcnt;
228
229 {
230 add_accept( mach, num_rules );
231
232 /* we did this in new_rule(), but it often gets the wrong
233 * number because we do it before we start parsing the current rule
234 */
235 rule_linenum[num_rules] = linenum;
236
237 /* if this is a continued action, then the line-number has
238 * already been updated, giving us the wrong number
239 */
240 if ( continued_action )
241 --rule_linenum[num_rules];
242
243 fprintf( temp_action_file, "case %d:\n", num_rules );
244
245 if ( variable_trail_rule )
246 {
247 rule_type[num_rules] = RULE_VARIABLE;
248
249 if ( performance_report )
250 fprintf( stderr, "Variable trailing context rule at line %d\n",
251 rule_linenum[num_rules] );
252
253 variable_trailing_context_rules = true;
254 }
255
256 else
257 {
258 rule_type[num_rules] = RULE_NORMAL;
259
260 if ( headcnt > 0 || trailcnt > 0 )
261 {
262 /* do trailing context magic to not match the trailing characters */
263 char *scanner_cp = "yy_c_buf_p = yy_cp";
264 char *scanner_bp = "yy_bp";
265
266 fprintf( temp_action_file,
267 "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
268
269 if ( headcnt > 0 )
270 fprintf( temp_action_file, "%s = %s + %d;\n",
271 scanner_cp, scanner_bp, headcnt );
272
273 else
274 fprintf( temp_action_file,
275 "%s -= %d;\n", scanner_cp, trailcnt );
276
277 fprintf( temp_action_file,
278 "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
279 }
280 }
281
282 line_directive_out( temp_action_file );
283 }
284
285
286/* link_machines - connect two machines together
287 *
288 * synopsis
289 *
290 * new = link_machines( first, last );
291 *
292 * new - a machine constructed by connecting first to last
293 * first - the machine whose successor is to be last
294 * last - the machine whose predecessor is to be first
295 *
296 * note: this routine concatenates the machine first with the machine
297 * last to produce a machine new which will pattern-match first first
298 * and then last, and will fail if either of the sub-patterns fails.
299 * FIRST is set to new by the operation. last is unmolested.
300 */
301
302int link_machines( first, last )
303int first, last;
304
305 {
306 if ( first == NIL )
307 return ( last );
308
309 else if ( last == NIL )
310 return ( first );
311
312 else
313 {
314 mkxtion( finalst[first], last );
315 finalst[first] = finalst[last];
316 lastst[first] = max( lastst[first], lastst[last] );
317 firstst[first] = min( firstst[first], firstst[last] );
318
319 return ( first );
320 }
321 }
322
323
324/* mark_beginning_as_normal - mark each "beginning" state in a machine
325 * as being a "normal" (i.e., not trailing context-
326 * associated) states
327 *
328 * synopsis
329 *
330 * mark_beginning_as_normal( mach )
331 *
332 * mach - machine to mark
333 *
334 * The "beginning" states are the epsilon closure of the first state
335 */
336
337void mark_beginning_as_normal( mach )
338register int mach;
339
340 {
341 switch ( state_type[mach] )
342 {
343 case STATE_NORMAL:
344 /* oh, we've already visited here */
345 return;
346
347 case STATE_TRAILING_CONTEXT:
348 state_type[mach] = STATE_NORMAL;
349
350 if ( transchar[mach] == SYM_EPSILON )
351 {
352 if ( trans1[mach] != NO_TRANSITION )
353 mark_beginning_as_normal( trans1[mach] );
354
355 if ( trans2[mach] != NO_TRANSITION )
356 mark_beginning_as_normal( trans2[mach] );
357 }
358 break;
359
360 default:
361 flexerror( "bad state type in mark_beginning_as_normal()" );
362 break;
363 }
364 }
365
366
367/* mkbranch - make a machine that branches to two machines
368 *
369 * synopsis
370 *
371 * branch = mkbranch( first, second );
372 *
373 * branch - a machine which matches either first's pattern or second's
374 * first, second - machines whose patterns are to be or'ed (the | operator)
375 *
376 * note that first and second are NEITHER destroyed by the operation. Also,
377 * the resulting machine CANNOT be used with any other "mk" operation except
378 * more mkbranch's. Compare with mkor()
379 */
380
381int mkbranch( first, second )
382int first, second;
383
384 {
385 int eps;
386
387 if ( first == NO_TRANSITION )
388 return ( second );
389
390 else if ( second == NO_TRANSITION )
391 return ( first );
392
393 eps = mkstate( SYM_EPSILON );
394
395 mkxtion( eps, first );
396 mkxtion( eps, second );
397
398 return ( eps );
399 }
400
401
402/* mkclos - convert a machine into a closure
403 *
404 * synopsis
405 * new = mkclos( state );
406 *
407 * new - a new state which matches the closure of "state"
408 */
409
410int mkclos( state )
411int state;
412
413 {
414 return ( mkopt( mkposcl( state ) ) );
415 }
416
417
418/* mkopt - make a machine optional
419 *
420 * synopsis
421 *
422 * new = mkopt( mach );
423 *
424 * new - a machine which optionally matches whatever mach matched
425 * mach - the machine to make optional
426 *
427 * notes:
428 * 1. mach must be the last machine created
429 * 2. mach is destroyed by the call
430 */
431
432int mkopt( mach )
433int mach;
434
435 {
436 int eps;
437
438 if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
439 {
440 eps = mkstate( SYM_EPSILON );
441 mach = link_machines( mach, eps );
442 }
443
444 /* can't skimp on the following if FREE_EPSILON(mach) is true because
445 * some state interior to "mach" might point back to the beginning
446 * for a closure
447 */
448 eps = mkstate( SYM_EPSILON );
449 mach = link_machines( eps, mach );
450
451 mkxtion( mach, finalst[mach] );
452
453 return ( mach );
454 }
455
456
457/* mkor - make a machine that matches either one of two machines
458 *
459 * synopsis
460 *
461 * new = mkor( first, second );
462 *
463 * new - a machine which matches either first's pattern or second's
464 * first, second - machines whose patterns are to be or'ed (the | operator)
465 *
466 * note that first and second are both destroyed by the operation
467 * the code is rather convoluted because an attempt is made to minimize
468 * the number of epsilon states needed
469 */
470
471int mkor( first, second )
472int first, second;
473
474 {
475 int eps, orend;
476
477 if ( first == NIL )
478 return ( second );
479
480 else if ( second == NIL )
481 return ( first );
482
483 else
484 {
485 /* see comment in mkopt() about why we can't use the first state
486 * of "first" or "second" if they satisfy "FREE_EPSILON"
487 */
488 eps = mkstate( SYM_EPSILON );
489
490 first = link_machines( eps, first );
491
492 mkxtion( first, second );
493
494 if ( SUPER_FREE_EPSILON(finalst[first]) &&
495 accptnum[finalst[first]] == NIL )
496 {
497 orend = finalst[first];
498 mkxtion( finalst[second], orend );
499 }
500
501 else if ( SUPER_FREE_EPSILON(finalst[second]) &&
502 accptnum[finalst[second]] == NIL )
503 {
504 orend = finalst[second];
505 mkxtion( finalst[first], orend );
506 }
507
508 else
509 {
510 eps = mkstate( SYM_EPSILON );
511
512 first = link_machines( first, eps );
513 orend = finalst[first];
514
515 mkxtion( finalst[second], orend );
516 }
517 }
518
519 finalst[first] = orend;
520 return ( first );
521 }
522
523
524/* mkposcl - convert a machine into a positive closure
525 *
526 * synopsis
527 * new = mkposcl( state );
528 *
529 * new - a machine matching the positive closure of "state"
530 */
531
532int mkposcl( state )
533int state;
534
535 {
536 int eps;
537
538 if ( SUPER_FREE_EPSILON(finalst[state]) )
539 {
540 mkxtion( finalst[state], state );
541 return ( state );
542 }
543
544 else
545 {
546 eps = mkstate( SYM_EPSILON );
547 mkxtion( eps, state );
548 return ( link_machines( state, eps ) );
549 }
550 }
551
552
553/* mkrep - make a replicated machine
554 *
555 * synopsis
556 * new = mkrep( mach, lb, ub );
557 *
558 * new - a machine that matches whatever "mach" matched from "lb"
559 * number of times to "ub" number of times
560 *
561 * note
562 * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
563 */
564
565int mkrep( mach, lb, ub )
566int mach, lb, ub;
567
568 {
569 int base_mach, tail, copy, i;
570
571 base_mach = copysingl( mach, lb - 1 );
572
573 if ( ub == INFINITY )
574 {
575 copy = dupmachine( mach );
576 mach = link_machines( mach,
577 link_machines( base_mach, mkclos( copy ) ) );
578 }
579
580 else
581 {
582 tail = mkstate( SYM_EPSILON );
583
584 for ( i = lb; i < ub; ++i )
585 {
586 copy = dupmachine( mach );
587 tail = mkopt( link_machines( copy, tail ) );
588 }
589
590 mach = link_machines( mach, link_machines( base_mach, tail ) );
591 }
592
593 return ( mach );
594 }
595
596
597/* mkstate - create a state with a transition on a given symbol
598 *
599 * synopsis
600 *
601 * state = mkstate( sym );
602 *
603 * state - a new state matching sym
604 * sym - the symbol the new state is to have an out-transition on
605 *
606 * note that this routine makes new states in ascending order through the
607 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
608 * relies on machines being made in ascending order and that they are
609 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
610 * that it admittedly is)
611 */
612
613int mkstate( sym )
614int sym;
615
616 {
617 if ( ++lastnfa >= current_mns )
618 {
619 if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
620 lerrif( "input rules are too complicated (>= %d NFA states)",
621 current_mns );
622
623 ++num_reallocs;
624
625 firstst = reallocate_integer_array( firstst, current_mns );
626 lastst = reallocate_integer_array( lastst, current_mns );
627 finalst = reallocate_integer_array( finalst, current_mns );
628 transchar = reallocate_integer_array( transchar, current_mns );
629 trans1 = reallocate_integer_array( trans1, current_mns );
630 trans2 = reallocate_integer_array( trans2, current_mns );
631 accptnum = reallocate_integer_array( accptnum, current_mns );
632 assoc_rule = reallocate_integer_array( assoc_rule, current_mns );
633 state_type = reallocate_integer_array( state_type, current_mns );
634 }
635
636 firstst[lastnfa] = lastnfa;
637 finalst[lastnfa] = lastnfa;
638 lastst[lastnfa] = lastnfa;
639 transchar[lastnfa] = sym;
640 trans1[lastnfa] = NO_TRANSITION;
641 trans2[lastnfa] = NO_TRANSITION;
642 accptnum[lastnfa] = NIL;
643 assoc_rule[lastnfa] = num_rules;
644 state_type[lastnfa] = current_state_type;
645
646 /* fix up equivalence classes base on this transition. Note that any
647 * character which has its own transition gets its own equivalence class.
648 * Thus only characters which are only in character classes have a chance
649 * at being in the same equivalence class. E.g. "a|b" puts 'a' and 'b'
650 * into two different equivalence classes. "[ab]" puts them in the same
651 * equivalence class (barring other differences elsewhere in the input).
652 */
653
654 if ( sym < 0 )
655 {
656 /* we don't have to update the equivalence classes since that was
657 * already done when the ccl was created for the first time
658 */
659 }
660
661 else if ( sym == SYM_EPSILON )
662 ++numeps;
663
664 else
665 {
666 if ( useecs )
667 /* map NUL's to csize */
668 mkechar( sym ? sym : csize, nextecm, ecgroup );
669 }
670
671 return ( lastnfa );
672 }
673
674
675/* mkxtion - make a transition from one state to another
676 *
677 * synopsis
678 *
679 * mkxtion( statefrom, stateto );
680 *
681 * statefrom - the state from which the transition is to be made
682 * stateto - the state to which the transition is to be made
683 */
684
685void mkxtion( statefrom, stateto )
686int statefrom, stateto;
687
688 {
689 if ( trans1[statefrom] == NO_TRANSITION )
690 trans1[statefrom] = stateto;
691
692 else if ( (transchar[statefrom] != SYM_EPSILON) ||
693 (trans2[statefrom] != NO_TRANSITION) )
694 flexfatal( "found too many transitions in mkxtion()" );
695
696 else
697 { /* second out-transition for an epsilon state */
698 ++eps2;
699 trans2[statefrom] = stateto;
700 }
701 }
702
703/* new_rule - initialize for a new rule
704 *
705 * synopsis
706 *
707 * new_rule();
708 *
709 * the global num_rules is incremented and the any corresponding dynamic
710 * arrays (such as rule_type[]) are grown as needed.
711 */
712
713void new_rule()
714
715 {
716 if ( ++num_rules >= current_max_rules )
717 {
718 ++num_reallocs;
719 current_max_rules += MAX_RULES_INCREMENT;
720 rule_type = reallocate_integer_array( rule_type, current_max_rules );
721 rule_linenum =
722 reallocate_integer_array( rule_linenum, current_max_rules );
723 }
724
725 if ( num_rules > MAX_RULE )
726 lerrif( "too many rules (> %d)!", MAX_RULE );
727
728 rule_linenum[num_rules] = linenum;
729 }