This commit was manufactured by cvs2svn to create tag 'FreeBSD-release/1.0'.
[unix-history] / usr.bin / lex / nfa.c
CommitLineData
78ed81a3 1/* nfa - NFA construction routines */
2
15637ed4
RG
3/*-
4 * Copyright (c) 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
78ed81a3 8 * Vern Paxson.
15637ed4
RG
9 *
10 * The United States Government has rights in this work pursuant
11 * to contract no. DE-AC03-76SF00098 between the United States
12 * Department of Energy and the University of California.
13 *
78ed81a3 14 * Redistribution and use in source and binary forms are permitted provided
15 * that: (1) source distributions retain this entire copyright notice and
16 * comment, and (2) distributions including binaries display the following
17 * acknowledgement: ``This product includes software developed by the
18 * University of California, Berkeley and its contributors'' in the
19 * documentation or other materials provided with the distribution and in
20 * all advertising materials mentioning features or use of this software.
21 * Neither the name of the University nor the names of its contributors may
22 * be used to endorse or promote products derived from this software without
23 * specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
25 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
26 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
15637ed4
RG
27 */
28
29#ifndef lint
78ed81a3 30static char rcsid[] =
31 "@(#) $Header: /usr/fsys/odin/a/vern/flex/RCS/nfa.c,v 2.6 90/06/27 23:48:29 vern Exp $ (LBL)";
32#endif
15637ed4
RG
33
34#include "flexdef.h"
35
78ed81a3 36
15637ed4
RG
37/* declare functions that have forward references */
38
39int dupmachine PROTO((int));
40void mkxtion PROTO((int, int));
41
42
43/* add_accept - add an accepting state to a machine
44 *
45 * synopsis
46 *
47 * add_accept( mach, accepting_number );
48 *
49 * accepting_number becomes mach's accepting number.
50 */
51
52void add_accept( mach, accepting_number )
53int mach, accepting_number;
54
55 {
56 /* hang the accepting number off an epsilon state. if it is associated
57 * with a state that has a non-epsilon out-transition, then the state
58 * will accept BEFORE it makes that transition, i.e., one character
59 * too soon
60 */
61
62 if ( transchar[finalst[mach]] == SYM_EPSILON )
63 accptnum[finalst[mach]] = accepting_number;
64
65 else
66 {
67 int astate = mkstate( SYM_EPSILON );
68 accptnum[astate] = accepting_number;
69 mach = link_machines( mach, astate );
70 }
71 }
72
73
74/* copysingl - make a given number of copies of a singleton machine
75 *
76 * synopsis
77 *
78 * newsng = copysingl( singl, num );
79 *
80 * newsng - a new singleton composed of num copies of singl
81 * singl - a singleton machine
82 * num - the number of copies of singl to be present in newsng
83 */
84
85int copysingl( singl, num )
86int singl, num;
87
88 {
89 int copy, i;
90
91 copy = mkstate( SYM_EPSILON );
92
93 for ( i = 1; i <= num; ++i )
94 copy = link_machines( copy, dupmachine( singl ) );
95
96 return ( copy );
97 }
98
99
100/* dumpnfa - debugging routine to write out an nfa
101 *
102 * synopsis
103 * int state1;
104 * dumpnfa( state1 );
105 */
106
107void dumpnfa( state1 )
108int state1;
109
110 {
111 int sym, tsp1, tsp2, anum, ns;
112
113 fprintf( stderr, "\n\n********** beginning dump of nfa with start state %d\n",
114 state1 );
115
116 /* we probably should loop starting at firstst[state1] and going to
117 * lastst[state1], but they're not maintained properly when we "or"
118 * all of the rules together. So we use our knowledge that the machine
119 * starts at state 1 and ends at lastnfa.
120 */
121
122 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
123 for ( ns = 1; ns <= lastnfa; ++ns )
124 {
125 fprintf( stderr, "state # %4d\t", ns );
126
127 sym = transchar[ns];
128 tsp1 = trans1[ns];
129 tsp2 = trans2[ns];
130 anum = accptnum[ns];
131
132 fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
133
134 if ( anum != NIL )
135 fprintf( stderr, " [%d]", anum );
136
137 fprintf( stderr, "\n" );
138 }
139
140 fprintf( stderr, "********** end of dump\n" );
141 }
142
143
144/* dupmachine - make a duplicate of a given machine
145 *
146 * synopsis
147 *
148 * copy = dupmachine( mach );
149 *
150 * copy - holds duplicate of mach
151 * mach - machine to be duplicated
152 *
153 * note that the copy of mach is NOT an exact duplicate; rather, all the
154 * transition states values are adjusted so that the copy is self-contained,
155 * as the original should have been.
156 *
157 * also note that the original MUST be contiguous, with its low and high
158 * states accessible by the arrays firstst and lastst
159 */
160
161int dupmachine( mach )
162int mach;
163
164 {
165 int i, init, state_offset;
166 int state = 0;
167 int last = lastst[mach];
168
169 for ( i = firstst[mach]; i <= last; ++i )
170 {
171 state = mkstate( transchar[i] );
172
173 if ( trans1[i] != NO_TRANSITION )
174 {
175 mkxtion( finalst[state], trans1[i] + state - i );
176
177 if ( transchar[i] == SYM_EPSILON && trans2[i] != NO_TRANSITION )
178 mkxtion( finalst[state], trans2[i] + state - i );
179 }
180
181 accptnum[state] = accptnum[i];
182 }
183
184 if ( state == 0 )
185 flexfatal( "empty machine in dupmachine()" );
186
187 state_offset = state - i + 1;
188
189 init = mach + state_offset;
190 firstst[init] = firstst[mach] + state_offset;
191 finalst[init] = finalst[mach] + state_offset;
192 lastst[init] = lastst[mach] + state_offset;
193
194 return ( init );
195 }
196
197
198/* finish_rule - finish up the processing for a rule
199 *
200 * synopsis
201 *
202 * finish_rule( mach, variable_trail_rule, headcnt, trailcnt );
203 *
204 * An accepting number is added to the given machine. If variable_trail_rule
205 * is true then the rule has trailing context and both the head and trail
206 * are variable size. Otherwise if headcnt or trailcnt is non-zero then
207 * the machine recognizes a pattern with trailing context and headcnt is
208 * the number of characters in the matched part of the pattern, or zero
209 * if the matched part has variable length. trailcnt is the number of
210 * trailing context characters in the pattern, or zero if the trailing
211 * context has variable length.
212 */
213
214void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
215int mach, variable_trail_rule, headcnt, trailcnt;
216
217 {
218 add_accept( mach, num_rules );
219
220 /* we did this in new_rule(), but it often gets the wrong
221 * number because we do it before we start parsing the current rule
222 */
223 rule_linenum[num_rules] = linenum;
224
225 /* if this is a continued action, then the line-number has
226 * already been updated, giving us the wrong number
227 */
228 if ( continued_action )
229 --rule_linenum[num_rules];
230
231 fprintf( temp_action_file, "case %d:\n", num_rules );
232
233 if ( variable_trail_rule )
234 {
235 rule_type[num_rules] = RULE_VARIABLE;
236
237 if ( performance_report )
238 fprintf( stderr, "Variable trailing context rule at line %d\n",
239 rule_linenum[num_rules] );
240
241 variable_trailing_context_rules = true;
242 }
243
244 else
245 {
246 rule_type[num_rules] = RULE_NORMAL;
247
248 if ( headcnt > 0 || trailcnt > 0 )
249 {
250 /* do trailing context magic to not match the trailing characters */
251 char *scanner_cp = "yy_c_buf_p = yy_cp";
252 char *scanner_bp = "yy_bp";
253
254 fprintf( temp_action_file,
255 "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
256
257 if ( headcnt > 0 )
258 fprintf( temp_action_file, "%s = %s + %d;\n",
259 scanner_cp, scanner_bp, headcnt );
260
261 else
262 fprintf( temp_action_file,
263 "%s -= %d;\n", scanner_cp, trailcnt );
264
265 fprintf( temp_action_file,
266 "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
267 }
268 }
269
270 line_directive_out( temp_action_file );
271 }
272
273
274/* link_machines - connect two machines together
275 *
276 * synopsis
277 *
278 * new = link_machines( first, last );
279 *
280 * new - a machine constructed by connecting first to last
281 * first - the machine whose successor is to be last
282 * last - the machine whose predecessor is to be first
283 *
284 * note: this routine concatenates the machine first with the machine
285 * last to produce a machine new which will pattern-match first first
286 * and then last, and will fail if either of the sub-patterns fails.
287 * FIRST is set to new by the operation. last is unmolested.
288 */
289
290int link_machines( first, last )
291int first, last;
292
293 {
294 if ( first == NIL )
295 return ( last );
296
297 else if ( last == NIL )
298 return ( first );
299
300 else
301 {
302 mkxtion( finalst[first], last );
303 finalst[first] = finalst[last];
304 lastst[first] = max( lastst[first], lastst[last] );
305 firstst[first] = min( firstst[first], firstst[last] );
306
307 return ( first );
308 }
309 }
310
311
312/* mark_beginning_as_normal - mark each "beginning" state in a machine
313 * as being a "normal" (i.e., not trailing context-
314 * associated) states
315 *
316 * synopsis
317 *
318 * mark_beginning_as_normal( mach )
319 *
320 * mach - machine to mark
321 *
322 * The "beginning" states are the epsilon closure of the first state
323 */
324
325void mark_beginning_as_normal( mach )
326register int mach;
327
328 {
329 switch ( state_type[mach] )
330 {
331 case STATE_NORMAL:
332 /* oh, we've already visited here */
333 return;
334
335 case STATE_TRAILING_CONTEXT:
336 state_type[mach] = STATE_NORMAL;
337
338 if ( transchar[mach] == SYM_EPSILON )
339 {
340 if ( trans1[mach] != NO_TRANSITION )
341 mark_beginning_as_normal( trans1[mach] );
342
343 if ( trans2[mach] != NO_TRANSITION )
344 mark_beginning_as_normal( trans2[mach] );
345 }
346 break;
347
348 default:
349 flexerror( "bad state type in mark_beginning_as_normal()" );
350 break;
351 }
352 }
353
354
355/* mkbranch - make a machine that branches to two machines
356 *
357 * synopsis
358 *
359 * branch = mkbranch( first, second );
360 *
361 * branch - a machine which matches either first's pattern or second's
362 * first, second - machines whose patterns are to be or'ed (the | operator)
363 *
364 * note that first and second are NEITHER destroyed by the operation. Also,
365 * the resulting machine CANNOT be used with any other "mk" operation except
366 * more mkbranch's. Compare with mkor()
367 */
368
369int mkbranch( first, second )
370int first, second;
371
372 {
373 int eps;
374
375 if ( first == NO_TRANSITION )
376 return ( second );
377
378 else if ( second == NO_TRANSITION )
379 return ( first );
380
381 eps = mkstate( SYM_EPSILON );
382
383 mkxtion( eps, first );
384 mkxtion( eps, second );
385
386 return ( eps );
387 }
388
389
390/* mkclos - convert a machine into a closure
391 *
392 * synopsis
393 * new = mkclos( state );
394 *
395 * new - a new state which matches the closure of "state"
396 */
397
398int mkclos( state )
399int state;
400
401 {
402 return ( mkopt( mkposcl( state ) ) );
403 }
404
405
406/* mkopt - make a machine optional
407 *
408 * synopsis
409 *
410 * new = mkopt( mach );
411 *
412 * new - a machine which optionally matches whatever mach matched
413 * mach - the machine to make optional
414 *
415 * notes:
416 * 1. mach must be the last machine created
417 * 2. mach is destroyed by the call
418 */
419
420int mkopt( mach )
421int mach;
422
423 {
424 int eps;
425
426 if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
427 {
428 eps = mkstate( SYM_EPSILON );
429 mach = link_machines( mach, eps );
430 }
431
432 /* can't skimp on the following if FREE_EPSILON(mach) is true because
433 * some state interior to "mach" might point back to the beginning
434 * for a closure
435 */
436 eps = mkstate( SYM_EPSILON );
437 mach = link_machines( eps, mach );
438
439 mkxtion( mach, finalst[mach] );
440
441 return ( mach );
442 }
443
444
445/* mkor - make a machine that matches either one of two machines
446 *
447 * synopsis
448 *
449 * new = mkor( first, second );
450 *
451 * new - a machine which matches either first's pattern or second's
452 * first, second - machines whose patterns are to be or'ed (the | operator)
453 *
454 * note that first and second are both destroyed by the operation
455 * the code is rather convoluted because an attempt is made to minimize
456 * the number of epsilon states needed
457 */
458
459int mkor( first, second )
460int first, second;
461
462 {
463 int eps, orend;
464
465 if ( first == NIL )
466 return ( second );
467
468 else if ( second == NIL )
469 return ( first );
470
471 else
472 {
473 /* see comment in mkopt() about why we can't use the first state
474 * of "first" or "second" if they satisfy "FREE_EPSILON"
475 */
476 eps = mkstate( SYM_EPSILON );
477
478 first = link_machines( eps, first );
479
480 mkxtion( first, second );
481
482 if ( SUPER_FREE_EPSILON(finalst[first]) &&
483 accptnum[finalst[first]] == NIL )
484 {
485 orend = finalst[first];
486 mkxtion( finalst[second], orend );
487 }
488
489 else if ( SUPER_FREE_EPSILON(finalst[second]) &&
490 accptnum[finalst[second]] == NIL )
491 {
492 orend = finalst[second];
493 mkxtion( finalst[first], orend );
494 }
495
496 else
497 {
498 eps = mkstate( SYM_EPSILON );
499
500 first = link_machines( first, eps );
501 orend = finalst[first];
502
503 mkxtion( finalst[second], orend );
504 }
505 }
506
507 finalst[first] = orend;
508 return ( first );
509 }
510
511
512/* mkposcl - convert a machine into a positive closure
513 *
514 * synopsis
515 * new = mkposcl( state );
516 *
517 * new - a machine matching the positive closure of "state"
518 */
519
520int mkposcl( state )
521int state;
522
523 {
524 int eps;
525
526 if ( SUPER_FREE_EPSILON(finalst[state]) )
527 {
528 mkxtion( finalst[state], state );
529 return ( state );
530 }
531
532 else
533 {
534 eps = mkstate( SYM_EPSILON );
535 mkxtion( eps, state );
536 return ( link_machines( state, eps ) );
537 }
538 }
539
540
541/* mkrep - make a replicated machine
542 *
543 * synopsis
544 * new = mkrep( mach, lb, ub );
545 *
546 * new - a machine that matches whatever "mach" matched from "lb"
547 * number of times to "ub" number of times
548 *
549 * note
550 * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
551 */
552
553int mkrep( mach, lb, ub )
554int mach, lb, ub;
555
556 {
557 int base_mach, tail, copy, i;
558
559 base_mach = copysingl( mach, lb - 1 );
560
561 if ( ub == INFINITY )
562 {
563 copy = dupmachine( mach );
564 mach = link_machines( mach,
565 link_machines( base_mach, mkclos( copy ) ) );
566 }
567
568 else
569 {
570 tail = mkstate( SYM_EPSILON );
571
572 for ( i = lb; i < ub; ++i )
573 {
574 copy = dupmachine( mach );
575 tail = mkopt( link_machines( copy, tail ) );
576 }
577
578 mach = link_machines( mach, link_machines( base_mach, tail ) );
579 }
580
581 return ( mach );
582 }
583
584
585/* mkstate - create a state with a transition on a given symbol
586 *
587 * synopsis
588 *
589 * state = mkstate( sym );
590 *
591 * state - a new state matching sym
592 * sym - the symbol the new state is to have an out-transition on
593 *
594 * note that this routine makes new states in ascending order through the
595 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
596 * relies on machines being made in ascending order and that they are
597 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
598 * that it admittedly is)
599 */
600
601int mkstate( sym )
602int sym;
603
604 {
605 if ( ++lastnfa >= current_mns )
606 {
607 if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
608 lerrif( "input rules are too complicated (>= %d NFA states)",
609 current_mns );
610
611 ++num_reallocs;
612
613 firstst = reallocate_integer_array( firstst, current_mns );
614 lastst = reallocate_integer_array( lastst, current_mns );
615 finalst = reallocate_integer_array( finalst, current_mns );
616 transchar = reallocate_integer_array( transchar, current_mns );
617 trans1 = reallocate_integer_array( trans1, current_mns );
618 trans2 = reallocate_integer_array( trans2, current_mns );
619 accptnum = reallocate_integer_array( accptnum, current_mns );
620 assoc_rule = reallocate_integer_array( assoc_rule, current_mns );
621 state_type = reallocate_integer_array( state_type, current_mns );
622 }
623
624 firstst[lastnfa] = lastnfa;
625 finalst[lastnfa] = lastnfa;
626 lastst[lastnfa] = lastnfa;
627 transchar[lastnfa] = sym;
628 trans1[lastnfa] = NO_TRANSITION;
629 trans2[lastnfa] = NO_TRANSITION;
630 accptnum[lastnfa] = NIL;
631 assoc_rule[lastnfa] = num_rules;
632 state_type[lastnfa] = current_state_type;
633
634 /* fix up equivalence classes base on this transition. Note that any
635 * character which has its own transition gets its own equivalence class.
636 * Thus only characters which are only in character classes have a chance
637 * at being in the same equivalence class. E.g. "a|b" puts 'a' and 'b'
638 * into two different equivalence classes. "[ab]" puts them in the same
639 * equivalence class (barring other differences elsewhere in the input).
640 */
641
642 if ( sym < 0 )
643 {
644 /* we don't have to update the equivalence classes since that was
645 * already done when the ccl was created for the first time
646 */
647 }
648
649 else if ( sym == SYM_EPSILON )
650 ++numeps;
651
652 else
653 {
654 if ( useecs )
655 /* map NUL's to csize */
656 mkechar( sym ? sym : csize, nextecm, ecgroup );
657 }
658
659 return ( lastnfa );
660 }
661
662
663/* mkxtion - make a transition from one state to another
664 *
665 * synopsis
666 *
667 * mkxtion( statefrom, stateto );
668 *
669 * statefrom - the state from which the transition is to be made
670 * stateto - the state to which the transition is to be made
671 */
672
673void mkxtion( statefrom, stateto )
674int statefrom, stateto;
675
676 {
677 if ( trans1[statefrom] == NO_TRANSITION )
678 trans1[statefrom] = stateto;
679
680 else if ( (transchar[statefrom] != SYM_EPSILON) ||
681 (trans2[statefrom] != NO_TRANSITION) )
682 flexfatal( "found too many transitions in mkxtion()" );
683
684 else
685 { /* second out-transition for an epsilon state */
686 ++eps2;
687 trans2[statefrom] = stateto;
688 }
689 }
690
691/* new_rule - initialize for a new rule
692 *
693 * synopsis
694 *
695 * new_rule();
696 *
697 * the global num_rules is incremented and the any corresponding dynamic
698 * arrays (such as rule_type[]) are grown as needed.
699 */
700
701void new_rule()
702
703 {
704 if ( ++num_rules >= current_max_rules )
705 {
706 ++num_reallocs;
707 current_max_rules += MAX_RULES_INCREMENT;
708 rule_type = reallocate_integer_array( rule_type, current_max_rules );
709 rule_linenum =
710 reallocate_integer_array( rule_linenum, current_max_rules );
711 }
712
713 if ( num_rules > MAX_RULE )
714 lerrif( "too many rules (> %d)!", MAX_RULE );
715
716 rule_linenum[num_rules] = linenum;
717 }