From 810a4854f69fc4cb2afb45719cc4bf0b37e4fb4e Mon Sep 17 00:00:00 2001
From: CSRG <csrg@ucbvax.Berkeley.EDU>
Date: Mon, 18 Jun 1990 06:04:29 -0800
Subject: [PATCH] BSD 4_3_Reno development Work on file
 usr/src/pgrm/lex/flexdoc.1

Synthesized-from: CSRG/cd2/4.3reno
---
 usr/src/pgrm/lex/flexdoc.1 | 2382 ++++++++++++++++++++++++++++++++++++
 1 file changed, 2382 insertions(+)
 create mode 100644 usr/src/pgrm/lex/flexdoc.1

diff --git a/usr/src/pgrm/lex/flexdoc.1 b/usr/src/pgrm/lex/flexdoc.1
new file mode 100644
index 0000000000..ffa589a189
--- /dev/null
+++ b/usr/src/pgrm/lex/flexdoc.1
@@ -0,0 +1,2382 @@
+.TH FLEX 1 "26 May 1990" "Version 2.3"
+.SH NAME
+flex - fast lexical analyzer generator
+.SH SYNOPSIS
+.B flex
+.B [-bcdfinpstvFILT8 -C[efmF] -Sskeleton]
+.I [filename ...]
+.SH DESCRIPTION
+.I flex
+is a tool for generating
+.I scanners:
+programs which recognized lexical patterns in text.
+.I flex
+reads
+the given input files, or its standard input if no file names are given,
+for a description of a scanner to generate.  The description is in
+the form of pairs
+of regular expressions and C code, called
+.I rules.  flex
+generates as output a C source file,
+.B lex.yy.c,
+which defines a routine
+.B yylex().
+This file is compiled and linked with the
+.B -lfl
+library to produce an executable.  When the executable is run,
+it analyzes its input for occurrences
+of the regular expressions.  Whenever it finds one, it executes
+the corresponding C code.
+.SH SOME SIMPLE EXAMPLES
+.LP
+First some simple examples to get the flavor of how one uses
+.I flex.
+The following
+.I flex
+input specifies a scanner which whenever it encounters the string
+"username" will replace it with the user's login name:
+.nf
+
+    %%
+    username    printf( "%s", getlogin() );
+
+.fi
+By default, any text not matched by a
+.I flex
+scanner
+is copied to the output, so the net effect of this scanner is
+to copy its input file to its output with each occurrence
+of "username" expanded.
+In this input, there is just one rule.  "username" is the
+.I pattern
+and the "printf" is the
+.I action.
+The "%%" marks the beginning of the rules.
+.LP
+Here's another simple example:
+.nf
+
+        int num_lines = 0, num_chars = 0;
+
+    %%
+    \\n    ++num_lines; ++num_chars;
+    .     ++num_chars;
+
+    %%
+    main()
+        {
+        yylex();
+        printf( "# of lines = %d, # of chars = %d\\n",
+                num_lines, num_chars );
+        }
+
+.fi
+This scanner counts the number of characters and the number
+of lines in its input (it produces no output other than the
+final report on the counts).  The first line
+declares two globals, "num_lines" and "num_chars", which are accessible
+both inside
+.B yylex()
+and in the
+.B main()
+routine declared after the second "%%".  There are two rules, one
+which matches a newline ("\\n") and increments both the line count and
+the character count, and one which matches any character other than
+a newline (indicated by the "." regular expression).
+.LP
+A somewhat more complicated example:
+.nf
+
+    /* scanner for a toy Pascal-like language */
+
+    %{
+    /* need this for the call to atof() below */
+    #include <math.h>
+    %}
+
+    DIGIT    [0-9]
+    ID       [a-z][a-z0-9]*
+
+    %%
+
+    {DIGIT}+    {
+                printf( "An integer: %s (%d)\\n", yytext,
+                        atoi( yytext ) );
+                }
+
+    {DIGIT}+"."{DIGIT}*        {
+                printf( "A float: %s (%d)\\n", yytext,
+                        atof( yytext ) );
+                }
+
+    if|then|begin|end|procedure|function        {
+                printf( "A keyword: %s\\n", yytext );
+                }
+
+    {ID}        printf( "An identifier: %s\\n", yytext );
+
+    "+"|"-"|"*"|"/"   printf( "An operator: %s\\n", yytext );
+
+    "{"[^}\\n]*"}"     /* eat up one-line comments */
+
+    [ \\t\\n]+          /* eat up whitespace */
+
+    .           printf( "Unrecognized character: %s\\n", yytext );
+
+    %%
+
+    main( argc, argv )
+    int argc;
+    char **argv;
+        {
+        ++argv, --argc;  /* skip over program name */
+        if ( argc > 0 )
+                yyin = fopen( argv[0], "r" );
+        else
+                yyin = stdin;
+        
+        yylex();
+        }
+
+.fi
+This is the beginnings of a simple scanner for a language like
+Pascal.  It identifies different types of
+.I tokens
+and reports on what it has seen.
+.LP
+The details of this example will be explained in the following
+sections.
+.SH FORMAT OF THE INPUT FILE
+The
+.I flex
+input file consists of three sections, separated by a line with just
+.B %%
+in it:
+.nf
+
+    definitions
+    %%
+    rules
+    %%
+    user code
+
+.fi
+The
+.I definitions
+section contains declarations of simple
+.I name
+definitions to simplify the scanner specification, and declarations of
+.I start conditions,
+which are explained in a later section.
+.LP
+Name definitions have the form:
+.nf
+
+    name definition
+
+.fi
+The "name" is a word beginning with a letter or an underscore ('_')
+followed by zero or more letters, digits, '_', or '-' (dash).
+The definition is taken to begin at the first non-white-space character
+following the name and continuing to the end of the line.
+The definition can subsequently be referred to using "{name}", which
+will expand to "(definition)".  For example,
+.nf
+
+    DIGIT    [0-9]
+    ID       [a-z][a-z0-9]*
+
+.fi
+defines "DIGIT" to be a regular expression which matches a
+single digit, and
+"ID" to be a regular expression which matches a letter
+followed by zero-or-more letters-or-digits.
+A subsequent reference to
+.nf
+
+    {DIGIT}+"."{DIGIT}*
+
+.fi
+is identical to
+.nf
+
+    ([0-9])+"."([0-9])*
+
+.fi
+and matches one-or-more digits followed by a '.' followed
+by zero-or-more digits.
+.LP
+The
+.I rules
+section of the
+.I flex
+input contains a series of rules of the form:
+.nf
+
+    pattern   action
+
+.fi
+where the pattern must be unindented and the action must begin
+on the same line.
+.LP
+See below for a further description of patterns and actions.
+.LP
+Finally, the user code section is simply copied to
+.B lex.yy.c
+verbatim.
+It is used for companion routines which call or are called
+by the scanner.  The presence of this section is optional;
+if it is missing, the second
+.B %%
+in the input file may be skipped, too.
+.LP
+In the definitions and rules sections, any
+.I indented
+text or text enclosed in
+.B %{
+and
+.B %}
+is copied verbatim to the output (with the %{}'s removed).
+The %{}'s must appear unindented on lines by themselves.
+.LP
+In the rules section,
+any indented or %{} text appearing before the
+first rule may be used to declare variables
+which are local to the scanning routine and (after the declarations)
+code which is to be executed whenever the scanning routine is entered.
+Other indented or %{} text in the rule section is still copied to the output,
+but its meaning is not well-defined and it may well cause compile-time
+errors (this feature is present for
+.I POSIX
+compliance; see below for other such features).
+.LP
+In the definitions section, an unindented comment (i.e., a line
+beginning with "/*") is also copied verbatim to the output up
+to the next "*/".  Also, any line in the definitions section
+beginning with '#' is ignored, though this style of comment is
+deprecated and may go away in the future.
+.SH PATTERNS
+The patterns in the input are written using an extended set of regular
+expressions.  These are:
+.nf
+
+    x          match the character 'x'
+    .          any character except newline
+    [xyz]      a "character class"; in this case, the pattern
+                 matches either an 'x', a 'y', or a 'z'
+    [abj-oZ]   a "character class" with a range in it; matches
+                 an 'a', a 'b', any letter from 'j' through 'o',
+                 or a 'Z'
+    [^A-Z]     a "negated character class", i.e., any character
+                 but those in the class.  In this case, any
+                 character EXCEPT an uppercase letter.
+    [^A-Z\\n]   any character EXCEPT an uppercase letter or
+                 a newline
+    r*         zero or more r's, where r is any regular expression
+    r+         one or more r's
+    r?         zero or one r's (that is, "an optional r")
+    r{2,5}     anywhere from two to five r's
+    r{2,}      two or more r's
+    r{4}       exactly 4 r's
+    {name}     the expansion of the "name" definition
+               (see above)
+    "[xyz]\\"foo"
+               the literal string: [xyz]"foo
+    \\X         if X is an 'a', 'b', 'f', 'n', 'r', 't', or 'v',
+                 then the ANSI-C interpretation of \\x.
+                 Otherwise, a literal 'X' (used to escape
+                 operators such as '*')
+    \\123       the character with octal value 123
+    \\x2a       the character with hexadecimal value 2a
+    (r)        match an r; parentheses are used to override
+                 precedence (see below)
+
+
+    rs         the regular expression r followed by the
+                 regular expression s; called "concatenation"
+
+
+    r|s        either an r or an s
+
+
+    r/s        an r but only if it is followed by an s.  The
+                 s is not part of the matched text.  This type
+                 of pattern is called as "trailing context".
+    ^r         an r, but only at the beginning of a line
+    r$         an r, but only at the end of a line.  Equivalent
+                 to "r/\\n".
+
+
+    <s>r       an r, but only in start condition s (see
+               below for discussion of start conditions)
+    <s1,s2,s3>r
+               same, but in any of start conditions s1,
+               s2, or s3
+
+
+    <<EOF>>    an end-of-file
+    <s1,s2><<EOF>>
+               an end-of-file when in start condition s1 or s2
+
+.fi
+The regular expressions listed above are grouped according to
+precedence, from highest precedence at the top to lowest at the bottom.
+Those grouped together have equal precedence.  For example,
+.nf
+
+    foo|bar*
+
+.fi
+is the same as
+.nf
+
+    (foo)|(ba(r*))
+
+.fi
+since the '*' operator has higher precedence than concatenation,
+and concatenation higher than alternation ('|').  This pattern
+therefore matches
+.I either
+the string "foo"
+.I or
+the string "ba" followed by zero-or-more r's.
+To match "foo" or zero-or-more "bar"'s, use:
+.nf
+
+    foo|(bar)*
+
+.fi
+and to match zero-or-more "foo"'s-or-"bar"'s:
+.nf
+
+    (foo|bar)*
+
+.fi
+.LP
+Some notes on patterns:
+.IP -
+A negated character class such as the example "[^A-Z]"
+above
+.I will match a newline
+unless "\\n" (or an equivalent escape sequence) is one of the
+characters explicitly present in the negated character class
+(e.g., "[^A-Z\\n]").  This is unlike how many other regular
+expression tools treat negated character classes, but unfortunately
+the inconsistency is historically entrenched.
+Matching newlines means that a pattern like [^"]* can match an entire
+input (overflowing the scanner's input buffer) unless there's another
+quote in the input.
+.IP -
+A rule can have at most one instance of trailing context (the '/' operator
+or the '$' operator).  The start condition, '^', and "<<EOF>>" patterns
+can only occur at the beginning of a pattern, and, as well as with '/' and '$',
+cannot be grouped inside parentheses.  A '^' which does not occur at
+the beginning of a rule or a '$' which does not occur at the end of
+a rule loses its special properties and is treated as a normal character.
+.IP
+The following are illegal:
+.nf
+
+    foo/bar$
+    <sc1>foo<sc2>bar
+
+.fi
+Note that the first of these, can be written "foo/bar\\n".
+.IP
+The following will result in '$' or '^' being treated as a normal character:
+.nf
+
+    foo|(bar$)
+    foo|^bar
+
+.fi
+If what's wanted is a "foo" or a bar-followed-by-a-newline, the following
+could be used (the special '|' action is explained below):
+.nf
+
+    foo      |
+    bar$     /* action goes here */
+
+.fi
+A similar trick will work for matching a foo or a
+bar-at-the-beginning-of-a-line.
+.SH HOW THE INPUT IS MATCHED
+When the generated scanner is run, it analyzes its input looking
+for strings which match any of its patterns.  If it finds more than
+one match, it takes the one matching the most text (for trailing
+context rules, this includes the length of the trailing part, even
+though it will then be returned to the input).  If it finds two
+or more matches of the same length, the
+rule listed first in the
+.I flex
+input file is chosen.
+.LP
+Once the match is determined, the text corresponding to the match
+(called the
+.I token)
+is made available in the global character pointer
+.B yytext,
+and its length in the global integer
+.B yyleng.
+The
+.I action
+corresponding to the matched pattern is then executed (a more
+detailed description of actions follows), and then the remaining
+input is scanned for another match.
+.LP
+If no match is found, then the
+.I default rule
+is executed: the next character in the input is considered matched and
+copied to the standard output.  Thus, the simplest legal
+.I flex
+input is:
+.nf
+
+    %%
+
+.fi
+which generates a scanner that simply copies its input (one character
+at a time) to its output.
+.SH ACTIONS
+Each pattern in a rule has a corresponding action, which can be any
+arbitrary C statement.  The pattern ends at the first non-escaped
+whitespace character; the remainder of the line is its action.  If the
+action is empty, then when the pattern is matched the input token
+is simply discarded.  For example, here is the specification for a program
+which deletes all occurrences of "zap me" from its input:
+.nf
+
+    %%
+    "zap me"
+
+.fi
+(It will copy all other characters in the input to the output since
+they will be matched by the default rule.)
+.LP
+Here is a program which compresses multiple blanks and tabs down to
+a single blank, and throws away whitespace found at the end of a line:
+.nf
+
+    %%
+    [ \\t]+        putchar( ' ' );
+    [ \\t]+$       /* ignore this token */
+
+.fi
+.LP
+If the action contains a '{', then the action spans till the balancing '}'
+is found, and the action may cross multiple lines.
+.I flex 
+knows about C strings and comments and won't be fooled by braces found
+within them, but also allows actions to begin with
+.B %{
+and will consider the action to be all the text up to the next
+.B %}
+(regardless of ordinary braces inside the action).
+.LP
+An action consisting solely of a vertical bar ('|') means "same as
+the action for the next rule."  See below for an illustration.
+.LP
+Actions can include arbitrary C code, including
+.B return
+statements to return a value to whatever routine called
+.B yylex().
+Each time
+.B yylex()
+is called it continues processing tokens from where it last left
+off until it either reaches
+the end of the file or executes a return.  Once it reaches an end-of-file,
+however, then any subsequent call to
+.B yylex()
+will simply immediately return, unless
+.B yyrestart()
+is first called (see below).
+.LP
+Actions are not allowed to modify yytext or yyleng.
+.LP
+There are a number of special directives which can be included within
+an action:
+.IP -
+.B ECHO
+copies yytext to the scanner's output.
+.IP -
+.B BEGIN
+followed by the name of a start condition places the scanner in the
+corresponding start condition (see below).
+.IP -
+.B REJECT
+directs the scanner to proceed on to the "second best" rule which matched the
+input (or a prefix of the input).  The rule is chosen as described
+above in "How the Input is Matched", and
+.B yytext
+and
+.B yyleng
+set up appropriately.
+It may either be one which matched as much text
+as the originally chosen rule but came later in the
+.I flex
+input file, or one which matched less text.
+For example, the following will both count the
+words in the input and call the routine special() whenever "frob" is seen:
+.nf
+
+            int word_count = 0;
+    %%
+
+    frob        special(); REJECT;
+    [^ \\t\\n]+   ++word_count;
+
+.fi
+Without the
+.B REJECT,
+any "frob"'s in the input would not be counted as words, since the
+scanner normally executes only one action per token.
+Multiple
+.B REJECT's
+are allowed, each one finding the next best choice to the currently
+active rule.  For example, when the following scanner scans the token
+"abcd", it will write "abcdabcaba" to the output:
+.nf
+
+    %%
+    a        |
+    ab       |
+    abc      |
+    abcd     ECHO; REJECT;
+    .|\\n     /* eat up any unmatched character */
+
+.fi
+(The first three rules share the fourth's action since they use
+the special '|' action.)
+.B REJECT
+is a particularly expensive feature in terms scanner performance;
+if it is used in
+.I any
+of the scanner's actions it will slow down
+.I all
+of the scanner's matching.  Furthermore,
+.B REJECT
+cannot be used with the
+.I -f
+or
+.I -F
+options (see below).
+.IP
+Note also that unlike the other special actions,
+.B REJECT
+is a
+.I branch;
+code immediately following it in the action will
+.I not
+be executed.
+.IP -
+.B yymore()
+tells the scanner that the next time it matches a rule, the corresponding
+token should be
+.I appended
+onto the current value of
+.B yytext
+rather than replacing it.  For example, given the input "mega-kludge"
+the following will write "mega-mega-kludge" to the output:
+.nf
+
+    %%
+    mega-    ECHO; yymore();
+    kludge   ECHO;
+
+.fi
+First "mega-" is matched and echoed to the output.  Then "kludge"
+is matched, but the previous "mega-" is still hanging around at the
+beginning of
+.B yytext
+so the
+.B ECHO
+for the "kludge" rule will actually write "mega-kludge".
+The presence of
+.B yymore()
+in the scanner's action entails a minor performance penalty in the
+scanner's matching speed.
+.IP -
+.B yyless(n)
+returns all but the first
+.I n
+characters of the current token back to the input stream, where they
+will be rescanned when the scanner looks for the next match.
+.B yytext
+and
+.B yyleng
+are adjusted appropriately (e.g.,
+.B yyleng
+will now be equal to
+.I n
+).  For example, on the input "foobar" the following will write out
+"foobarbar":
+.nf
+
+    %%
+    foobar    ECHO; yyless(3);
+    [a-z]+    ECHO;
+
+.fi
+An argument of 0 to
+.B yyless
+will cause the entire current input string to be scanned again.  Unless you've
+changed how the scanner will subsequently process its input (using
+.B BEGIN,
+for example), this will result in an endless loop.
+.IP -
+.B unput(c)
+puts the character
+.I c
+back onto the input stream.  It will be the next character scanned.
+The following action will take the current token and cause it
+to be rescanned enclosed in parentheses.
+.nf
+
+    {
+    int i;
+    unput( ')' );
+    for ( i = yyleng - 1; i >= 0; --i )
+        unput( yytext[i] );
+    unput( '(' );
+    }
+
+.fi
+Note that since each
+.B unput()
+puts the given character back at the
+.I beginning
+of the input stream, pushing back strings must be done back-to-front.
+.IP -
+.B input()
+reads the next character from the input stream.  For example,
+the following is one way to eat up C comments:
+.nf
+
+    %%
+    "/*"        {
+                register int c;
+
+                for ( ; ; )
+                    {
+                    while ( (c = input()) != '*' &&
+                            c != EOF )
+                        ;    /* eat up text of comment */
+
+                    if ( c == '*' )
+                        {
+                        while ( (c = input()) == '*' )
+                            ;
+                        if ( c == '/' )
+                            break;    /* found the end */
+                        }
+
+                    if ( c == EOF )
+                        {
+                        error( "EOF in comment" );
+                        break;
+                        }
+                    }
+                }
+
+.fi
+(Note that if the scanner is compiled using
+.B C++,
+then
+.B input()
+is instead referred to as
+.B yyinput(),
+in order to avoid a name clash with the
+.B C++
+stream by the name of
+.I input.)
+.IP -
+.B yyterminate()
+can be used in lieu of a return statement in an action.  It terminates
+the scanner and returns a 0 to the scanner's caller, indicating "all done".
+Subsequent calls to the scanner will immediately return unless preceded
+by a call to
+.B yyrestart()
+(see below).
+By default,
+.B yyterminate()
+is also called when an end-of-file is encountered.  It is a macro and
+may be redefined.
+.SH THE GENERATED SCANNER
+The output of
+.I flex
+is the file
+.B lex.yy.c,
+which contains the scanning routine
+.B yylex(),
+a number of tables used by it for matching tokens, and a number
+of auxiliary routines and macros.  By default,
+.B yylex()
+is declared as follows:
+.nf
+
+    int yylex()
+        {
+        ... various definitions and the actions in here ...
+        }
+
+.fi
+(If your environment supports function prototypes, then it will
+be "int yylex( void )".)  This definition may be changed by redefining
+the "YY_DECL" macro.  For example, you could use:
+.nf
+
+    #undef YY_DECL
+    #define YY_DECL float lexscan( a, b ) float a, b;
+
+.fi
+to give the scanning routine the name
+.I lexscan,
+returning a float, and taking two floats as arguments.  Note that
+if you give arguments to the scanning routine using a
+K&R-style/non-prototyped function declaration, you must terminate
+the definition with a semi-colon (;).
+.LP
+Whenever
+.B yylex()
+is called, it scans tokens from the global input file
+.I yyin
+(which defaults to stdin).  It continues until it either reaches
+an end-of-file (at which point it returns the value 0) or
+one of its actions executes a
+.I return
+statement.
+In the former case, when called again the scanner will immediately
+return unless
+.B yyrestart()
+is called to point
+.I yyin
+at the new input file.  (
+.B yyrestart()
+takes one argument, a
+.B FILE *
+pointer.)
+In the latter case (i.e., when an action
+executes a return), the scanner may then be called again and it
+will resume scanning where it left off.
+.LP
+By default (and for purposes of efficiency), the scanner uses
+block-reads rather than simple
+.I getc()
+calls to read characters from
+.I yyin.
+The nature of how it gets its input can be controlled by redefining the
+.B YY_INPUT
+macro.
+YY_INPUT's calling sequence is "YY_INPUT(buf,result,max_size)".  Its
+action is to place up to
+.I max_size
+characters in the character array
+.I buf
+and return in the integer variable
+.I result
+either the
+number of characters read or the constant YY_NULL (0 on Unix systems)
+to indicate EOF.  The default YY_INPUT reads from the
+global file-pointer "yyin".
+.LP
+A sample redefinition of YY_INPUT (in the definitions
+section of the input file):
+.nf
+
+    %{
+    #undef YY_INPUT
+    #define YY_INPUT(buf,result,max_size) \\
+        result = ((buf[0] = getchar()) == EOF) ? YY_NULL : 1;
+    %}
+
+.fi
+This definition will change the input processing to occur
+one character at a time.
+.LP
+You also can add in things like keeping track of the
+input line number this way; but don't expect your scanner to
+go very fast.
+.LP
+When the scanner receives an end-of-file indication from YY_INPUT,
+it then checks the
+.B yywrap()
+function.  If
+.B yywrap()
+returns false (zero), then it is assumed that the
+function has gone ahead and set up
+.I yyin
+to point to another input file, and scanning continues.  If it returns
+true (non-zero), then the scanner terminates, returning 0 to its
+caller.
+.LP
+The default
+.B yywrap()
+always returns 1.  Presently, to redefine it you must first
+"#undef yywrap", as it is currently implemented as a macro.  As indicated
+by the hedging in the previous sentence, it may be changed to
+a true function in the near future.
+.LP
+The scanner writes its
+.B ECHO
+output to the
+.I yyout
+global (default, stdout), which may be redefined by the user simply
+by assigning it to some other
+.B FILE
+pointer.
+.SH START CONDITIONS
+.I flex
+provides a mechanism for conditionally activating rules.  Any rule
+whose pattern is prefixed with "<sc>" will only be active when
+the scanner is in the start condition named "sc".  For example,
+.nf
+
+    <STRING>[^"]*        { /* eat up the string body ... */
+                ...
+                }
+
+.fi
+will be active only when the scanner is in the "STRING" start
+condition, and
+.nf
+
+    <INITIAL,STRING,QUOTE>\\.        { /* handle an escape ... */
+                ...
+                }
+
+.fi
+will be active only when the current start condition is
+either "INITIAL", "STRING", or "QUOTE".
+.LP
+Start conditions
+are declared in the definitions (first) section of the input
+using unindented lines beginning with either
+.B %s
+or
+.B %x
+followed by a list of names.
+The former declares
+.I inclusive
+start conditions, the latter
+.I exclusive
+start conditions.  A start condition is activated using the
+.B BEGIN
+action.  Until the next
+.B BEGIN
+action is executed, rules with the given start
+condition will be active and
+rules with other start conditions will be inactive.
+If the start condition is
+.I inclusive,
+then rules with no start conditions at all will also be active.
+If it is
+.I exclusive,
+then
+.I only
+rules qualified with the start condition will be active.
+A set of rules contingent on the same exclusive start condition
+describe a scanner which is independent of any of the other rules in the
+.I flex
+input.  Because of this,
+exclusive start conditions make it easy to specify "mini-scanners"
+which scan portions of the input that are syntactically different
+from the rest (e.g., comments).
+.LP
+If the distinction between inclusive and exclusive start conditions
+is still a little vague, here's a simple example illustrating the
+connection between the two.  The set of rules:
+.nf
+
+    %s example
+    %%
+    <example>foo           /* do something */
+
+.fi
+is equivalent to
+.nf
+
+    %x example
+    %%
+    <INITIAL,example>foo   /* do something */
+
+.fi
+.LP
+The default rule (to
+.B ECHO
+any unmatched character) remains active in start conditions.
+.LP
+.B BEGIN(0)
+returns to the original state where only the rules with
+no start conditions are active.  This state can also be
+referred to as the start-condition "INITIAL", so
+.B BEGIN(INITIAL)
+is equivalent to
+.B BEGIN(0).
+(The parentheses around the start condition name are not required but
+are considered good style.)
+.LP
+.B BEGIN
+actions can also be given as indented code at the beginning
+of the rules section.  For example, the following will cause
+the scanner to enter the "SPECIAL" start condition whenever
+.I yylex()
+is called and the global variable
+.I enter_special
+is true:
+.nf
+
+            int enter_special;
+
+    %x SPECIAL
+    %%
+            if ( enter_special )
+                BEGIN(SPECIAL);
+
+    <SPECIAL>blahblahblah
+    ...more rules follow...
+
+.fi
+.LP
+To illustrate the uses of start conditions,
+here is a scanner which provides two different interpretations
+of a string like "123.456".  By default it will treat it as
+as three tokens, the integer "123", a dot ('.'), and the integer "456".
+But if the string is preceded earlier in the line by the string
+"expect-floats"
+it will treat it as a single token, the floating-point number
+123.456:
+.nf
+
+    %{
+    #include <math.h>
+    %}
+    %s expect
+
+    %%
+    expect-floats        BEGIN(expect);
+
+    <expect>[0-9]+"."[0-9]+      {
+                printf( "found a float, = %f\\n",
+                        atof( yytext ) );
+                }
+    <expect>\\n           {
+                /* that's the end of the line, so
+                 * we need another "expect-number"
+                 * before we'll recognize any more
+                 * numbers
+                 */
+                BEGIN(INITIAL);
+                }
+
+    [0-9]+      {
+                printf( "found an integer, = %d\\n",
+                        atoi( yytext ) );
+                }
+
+    "."         printf( "found a dot\\n" );
+
+.fi
+Here is a scanner which recognizes (and discards) C comments while
+maintaining a count of the current input line.
+.nf
+
+    %x comment
+    %%
+            int line_num = 1;
+
+    "/*"         BEGIN(comment);
+
+    <comment>[^*\\n]*        /* eat anything that's not a '*' */
+    <comment>"*"+[^*/\\n]*   /* eat up '*'s not followed by '/'s */
+    <comment>\\n             ++line_num;
+    <comment>"*"+"/"        BEGIN(INITIAL);
+
+.fi
+Note that start-conditions names are really integer values and
+can be stored as such.  Thus, the above could be extended in the
+following fashion:
+.nf
+
+    %x comment foo
+    %%
+            int line_num = 1;
+            int comment_caller;
+
+    "/*"         {
+                 comment_caller = INITIAL;
+                 BEGIN(comment);
+                 }
+
+    ...
+
+    <foo>"/*"    {
+                 comment_caller = foo;
+                 BEGIN(comment);
+                 }
+
+    <comment>[^*\\n]*        /* eat anything that's not a '*' */
+    <comment>"*"+[^*/\\n]*   /* eat up '*'s not followed by '/'s */
+    <comment>\\n             ++line_num;
+    <comment>"*"+"/"        BEGIN(comment_caller);
+
+.fi
+One can then implement a "stack" of start conditions using an
+array of integers.  (It is likely that such stacks will become
+a full-fledged
+.I flex
+feature in the future.)  Note, though, that
+start conditions do not have their own name-space; %s's and %x's
+declare names in the same fashion as #define's.
+.SH MULTIPLE INPUT BUFFERS
+Some scanners (such as those which support "include" files)
+require reading from several input streams.  As
+.I flex
+scanners do a large amount of buffering, one cannot control
+where the next input will be read from by simply writing a
+.B YY_INPUT
+which is sensitive to the scanning context.
+.B YY_INPUT
+is only called when the scanner reaches the end of its buffer, which
+may be a long time after scanning a statement such as an "include"
+which requires switching the input source.
+.LP
+To negotiate these sorts of problems,
+.I flex
+provides a mechanism for creating and switching between multiple
+input buffers.  An input buffer is created by using:
+.nf
+
+    YY_BUFFER_STATE yy_create_buffer( FILE *file, int size )
+
+.fi
+which takes a
+.I FILE
+pointer and a size and creates a buffer associated with the given
+file and large enough to hold
+.I size
+characters (when in doubt, use
+.B YY_BUF_SIZE
+for the size).  It returns a
+.B YY_BUFFER_STATE
+handle, which may then be passed to other routines:
+.nf
+
+    void yy_switch_to_buffer( YY_BUFFER_STATE new_buffer )
+
+.fi
+switches the scanner's input buffer so subsequent tokens will
+come from
+.I new_buffer.
+Note that
+.B yy_switch_to_buffer()
+may be used by yywrap() to sets things up for continued scanning, instead
+of opening a new file and pointing
+.I yyin
+at it.
+.nf
+
+    void yy_delete_buffer( YY_BUFFER_STATE buffer )
+
+.fi
+is used to reclaim the storage associated with a buffer.
+.LP
+.B yy_new_buffer()
+is an alias for
+.B yy_create_buffer(),
+provided for compatibility with the C++ use of
+.I new
+and
+.I delete
+for creating and destroying dynamic objects.
+.LP
+Finally, the
+.B YY_CURRENT_BUFFER
+macro returns a
+.B YY_BUFFER_STATE
+handle to the current buffer.
+.LP
+Here is an example of using these features for writing a scanner
+which expands include files (the
+.B <<EOF>>
+feature is discussed below):
+.nf
+
+    /* the "incl" state is used for picking up the name
+     * of an include file
+     */
+    %x incl
+
+    %{
+    #define MAX_INCLUDE_DEPTH 10
+    YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH];
+    int include_stack_ptr = 0;
+    %}
+
+    %%
+    include             BEGIN(incl);
+
+    [a-z]+              ECHO;
+    [^a-z\\n]*\\n?        ECHO;
+
+    <incl>[ \\t]*      /* eat the whitespace */
+    <incl>[^ \\t\\n]+   { /* got the include file name */
+            if ( include_stack_ptr >= MAX_INCLUDE_DEPTH )
+                {
+                fprintf( stderr, "Includes nested too deeply" );
+                exit( 1 );
+                }
+
+            include_stack[include_stack_ptr++] =
+                YY_CURRENT_BUFFER;
+
+            yyin = fopen( yytext, "r" );
+
+            if ( ! yyin )
+                error( ... );
+
+            yy_switch_to_buffer(
+                yy_create_buffer( yyin, YY_BUF_SIZE ) );
+
+            BEGIN(INITIAL);
+            }
+
+    <<EOF>> {
+            if ( --include_stack_ptr < 0 )
+                {
+                yyterminate();
+                }
+
+            else
+                yy_switch_to_buffer(
+                     include_stack[include_stack_ptr] );
+            }
+
+.fi
+.SH END-OF-FILE RULES
+The special rule "<<EOF>>" indicates
+actions which are to be taken when an end-of-file is
+encountered and yywrap() returns non-zero (i.e., indicates
+no further files to process).  The action must finish
+by doing one of four things:
+.IP -
+the special
+.B YY_NEW_FILE
+action, if
+.I yyin
+has been pointed at a new file to process;
+.IP -
+a
+.I return
+statement;
+.IP -
+the special
+.B yyterminate()
+action;
+.IP -
+or, switching to a new buffer using
+.B yy_switch_to_buffer()
+as shown in the example above.
+.LP
+<<EOF>> rules may not be used with other
+patterns; they may only be qualified with a list of start
+conditions.  If an unqualified <<EOF>> rule is given, it
+applies to
+.I all
+start conditions which do not already have <<EOF>> actions.  To
+specify an <<EOF>> rule for only the initial start condition, use
+.nf
+
+    <INITIAL><<EOF>>
+
+.fi
+.LP
+These rules are useful for catching things like unclosed comments.
+An example:
+.nf
+
+    %x quote
+    %%
+
+    ...other rules for dealing with quotes...
+
+    <quote><<EOF>>   {
+             error( "unterminated quote" );
+             yyterminate();
+             }
+    <<EOF>>  {
+             if ( *++filelist )
+                 {
+                 yyin = fopen( *filelist, "r" );
+                 YY_NEW_FILE;
+                 }
+             else
+                yyterminate();
+             }
+
+.fi
+.SH MISCELLANEOUS MACROS
+The macro
+.bd
+YY_USER_ACTION
+can be redefined to provide an action
+which is always executed prior to the matched rule's action.  For example,
+it could be #define'd to call a routine to convert yytext to lower-case.
+.LP
+The macro
+.B YY_USER_INIT
+may be redefined to provide an action which is always executed before
+the first scan (and before the scanner's internal initializations are done).
+For example, it could be used to call a routine to read
+in a data table or open a logging file.
+.LP
+In the generated scanner, the actions are all gathered in one large
+switch statement and separated using
+.B YY_BREAK,
+which may be redefined.  By default, it is simply a "break", to separate
+each rule's action from the following rule's.
+Redefining
+.B YY_BREAK
+allows, for example, C++ users to
+#define YY_BREAK to do nothing (while being very careful that every
+rule ends with a "break" or a "return"!) to avoid suffering from
+unreachable statement warnings where because a rule's action ends with
+"return", the
+.B YY_BREAK
+is inaccessible.
+.SH INTERFACING WITH YACC
+One of the main uses of
+.I flex
+is as a companion to the
+.I yacc
+parser-generator.
+.I yacc
+parsers expect to call a routine named
+.B yylex()
+to find the next input token.  The routine is supposed to
+return the type of the next token as well as putting any associated
+value in the global
+.B yylval.
+To use
+.I flex
+with
+.I yacc,
+one specifies the
+.B -d
+option to
+.I yacc
+to instruct it to generate the file
+.B y.tab.h
+containing definitions of all the
+.B %tokens
+appearing in the
+.I yacc
+input.  This file is then included in the
+.I flex
+scanner.  For example, if one of the tokens is "TOK_NUMBER",
+part of the scanner might look like:
+.nf
+
+    %{
+    #include "y.tab.h"
+    %}
+
+    %%
+
+    [0-9]+        yylval = atoi( yytext ); return TOK_NUMBER;
+
+.fi
+.SH TRANSLATION TABLE
+In the name of POSIX compliance,
+.I flex
+supports a
+.I translation table
+for mapping input characters into groups.
+The table is specified in the first section, and its format looks like:
+.nf
+
+    %t
+    1        abcd
+    2        ABCDEFGHIJKLMNOPQRSTUVWXYZ
+    52       0123456789
+    6        \\t\\ \\n
+    %t
+
+.fi
+This example specifies that the characters 'a', 'b', 'c', and 'd'
+are to all be lumped into group #1, upper-case letters
+in group #2, digits in group #52, tabs, blanks, and newlines into
+group #6, and
+.I
+no other characters will appear in the patterns.
+The group numbers are actually disregarded by
+.I flex;
+.B %t
+serves, though, to lump characters together.  Given the above
+table, for example, the pattern "a(AA)*5" is equivalent to "d(ZQ)*0".
+They both say, "match any character in group #1, followed by
+zero-or-more pairs of characters
+from group #2, followed by a character from group #52."  Thus
+.B %t
+provides a crude way for introducing equivalence classes into
+the scanner specification.
+.LP
+Note that the
+.B -i
+option (see below) coupled with the equivalence classes which
+.I flex
+automatically generates take care of virtually all the instances
+when one might consider using
+.B %t.
+But what the hell, it's there if you want it.
+.SH OPTIONS
+.I flex
+has the following options:
+.TP
+.B -b
+Generate backtracking information to
+.I lex.backtrack.
+This is a list of scanner states which require backtracking
+and the input characters on which they do so.  By adding rules one
+can remove backtracking states.  If all backtracking states
+are eliminated and
+.B -f
+or
+.B -F
+is used, the generated scanner will run faster (see the
+.B -p
+flag).  Only users who wish to squeeze every last cycle out of their
+scanners need worry about this option.  (See the section on PERFORMANCE
+CONSIDERATIONS below.)
+.TP
+.B -c
+is a do-nothing, deprecated option included for POSIX compliance.
+.IP
+.B NOTE:
+in previous releases of
+.I flex
+.B -c
+specified table-compression options.  This functionality is
+now given by the
+.B -C
+flag.  To ease the the impact of this change, when
+.I flex
+encounters
+.B -c,
+it currently issues a warning message and assumes that
+.B -C
+was desired instead.  In the future this "promotion" of
+.B -c
+to
+.B -C
+will go away in the name of full POSIX compliance (unless
+the POSIX meaning is removed first).
+.TP
+.B -d
+makes the generated scanner run in
+.I debug
+mode.  Whenever a pattern is recognized and the global
+.B yy_flex_debug
+is non-zero (which is the default),
+the scanner will write to
+.I stderr
+a line of the form:
+.nf
+
+    --accepting rule at line 53 ("the matched text")
+
+.fi
+The line number refers to the location of the rule in the file
+defining the scanner (i.e., the file that was fed to flex).  Messages
+are also generated when the scanner backtracks, accepts the
+default rule, reaches the end of its input buffer (or encounters
+a NUL; at this point, the two look the same as far as the scanner's concerned),
+or reaches an end-of-file.
+.TP
+.B -f
+specifies (take your pick)
+.I full table
+or
+.I fast scanner.
+No table compression is done.  The result is large but fast.
+This option is equivalent to
+.B -Cf
+(see below).
+.TP
+.B -i
+instructs
+.I flex
+to generate a
+.I case-insensitive
+scanner.  The case of letters given in the
+.I flex
+input patterns will
+be ignored, and tokens in the input will be matched regardless of case.  The
+matched text given in
+.I yytext
+will have the preserved case (i.e., it will not be folded).
+.TP
+.B -n
+is another do-nothing, deprecated option included only for
+POSIX compliance.
+.TP
+.B -p
+generates a performance report to stderr.  The report
+consists of comments regarding features of the
+.I flex
+input file which will cause a loss of performance in the resulting scanner.
+Note that the use of
+.I REJECT
+and variable trailing context (see the BUGS section in flex(1))
+entails a substantial performance penalty; use of
+.I yymore(),
+the
+.B ^
+operator,
+and the
+.B -I
+flag entail minor performance penalties.
+.TP
+.B -s
+causes the
+.I default rule
+(that unmatched scanner input is echoed to
+.I stdout)
+to be suppressed.  If the scanner encounters input that does not
+match any of its rules, it aborts with an error.  This option is
+useful for finding holes in a scanner's rule set.
+.TP
+.B -t
+instructs
+.I flex
+to write the scanner it generates to standard output instead
+of
+.B lex.yy.c.
+.TP
+.B -v
+specifies that
+.I flex
+should write to
+.I stderr
+a summary of statistics regarding the scanner it generates.
+Most of the statistics are meaningless to the casual
+.I flex
+user, but the
+first line identifies the version of
+.I flex,
+which is useful for figuring
+out where you stand with respect to patches and new releases,
+and the next two lines give the date when the scanner was created
+and a summary of the flags which were in effect.
+.TP
+.B -F
+specifies that the
+.ul
+fast
+scanner table representation should be used.  This representation is
+about as fast as the full table representation
+.ul
+(-f),
+and for some sets of patterns will be considerably smaller (and for
+others, larger).  In general, if the pattern set contains both "keywords"
+and a catch-all, "identifier" rule, such as in the set:
+.nf
+
+    "case"    return TOK_CASE;
+    "switch"  return TOK_SWITCH;
+    ...
+    "default" return TOK_DEFAULT;
+    [a-z]+    return TOK_ID;
+
+.fi
+then you're better off using the full table representation.  If only
+the "identifier" rule is present and you then use a hash table or some such
+to detect the keywords, you're better off using
+.ul
+-F.
+.IP
+This option is equivalent to
+.B -CF
+(see below).
+.TP
+.B -I
+instructs
+.I flex
+to generate an
+.I interactive
+scanner.  Normally, scanners generated by
+.I flex
+always look ahead one
+character before deciding that a rule has been matched.  At the cost of
+some scanning overhead,
+.I flex
+will generate a scanner which only looks ahead
+when needed.  Such scanners are called
+.I interactive
+because if you want to write a scanner for an interactive system such as a
+command shell, you will probably want the user's input to be terminated
+with a newline, and without
+.B -I
+the user will have to type a character in addition to the newline in order
+to have the newline recognized.  This leads to dreadful interactive
+performance.
+.IP
+If all this seems to confusing, here's the general rule: if a human will
+be typing in input to your scanner, use
+.B -I,
+otherwise don't; if you don't care about squeezing the utmost performance
+from your scanner and you
+don't want to make any assumptions about the input to your scanner,
+use
+.B -I.
+.IP
+Note,
+.B -I
+cannot be used in conjunction with
+.I full
+or
+.I fast tables,
+i.e., the
+.B -f, -F, -Cf,
+or
+.B -CF
+flags.
+.TP
+.B -L
+instructs
+.I flex
+not to generate
+.B #line
+directives.  Without this option,
+.I flex
+peppers the generated scanner
+with #line directives so error messages in the actions will be correctly
+located with respect to the original
+.I flex
+input file, and not to
+the fairly meaningless line numbers of
+.B lex.yy.c.
+(Unfortunately
+.I flex
+does not presently generate the necessary directives
+to "retarget" the line numbers for those parts of
+.B lex.yy.c
+which it generated.  So if there is an error in the generated code,
+a meaningless line number is reported.)
+.TP
+.B -T
+makes
+.I flex
+run in
+.I trace
+mode.  It will generate a lot of messages to
+.I stdout
+concerning
+the form of the input and the resultant non-deterministic and deterministic
+finite automata.  This option is mostly for use in maintaining
+.I flex.
+.TP
+.B -8
+instructs
+.I flex
+to generate an 8-bit scanner, i.e., one which can recognize 8-bit
+characters.  On some sites,
+.I flex
+is installed with this option as the default.  On others, the default
+is 7-bit characters.  To see which is the case, check the verbose
+.B (-v)
+output for "equivalence classes created".  If the denominator of
+the number shown is 128, then by default
+.I flex
+is generating 7-bit characters.  If it is 256, then the default is
+8-bit characters and the
+.B -8
+flag is not required (but may be a good idea to keep the scanner
+specification portable).  Feeding a 7-bit scanner 8-bit characters
+will result in infinite loops, bus errors, or other such fireworks,
+so when in doubt, use the flag.  Note that if equivalence classes
+are used, 8-bit scanners take only slightly more table space than
+7-bit scanners (128 bytes, to be exact); if equivalence classes are
+not used, however, then the tables may grow up to twice their
+7-bit size.
+.TP 
+.B -C[efmF]
+controls the degree of table compression.
+.IP
+.B -Ce
+directs
+.I flex
+to construct
+.I equivalence classes,
+i.e., sets of characters
+which have identical lexical properties (for example, if the only
+appearance of digits in the
+.I flex
+input is in the character class
+"[0-9]" then the digits '0', '1', ..., '9' will all be put
+in the same equivalence class).  Equivalence classes usually give
+dramatic reductions in the final table/object file sizes (typically
+a factor of 2-5) and are pretty cheap performance-wise (one array
+look-up per character scanned).
+.IP
+.B -Cf
+specifies that the
+.I full
+scanner tables should be generated -
+.I flex
+should not compress the
+tables by taking advantages of similar transition functions for
+different states.
+.IP
+.B -CF
+specifies that the alternate fast scanner representation (described
+above under the
+.B -F
+flag)
+should be used.
+.IP
+.B -Cm
+directs
+.I flex
+to construct
+.I meta-equivalence classes,
+which are sets of equivalence classes (or characters, if equivalence
+classes are not being used) that are commonly used together.  Meta-equivalence
+classes are often a big win when using compressed tables, but they
+have a moderate performance impact (one or two "if" tests and one
+array look-up per character scanned).
+.IP
+A lone
+.B -C
+specifies that the scanner tables should be compressed but neither
+equivalence classes nor meta-equivalence classes should be used.
+.IP
+The options
+.B -Cf
+or
+.B -CF
+and
+.B -Cm
+do not make sense together - there is no opportunity for meta-equivalence
+classes if the table is not being compressed.  Otherwise the options
+may be freely mixed.
+.IP
+The default setting is
+.B -Cem,
+which specifies that
+.I flex
+should generate equivalence classes
+and meta-equivalence classes.  This setting provides the highest
+degree of table compression.  You can trade off
+faster-executing scanners at the cost of larger tables with
+the following generally being true:
+.nf
+
+    slowest & smallest
+          -Cem
+          -Cm
+          -Ce
+          -C
+          -C{f,F}e
+          -C{f,F}
+    fastest & largest
+
+.fi
+Note that scanners with the smallest tables are usually generated and
+compiled the quickest, so
+during development you will usually want to use the default, maximal
+compression.
+.IP
+.B -Cfe
+is often a good compromise between speed and size for production
+scanners.
+.IP
+.B -C
+options are not cumulative; whenever the flag is encountered, the
+previous -C settings are forgotten.
+.TP
+.B -Sskeleton_file
+overrides the default skeleton file from which
+.I flex
+constructs its scanners.  You'll never need this option unless you are doing
+.I flex
+maintenance or development.
+.SH PERFORMANCE CONSIDERATIONS
+The main design goal of
+.I flex
+is that it generate high-performance scanners.  It has been optimized
+for dealing well with large sets of rules.  Aside from the effects
+of table compression on scanner speed outlined above,
+there are a number of options/actions which degrade performance.  These
+are, from most expensive to least:
+.nf
+
+    REJECT
+
+    pattern sets that require backtracking
+    arbitrary trailing context
+
+    '^' beginning-of-line operator
+    yymore()
+
+.fi
+with the first three all being quite expensive and the last two
+being quite cheap.
+.LP
+.B REJECT
+should be avoided at all costs when performance is important.
+It is a particularly expensive option.
+.LP
+Getting rid of backtracking is messy and often may be an enormous
+amount of work for a complicated scanner.  In principal, one begins
+by using the
+.B -b 
+flag to generate a
+.I lex.backtrack
+file.  For example, on the input
+.nf
+
+    %%
+    foo        return TOK_KEYWORD;
+    foobar     return TOK_KEYWORD;
+
+.fi
+the file looks like:
+.nf
+
+    State #6 is non-accepting -
+     associated rule line numbers:
+           2       3
+     out-transitions: [ o ]
+     jam-transitions: EOF [ \\001-n  p-\\177 ]
+
+    State #8 is non-accepting -
+     associated rule line numbers:
+           3
+     out-transitions: [ a ]
+     jam-transitions: EOF [ \\001-`  b-\\177 ]
+
+    State #9 is non-accepting -
+     associated rule line numbers:
+           3
+     out-transitions: [ r ]
+     jam-transitions: EOF [ \\001-q  s-\\177 ]
+
+    Compressed tables always backtrack.
+
+.fi
+The first few lines tell us that there's a scanner state in
+which it can make a transition on an 'o' but not on any other
+character, and that in that state the currently scanned text does not match
+any rule.  The state occurs when trying to match the rules found
+at lines 2 and 3 in the input file.
+If the scanner is in that state and then reads
+something other than an 'o', it will have to backtrack to find
+a rule which is matched.  With
+a bit of headscratching one can see that this must be the
+state it's in when it has seen "fo".  When this has happened,
+if anything other than another 'o' is seen, the scanner will
+have to back up to simply match the 'f' (by the default rule).
+.LP
+The comment regarding State #8 indicates there's a problem
+when "foob" has been scanned.  Indeed, on any character other
+than a 'b', the scanner will have to back up to accept "foo".
+Similarly, the comment for State #9 concerns when "fooba" has
+been scanned.
+.LP
+The final comment reminds us that there's no point going to
+all the trouble of removing backtracking from the rules unless
+we're using
+.B -f
+or
+.B -F,
+since there's no performance gain doing so with compressed scanners.
+.LP
+The way to remove the backtracking is to add "error" rules:
+.nf
+
+    %%
+    foo         return TOK_KEYWORD;
+    foobar      return TOK_KEYWORD;
+
+    fooba       |
+    foob        |
+    fo          {
+                /* false alarm, not really a keyword */
+                return TOK_ID;
+                }
+
+.fi
+.LP
+Eliminating backtracking among a list of keywords can also be
+done using a "catch-all" rule:
+.nf
+
+    %%
+    foo         return TOK_KEYWORD;
+    foobar      return TOK_KEYWORD;
+
+    [a-z]+      return TOK_ID;
+
+.fi
+This is usually the best solution when appropriate.
+.LP
+Backtracking messages tend to cascade.
+With a complicated set of rules it's not uncommon to get hundreds
+of messages.  If one can decipher them, though, it often
+only takes a dozen or so rules to eliminate the backtracking (though
+it's easy to make a mistake and have an error rule accidentally match
+a valid token.  A possible future
+.I flex
+feature will be to automatically add rules to eliminate backtracking).
+.LP
+.I Variable
+trailing context (where both the leading and trailing parts do not have
+a fixed length) entails almost the same performance loss as
+.I REJECT
+(i.e., substantial).  So when possible a rule like:
+.nf
+
+    %%
+    mouse|rat/(cat|dog)   run();
+
+.fi
+is better written:
+.nf
+
+    %%
+    mouse/cat|dog         run();
+    rat/cat|dog           run();
+
+.fi
+or as
+.nf
+
+    %%
+    mouse|rat/cat         run();
+    mouse|rat/dog         run();
+
+.fi
+Note that here the special '|' action does
+.I not
+provide any savings, and can even make things worse (see
+.B BUGS
+in flex(1)).
+.LP
+Another area where the user can increase a scanner's performance
+(and one that's easier to implement) arises from the fact that
+the longer the tokens matched, the faster the scanner will run.
+This is because with long tokens the processing of most input
+characters takes place in the (short) inner scanning loop, and
+does not often have to go through the additional work of setting up
+the scanning environment (e.g.,
+.B yytext)
+for the action.  Recall the scanner for C comments:
+.nf
+
+    %x comment
+    %%
+            int line_num = 1;
+
+    "/*"         BEGIN(comment);
+
+    <comment>[^*\\n]*
+    <comment>"*"+[^*/\\n]*
+    <comment>\\n             ++line_num;
+    <comment>"*"+"/"        BEGIN(INITIAL);
+
+.fi
+This could be sped up by writing it as:
+.nf
+
+    %x comment
+    %%
+            int line_num = 1;
+
+    "/*"         BEGIN(comment);
+
+    <comment>[^*\\n]*
+    <comment>[^*\\n]*\\n      ++line_num;
+    <comment>"*"+[^*/\\n]*
+    <comment>"*"+[^*/\\n]*\\n ++line_num;
+    <comment>"*"+"/"        BEGIN(INITIAL);
+
+.fi
+Now instead of each newline requiring the processing of another
+action, recognizing the newlines is "distributed" over the other rules
+to keep the matched text as long as possible.  Note that
+.I adding
+rules does
+.I not
+slow down the scanner!  The speed of the scanner is independent
+of the number of rules or (modulo the considerations given at the
+beginning of this section) how complicated the rules are with
+regard to operators such as '*' and '|'.
+.LP
+A final example in speeding up a scanner: suppose you want to scan
+through a file containing identifiers and keywords, one per line
+and with no other extraneous characters, and recognize all the
+keywords.  A natural first approach is:
+.nf
+
+    %%
+    asm      |
+    auto     |
+    break    |
+    ... etc ...
+    volatile |
+    while    /* it's a keyword */
+
+    .|\\n     /* it's not a keyword */
+
+.fi
+To eliminate the back-tracking, introduce a catch-all rule:
+.nf
+
+    %%
+    asm      |
+    auto     |
+    break    |
+    ... etc ...
+    volatile |
+    while    /* it's a keyword */
+
+    [a-z]+   |
+    .|\\n     /* it's not a keyword */
+
+.fi
+Now, if it's guaranteed that there's exactly one word per line,
+then we can reduce the total number of matches by a half by
+merging in the recognition of newlines with that of the other
+tokens:
+.nf
+
+    %%
+    asm\\n    |
+    auto\\n   |
+    break\\n  |
+    ... etc ...
+    volatile\\n |
+    while\\n  /* it's a keyword */
+
+    [a-z]+\\n |
+    .|\\n     /* it's not a keyword */
+
+.fi
+One has to be careful here, as we have now reintroduced backtracking
+into the scanner.  In particular, while
+.I we
+know that there will never be any characters in the input stream
+other than letters or newlines,
+.I flex
+can't figure this out, and it will plan for possibly needing backtracking
+when it has scanned a token like "auto" and then the next character
+is something other than a newline or a letter.  Previously it would
+then just match the "auto" rule and be done, but now it has no "auto"
+rule, only a "auto\\n" rule.  To eliminate the possibility of backtracking,
+we could either duplicate all rules but without final newlines, or,
+since we never expect to encounter such an input and therefore don't
+how it's classified, we can introduce one more catch-all rule, this
+one which doesn't include a newline:
+.nf
+
+    %%
+    asm\\n    |
+    auto\\n   |
+    break\\n  |
+    ... etc ...
+    volatile\\n |
+    while\\n  /* it's a keyword */
+
+    [a-z]+\\n |
+    [a-z]+   |
+    .|\\n     /* it's not a keyword */
+
+.fi
+Compiled with
+.B -Cf,
+this is about as fast as one can get a
+.I flex 
+scanner to go for this particular problem.
+.LP
+A final note:
+.I flex
+is slow when matching NUL's, particularly when a token contains
+multiple NUL's.
+It's best to write rules which match
+.I short
+amounts of text if it's anticipated that the text will often include NUL's.
+.SH INCOMPATIBILITIES WITH LEX AND POSIX
+.I flex
+is a rewrite of the Unix
+.I lex
+tool (the two implementations do not share any code, though),
+with some extensions and incompatibilities, both of which
+are of concern to those who wish to write scanners acceptable
+to either implementation.  At present, the POSIX
+.I lex
+draft is
+very close to the original
+.I lex
+implementation, so some of these
+incompatibilities are also in conflict with the POSIX draft.  But
+the intent is that except as noted below,
+.I flex
+as it presently stands will
+ultimately be POSIX conformant (i.e., that those areas of conflict with
+the POSIX draft will be resolved in
+.I flex's
+favor).  Please bear in
+mind that all the comments which follow are with regard to the POSIX
+.I draft
+standard of Summer 1989, and not the final document (or subsequent
+drafts); they are included so
+.I flex
+users can be aware of the standardization issues and those areas where
+.I flex
+may in the near future undergo changes incompatible with
+its current definition.
+.LP
+.I flex
+is fully compatible with
+.I lex
+with the following exceptions:
+.IP -
+.I lex
+does not support exclusive start conditions (%x), though they
+are in the current POSIX draft.
+.IP -
+When definitions are expanded,
+.I flex
+encloses them in parentheses.
+With lex, the following:
+.nf
+
+    NAME    [A-Z][A-Z0-9]*
+    %%
+    foo{NAME}?      printf( "Found it\\n" );
+    %%
+
+.fi
+will not match the string "foo" because when the macro
+is expanded the rule is equivalent to "foo[A-Z][A-Z0-9]*?"
+and the precedence is such that the '?' is associated with
+"[A-Z0-9]*".  With
+.I flex,
+the rule will be expanded to
+"foo([A-Z][A-Z0-9]*)?" and so the string "foo" will match.
+Note that because of this, the
+.B ^, $, <s>, /,
+and
+.B <<EOF>>
+operators cannot be used in a
+.I flex
+definition.
+.IP
+The POSIX draft interpretation is the same as
+.I flex's.
+.IP -
+To specify a character class which matches anything but a left bracket (']'),
+in
+.I lex
+one can use "[^]]" but with
+.I flex
+one must use "[^\\]]".  The latter works with
+.I lex,
+too.
+.IP -
+The undocumented
+.I lex
+scanner internal variable
+.B yylineno
+is not supported.  (The variable is not part of the POSIX draft.)
+.IP -
+The
+.B input()
+routine is not redefinable, though it may be called to read characters
+following whatever has been matched by a rule.  If
+.B input()
+encounters an end-of-file the normal
+.B yywrap()
+processing is done.  A ``real'' end-of-file is returned by
+.B input()
+as
+.I EOF.
+.IP
+Input is instead controlled by redefining the
+.B YY_INPUT
+macro.
+.IP
+The
+.I flex
+restriction that
+.B input()
+cannot be redefined is in accordance with the POSIX draft, but
+.B YY_INPUT
+has not yet been accepted into the draft.
+.IP -
+.B output()
+is not supported.
+Output from the
+.B ECHO
+macro is done to the file-pointer
+.I yyout
+(default
+.I stdout).
+.IP
+The POSIX draft mentions that an
+.B output()
+routine exists but currently gives no details as to what it does.
+.IP -
+The
+.I lex
+.B %r
+(generate a Ratfor scanner) option is not supported.  It is not part
+of the POSIX draft.
+.IP -
+If you are providing your own yywrap() routine, you must include a
+"#undef yywrap" in the definitions section (section 1).  Note that
+the "#undef" will have to be enclosed in %{}'s.
+.IP
+The POSIX draft
+specifies that yywrap() is a function and this is unlikely to change; so
+.I flex users are warned
+that
+.B yywrap()
+is likely to be changed to a function in the near future.
+.IP -
+After a call to
+.B unput(),
+.I yytext
+and
+.I yyleng
+are undefined until the next token is matched.  This is not the case with
+.I lex
+or the present POSIX draft.
+.IP -
+The precedence of the
+.B {}
+(numeric range) operator is different.
+.I lex
+interprets "abc{1,3}" as "match one, two, or
+three occurrences of 'abc'", whereas
+.I flex
+interprets it as "match 'ab'
+followed by one, two, or three occurrences of 'c'".  The latter is
+in agreement with the current POSIX draft.
+.IP -
+The precedence of the
+.B ^
+operator is different.
+.I lex
+interprets "^foo|bar" as "match either 'foo' at the beginning of a line,
+or 'bar' anywhere", whereas
+.I flex
+interprets it as "match either 'foo' or 'bar' if they come at the beginning
+of a line".  The latter is in agreement with the current POSIX draft.
+.IP -
+To refer to yytext outside of the scanner source file,
+the correct definition with
+.I flex
+is "extern char *yytext" rather than "extern char yytext[]".
+This is contrary to the current POSIX draft but a point on which
+.I flex
+will not be changing, as the array representation entails a
+serious performance penalty.  It is hoped that the POSIX draft will
+be emended to support the
+.I flex
+variety of declaration (as this is a fairly painless change to
+require of
+.I lex
+users).
+.IP -
+.I yyin
+is
+.I initialized
+by
+.I lex
+to be
+.I stdin;
+.I flex,
+on the other hand,
+initializes
+.I yyin
+to NULL
+and then
+.I assigns
+it to
+.I stdin
+the first time the scanner is called, providing
+.I yyin
+has not already been assigned to a non-NULL value.  The difference is
+subtle, but the net effect is that with
+.I flex
+scanners,
+.I yyin
+does not have a valid value until the scanner has been called.
+.IP -
+The special table-size declarations such as
+.B %a
+supported by
+.I lex
+are not required by
+.I flex
+scanners;
+.I flex
+ignores them.
+.IP -
+The name
+.bd
+FLEX_SCANNER
+is #define'd so scanners may be written for use with either
+.I flex
+or
+.I lex.
+.LP
+The following
+.I flex
+features are not included in
+.I lex
+or the POSIX draft standard:
+.nf
+
+    yyterminate()
+    <<EOF>>
+    YY_DECL
+    #line directives
+    %{}'s around actions
+    yyrestart()
+    comments beginning with '#' (deprecated)
+    multiple actions on a line
+
+.fi
+This last feature refers to the fact that with
+.I flex
+you can put multiple actions on the same line, separated with
+semi-colons, while with
+.I lex,
+the following
+.nf
+
+    foo    handle_foo(); ++num_foos_seen;
+
+.fi
+is (rather surprisingly) truncated to
+.nf
+
+    foo    handle_foo();
+
+.fi
+.I flex
+does not truncate the action.  Actions that are not enclosed in
+braces are simply terminated at the end of the line.
+.SH DIAGNOSTICS
+.I reject_used_but_not_detected undefined
+or
+.I yymore_used_but_not_detected undefined -
+These errors can occur at compile time.  They indicate that the
+scanner uses
+.B REJECT
+or
+.B yymore()
+but that
+.I flex
+failed to notice the fact, meaning that
+.I flex
+scanned the first two sections looking for occurrences of these actions
+and failed to find any, but somehow you snuck some in (via a #include
+file, for example).  Make an explicit reference to the action in your
+.I flex
+input file.  (Note that previously
+.I flex
+supported a
+.B %used/%unused
+mechanism for dealing with this problem; this feature is still supported
+but now deprecated, and will go away soon unless the author hears from
+people who can argue compellingly that they need it.)
+.LP
+.I flex scanner jammed -
+a scanner compiled with
+.B -s
+has encountered an input string which wasn't matched by
+any of its rules.
+.LP
+.I flex input buffer overflowed -
+a scanner rule matched a string long enough to overflow the
+scanner's internal input buffer (16K bytes by default - controlled by
+.B YY_BUF_SIZE
+in "flex.skel".  Note that to redefine this macro, you must first
+.B #undefine
+it).
+.LP
+.I scanner requires -8 flag -
+Your scanner specification includes recognizing 8-bit characters and
+you did not specify the -8 flag (and your site has not installed flex
+with -8 as the default).
+.LP
+.I too many %t classes! -
+You managed to put every single character into its own %t class.
+.I flex
+requires that at least one of the classes share characters.
+.SH DEFICIENCIES / BUGS
+See flex(1).
+.SH "SEE ALSO"
+.LP
+flex(1), lex(1), yacc(1), sed(1), awk(1).
+.LP
+M. E. Lesk and E. Schmidt,
+.I LEX - Lexical Analyzer Generator
+.SH AUTHOR
+Vern Paxson, with the help of many ideas and much inspiration from
+Van Jacobson.  Original version by Jef Poskanzer.  The fast table
+representation is a partial implementation of a design done by Van
+Jacobson.  The implementation was done by Kevin Gong and Vern Paxson.
+.LP
+Thanks to the many
+.I flex
+beta-testers, feedbackers, and contributors, especially Casey
+Leedom, benson@odi.com,
+Frederic Brehm, Nick Christopher, Jason Coughlin,
+Scott David Daniels, Leo Eskin,
+Chris Faylor, Eric Goldman, Eric
+Hughes, Jeffrey R. Jones, Kevin B. Kenny, Ronald Lamprecht,
+Greg Lee, Craig Leres, Mohamed el Lozy, Jim Meyering, Marc Nozell, Esmond Pitt,
+Jef Poskanzer, Jim Roskind,
+Dave Tallman, Frank Whaley, Ken Yap, and those whose names
+have slipped my marginal mail-archiving skills but whose contributions
+are appreciated all the same.
+.LP
+Thanks to Keith Bostic, John Gilmore, Craig Leres, Bob
+Mulcahy, Rich Salz, and Richard Stallman for help with various distribution
+headaches.
+.LP
+Thanks to Esmond Pitt and Earle Horton for 8-bit character support;
+to Benson Margulies and Fred
+Burke for C++ support; to Ove Ewerlid for the basics of support for
+NUL's; and to Eric Hughes for the basics of support for multiple buffers.
+.LP
+Work is being done on extending
+.I flex
+to generate scanners in which the
+state machine is directly represented in C code rather than tables.
+These scanners may well be substantially faster than those generated
+using -f or -F.  If you are working in this area and are interested
+in comparing notes and seeing whether redundant work can be avoided,
+contact Ove Ewerlid (ewerlid@mizar.DoCS.UU.SE).
+.LP
+This work was primarily done when I was at the Real Time Systems Group
+at the Lawrence Berkeley Laboratory in Berkeley, CA.  Many thanks to all there
+for the support I received.
+.LP
+Send comments to:
+.nf
+
+     Vern Paxson
+     Computer Science Department
+     4126 Upson Hall
+     Cornell University
+     Ithaca, NY 14853-7501
+
+     vern@cs.cornell.edu
+     decvax!cornell!vern
+
+.fi
-- 
2.20.1