/* lex.c: rc's lexical analyzer */
Special characters (i.e., "non-word") in rc:
\t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \
The lexical analyzer is fairly straightforward. The only really
unclean part concerns backslash continuation and "double
backslashes". A backslash followed by a newline is treated as a
space, otherwise backslash is not a special characeter (i.e.,
it can be part of a word). This introduces a host of unwanted
special cases. In our case, \ cannot be a word character, since
we wish to read in all word characters in a tight loop.
Note: to save the trouble of declaring these arrays with TRUEs
and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is
#define BUFSIZE ((size_t) 1000) /* malloc hates power of 2 buffers? */
#define BUFMAX (8 * BUFSIZE) /* How big the buffer can get before we re-allocate the
space at BUFSIZE again. Premature optimization? Maybe.
typedef enum wordstates
{
NW
, RW
, KW
/* "nonword", "realword", "keyword" */
static void getpair(int);
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
static size_t bufsize
= BUFSIZE
;
static char *realbuf
= NULL
;
static bool newline
= FALSE
;
static bool errset
= FALSE
;
static bool prerror
= FALSE
;
static wordstates w
= NW
;
static int fd_left
, fd_right
;
#define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }}
static bool dollar
= FALSE
;
size_t i
; /* The purpose of all these local assignments is to */
const char *meta
; /* allow optimizing compilers like gcc to load these */
char *buf
= realbuf
; /* values into registers. On a sparc this is a */
YYSTYPE
*y
= &yylval
; /* win, in code size *and* execution time */
/* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */
meta
= (dollar
? dnw
: nw
);
--lineno
; /* slight space optimization; print_prompt2() always increments lineno */
top
: while ((c
= gchar()) == ' ' || c
== '\t')
if (!meta
[(unsigned char) c
]) { /* it's a word or keyword. */
if (c
== '?' || c
== '[' || c
== '*')
buf
= realbuf
= erealloc(buf
, bufsize
*= 2);
} while ((c
= gchar()) != EOF
&& !meta
[(unsigned char) c
]);
if ((c
= gchar()) == '\n') {
c
= ' '; /* Pretend a space was read */
bs
: if (meta
!= dnw
) { /* all words but varnames may have a bslash */
buf
= realbuf
= erealloc(buf
, bufsize
*= 2);
if (!meta
[(unsigned char) c
])
if (*buf
== 'i' && buf
[1] == 'f') return IF
;
if (*buf
== 'f' && buf
[1] == 'n') return FN
;
if (*buf
== 'i' && buf
[1] == 'n') return IN
;
if (streq(buf
, "for")) return FOR
;
if (streq(buf
, "else")) return ELSE
;
if (streq(buf
, "switch")) return SWITCH
;
if (streq(buf
, "while")) return WHILE
;
if (streq(buf
, "case")) return CASE
;
y
->word
.m
= nalloc(strlen(buf
) + 1);
for (r
= buf
, s
= y
->word
.m
; *r
!= '\0'; r
++, s
++)
*s
= (*r
== '?' || *r
== '[' || *r
== '*');
if (c
== '`' || c
== '!' || c
== '@' || c
== '~' || c
== '$' || c
== '\'') {
if (c
== '!' || c
== '@' || c
== '~')
pr_error("warning: null character ignored");
scanerror("eof in quoted string");
buf
= realbuf
= erealloc(buf
, bufsize
*= 2);
} while ((c
= gchar()) != '\'' || (c
= gchar()) == '\''); /* quote "'" thus: 'how''s it going?' */
if ((c
= gchar()) == '\n') {
goto top
; /* Pretend it was just another space. */
if (w
== RW
) /* SUB's happen only after real words, not keyowrds, so if () and while () work */
while ((c
= gchar()) != '\n') /* skip comment until newline */
if ((y
->pipe
.left
= fd_left
) == UNSET
)
y
->pipe
.left
= 1; /* default to fd 1 */
if ((y
->pipe
.right
= fd_right
) == UNSET
)
y
->pipe
.right
= 0; /* default to fd 0 */
if (y
->pipe
.right
== CLOSED
) {
scanerror("expected digit after '='"); /* can't close a pipe */
y
->redir
.type
= rHerestring
;
y
->redir
.type
= rHeredoc
;
if (fd_right
== UNSET
) { /* redirection, not dup */
return (y
->redir
.type
== rFrom
|| y
->redir
.type
== rCreate
) ? REDIR
: SREDIR
;
} else { /* dup; recast yylval */
y
->dup
.type
= y
->redir
.type
;
return c
; /* don't know what it is, let yacc barf on it */
extern void yyerror(const char *s
) {
if (prerror
) { /* don't print "syntax error" if there's a more informative scanerror */
tok
= nprint((last
< 32 || last
> 126) ? "(decimal %d)" : "'%c'", last
);
fprint(2, "line %d: %s near %s\n", lineno
- (last
== '\n'), s
, tok
);
extern void scanerror(char *s
) {
flushu(); /* flush upto newline */
/* return memory to the system if the buffer got too large */
if (bufsize
> BUFMAX
&& realbuf
!= NULL
) {
realbuf
= ealloc(bufsize
);
} else if (realbuf
== NULL
)
realbuf
= ealloc(bufsize
);
extern void print_prompt2() {
fprint(2, "%s", prompt2
);
Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file
descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g.,
>[2] is represented as (2,UNSET).
This function makes use of unsigned compares to make range tests in one compare operation.
static void getpair(int c
) {
fd_left
= fd_right
= UNSET
;
if ((unsigned int) (n
= gchar() - '0') > 9) {
scanerror("expected digit after '['");
while ((unsigned int) (c
= gchar() - '0') <= 9)
scanerror("expected '=' or ']' after digit");
if ((unsigned int) (n
= gchar() - '0') > 9) {
scanerror("expected digit or ']' after '='");
while ((unsigned int) (c
= gchar() - '0') <= 9)
scanerror("expected ']' after digit");