Commit | Line | Data |
---|---|---|
95fa4dd9 C |
1 | /* lex.c: rc's lexical analyzer */ |
2 | ||
3 | #include "rc.h" | |
4 | #include "y.tab.h" | |
5 | ||
6 | /* | |
7 | Special characters (i.e., "non-word") in rc: | |
8 | \t \n # ; & | ^ $ = ~ ` ' { } @ ! ( ) < > \ | |
9 | ||
10 | The lexical analyzer is fairly straightforward. The only really | |
11 | unclean part concerns backslash continuation and "double | |
12 | backslashes". A backslash followed by a newline is treated as a | |
13 | space, otherwise backslash is not a special characeter (i.e., | |
14 | it can be part of a word). This introduces a host of unwanted | |
15 | special cases. In our case, \ cannot be a word character, since | |
16 | we wish to read in all word characters in a tight loop. | |
17 | ||
18 | Note: to save the trouble of declaring these arrays with TRUEs | |
19 | and FALSEs, I am assuming that FALSE = 0, TRUE = 1. (and so is | |
20 | it declared in rc.h) | |
21 | */ | |
22 | ||
23 | #define BUFSIZE ((size_t) 1000) /* malloc hates power of 2 buffers? */ | |
24 | #define BUFMAX (8 * BUFSIZE) /* How big the buffer can get before we re-allocate the | |
25 | space at BUFSIZE again. Premature optimization? Maybe. | |
26 | */ | |
27 | ||
28 | typedef enum wordstates { | |
29 | NW, RW, KW /* "nonword", "realword", "keyword" */ | |
30 | } wordstates; | |
31 | ||
32 | static void getpair(int); | |
33 | ||
34 | int lineno; | |
35 | ||
36 | const char nw[] = { | |
37 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
38 | 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, | |
39 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, | |
40 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, | |
41 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
43 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
44 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
45 | }; | |
46 | ||
47 | const char dnw[] = { | |
48 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
49 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, | |
50 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, | |
51 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, | |
52 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
53 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
54 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
55 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
56 | }; | |
57 | ||
58 | static size_t bufsize = BUFSIZE; | |
59 | static char *realbuf = NULL; | |
60 | static bool newline = FALSE; | |
61 | static bool errset = FALSE; | |
62 | static bool prerror = FALSE; | |
63 | static wordstates w = NW; | |
64 | static int fd_left, fd_right; | |
65 | ||
66 | #define checkfreecaret {if (w != NW) { w = NW; ugchar(c); return '^'; }} | |
67 | ||
68 | enum filedescriptors { | |
69 | UNSET = -9, CLOSED = -1 | |
70 | }; | |
71 | ||
72 | extern int yylex() { | |
73 | static bool dollar = FALSE; | |
74 | bool saw_meta = FALSE; | |
75 | int c; | |
76 | size_t i; /* The purpose of all these local assignments is to */ | |
77 | const char *meta; /* allow optimizing compilers like gcc to load these */ | |
78 | char *buf = realbuf; /* values into registers. On a sparc this is a */ | |
79 | YYSTYPE *y = &yylval; /* win, in code size *and* execution time */ | |
80 | if (errset) { | |
81 | errset = FALSE; | |
82 | return '\n'; | |
83 | } | |
84 | /* rc variable-names may contain only alnum, '*' and '_', so use dnw if we are scanning one. */ | |
85 | meta = (dollar ? dnw : nw); | |
86 | dollar = FALSE; | |
87 | if (newline) { | |
88 | --lineno; /* slight space optimization; print_prompt2() always increments lineno */ | |
89 | print_prompt2(); | |
90 | newline = FALSE; | |
91 | } | |
92 | top: while ((c = gchar()) == ' ' || c == '\t') | |
93 | w = NW; | |
94 | if (c == EOF) | |
95 | return END; | |
96 | if (!meta[(unsigned char) c]) { /* it's a word or keyword. */ | |
97 | checkfreecaret; | |
98 | w = RW; | |
99 | i = 0; | |
100 | read: do { | |
101 | buf[i++] = c; | |
102 | if (c == '?' || c == '[' || c == '*') | |
103 | saw_meta = TRUE; | |
104 | if (i >= bufsize) | |
105 | buf = realbuf = erealloc(buf, bufsize *= 2); | |
106 | } while ((c = gchar()) != EOF && !meta[(unsigned char) c]); | |
107 | while (c == '\\') { | |
108 | if ((c = gchar()) == '\n') { | |
109 | print_prompt2(); | |
110 | c = ' '; /* Pretend a space was read */ | |
111 | break; | |
112 | } else { | |
113 | bs: if (meta != dnw) { /* all words but varnames may have a bslash */ | |
114 | buf[i++] = '\\'; | |
115 | if (i >= bufsize) | |
116 | buf = realbuf = erealloc(buf, bufsize *= 2); | |
117 | if (!meta[(unsigned char) c]) | |
118 | goto read; | |
119 | } else { | |
120 | ugchar(c); | |
121 | c = '\\'; | |
122 | break; | |
123 | } | |
124 | } | |
125 | } | |
126 | ugchar(c); | |
127 | buf[i] = '\0'; | |
128 | w = KW; | |
129 | if (i == 2) { | |
130 | if (*buf == 'i' && buf[1] == 'f') return IF; | |
131 | if (*buf == 'f' && buf[1] == 'n') return FN; | |
132 | if (*buf == 'i' && buf[1] == 'n') return IN; | |
133 | } | |
134 | if (streq(buf, "for")) return FOR; | |
135 | if (streq(buf, "else")) return ELSE; | |
136 | if (streq(buf, "switch")) return SWITCH; | |
137 | if (streq(buf, "while")) return WHILE; | |
138 | if (streq(buf, "case")) return CASE; | |
139 | w = RW; | |
140 | y->word.w = ncpy(buf); | |
141 | if (saw_meta) { | |
142 | char *r, *s; | |
143 | ||
144 | y->word.m = nalloc(strlen(buf) + 1); | |
145 | for (r = buf, s = y->word.m; *r != '\0'; r++, s++) | |
146 | *s = (*r == '?' || *r == '[' || *r == '*'); | |
147 | } else { | |
148 | y->word.m = NULL; | |
149 | } | |
150 | return WORD; | |
151 | } | |
152 | if (c == '`' || c == '!' || c == '@' || c == '~' || c == '$' || c == '\'') { | |
153 | checkfreecaret; | |
154 | if (c == '!' || c == '@' || c == '~') | |
155 | w = KW; | |
156 | } | |
157 | switch (c) { | |
158 | case '\0': | |
159 | pr_error("warning: null character ignored"); | |
160 | goto top; | |
161 | case '!': | |
162 | return BANG; | |
163 | case '@': | |
164 | return SUBSHELL; | |
165 | case '~': | |
166 | return TWIDDLE; | |
167 | case '`': | |
168 | c = gchar(); | |
169 | if (c == '`') | |
170 | return BACKBACK; | |
171 | ugchar(c); | |
172 | return '`'; | |
173 | case '$': | |
174 | dollar = TRUE; | |
175 | c = gchar(); | |
176 | if (c == '#') | |
177 | return COUNT; | |
178 | if (c == '^') | |
179 | return FLAT; | |
180 | ugchar(c); | |
181 | return '$'; | |
182 | case '\'': | |
183 | w = RW; | |
184 | i = 0; | |
185 | do { | |
186 | buf[i++] = c; | |
187 | if (c == '\n') | |
188 | print_prompt2(); | |
189 | if (c == EOF) { | |
190 | w = NW; | |
191 | scanerror("eof in quoted string"); | |
192 | return HUH; | |
193 | } | |
194 | if (i >= bufsize) | |
195 | buf = realbuf = erealloc(buf, bufsize *= 2); | |
196 | } while ((c = gchar()) != '\'' || (c = gchar()) == '\''); /* quote "'" thus: 'how''s it going?' */ | |
197 | ugchar(c); | |
198 | buf[i] = '\0'; | |
199 | y->word.w = ncpy(buf); | |
200 | y->word.m = NULL; | |
201 | return WORD; | |
202 | case '\\': | |
203 | if ((c = gchar()) == '\n') { | |
204 | print_prompt2(); | |
205 | goto top; /* Pretend it was just another space. */ | |
206 | } | |
207 | ugchar(c); | |
208 | c = '\\'; | |
209 | checkfreecaret; | |
210 | c = gchar(); | |
211 | i = 0; | |
212 | goto bs; | |
213 | case '(': | |
214 | if (w == RW) /* SUB's happen only after real words, not keyowrds, so if () and while () work */ | |
215 | c = SUB; | |
216 | w = NW; | |
217 | return c; | |
218 | case '#': | |
219 | while ((c = gchar()) != '\n') /* skip comment until newline */ | |
220 | if (c == EOF) | |
221 | return END; | |
222 | /* FALLTHROUGH */ | |
223 | case '\n': | |
224 | lineno++; | |
225 | newline = TRUE; | |
226 | /* FALLTHROUGH */ | |
227 | case ';': | |
228 | case '^': | |
229 | case ')': | |
230 | case '=': | |
231 | case '{': case '}': | |
232 | w = NW; | |
233 | return c; | |
234 | case '&': | |
235 | w = NW; | |
236 | c = gchar(); | |
237 | if (c == '&') | |
238 | return ANDAND; | |
239 | ugchar(c); | |
240 | return '&'; | |
241 | case '|': | |
242 | w = NW; | |
243 | c = gchar(); | |
244 | if (c == '|') | |
245 | return OROR; | |
246 | getpair(c); | |
247 | if (errset) | |
248 | return HUH; | |
249 | if ((y->pipe.left = fd_left) == UNSET) | |
250 | y->pipe.left = 1; /* default to fd 1 */ | |
251 | if ((y->pipe.right = fd_right) == UNSET) | |
252 | y->pipe.right = 0; /* default to fd 0 */ | |
253 | if (y->pipe.right == CLOSED) { | |
254 | scanerror("expected digit after '='"); /* can't close a pipe */ | |
255 | return HUH; | |
256 | } | |
257 | return PIPE; | |
258 | case '>': | |
259 | c = gchar(); | |
260 | if (c == '>') { | |
261 | c = gchar(); | |
262 | y->redir.type = rAppend; | |
263 | } else | |
264 | y->redir.type = rCreate; | |
265 | y->redir.fd = 1; | |
266 | goto common; | |
267 | case '<': | |
268 | c = gchar(); | |
269 | if (c == '<') { | |
270 | c = gchar(); | |
271 | if (c == '<') { | |
272 | c = gchar(); | |
273 | y->redir.type = rHerestring; | |
274 | } else { | |
275 | y->redir.type = rHeredoc; | |
276 | } | |
277 | } else | |
278 | y->redir.type = rFrom; | |
279 | y->redir.fd = 0; | |
280 | common: | |
281 | w = NW; | |
282 | getpair(c); | |
283 | if (errset) | |
284 | return HUH; | |
285 | if (fd_right == UNSET) { /* redirection, not dup */ | |
286 | if (fd_left != UNSET) { | |
287 | y->redir.fd = fd_left; | |
288 | return SREDIR; | |
289 | } | |
290 | return (y->redir.type == rFrom || y->redir.type == rCreate) ? REDIR : SREDIR; | |
291 | } else { /* dup; recast yylval */ | |
292 | y->dup.type = y->redir.type; | |
293 | y->dup.left = fd_left; | |
294 | y->dup.right = fd_right; | |
295 | return DUP; | |
296 | } | |
297 | default: | |
298 | w = NW; | |
299 | return c; /* don't know what it is, let yacc barf on it */ | |
300 | } | |
301 | } | |
302 | ||
303 | extern void yyerror(const char *s) { | |
304 | char *tok; | |
305 | if (prerror) { /* don't print "syntax error" if there's a more informative scanerror */ | |
306 | prerror = FALSE; | |
307 | return; | |
308 | } | |
309 | if (!interactive) { | |
310 | if (w != NW) | |
311 | tok = realbuf; | |
312 | else if (last == EOF) | |
313 | tok = "eof"; | |
314 | else if (last == '\n') | |
315 | tok = "end of line"; | |
316 | else | |
317 | tok = nprint((last < 32 || last > 126) ? "(decimal %d)" : "'%c'", last); | |
318 | fprint(2, "line %d: %s near %s\n", lineno - (last == '\n'), s, tok); | |
319 | } else | |
320 | fprint(2, "%s\n", s); | |
321 | } | |
322 | ||
323 | extern void scanerror(char *s) { | |
324 | flushu(); /* flush upto newline */ | |
325 | yyerror(s); | |
326 | errset = prerror = TRUE; | |
327 | } | |
328 | ||
329 | extern void inityy() { | |
330 | newline = FALSE; | |
331 | w = NW; | |
332 | hq = NULL; | |
333 | /* return memory to the system if the buffer got too large */ | |
334 | if (bufsize > BUFMAX && realbuf != NULL) { | |
335 | efree(realbuf); | |
336 | bufsize = BUFSIZE; | |
337 | realbuf = ealloc(bufsize); | |
338 | } else if (realbuf == NULL) | |
339 | realbuf = ealloc(bufsize); | |
340 | } | |
341 | ||
342 | extern void print_prompt2() { | |
343 | lineno++; | |
344 | if (interactive) | |
345 | fprint(2, "%s", prompt2); | |
346 | } | |
347 | ||
348 | /* | |
349 | Scan in a pair of integers for redirections like >[2=1]. CLOSED represents a closed file | |
350 | descriptor (i.e., >[2=]) and UNSET represents an undesignated file descriptor (e.g., | |
351 | >[2] is represented as (2,UNSET). | |
352 | ||
353 | This function makes use of unsigned compares to make range tests in one compare operation. | |
354 | */ | |
355 | ||
356 | static void getpair(int c) { | |
357 | int n; | |
358 | fd_left = fd_right = UNSET; | |
359 | if (c != '[') { | |
360 | ugchar(c); | |
361 | return; | |
362 | } | |
363 | if ((unsigned int) (n = gchar() - '0') > 9) { | |
364 | scanerror("expected digit after '['"); | |
365 | return; | |
366 | } | |
367 | while ((unsigned int) (c = gchar() - '0') <= 9) | |
368 | n = n * 10 + c; | |
369 | fd_left = n; | |
370 | c += '0'; | |
371 | switch (c) { | |
372 | default: | |
373 | scanerror("expected '=' or ']' after digit"); | |
374 | return; | |
375 | case ']': | |
376 | return; | |
377 | case '=': | |
378 | if ((unsigned int) (n = gchar() - '0') > 9) { | |
379 | if (n != ']' - '0') { | |
380 | scanerror("expected digit or ']' after '='"); | |
381 | return; | |
382 | } | |
383 | fd_right = CLOSED; | |
384 | } else { | |
385 | while ((unsigned int) (c = gchar() - '0') <= 9) | |
386 | n = n * 10 + c; | |
387 | if (c != ']' - '0') { | |
388 | scanerror("expected ']' after digit"); | |
389 | return; | |
390 | } | |
391 | fd_right = n; | |
392 | } | |
393 | } | |
394 | } |