Commit | Line | Data |
---|---|---|
c0bc4ef7 DF |
1 | /* |
2 | * Copyright (c) 1980 Regents of the University of California. | |
3 | * All rights reserved. The Berkeley software License Agreement | |
4 | * specifies the terms and conditions for redistribution. | |
5 | */ | |
6 | ||
7 | #ifndef lint | |
1d7a34f4 | 8 | static char sccsid[] = "@(#)lexi.c 5.5 (Berkeley) %G%"; |
c0bc4ef7 | 9 | #endif not lint |
4b365fcd | 10 | |
1009bf5e KM |
11 | /*- |
12 | * | |
13 | * Copyright (C) 1976 | |
14 | * by the | |
15 | * Board of Trustees | |
16 | * of the | |
17 | * University of Illinois | |
18 | * | |
19 | * All rights reserved | |
20 | * | |
21 | * | |
22 | * NAME: | |
23 | * lexi | |
24 | * | |
25 | * FUNCTION: | |
26 | * This is the token scanner for indent | |
27 | * | |
28 | * ALGORITHM: | |
29 | * 1) Strip off intervening blanks and/or tabs. | |
30 | * 2) If it is an alphanumeric token, move it to the token buffer "token". | |
31 | * Check if it is a special reserved word that indent will want to | |
32 | * know about. | |
33 | * 3) Non-alphanumeric tokens are handled with a big switch statement. A | |
34 | * flag is kept to remember if the last token was a "unary delimiter", | |
35 | * which forces a following operator to be unary as opposed to binary. | |
36 | * | |
37 | * PARAMETERS: | |
38 | * None | |
39 | * | |
40 | * RETURNS: | |
41 | * An integer code indicating the type of token scanned. | |
42 | * | |
43 | * GLOBALS: | |
44 | * buf_ptr = | |
45 | * had_eof | |
46 | * ps.last_u_d = Set to true iff this token is a "unary delimiter" | |
47 | * | |
48 | * CALLS: | |
49 | * fill_buffer | |
50 | * printf (lib) | |
51 | * | |
52 | * CALLED BY: | |
53 | * main | |
54 | * | |
55 | * NOTES: | |
56 | * Start of comment is passed back so that the comment can be scanned by | |
57 | * pr_comment. | |
58 | * | |
59 | * Strings and character literals are returned just like identifiers. | |
60 | * | |
61 | * HISTORY: | |
62 | * initial coding November 1976 D A Willcox of CAC | |
63 | * 1/7/77 D A Willcox of CAC Fix to provide proper handling | |
64 | * of "int a -1;" | |
65 | * | |
66 | */\f | |
4b365fcd | 67 | |
1009bf5e KM |
68 | /* |
69 | * Here we have the token scanner for indent. It scans off one token and | |
70 | * puts it in the global variable "token". It returns a code, indicating | |
71 | * the type of token scanned. | |
72 | */ | |
4b365fcd | 73 | |
1d7a34f4 KB |
74 | #include "indent_globs.h" |
75 | #include "indent_codes.h" | |
1009bf5e | 76 | #include "ctype.h" |
4b365fcd KM |
77 | |
78 | #define alphanum 1 | |
79 | #define opchar 3 | |
80 | ||
81 | struct templ { | |
1009bf5e KM |
82 | char *rwd; |
83 | int rwcode; | |
4b365fcd KM |
84 | }; |
85 | ||
1009bf5e | 86 | struct templ specials[100] = |
4b365fcd KM |
87 | { |
88 | "switch", 1, | |
89 | "case", 2, | |
1009bf5e | 90 | "break", 0, |
4b365fcd | 91 | "struct", 3, |
1009bf5e KM |
92 | "union", 3, |
93 | "enum", 3, | |
4b365fcd KM |
94 | "default", 2, |
95 | "int", 4, | |
96 | "char", 4, | |
97 | "float", 4, | |
98 | "double", 4, | |
99 | "long", 4, | |
100 | "short", 4, | |
101 | "typdef", 4, | |
102 | "unsigned", 4, | |
103 | "register", 4, | |
104 | "static", 4, | |
105 | "global", 4, | |
106 | "extern", 4, | |
1009bf5e KM |
107 | "void", 4, |
108 | "goto", 0, | |
109 | "return", 0, | |
4b365fcd KM |
110 | "if", 5, |
111 | "while", 5, | |
112 | "for", 5, | |
113 | "else", 6, | |
114 | "do", 6, | |
1009bf5e | 115 | "sizeof", 7, |
4b365fcd KM |
116 | 0, 0 |
117 | }; | |
118 | ||
1009bf5e KM |
119 | char chartype[128] = |
120 | { /* this is used to facilitate the decision | |
121 | * of what type (alphanumeric, operator) | |
122 | * each character is */ | |
4b365fcd KM |
123 | 0, 0, 0, 0, 0, 0, 0, 0, |
124 | 0, 0, 0, 0, 0, 0, 0, 0, | |
125 | 0, 0, 0, 0, 0, 0, 0, 0, | |
126 | 0, 0, 0, 0, 0, 0, 0, 0, | |
127 | 0, 3, 0, 0, 0, 3, 3, 0, | |
128 | 0, 0, 3, 3, 0, 3, 3, 3, | |
129 | 1, 1, 1, 1, 1, 1, 1, 1, | |
130 | 1, 1, 0, 0, 3, 3, 3, 3, | |
131 | 0, 1, 1, 1, 1, 1, 1, 1, | |
132 | 1, 1, 1, 1, 1, 1, 1, 1, | |
133 | 1, 1, 1, 1, 1, 1, 1, 1, | |
134 | 1, 1, 1, 0, 0, 0, 3, 1, | |
135 | 0, 1, 1, 1, 1, 1, 1, 1, | |
136 | 1, 1, 1, 1, 1, 1, 1, 1, | |
137 | 1, 1, 1, 1, 1, 1, 1, 1, | |
138 | 1, 1, 1, 0, 3, 0, 3, 0 | |
139 | }; | |
140 | ||
1009bf5e KM |
141 | |
142 | ||
143 | ||
144 | int | |
145 | lexi() | |
146 | { | |
147 | register char *tok; /* local pointer to next char in token */ | |
148 | int unary_delim; /* this is set to 1 if the current token | |
149 | * | |
150 | * forces a following operator to be unary */ | |
151 | static int last_code; /* the last token type returned */ | |
152 | static int l_struct; /* set to 1 if the last token was 'struct' */ | |
153 | int code; /* internal code to be returned */ | |
154 | char qchar; /* the delimiter character for a string */ | |
155 | ||
156 | tok = token; /* point to start of place to save token */ | |
4b365fcd | 157 | unary_delim = false; |
1009bf5e KM |
158 | ps.col_1 = ps.last_nl; /* tell world that this token started in |
159 | * column 1 iff the last thing scanned was | |
160 | * nl */ | |
161 | ps.last_nl = false; | |
162 | ||
163 | while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ | |
164 | ps.col_1 = false; /* leading blanks imply token is not in | |
165 | * column 1 */ | |
4b365fcd | 166 | if (++buf_ptr >= buf_end) |
1009bf5e | 167 | fill_buffer(); |
4b365fcd KM |
168 | } |
169 | ||
c93d6f87 KM |
170 | /* Scan an alphanumeric token. Note that we must also handle |
171 | * stuff like "1.0e+03" and "7e-6". */ | |
1009bf5e KM |
172 | if (chartype[*buf_ptr & 0177] == alphanum) { /* we have a character |
173 | * or number */ | |
174 | register char *j; /* used for searching thru list of | |
1009bf5e KM |
175 | * reserved words */ |
176 | register struct templ *p; | |
c93d6f87 | 177 | register int c; |
4b365fcd | 178 | |
c93d6f87 | 179 | do { /* copy it over */ |
4b365fcd KM |
180 | *tok++ = *buf_ptr++; |
181 | if (buf_ptr >= buf_end) | |
1009bf5e | 182 | fill_buffer(); |
c93d6f87 KM |
183 | } while (chartype[c = *buf_ptr & 0177] == alphanum || |
184 | isdigit(token[0]) && (c == '+' || c == '-') && | |
185 | (tok[-1] == 'e' || tok[-1] == 'E')); | |
4b365fcd | 186 | *tok++ = '\0'; |
1009bf5e KM |
187 | while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ |
188 | if (++buf_ptr >= buf_end) | |
189 | fill_buffer(); | |
190 | } | |
191 | ps.its_a_keyword = false; | |
192 | ps.sizeof_keyword = false; | |
193 | if (l_struct) { /* if last token was 'struct', then this | |
194 | * token should be treated as a | |
195 | * declaration */ | |
4b365fcd KM |
196 | l_struct = false; |
197 | last_code = ident; | |
1009bf5e | 198 | ps.last_u_d = true; |
4b365fcd KM |
199 | return (decl); |
200 | } | |
1009bf5e KM |
201 | ps.last_u_d = false; /* Operator after indentifier is binary */ |
202 | last_code = ident; /* Remember that this is the code we will | |
203 | * return */ | |
204 | ||
205 | /* | |
206 | * This loop will check if the token is a keyword. | |
207 | */ | |
208 | for (p = specials; (j = p->rwd) != 0; p++) { | |
209 | tok = token; /* point at scanned token */ | |
210 | if (*j++ != *tok++ || *j++ != *tok++) | |
211 | continue; /* This test depends on the fact that | |
212 | * identifiers are always at least 1 | |
213 | * character long (ie. the first two bytes | |
214 | * of the identifier are always | |
215 | * meaningful) */ | |
216 | if (tok[-1] == 0) | |
217 | break; /* If its a one-character identifier */ | |
218 | while (*tok++ == *j) | |
219 | if (*j++ == 0) | |
220 | goto found_keyword; /* I wish that C had a multi-level | |
221 | * break... */ | |
222 | } | |
223 | if (p->rwd) { /* we have a keyword */ | |
224 | found_keyword: | |
225 | ps.its_a_keyword = true; | |
226 | ps.last_u_d = true; | |
227 | switch (p->rwcode) { | |
228 | case 1: /* it is a switch */ | |
229 | return (swstmt); | |
230 | case 2: /* a case or default */ | |
231 | return (casestmt); | |
232 | ||
233 | case 3: /* a "struct" */ | |
234 | if (ps.p_l_follow) | |
235 | break; /* inside parens: cast */ | |
236 | l_struct = true; | |
237 | ||
238 | /* | |
239 | * Next time around, we will want to know that we have | |
240 | * had a 'struct' | |
241 | */ | |
242 | case 4: /* one of the declaration keywords */ | |
243 | if (ps.p_l_follow) { | |
244 | ps.cast_mask |= 1 << ps.p_l_follow; | |
245 | break; /* inside parens: cast */ | |
246 | } | |
247 | last_code = decl; | |
248 | return (decl); | |
249 | ||
250 | case 5: /* if, while, for */ | |
251 | return (sp_paren); | |
252 | ||
253 | case 6: /* do, else */ | |
254 | return (sp_nparen); | |
255 | ||
256 | case 7: | |
257 | ps.sizeof_keyword = true; | |
258 | default: /* all others are treated like any other | |
259 | * identifier */ | |
260 | return (ident); | |
261 | } /* end of switch */ | |
262 | } /* end of if (found_it) */ | |
263 | if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 | |
264 | && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) { | |
265 | strncpy(ps.procname, token, sizeof ps.procname - 1); | |
266 | ps.in_parameter_declaration = 1; | |
4b365fcd KM |
267 | } |
268 | ||
1009bf5e KM |
269 | /* |
270 | * The following hack attempts to guess whether or not the current | |
271 | * token is in fact a declaration keyword -- one that has been | |
272 | * typedefd | |
273 | */ | |
274 | if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr)) | |
275 | && !ps.p_l_follow | |
276 | && (ps.last_token == rparen || ps.last_token == semicolon || | |
277 | ps.last_token == decl || | |
278 | ps.last_token == lbrace || ps.last_token == rbrace)) { | |
279 | ps.its_a_keyword = true; | |
280 | ps.last_u_d = true; | |
281 | last_code = decl; | |
282 | return decl; | |
283 | } | |
284 | if (last_code == decl) /* if this is a declared variable, then | |
285 | * following sign is unary */ | |
286 | ps.last_u_d = true; /* will make "int a -1" work */ | |
4b365fcd | 287 | last_code = ident; |
1009bf5e KM |
288 | return (ident); /* the ident is not in the list */ |
289 | } /* end of procesing for alpanum character */ | |
c93d6f87 | 290 | /* Scan a non-alphanumeric token */ |
4b365fcd | 291 | |
1009bf5e KM |
292 | *tok++ = *buf_ptr; /* if it is only a one-character token, it |
293 | * is moved here */ | |
4b365fcd KM |
294 | *tok = '\0'; |
295 | if (++buf_ptr >= buf_end) | |
1009bf5e | 296 | fill_buffer(); |
4b365fcd KM |
297 | |
298 | switch (*token) { | |
1009bf5e KM |
299 | case '\n': |
300 | unary_delim = ps.last_u_d; | |
301 | ps.last_nl = true; /* remember that we just had a newline */ | |
4b365fcd | 302 | code = (had_eof ? 0 : newline); |
4b365fcd | 303 | |
1009bf5e KM |
304 | /* |
305 | * if data has been exausted, the newline is a dummy, and we | |
306 | * should return code to stop | |
307 | */ | |
308 | break; | |
4b365fcd | 309 | |
1009bf5e KM |
310 | case '\'': /* start of quoted character */ |
311 | case '"': /* start of string */ | |
312 | qchar = *token; | |
313 | if (troff) { | |
314 | tok[-1] = '`'; | |
315 | if (qchar == '"') | |
316 | *tok++ = '`'; | |
317 | *tok++ = BACKSLASH; | |
318 | *tok++ = 'f'; | |
319 | *tok++ = 'L'; | |
320 | } | |
321 | do { /* copy the string */ | |
322 | while (1) { /* move one character or [/<char>]<char> */ | |
4b365fcd | 323 | if (*buf_ptr == '\n') { |
1009bf5e | 324 | printf("%d: Unterminated literal\n", line_no); |
4b365fcd | 325 | goto stop_lit; |
4b365fcd | 326 | } |
4b365fcd KM |
327 | *tok = *buf_ptr++; |
328 | if (buf_ptr >= buf_end) | |
1009bf5e | 329 | fill_buffer(); |
4b365fcd | 330 | if (had_eof || ((tok - token) > (bufsize - 2))) { |
1009bf5e | 331 | printf("Unterminated literal\n"); |
4b365fcd KM |
332 | ++tok; |
333 | goto stop_lit; | |
1009bf5e | 334 | /* get outof literal copying loop */ |
4b365fcd | 335 | } |
1009bf5e KM |
336 | if (*tok == BACKSLASH) { /* if escape, copy extra |
337 | * char */ | |
338 | if (*buf_ptr == '\n') /* check for escaped | |
339 | * newline */ | |
4b365fcd | 340 | ++line_no; |
1009bf5e KM |
341 | if (troff) { |
342 | *++tok = BACKSLASH; | |
343 | if (*buf_ptr == BACKSLASH) | |
344 | *++tok = BACKSLASH; | |
345 | } | |
346 | *++tok = *buf_ptr++; | |
347 | ++tok; /* we must increment this again because we | |
348 | * copied two chars */ | |
4b365fcd | 349 | if (buf_ptr >= buf_end) |
1009bf5e | 350 | fill_buffer(); |
4b365fcd KM |
351 | } |
352 | else | |
1009bf5e KM |
353 | break; /* we copied one character */ |
354 | } /* end of while (1) */ | |
4b365fcd | 355 | } while (*tok++ != qchar); |
1009bf5e KM |
356 | if (troff) { |
357 | tok[-1] = BACKSLASH; | |
358 | *tok++ = 'f'; | |
359 | *tok++ = 'R'; | |
360 | *tok++ = '\''; | |
361 | if (qchar == '"') | |
362 | *tok++ = '\''; | |
363 | } | |
364 | stop_lit: | |
4b365fcd KM |
365 | code = ident; |
366 | break; | |
367 | ||
1009bf5e KM |
368 | case ('('): |
369 | case ('['): | |
4b365fcd KM |
370 | unary_delim = true; |
371 | code = lparen; | |
372 | break; | |
373 | ||
1009bf5e KM |
374 | case (')'): |
375 | case (']'): | |
4b365fcd KM |
376 | code = rparen; |
377 | break; | |
378 | ||
1009bf5e KM |
379 | case '#': |
380 | unary_delim = ps.last_u_d; | |
4b365fcd KM |
381 | code = preesc; |
382 | break; | |
383 | ||
1009bf5e | 384 | case '?': |
4b365fcd KM |
385 | unary_delim = true; |
386 | code = question; | |
387 | break; | |
388 | ||
1009bf5e | 389 | case (':'): |
4b365fcd KM |
390 | code = colon; |
391 | unary_delim = true; | |
392 | break; | |
393 | ||
1009bf5e | 394 | case (';'): |
4b365fcd KM |
395 | unary_delim = true; |
396 | code = semicolon; | |
397 | break; | |
398 | ||
1009bf5e | 399 | case ('{'): |
4b365fcd | 400 | unary_delim = true; |
1009bf5e KM |
401 | |
402 | /* | |
403 | * if (ps.in_or_st) ps.block_init = 1; | |
404 | */ | |
405 | code = ps.block_init ? lparen : lbrace; | |
4b365fcd KM |
406 | break; |
407 | ||
1009bf5e | 408 | case ('}'): |
4b365fcd | 409 | unary_delim = true; |
1009bf5e | 410 | code = ps.block_init ? rparen : rbrace; |
4b365fcd KM |
411 | break; |
412 | ||
1009bf5e KM |
413 | case 014: /* a form feed */ |
414 | unary_delim = ps.last_u_d; | |
415 | ps.last_nl = true; /* remember this so we can set 'ps.col_1' | |
416 | * right */ | |
4b365fcd KM |
417 | code = form_feed; |
418 | break; | |
419 | ||
1009bf5e | 420 | case (','): |
4b365fcd KM |
421 | unary_delim = true; |
422 | code = comma; | |
423 | break; | |
424 | ||
1009bf5e | 425 | case '.': |
4b365fcd KM |
426 | unary_delim = false; |
427 | code = period; | |
428 | break; | |
429 | ||
1009bf5e KM |
430 | case '-': |
431 | case '+': /* check for -, +, --, ++ */ | |
432 | code = (ps.last_u_d ? unary_op : binary_op); | |
4b365fcd KM |
433 | unary_delim = true; |
434 | ||
435 | if (*buf_ptr == token[0]) { | |
1009bf5e | 436 | /* check for doubled character */ |
4b365fcd | 437 | *tok++ = *buf_ptr++; |
1009bf5e | 438 | /* buffer overflow will be checked at end of loop */ |
4b365fcd | 439 | if (last_code == ident || last_code == rparen) { |
1009bf5e KM |
440 | code = (ps.last_u_d ? unary_op : postop); |
441 | /* check for following ++ or -- */ | |
4b365fcd KM |
442 | unary_delim = false; |
443 | } | |
444 | } | |
1009bf5e KM |
445 | else if (*buf_ptr == '=') |
446 | /* check for operator += */ | |
447 | *tok++ = *buf_ptr++; | |
5c6e73ac | 448 | else if (token[0] == '-' && *buf_ptr == '>') { |
1009bf5e KM |
449 | /* check for operator -> */ |
450 | *tok++ = *buf_ptr++; | |
5c6e73ac KM |
451 | if (!pointer_as_binop) { |
452 | code = unary_op; | |
453 | unary_delim = false; | |
454 | ps.want_blank = false; | |
455 | } | |
1009bf5e KM |
456 | } |
457 | /* buffer overflow will be checked at end of switch */ | |
4b365fcd KM |
458 | |
459 | break; | |
460 | ||
1009bf5e KM |
461 | case '=': |
462 | if (ps.in_or_st) | |
463 | ps.block_init = 1; | |
464 | if (chartype[*buf_ptr] == opchar) { /* we have two char | |
465 | * assignment */ | |
466 | tok[-1] = *buf_ptr++; | |
467 | if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr) | |
468 | *tok++ = *buf_ptr++; | |
469 | *tok++ = '='; /* Flip =+ to += */ | |
470 | *tok = 0; | |
4b365fcd | 471 | } |
4b365fcd KM |
472 | code = binary_op; |
473 | unary_delim = true; | |
1009bf5e KM |
474 | break; |
475 | /* can drop thru!!! */ | |
4b365fcd | 476 | |
1009bf5e KM |
477 | case '>': |
478 | case '<': | |
479 | case '!': /* ops like <, <<, <=, !=, etc */ | |
4b365fcd KM |
480 | if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { |
481 | *tok++ = *buf_ptr; | |
482 | if (++buf_ptr >= buf_end) | |
1009bf5e | 483 | fill_buffer(); |
4b365fcd | 484 | } |
4b365fcd | 485 | if (*buf_ptr == '=') |
1009bf5e KM |
486 | *tok++ = *buf_ptr++; |
487 | code = (ps.last_u_d ? unary_op : binary_op); | |
4b365fcd KM |
488 | unary_delim = true; |
489 | break; | |
490 | ||
1009bf5e | 491 | default: |
4b365fcd | 492 | if (token[0] == '/' && *buf_ptr == '*') { |
1009bf5e | 493 | /* it is start of comment */ |
4b365fcd KM |
494 | *tok++ = '*'; |
495 | ||
496 | if (++buf_ptr >= buf_end) | |
1009bf5e | 497 | fill_buffer(); |
4b365fcd KM |
498 | |
499 | code = comment; | |
1009bf5e | 500 | unary_delim = ps.last_u_d; |
4b365fcd KM |
501 | break; |
502 | } | |
1009bf5e KM |
503 | while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') { |
504 | /* handle ||, &&, etc, and also things as in int *****i */ | |
4b365fcd KM |
505 | *tok++ = *buf_ptr; |
506 | if (++buf_ptr >= buf_end) | |
1009bf5e | 507 | fill_buffer(); |
4b365fcd | 508 | } |
1009bf5e | 509 | code = (ps.last_u_d ? unary_op : binary_op); |
4b365fcd KM |
510 | unary_delim = true; |
511 | ||
512 | ||
1009bf5e | 513 | } /* end of switch */ |
4b365fcd KM |
514 | if (code != newline) { |
515 | l_struct = false; | |
516 | last_code = code; | |
517 | } | |
1009bf5e KM |
518 | if (buf_ptr >= buf_end) /* check for input buffer empty */ |
519 | fill_buffer(); | |
520 | ps.last_u_d = unary_delim; | |
521 | *tok = '\0'; /* null terminate the token */ | |
4b365fcd KM |
522 | return (code); |
523 | }; | |
1009bf5e KM |
524 | |
525 | /* Add the given keyword to the keyword table, using val as the keyword type | |
526 | */ | |
527 | addkey (key, val) | |
528 | char *key; | |
529 | { | |
530 | register struct templ *p = specials; | |
531 | while (p->rwd) | |
532 | if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) | |
533 | return; | |
534 | else | |
535 | p++; | |
536 | if (p >= specials + sizeof specials / sizeof specials[0]) | |
537 | return; /* For now, table overflows are silently | |
538 | ignored */ | |
539 | p->rwd = key; | |
540 | p->rwcode = val; | |
541 | p[1].rwd = 0; | |
542 | p[1].rwcode = 0; | |
543 | return; | |
544 | } |