include ultrix -> vaxuba
[unix-history] / usr / src / usr.bin / indent / lexi.c
CommitLineData
c0bc4ef7
DF
1/*
2 * Copyright (c) 1980 Regents of the University of California.
b0627149
KB
3 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms are permitted
7 * provided that this notice is preserved and that due credit is given
8 * to the University of California at Berkeley and the University of
9 * Illinois at Urbana. The name of either University may not be used
10 * to endorse or promote products derived from this software without
11 * specific prior written permission. This software is provided
12 * ``as is'' without express or implied warranty.
c0bc4ef7
DF
13 */
14
15#ifndef lint
720fc992 16static char sccsid[] = "@(#)lexi.c 5.7 (Berkeley) %G%";
b0627149 17#endif /* not lint */
4b365fcd 18
b0627149 19/*
1009bf5e
KM
20 * NAME:
21 * lexi
22 *
23 * FUNCTION:
24 * This is the token scanner for indent
25 *
26 * ALGORITHM:
27 * 1) Strip off intervening blanks and/or tabs.
28 * 2) If it is an alphanumeric token, move it to the token buffer "token".
29 * Check if it is a special reserved word that indent will want to
30 * know about.
31 * 3) Non-alphanumeric tokens are handled with a big switch statement. A
32 * flag is kept to remember if the last token was a "unary delimiter",
33 * which forces a following operator to be unary as opposed to binary.
34 *
35 * PARAMETERS:
36 * None
37 *
38 * RETURNS:
39 * An integer code indicating the type of token scanned.
40 *
41 * GLOBALS:
42 * buf_ptr =
43 * had_eof
44 * ps.last_u_d = Set to true iff this token is a "unary delimiter"
45 *
46 * CALLS:
47 * fill_buffer
48 * printf (lib)
49 *
50 * CALLED BY:
51 * main
52 *
53 * NOTES:
54 * Start of comment is passed back so that the comment can be scanned by
55 * pr_comment.
56 *
57 * Strings and character literals are returned just like identifiers.
58 *
59 * HISTORY:
60 * initial coding November 1976 D A Willcox of CAC
61 * 1/7/77 D A Willcox of CAC Fix to provide proper handling
62 * of "int a -1;"
63 *
64 */\f
4b365fcd 65
1009bf5e
KM
66/*
67 * Here we have the token scanner for indent. It scans off one token and
68 * puts it in the global variable "token". It returns a code, indicating
69 * the type of token scanned.
70 */
4b365fcd 71
1d7a34f4
KB
72#include "indent_globs.h"
73#include "indent_codes.h"
1009bf5e 74#include "ctype.h"
4b365fcd
KM
75
76#define alphanum 1
77#define opchar 3
78
79struct templ {
1009bf5e
KM
80 char *rwd;
81 int rwcode;
4b365fcd
KM
82};
83
1009bf5e 84struct templ specials[100] =
4b365fcd
KM
85{
86 "switch", 1,
87 "case", 2,
1009bf5e 88 "break", 0,
4b365fcd 89 "struct", 3,
1009bf5e
KM
90 "union", 3,
91 "enum", 3,
4b365fcd
KM
92 "default", 2,
93 "int", 4,
94 "char", 4,
95 "float", 4,
96 "double", 4,
97 "long", 4,
98 "short", 4,
99 "typdef", 4,
100 "unsigned", 4,
101 "register", 4,
102 "static", 4,
103 "global", 4,
104 "extern", 4,
1009bf5e
KM
105 "void", 4,
106 "goto", 0,
107 "return", 0,
4b365fcd
KM
108 "if", 5,
109 "while", 5,
110 "for", 5,
111 "else", 6,
112 "do", 6,
1009bf5e 113 "sizeof", 7,
4b365fcd
KM
114 0, 0
115};
116
1009bf5e
KM
117char chartype[128] =
118{ /* this is used to facilitate the decision
119 * of what type (alphanumeric, operator)
120 * each character is */
4b365fcd
KM
121 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
720fc992 125 0, 3, 0, 0, 1, 3, 3, 0,
4b365fcd
KM
126 0, 0, 3, 3, 0, 3, 3, 3,
127 1, 1, 1, 1, 1, 1, 1, 1,
128 1, 1, 0, 0, 3, 3, 3, 3,
129 0, 1, 1, 1, 1, 1, 1, 1,
130 1, 1, 1, 1, 1, 1, 1, 1,
131 1, 1, 1, 1, 1, 1, 1, 1,
132 1, 1, 1, 0, 0, 0, 3, 1,
133 0, 1, 1, 1, 1, 1, 1, 1,
134 1, 1, 1, 1, 1, 1, 1, 1,
135 1, 1, 1, 1, 1, 1, 1, 1,
136 1, 1, 1, 0, 3, 0, 3, 0
137};
138
1009bf5e
KM
139
140
141
142int
143lexi()
144{
145 register char *tok; /* local pointer to next char in token */
146 int unary_delim; /* this is set to 1 if the current token
147 *
148 * forces a following operator to be unary */
149 static int last_code; /* the last token type returned */
150 static int l_struct; /* set to 1 if the last token was 'struct' */
151 int code; /* internal code to be returned */
152 char qchar; /* the delimiter character for a string */
153
154 tok = token; /* point to start of place to save token */
4b365fcd 155 unary_delim = false;
1009bf5e
KM
156 ps.col_1 = ps.last_nl; /* tell world that this token started in
157 * column 1 iff the last thing scanned was
158 * nl */
159 ps.last_nl = false;
160
161 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
162 ps.col_1 = false; /* leading blanks imply token is not in
163 * column 1 */
4b365fcd 164 if (++buf_ptr >= buf_end)
1009bf5e 165 fill_buffer();
4b365fcd
KM
166 }
167
c93d6f87
KM
168 /* Scan an alphanumeric token. Note that we must also handle
169 * stuff like "1.0e+03" and "7e-6". */
1009bf5e
KM
170 if (chartype[*buf_ptr & 0177] == alphanum) { /* we have a character
171 * or number */
172 register char *j; /* used for searching thru list of
1009bf5e
KM
173 * reserved words */
174 register struct templ *p;
c93d6f87 175 register int c;
4b365fcd 176
c93d6f87 177 do { /* copy it over */
4b365fcd
KM
178 *tok++ = *buf_ptr++;
179 if (buf_ptr >= buf_end)
1009bf5e 180 fill_buffer();
c93d6f87
KM
181 } while (chartype[c = *buf_ptr & 0177] == alphanum ||
182 isdigit(token[0]) && (c == '+' || c == '-') &&
183 (tok[-1] == 'e' || tok[-1] == 'E'));
4b365fcd 184 *tok++ = '\0';
1009bf5e
KM
185 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
186 if (++buf_ptr >= buf_end)
187 fill_buffer();
188 }
189 ps.its_a_keyword = false;
190 ps.sizeof_keyword = false;
191 if (l_struct) { /* if last token was 'struct', then this
192 * token should be treated as a
193 * declaration */
4b365fcd
KM
194 l_struct = false;
195 last_code = ident;
1009bf5e 196 ps.last_u_d = true;
4b365fcd
KM
197 return (decl);
198 }
1009bf5e
KM
199 ps.last_u_d = false; /* Operator after indentifier is binary */
200 last_code = ident; /* Remember that this is the code we will
201 * return */
202
203 /*
204 * This loop will check if the token is a keyword.
205 */
206 for (p = specials; (j = p->rwd) != 0; p++) {
207 tok = token; /* point at scanned token */
208 if (*j++ != *tok++ || *j++ != *tok++)
209 continue; /* This test depends on the fact that
210 * identifiers are always at least 1
211 * character long (ie. the first two bytes
212 * of the identifier are always
213 * meaningful) */
214 if (tok[-1] == 0)
215 break; /* If its a one-character identifier */
216 while (*tok++ == *j)
217 if (*j++ == 0)
218 goto found_keyword; /* I wish that C had a multi-level
219 * break... */
220 }
221 if (p->rwd) { /* we have a keyword */
222 found_keyword:
223 ps.its_a_keyword = true;
224 ps.last_u_d = true;
225 switch (p->rwcode) {
226 case 1: /* it is a switch */
227 return (swstmt);
228 case 2: /* a case or default */
229 return (casestmt);
230
231 case 3: /* a "struct" */
232 if (ps.p_l_follow)
233 break; /* inside parens: cast */
234 l_struct = true;
235
236 /*
237 * Next time around, we will want to know that we have
238 * had a 'struct'
239 */
240 case 4: /* one of the declaration keywords */
241 if (ps.p_l_follow) {
242 ps.cast_mask |= 1 << ps.p_l_follow;
243 break; /* inside parens: cast */
244 }
245 last_code = decl;
246 return (decl);
247
248 case 5: /* if, while, for */
249 return (sp_paren);
250
251 case 6: /* do, else */
252 return (sp_nparen);
253
254 case 7:
255 ps.sizeof_keyword = true;
256 default: /* all others are treated like any other
257 * identifier */
258 return (ident);
259 } /* end of switch */
260 } /* end of if (found_it) */
261 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0
262 && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) {
263 strncpy(ps.procname, token, sizeof ps.procname - 1);
264 ps.in_parameter_declaration = 1;
4b365fcd
KM
265 }
266
1009bf5e
KM
267 /*
268 * The following hack attempts to guess whether or not the current
269 * token is in fact a declaration keyword -- one that has been
270 * typedefd
271 */
272 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr))
273 && !ps.p_l_follow
274 && (ps.last_token == rparen || ps.last_token == semicolon ||
275 ps.last_token == decl ||
276 ps.last_token == lbrace || ps.last_token == rbrace)) {
277 ps.its_a_keyword = true;
278 ps.last_u_d = true;
279 last_code = decl;
280 return decl;
281 }
282 if (last_code == decl) /* if this is a declared variable, then
283 * following sign is unary */
284 ps.last_u_d = true; /* will make "int a -1" work */
4b365fcd 285 last_code = ident;
1009bf5e
KM
286 return (ident); /* the ident is not in the list */
287 } /* end of procesing for alpanum character */
c93d6f87 288 /* Scan a non-alphanumeric token */
4b365fcd 289
1009bf5e
KM
290 *tok++ = *buf_ptr; /* if it is only a one-character token, it
291 * is moved here */
4b365fcd
KM
292 *tok = '\0';
293 if (++buf_ptr >= buf_end)
1009bf5e 294 fill_buffer();
4b365fcd
KM
295
296 switch (*token) {
1009bf5e
KM
297 case '\n':
298 unary_delim = ps.last_u_d;
299 ps.last_nl = true; /* remember that we just had a newline */
4b365fcd 300 code = (had_eof ? 0 : newline);
4b365fcd 301
1009bf5e
KM
302 /*
303 * if data has been exausted, the newline is a dummy, and we
304 * should return code to stop
305 */
306 break;
4b365fcd 307
1009bf5e
KM
308 case '\'': /* start of quoted character */
309 case '"': /* start of string */
310 qchar = *token;
311 if (troff) {
312 tok[-1] = '`';
313 if (qchar == '"')
314 *tok++ = '`';
315 *tok++ = BACKSLASH;
316 *tok++ = 'f';
317 *tok++ = 'L';
318 }
319 do { /* copy the string */
320 while (1) { /* move one character or [/<char>]<char> */
4b365fcd 321 if (*buf_ptr == '\n') {
1009bf5e 322 printf("%d: Unterminated literal\n", line_no);
4b365fcd 323 goto stop_lit;
4b365fcd 324 }
4b365fcd
KM
325 *tok = *buf_ptr++;
326 if (buf_ptr >= buf_end)
1009bf5e 327 fill_buffer();
4b365fcd 328 if (had_eof || ((tok - token) > (bufsize - 2))) {
1009bf5e 329 printf("Unterminated literal\n");
4b365fcd
KM
330 ++tok;
331 goto stop_lit;
1009bf5e 332 /* get outof literal copying loop */
4b365fcd 333 }
1009bf5e
KM
334 if (*tok == BACKSLASH) { /* if escape, copy extra
335 * char */
336 if (*buf_ptr == '\n') /* check for escaped
337 * newline */
4b365fcd 338 ++line_no;
1009bf5e
KM
339 if (troff) {
340 *++tok = BACKSLASH;
341 if (*buf_ptr == BACKSLASH)
342 *++tok = BACKSLASH;
343 }
344 *++tok = *buf_ptr++;
345 ++tok; /* we must increment this again because we
346 * copied two chars */
4b365fcd 347 if (buf_ptr >= buf_end)
1009bf5e 348 fill_buffer();
4b365fcd
KM
349 }
350 else
1009bf5e
KM
351 break; /* we copied one character */
352 } /* end of while (1) */
4b365fcd 353 } while (*tok++ != qchar);
1009bf5e
KM
354 if (troff) {
355 tok[-1] = BACKSLASH;
356 *tok++ = 'f';
357 *tok++ = 'R';
358 *tok++ = '\'';
359 if (qchar == '"')
360 *tok++ = '\'';
361 }
362 stop_lit:
4b365fcd
KM
363 code = ident;
364 break;
365
1009bf5e
KM
366 case ('('):
367 case ('['):
4b365fcd
KM
368 unary_delim = true;
369 code = lparen;
370 break;
371
1009bf5e
KM
372 case (')'):
373 case (']'):
4b365fcd
KM
374 code = rparen;
375 break;
376
1009bf5e
KM
377 case '#':
378 unary_delim = ps.last_u_d;
4b365fcd
KM
379 code = preesc;
380 break;
381
1009bf5e 382 case '?':
4b365fcd
KM
383 unary_delim = true;
384 code = question;
385 break;
386
1009bf5e 387 case (':'):
4b365fcd
KM
388 code = colon;
389 unary_delim = true;
390 break;
391
1009bf5e 392 case (';'):
4b365fcd
KM
393 unary_delim = true;
394 code = semicolon;
395 break;
396
1009bf5e 397 case ('{'):
4b365fcd 398 unary_delim = true;
1009bf5e
KM
399
400 /*
401 * if (ps.in_or_st) ps.block_init = 1;
402 */
403 code = ps.block_init ? lparen : lbrace;
4b365fcd
KM
404 break;
405
1009bf5e 406 case ('}'):
4b365fcd 407 unary_delim = true;
1009bf5e 408 code = ps.block_init ? rparen : rbrace;
4b365fcd
KM
409 break;
410
1009bf5e
KM
411 case 014: /* a form feed */
412 unary_delim = ps.last_u_d;
413 ps.last_nl = true; /* remember this so we can set 'ps.col_1'
414 * right */
4b365fcd
KM
415 code = form_feed;
416 break;
417
1009bf5e 418 case (','):
4b365fcd
KM
419 unary_delim = true;
420 code = comma;
421 break;
422
1009bf5e 423 case '.':
4b365fcd
KM
424 unary_delim = false;
425 code = period;
426 break;
427
1009bf5e
KM
428 case '-':
429 case '+': /* check for -, +, --, ++ */
430 code = (ps.last_u_d ? unary_op : binary_op);
4b365fcd
KM
431 unary_delim = true;
432
433 if (*buf_ptr == token[0]) {
1009bf5e 434 /* check for doubled character */
4b365fcd 435 *tok++ = *buf_ptr++;
1009bf5e 436 /* buffer overflow will be checked at end of loop */
4b365fcd 437 if (last_code == ident || last_code == rparen) {
1009bf5e
KM
438 code = (ps.last_u_d ? unary_op : postop);
439 /* check for following ++ or -- */
4b365fcd
KM
440 unary_delim = false;
441 }
442 }
1009bf5e
KM
443 else if (*buf_ptr == '=')
444 /* check for operator += */
445 *tok++ = *buf_ptr++;
5c6e73ac 446 else if (token[0] == '-' && *buf_ptr == '>') {
1009bf5e
KM
447 /* check for operator -> */
448 *tok++ = *buf_ptr++;
5c6e73ac
KM
449 if (!pointer_as_binop) {
450 code = unary_op;
451 unary_delim = false;
452 ps.want_blank = false;
453 }
1009bf5e
KM
454 }
455 /* buffer overflow will be checked at end of switch */
4b365fcd
KM
456
457 break;
458
1009bf5e
KM
459 case '=':
460 if (ps.in_or_st)
461 ps.block_init = 1;
462 if (chartype[*buf_ptr] == opchar) { /* we have two char
463 * assignment */
464 tok[-1] = *buf_ptr++;
465 if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr)
466 *tok++ = *buf_ptr++;
467 *tok++ = '='; /* Flip =+ to += */
468 *tok = 0;
4b365fcd 469 }
4b365fcd
KM
470 code = binary_op;
471 unary_delim = true;
1009bf5e
KM
472 break;
473 /* can drop thru!!! */
4b365fcd 474
1009bf5e
KM
475 case '>':
476 case '<':
477 case '!': /* ops like <, <<, <=, !=, etc */
4b365fcd
KM
478 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
479 *tok++ = *buf_ptr;
480 if (++buf_ptr >= buf_end)
1009bf5e 481 fill_buffer();
4b365fcd 482 }
4b365fcd 483 if (*buf_ptr == '=')
1009bf5e
KM
484 *tok++ = *buf_ptr++;
485 code = (ps.last_u_d ? unary_op : binary_op);
4b365fcd
KM
486 unary_delim = true;
487 break;
488
1009bf5e 489 default:
4b365fcd 490 if (token[0] == '/' && *buf_ptr == '*') {
1009bf5e 491 /* it is start of comment */
4b365fcd
KM
492 *tok++ = '*';
493
494 if (++buf_ptr >= buf_end)
1009bf5e 495 fill_buffer();
4b365fcd
KM
496
497 code = comment;
1009bf5e 498 unary_delim = ps.last_u_d;
4b365fcd
KM
499 break;
500 }
1009bf5e
KM
501 while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') {
502 /* handle ||, &&, etc, and also things as in int *****i */
4b365fcd
KM
503 *tok++ = *buf_ptr;
504 if (++buf_ptr >= buf_end)
1009bf5e 505 fill_buffer();
4b365fcd 506 }
1009bf5e 507 code = (ps.last_u_d ? unary_op : binary_op);
4b365fcd
KM
508 unary_delim = true;
509
510
1009bf5e 511 } /* end of switch */
4b365fcd
KM
512 if (code != newline) {
513 l_struct = false;
514 last_code = code;
515 }
1009bf5e
KM
516 if (buf_ptr >= buf_end) /* check for input buffer empty */
517 fill_buffer();
518 ps.last_u_d = unary_delim;
519 *tok = '\0'; /* null terminate the token */
4b365fcd
KM
520 return (code);
521};
1009bf5e
KM
522
523/* Add the given keyword to the keyword table, using val as the keyword type
524 */
525addkey (key, val)
526char *key;
527{
528 register struct templ *p = specials;
529 while (p->rwd)
530 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
531 return;
532 else
533 p++;
534 if (p >= specials + sizeof specials / sizeof specials[0])
535 return; /* For now, table overflows are silently
536 ignored */
537 p->rwd = key;
538 p->rwcode = val;
539 p[1].rwd = 0;
540 p[1].rwcode = 0;
541 return;
542}