new version from James Gosling including various bug fixes
[unix-history] / usr / src / usr.bin / indent / lexi.c
CommitLineData
c0bc4ef7 1/*
30f48914
KB
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980 The Regents of the University of California.
b0627149
KB
4 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms are permitted
b36fc510
KB
8 * provided that the above copyright notice and this paragraph are
9 * duplicated in all such forms and that any documentation,
10 * advertising materials, and other materials related to such
11 * distribution and use acknowledge that the software was developed
30f48914
KB
12 * by the University of California, Berkeley, the University of Illinois,
13 * Urbana, and Sun Microsystems, Inc. The name of either University
14 * or Sun Microsystems may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
b36fc510
KB
16 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
c0bc4ef7
DF
19 */
20
21#ifndef lint
0c8ee79d 22static char sccsid[] = "@(#)lexi.c 5.13 (Berkeley) %G%";
b0627149 23#endif /* not lint */
4b365fcd 24
b0627149 25/*
30f48914
KB
26 * Here we have the token scanner for indent. It scans off one token and puts
27 * it in the global variable "token". It returns a code, indicating the type
28 * of token scanned.
1009bf5e 29 */
4b365fcd 30
8540f0fa
KB
31#include "indent_globs.h"
32#include "indent_codes.h"
1a43f1ba 33#include <ctype.h>
4b365fcd
KM
34
35#define alphanum 1
36#define opchar 3
37
38struct templ {
1009bf5e
KM
39 char *rwd;
40 int rwcode;
4b365fcd
KM
41};
42
1009bf5e 43struct templ specials[100] =
4b365fcd
KM
44{
45 "switch", 1,
46 "case", 2,
1009bf5e 47 "break", 0,
4b365fcd 48 "struct", 3,
1009bf5e
KM
49 "union", 3,
50 "enum", 3,
4b365fcd
KM
51 "default", 2,
52 "int", 4,
53 "char", 4,
54 "float", 4,
55 "double", 4,
56 "long", 4,
57 "short", 4,
58 "typdef", 4,
59 "unsigned", 4,
60 "register", 4,
61 "static", 4,
62 "global", 4,
63 "extern", 4,
1009bf5e
KM
64 "void", 4,
65 "goto", 0,
66 "return", 0,
4b365fcd
KM
67 "if", 5,
68 "while", 5,
69 "for", 5,
70 "else", 6,
71 "do", 6,
1009bf5e 72 "sizeof", 7,
4b365fcd
KM
73 0, 0
74};
75
1009bf5e 76char chartype[128] =
30f48914
KB
77{ /* this is used to facilitate the decision of
78 * what type (alphanumeric, operator) each
79 * character is */
4b365fcd
KM
80 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 0, 0, 0, 0, 0, 0,
720fc992 84 0, 3, 0, 0, 1, 3, 3, 0,
30f48914 85 0, 0, 3, 3, 0, 3, 0, 3,
4b365fcd
KM
86 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 0, 0, 3, 3, 3, 3,
88 0, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 0, 0, 0, 3, 1,
92 0, 1, 1, 1, 1, 1, 1, 1,
93 1, 1, 1, 1, 1, 1, 1, 1,
94 1, 1, 1, 1, 1, 1, 1, 1,
95 1, 1, 1, 0, 3, 0, 3, 0
96};
97
1009bf5e
KM
98
99
100
30f48914 101int
1009bf5e
KM
102lexi()
103{
30f48914
KB
104 int unary_delim; /* this is set to 1 if the current token
105 *
1009bf5e
KM
106 * forces a following operator to be unary */
107 static int last_code; /* the last token type returned */
108 static int l_struct; /* set to 1 if the last token was 'struct' */
109 int code; /* internal code to be returned */
110 char qchar; /* the delimiter character for a string */
111
0c8ee79d 112 e_token = s_token; /* point to start of place to save token */
4b365fcd 113 unary_delim = false;
1009bf5e 114 ps.col_1 = ps.last_nl; /* tell world that this token started in
30f48914 115 * column 1 iff the last thing scanned was nl */
1009bf5e
KM
116 ps.last_nl = false;
117
118 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
30f48914
KB
119 ps.col_1 = false; /* leading blanks imply token is not in column
120 * 1 */
4b365fcd 121 if (++buf_ptr >= buf_end)
1009bf5e 122 fill_buffer();
4b365fcd
KM
123 }
124
30f48914
KB
125 /* Scan an alphanumeric token */
126 if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
127 /*
128 * we have a character or number
129 */
130 register char *j; /* used for searching thru list of
131 *
1009bf5e
KM
132 * reserved words */
133 register struct templ *p;
4b365fcd 134
30f48914
KB
135 if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
136 int seendot = 0,
137 seenexp = 0;
138 if (*buf_ptr == '0' &&
139 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
0c8ee79d
KB
140 *e_token++ = *buf_ptr++;
141 *e_token++ = *buf_ptr++;
142 while (isxdigit(*buf_ptr)) {
143 check_size(token);
144 *e_token++ = *buf_ptr++;
145 }
30f48914
KB
146 }
147 else
148 while (1) {
149 if (*buf_ptr == '.')
150 if (seendot)
151 break;
152 else
153 seendot++;
0c8ee79d
KB
154 check_size(token);
155 *e_token++ = *buf_ptr++;
30f48914
KB
156 if (!isdigit(*buf_ptr) && *buf_ptr != '.')
157 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
158 break;
159 else {
160 seenexp++;
161 seendot++;
0c8ee79d
KB
162 check_size(token);
163 *e_token++ = *buf_ptr++;
30f48914 164 if (*buf_ptr == '+' || *buf_ptr == '-')
0c8ee79d 165 *e_token++ = *buf_ptr++;
30f48914
KB
166 }
167 }
168 if (*buf_ptr == 'L' || *buf_ptr == 'l')
0c8ee79d 169 *e_token++ = *buf_ptr++;
30f48914
KB
170 }
171 else
172 while (chartype[*buf_ptr] == alphanum) { /* copy it over */
0c8ee79d
KB
173 check_size(token);
174 *e_token++ = *buf_ptr++;
30f48914
KB
175 if (buf_ptr >= buf_end)
176 fill_buffer();
177 }
0c8ee79d 178 *e_token++ = '\0';
1009bf5e
KM
179 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
180 if (++buf_ptr >= buf_end)
181 fill_buffer();
182 }
183 ps.its_a_keyword = false;
184 ps.sizeof_keyword = false;
30f48914
KB
185 if (l_struct) { /* if last token was 'struct', then this token
186 * should be treated as a declaration */
4b365fcd
KM
187 l_struct = false;
188 last_code = ident;
1009bf5e 189 ps.last_u_d = true;
4b365fcd
KM
190 return (decl);
191 }
1009bf5e
KM
192 ps.last_u_d = false; /* Operator after indentifier is binary */
193 last_code = ident; /* Remember that this is the code we will
194 * return */
195
196 /*
30f48914 197 * This loop will check if the token is a keyword.
1009bf5e
KM
198 */
199 for (p = specials; (j = p->rwd) != 0; p++) {
0c8ee79d
KB
200 register char *p = s_token; /* point at scanned token */
201 if (*j++ != *p++ || *j++ != *p++)
1009bf5e 202 continue; /* This test depends on the fact that
30f48914
KB
203 * identifiers are always at least 1 character
204 * long (ie. the first two bytes of the
205 * identifier are always meaningful) */
0c8ee79d 206 if (p[-1] == 0)
1009bf5e 207 break; /* If its a one-character identifier */
0c8ee79d 208 while (*p++ == *j)
1009bf5e
KM
209 if (*j++ == 0)
210 goto found_keyword; /* I wish that C had a multi-level
211 * break... */
212 }
213 if (p->rwd) { /* we have a keyword */
214 found_keyword:
215 ps.its_a_keyword = true;
216 ps.last_u_d = true;
217 switch (p->rwcode) {
30f48914
KB
218 case 1: /* it is a switch */
219 return (swstmt);
220 case 2: /* a case or default */
221 return (casestmt);
222
223 case 3: /* a "struct" */
224 if (ps.p_l_follow)
225 break; /* inside parens: cast */
226 l_struct = true;
227
228 /*
229 * Next time around, we will want to know that we have had a
230 * 'struct'
231 */
232 case 4: /* one of the declaration keywords */
233 if (ps.p_l_follow) {
234 ps.cast_mask |= 1 << ps.p_l_follow;
235 break; /* inside parens: cast */
236 }
237 last_code = decl;
238 return (decl);
1009bf5e 239
30f48914
KB
240 case 5: /* if, while, for */
241 return (sp_paren);
1009bf5e 242
30f48914
KB
243 case 6: /* do, else */
244 return (sp_nparen);
1009bf5e 245
30f48914
KB
246 case 7:
247 ps.sizeof_keyword = true;
248 default: /* all others are treated like any other
1009bf5e 249 * identifier */
30f48914 250 return (ident);
1009bf5e
KM
251 } /* end of switch */
252 } /* end of if (found_it) */
30f48914 253 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
c5b954f4
KB
254 register char *tp = buf_ptr;
255 while (tp < buf_end)
0c8ee79d 256 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
30f48914 257 goto not_proc;
1009bf5e
KM
258 strncpy(ps.procname, token, sizeof ps.procname - 1);
259 ps.in_parameter_declaration = 1;
0c8ee79d 260 rparen_count = 1;
30f48914 261 not_proc:;
4b365fcd 262 }
1009bf5e
KM
263 /*
264 * The following hack attempts to guess whether or not the current
265 * token is in fact a declaration keyword -- one that has been
30f48914 266 * typedefd
1009bf5e 267 */
30f48914
KB
268 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
269 && !ps.p_l_follow
270 && !ps.block_init
271 && (ps.last_token == rparen || ps.last_token == semicolon ||
272 ps.last_token == decl ||
273 ps.last_token == lbrace || ps.last_token == rbrace)) {
1009bf5e
KM
274 ps.its_a_keyword = true;
275 ps.last_u_d = true;
276 last_code = decl;
277 return decl;
278 }
279 if (last_code == decl) /* if this is a declared variable, then
280 * following sign is unary */
281 ps.last_u_d = true; /* will make "int a -1" work */
4b365fcd 282 last_code = ident;
1009bf5e
KM
283 return (ident); /* the ident is not in the list */
284 } /* end of procesing for alpanum character */
4b365fcd 285
0c8ee79d
KB
286 /* Scan a non-alphanumeric token */
287
288 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
30f48914 289 * moved here */
0c8ee79d 290 *e_token = '\0';
4b365fcd 291 if (++buf_ptr >= buf_end)
1009bf5e 292 fill_buffer();
4b365fcd
KM
293
294 switch (*token) {
30f48914
KB
295 case '\n':
296 unary_delim = ps.last_u_d;
297 ps.last_nl = true; /* remember that we just had a newline */
298 code = (had_eof ? 0 : newline);
4b365fcd 299
30f48914
KB
300 /*
301 * if data has been exausted, the newline is a dummy, and we should
302 * return code to stop
303 */
304 break;
305
306 case '\'': /* start of quoted character */
307 case '"': /* start of string */
308 qchar = *token;
309 if (troff) {
0c8ee79d 310 e_token[-1] = '`';
30f48914 311 if (qchar == '"')
0c8ee79d
KB
312 *e_token++ = '`';
313 e_token = chfont(&bodyf, &stringf, e_token);
30f48914
KB
314 }
315 do { /* copy the string */
316 while (1) { /* move one character or [/<char>]<char> */
317 if (*buf_ptr == '\n') {
318 printf("%d: Unterminated literal\n", line_no);
319 goto stop_lit;
320 }
0c8ee79d
KB
321 check_size(token); /* Only have to do this once in this loop,
322 * since check_size guarantees that there
323 * are at least 5 entries left */
324 *e_token = *buf_ptr++;
30f48914
KB
325 if (buf_ptr >= buf_end)
326 fill_buffer();
0c8ee79d 327 if (*e_token == BACKSLASH) { /* if escape, copy extra char */
30f48914
KB
328 if (*buf_ptr == '\n') /* check for escaped newline */
329 ++line_no;
330 if (troff) {
0c8ee79d 331 *++e_token = BACKSLASH;
30f48914 332 if (*buf_ptr == BACKSLASH)
0c8ee79d 333 *++e_token = BACKSLASH;
4b365fcd 334 }
0c8ee79d
KB
335 *++e_token = *buf_ptr++;
336 ++e_token; /* we must increment this again because we
30f48914 337 * copied two chars */
4b365fcd 338 if (buf_ptr >= buf_end)
1009bf5e 339 fill_buffer();
30f48914
KB
340 }
341 else
342 break; /* we copied one character */
343 } /* end of while (1) */
0c8ee79d 344 } while (*e_token++ != qchar);
30f48914 345 if (troff) {
0c8ee79d 346 e_token = chfont(&stringf, &bodyf, e_token - 1);
30f48914 347 if (qchar == '"')
0c8ee79d 348 *e_token++ = '\'';
30f48914
KB
349 }
350stop_lit:
351 code = ident;
352 break;
353
354 case ('('):
355 case ('['):
356 unary_delim = true;
357 code = lparen;
358 break;
359
360 case (')'):
361 case (']'):
362 code = rparen;
363 break;
364
365 case '#':
366 unary_delim = ps.last_u_d;
367 code = preesc;
368 break;
369
370 case '?':
371 unary_delim = true;
372 code = question;
373 break;
374
375 case (':'):
376 code = colon;
377 unary_delim = true;
378 break;
379
380 case (';'):
381 unary_delim = true;
382 code = semicolon;
383 break;
384
385 case ('{'):
386 unary_delim = true;
4b365fcd 387
30f48914
KB
388 /*
389 * if (ps.in_or_st) ps.block_init = 1;
390 */
391 /* ? code = ps.block_init ? lparen : lbrace; */
392 code = lbrace;
393 break;
394
395 case ('}'):
396 unary_delim = true;
397 /* ? code = ps.block_init ? rparen : rbrace; */
398 code = rbrace;
399 break;
400
401 case 014: /* a form feed */
402 unary_delim = ps.last_u_d;
403 ps.last_nl = true; /* remember this so we can set 'ps.col_1'
1009bf5e 404 * right */
30f48914
KB
405 code = form_feed;
406 break;
407
408 case (','):
409 unary_delim = true;
410 code = comma;
411 break;
412
413 case '.':
414 unary_delim = false;
415 code = period;
416 break;
417
418 case '-':
419 case '+': /* check for -, +, --, ++ */
420 code = (ps.last_u_d ? unary_op : binary_op);
421 unary_delim = true;
422
423 if (*buf_ptr == token[0]) {
424 /* check for doubled character */
0c8ee79d 425 *e_token++ = *buf_ptr++;
30f48914
KB
426 /* buffer overflow will be checked at end of loop */
427 if (last_code == ident || last_code == rparen) {
428 code = (ps.last_u_d ? unary_op : postop);
429 /* check for following ++ or -- */
430 unary_delim = false;
4b365fcd 431 }
30f48914
KB
432 }
433 else if (*buf_ptr == '=')
434 /* check for operator += */
0c8ee79d 435 *e_token++ = *buf_ptr++;
30f48914
KB
436 else if (*buf_ptr == '>') {
437 /* check for operator -> */
0c8ee79d 438 *e_token++ = *buf_ptr++;
30f48914
KB
439 if (!pointer_as_binop) {
440 unary_delim = false;
441 code = unary_op;
442 ps.want_blank = false;
4b365fcd 443 }
30f48914
KB
444 }
445 break; /* buffer overflow will be checked at end of
446 * switch */
447
448 case '=':
449 if (ps.in_or_st)
450 ps.block_init = 1;
451#ifdef undef
452 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
0c8ee79d
KB
453 e_token[-1] = *buf_ptr++;
454 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
455 *e_token++ = *buf_ptr++;
456 *e_token++ = '='; /* Flip =+ to += */
457 *e_token = 0;
30f48914
KB
458 }
459#else
460 if (*buf_ptr == '=') {/* == */
0c8ee79d 461 *e_token++ = '='; /* Flip =+ to += */
30f48914 462 buf_ptr++;
0c8ee79d 463 *e_token = 0;
30f48914
KB
464 }
465#endif
466 code = binary_op;
467 unary_delim = true;
468 break;
469 /* can drop thru!!! */
470
471 case '>':
472 case '<':
473 case '!': /* ops like <, <<, <=, !=, etc */
474 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
0c8ee79d 475 *e_token++ = *buf_ptr;
30f48914
KB
476 if (++buf_ptr >= buf_end)
477 fill_buffer();
478 }
479 if (*buf_ptr == '=')
0c8ee79d 480 *e_token++ = *buf_ptr++;
30f48914
KB
481 code = (ps.last_u_d ? unary_op : binary_op);
482 unary_delim = true;
483 break;
4b365fcd 484
30f48914
KB
485 default:
486 if (token[0] == '/' && *buf_ptr == '*') {
487 /* it is start of comment */
0c8ee79d 488 *e_token++ = '*';
4b365fcd 489
30f48914
KB
490 if (++buf_ptr >= buf_end)
491 fill_buffer();
4b365fcd 492
30f48914
KB
493 code = comment;
494 unary_delim = ps.last_u_d;
495 break;
496 }
0c8ee79d 497 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
30f48914
KB
498 /*
499 * handle ||, &&, etc, and also things as in int *****i
500 */
0c8ee79d 501 *e_token++ = *buf_ptr;
30f48914
KB
502 if (++buf_ptr >= buf_end)
503 fill_buffer();
504 }
505 code = (ps.last_u_d ? unary_op : binary_op);
506 unary_delim = true;
4b365fcd
KM
507
508
1009bf5e 509 } /* end of switch */
4b365fcd
KM
510 if (code != newline) {
511 l_struct = false;
512 last_code = code;
513 }
1009bf5e
KM
514 if (buf_ptr >= buf_end) /* check for input buffer empty */
515 fill_buffer();
516 ps.last_u_d = unary_delim;
0c8ee79d 517 *e_token = '\0'; /* null terminate the token */
4b365fcd 518 return (code);
1a43f1ba 519}
1009bf5e 520
30f48914
KB
521/*
522 * Add the given keyword to the keyword table, using val as the keyword type
523 */
524addkey(key, val)
525 char *key;
1009bf5e
KM
526{
527 register struct templ *p = specials;
528 while (p->rwd)
529 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
530 return;
531 else
532 p++;
533 if (p >= specials + sizeof specials / sizeof specials[0])
534 return; /* For now, table overflows are silently
30f48914 535 * ignored */
1009bf5e
KM
536 p->rwd = key;
537 p->rwcode = val;
538 p[1].rwd = 0;
539 p[1].rwcode = 0;
540 return;
541}