new version from James Gosling including various bug fixes
[unix-history] / usr / src / usr.bin / indent / lexi.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980 The Regents of the University of California.
4 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms are permitted
8 * provided that the above copyright notice and this paragraph are
9 * duplicated in all such forms and that any documentation,
10 * advertising materials, and other materials related to such
11 * distribution and use acknowledge that the software was developed
12 * by the University of California, Berkeley, the University of Illinois,
13 * Urbana, and Sun Microsystems, Inc. The name of either University
14 * or Sun Microsystems may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19 */
20
21#ifndef lint
22static char sccsid[] = "@(#)lexi.c 5.13 (Berkeley) %G%";
23#endif /* not lint */
24
25/*
26 * Here we have the token scanner for indent. It scans off one token and puts
27 * it in the global variable "token". It returns a code, indicating the type
28 * of token scanned.
29 */
30
31#include "indent_globs.h"
32#include "indent_codes.h"
33#include <ctype.h>
34
35#define alphanum 1
36#define opchar 3
37
38struct templ {
39 char *rwd;
40 int rwcode;
41};
42
43struct templ specials[100] =
44{
45 "switch", 1,
46 "case", 2,
47 "break", 0,
48 "struct", 3,
49 "union", 3,
50 "enum", 3,
51 "default", 2,
52 "int", 4,
53 "char", 4,
54 "float", 4,
55 "double", 4,
56 "long", 4,
57 "short", 4,
58 "typdef", 4,
59 "unsigned", 4,
60 "register", 4,
61 "static", 4,
62 "global", 4,
63 "extern", 4,
64 "void", 4,
65 "goto", 0,
66 "return", 0,
67 "if", 5,
68 "while", 5,
69 "for", 5,
70 "else", 6,
71 "do", 6,
72 "sizeof", 7,
73 0, 0
74};
75
76char chartype[128] =
77{ /* this is used to facilitate the decision of
78 * what type (alphanumeric, operator) each
79 * character is */
80 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 0, 0, 0, 0, 0, 0,
84 0, 3, 0, 0, 1, 3, 3, 0,
85 0, 0, 3, 3, 0, 3, 0, 3,
86 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 0, 0, 3, 3, 3, 3,
88 0, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 0, 0, 0, 3, 1,
92 0, 1, 1, 1, 1, 1, 1, 1,
93 1, 1, 1, 1, 1, 1, 1, 1,
94 1, 1, 1, 1, 1, 1, 1, 1,
95 1, 1, 1, 0, 3, 0, 3, 0
96};
97
98
99
100
101int
102lexi()
103{
104 int unary_delim; /* this is set to 1 if the current token
105 *
106 * forces a following operator to be unary */
107 static int last_code; /* the last token type returned */
108 static int l_struct; /* set to 1 if the last token was 'struct' */
109 int code; /* internal code to be returned */
110 char qchar; /* the delimiter character for a string */
111
112 e_token = s_token; /* point to start of place to save token */
113 unary_delim = false;
114 ps.col_1 = ps.last_nl; /* tell world that this token started in
115 * column 1 iff the last thing scanned was nl */
116 ps.last_nl = false;
117
118 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
119 ps.col_1 = false; /* leading blanks imply token is not in column
120 * 1 */
121 if (++buf_ptr >= buf_end)
122 fill_buffer();
123 }
124
125 /* Scan an alphanumeric token */
126 if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
127 /*
128 * we have a character or number
129 */
130 register char *j; /* used for searching thru list of
131 *
132 * reserved words */
133 register struct templ *p;
134
135 if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
136 int seendot = 0,
137 seenexp = 0;
138 if (*buf_ptr == '0' &&
139 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
140 *e_token++ = *buf_ptr++;
141 *e_token++ = *buf_ptr++;
142 while (isxdigit(*buf_ptr)) {
143 check_size(token);
144 *e_token++ = *buf_ptr++;
145 }
146 }
147 else
148 while (1) {
149 if (*buf_ptr == '.')
150 if (seendot)
151 break;
152 else
153 seendot++;
154 check_size(token);
155 *e_token++ = *buf_ptr++;
156 if (!isdigit(*buf_ptr) && *buf_ptr != '.')
157 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
158 break;
159 else {
160 seenexp++;
161 seendot++;
162 check_size(token);
163 *e_token++ = *buf_ptr++;
164 if (*buf_ptr == '+' || *buf_ptr == '-')
165 *e_token++ = *buf_ptr++;
166 }
167 }
168 if (*buf_ptr == 'L' || *buf_ptr == 'l')
169 *e_token++ = *buf_ptr++;
170 }
171 else
172 while (chartype[*buf_ptr] == alphanum) { /* copy it over */
173 check_size(token);
174 *e_token++ = *buf_ptr++;
175 if (buf_ptr >= buf_end)
176 fill_buffer();
177 }
178 *e_token++ = '\0';
179 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
180 if (++buf_ptr >= buf_end)
181 fill_buffer();
182 }
183 ps.its_a_keyword = false;
184 ps.sizeof_keyword = false;
185 if (l_struct) { /* if last token was 'struct', then this token
186 * should be treated as a declaration */
187 l_struct = false;
188 last_code = ident;
189 ps.last_u_d = true;
190 return (decl);
191 }
192 ps.last_u_d = false; /* Operator after indentifier is binary */
193 last_code = ident; /* Remember that this is the code we will
194 * return */
195
196 /*
197 * This loop will check if the token is a keyword.
198 */
199 for (p = specials; (j = p->rwd) != 0; p++) {
200 register char *p = s_token; /* point at scanned token */
201 if (*j++ != *p++ || *j++ != *p++)
202 continue; /* This test depends on the fact that
203 * identifiers are always at least 1 character
204 * long (ie. the first two bytes of the
205 * identifier are always meaningful) */
206 if (p[-1] == 0)
207 break; /* If its a one-character identifier */
208 while (*p++ == *j)
209 if (*j++ == 0)
210 goto found_keyword; /* I wish that C had a multi-level
211 * break... */
212 }
213 if (p->rwd) { /* we have a keyword */
214 found_keyword:
215 ps.its_a_keyword = true;
216 ps.last_u_d = true;
217 switch (p->rwcode) {
218 case 1: /* it is a switch */
219 return (swstmt);
220 case 2: /* a case or default */
221 return (casestmt);
222
223 case 3: /* a "struct" */
224 if (ps.p_l_follow)
225 break; /* inside parens: cast */
226 l_struct = true;
227
228 /*
229 * Next time around, we will want to know that we have had a
230 * 'struct'
231 */
232 case 4: /* one of the declaration keywords */
233 if (ps.p_l_follow) {
234 ps.cast_mask |= 1 << ps.p_l_follow;
235 break; /* inside parens: cast */
236 }
237 last_code = decl;
238 return (decl);
239
240 case 5: /* if, while, for */
241 return (sp_paren);
242
243 case 6: /* do, else */
244 return (sp_nparen);
245
246 case 7:
247 ps.sizeof_keyword = true;
248 default: /* all others are treated like any other
249 * identifier */
250 return (ident);
251 } /* end of switch */
252 } /* end of if (found_it) */
253 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
254 register char *tp = buf_ptr;
255 while (tp < buf_end)
256 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
257 goto not_proc;
258 strncpy(ps.procname, token, sizeof ps.procname - 1);
259 ps.in_parameter_declaration = 1;
260 rparen_count = 1;
261 not_proc:;
262 }
263 /*
264 * The following hack attempts to guess whether or not the current
265 * token is in fact a declaration keyword -- one that has been
266 * typedefd
267 */
268 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
269 && !ps.p_l_follow
270 && !ps.block_init
271 && (ps.last_token == rparen || ps.last_token == semicolon ||
272 ps.last_token == decl ||
273 ps.last_token == lbrace || ps.last_token == rbrace)) {
274 ps.its_a_keyword = true;
275 ps.last_u_d = true;
276 last_code = decl;
277 return decl;
278 }
279 if (last_code == decl) /* if this is a declared variable, then
280 * following sign is unary */
281 ps.last_u_d = true; /* will make "int a -1" work */
282 last_code = ident;
283 return (ident); /* the ident is not in the list */
284 } /* end of procesing for alpanum character */
285
286 /* Scan a non-alphanumeric token */
287
288 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
289 * moved here */
290 *e_token = '\0';
291 if (++buf_ptr >= buf_end)
292 fill_buffer();
293
294 switch (*token) {
295 case '\n':
296 unary_delim = ps.last_u_d;
297 ps.last_nl = true; /* remember that we just had a newline */
298 code = (had_eof ? 0 : newline);
299
300 /*
301 * if data has been exausted, the newline is a dummy, and we should
302 * return code to stop
303 */
304 break;
305
306 case '\'': /* start of quoted character */
307 case '"': /* start of string */
308 qchar = *token;
309 if (troff) {
310 e_token[-1] = '`';
311 if (qchar == '"')
312 *e_token++ = '`';
313 e_token = chfont(&bodyf, &stringf, e_token);
314 }
315 do { /* copy the string */
316 while (1) { /* move one character or [/<char>]<char> */
317 if (*buf_ptr == '\n') {
318 printf("%d: Unterminated literal\n", line_no);
319 goto stop_lit;
320 }
321 check_size(token); /* Only have to do this once in this loop,
322 * since check_size guarantees that there
323 * are at least 5 entries left */
324 *e_token = *buf_ptr++;
325 if (buf_ptr >= buf_end)
326 fill_buffer();
327 if (*e_token == BACKSLASH) { /* if escape, copy extra char */
328 if (*buf_ptr == '\n') /* check for escaped newline */
329 ++line_no;
330 if (troff) {
331 *++e_token = BACKSLASH;
332 if (*buf_ptr == BACKSLASH)
333 *++e_token = BACKSLASH;
334 }
335 *++e_token = *buf_ptr++;
336 ++e_token; /* we must increment this again because we
337 * copied two chars */
338 if (buf_ptr >= buf_end)
339 fill_buffer();
340 }
341 else
342 break; /* we copied one character */
343 } /* end of while (1) */
344 } while (*e_token++ != qchar);
345 if (troff) {
346 e_token = chfont(&stringf, &bodyf, e_token - 1);
347 if (qchar == '"')
348 *e_token++ = '\'';
349 }
350stop_lit:
351 code = ident;
352 break;
353
354 case ('('):
355 case ('['):
356 unary_delim = true;
357 code = lparen;
358 break;
359
360 case (')'):
361 case (']'):
362 code = rparen;
363 break;
364
365 case '#':
366 unary_delim = ps.last_u_d;
367 code = preesc;
368 break;
369
370 case '?':
371 unary_delim = true;
372 code = question;
373 break;
374
375 case (':'):
376 code = colon;
377 unary_delim = true;
378 break;
379
380 case (';'):
381 unary_delim = true;
382 code = semicolon;
383 break;
384
385 case ('{'):
386 unary_delim = true;
387
388 /*
389 * if (ps.in_or_st) ps.block_init = 1;
390 */
391 /* ? code = ps.block_init ? lparen : lbrace; */
392 code = lbrace;
393 break;
394
395 case ('}'):
396 unary_delim = true;
397 /* ? code = ps.block_init ? rparen : rbrace; */
398 code = rbrace;
399 break;
400
401 case 014: /* a form feed */
402 unary_delim = ps.last_u_d;
403 ps.last_nl = true; /* remember this so we can set 'ps.col_1'
404 * right */
405 code = form_feed;
406 break;
407
408 case (','):
409 unary_delim = true;
410 code = comma;
411 break;
412
413 case '.':
414 unary_delim = false;
415 code = period;
416 break;
417
418 case '-':
419 case '+': /* check for -, +, --, ++ */
420 code = (ps.last_u_d ? unary_op : binary_op);
421 unary_delim = true;
422
423 if (*buf_ptr == token[0]) {
424 /* check for doubled character */
425 *e_token++ = *buf_ptr++;
426 /* buffer overflow will be checked at end of loop */
427 if (last_code == ident || last_code == rparen) {
428 code = (ps.last_u_d ? unary_op : postop);
429 /* check for following ++ or -- */
430 unary_delim = false;
431 }
432 }
433 else if (*buf_ptr == '=')
434 /* check for operator += */
435 *e_token++ = *buf_ptr++;
436 else if (*buf_ptr == '>') {
437 /* check for operator -> */
438 *e_token++ = *buf_ptr++;
439 if (!pointer_as_binop) {
440 unary_delim = false;
441 code = unary_op;
442 ps.want_blank = false;
443 }
444 }
445 break; /* buffer overflow will be checked at end of
446 * switch */
447
448 case '=':
449 if (ps.in_or_st)
450 ps.block_init = 1;
451#ifdef undef
452 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
453 e_token[-1] = *buf_ptr++;
454 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
455 *e_token++ = *buf_ptr++;
456 *e_token++ = '='; /* Flip =+ to += */
457 *e_token = 0;
458 }
459#else
460 if (*buf_ptr == '=') {/* == */
461 *e_token++ = '='; /* Flip =+ to += */
462 buf_ptr++;
463 *e_token = 0;
464 }
465#endif
466 code = binary_op;
467 unary_delim = true;
468 break;
469 /* can drop thru!!! */
470
471 case '>':
472 case '<':
473 case '!': /* ops like <, <<, <=, !=, etc */
474 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
475 *e_token++ = *buf_ptr;
476 if (++buf_ptr >= buf_end)
477 fill_buffer();
478 }
479 if (*buf_ptr == '=')
480 *e_token++ = *buf_ptr++;
481 code = (ps.last_u_d ? unary_op : binary_op);
482 unary_delim = true;
483 break;
484
485 default:
486 if (token[0] == '/' && *buf_ptr == '*') {
487 /* it is start of comment */
488 *e_token++ = '*';
489
490 if (++buf_ptr >= buf_end)
491 fill_buffer();
492
493 code = comment;
494 unary_delim = ps.last_u_d;
495 break;
496 }
497 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
498 /*
499 * handle ||, &&, etc, and also things as in int *****i
500 */
501 *e_token++ = *buf_ptr;
502 if (++buf_ptr >= buf_end)
503 fill_buffer();
504 }
505 code = (ps.last_u_d ? unary_op : binary_op);
506 unary_delim = true;
507
508
509 } /* end of switch */
510 if (code != newline) {
511 l_struct = false;
512 last_code = code;
513 }
514 if (buf_ptr >= buf_end) /* check for input buffer empty */
515 fill_buffer();
516 ps.last_u_d = unary_delim;
517 *e_token = '\0'; /* null terminate the token */
518 return (code);
519}
520
521/*
522 * Add the given keyword to the keyword table, using val as the keyword type
523 */
524addkey(key, val)
525 char *key;
526{
527 register struct templ *p = specials;
528 while (p->rwd)
529 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
530 return;
531 else
532 p++;
533 if (p >= specials + sizeof specials / sizeof specials[0])
534 return; /* For now, table overflows are silently
535 * ignored */
536 p->rwd = key;
537 p->rwcode = val;
538 p[1].rwd = 0;
539 p[1].rwcode = 0;
540 return;
541}