date and time created 82/10/21 23:58:29 by mckusick
[unix-history] / usr / src / usr.bin / indent / lexi.c
CommitLineData
4b365fcd
KM
1static char sccsid[] = "@(#)lexi.c 4.1 (Berkeley) %G%";
2
3/*
4
5 Copyright (C) 1976
6 by the
7 Board of Trustees
8 of the
9 University of Illinois
10
11 All rights reserved
12
13
14NAME:
15 lexi
16
17FUNCTION:
18 This is the token scanner for indent
19
20ALGORITHM:
21 1) Strip off intervening blanks and/or tabs.
22 2) If it is an alphanumeric token, move it to the token buffer "token".
23 Check if it is a special reserved word that indent will want to
24 know about.
25 3) Non-alphanumeric tokens are handled with a big switch statement. A
26 flag is kept to remember if the last token was a "unary delimiter",
27 which forces a following operator to be unary as opposed to binary.
28
29PARAMETERS:
30 None
31
32RETURNS:
33 An integer code indicating the type of token scanned.
34
35GLOBALS:
36 buf_ptr =
37 had_eof
38 last_u_d = Set to true iff this token is a "unary delimiter"
39
40CALLS:
41 fill_buffer
42 printf (lib)
43
44CALLED BY:
45 main
46
47NOTES:
48 Start of comment is passed back so that the comment can be scanned by
49 pr_comment.
50
51 Strings and character literals are returned just like identifiers.
52
53HISTORY:
54 initial coding November 1976 D A Willcox of CAC
55 1/7/77 D A Willcox of CAC Fix to provide proper handling
56 of "int a -1;"
57
58*/\f
59
60/* Here we have the token scanner for indent. It scans off one token and
61 puts it in the global variable "token". It returns a code, indicating the
62 type of token scanned. */
63
64#include "indent_globs.h";
65#include "indent_codes.h";
66
67
68
69#define alphanum 1
70#define opchar 3
71
72struct templ {
73 char *rwd;
74 int rwcode;
75};
76
77struct templ specials[] =
78{
79 "switch", 1,
80 "case", 2,
81 "struct", 3,
82 "default", 2,
83 "int", 4,
84 "char", 4,
85 "float", 4,
86 "double", 4,
87 "long", 4,
88 "short", 4,
89 "typdef", 4,
90 "unsigned", 4,
91 "register", 4,
92 "static", 4,
93 "global", 4,
94 "extern", 4,
95 "if", 5,
96 "while", 5,
97 "for", 5,
98 "else", 6,
99 "do", 6,
100 "sizeof", 0,
101 0, 0
102};
103
104char chartype[128] =
105{ /* this is used to facilitate the decision of what type
106 (alphanumeric, operator) each character is */
107 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 0, 0, 0, 0, 0, 0, 0,
111 0, 3, 0, 0, 0, 3, 3, 0,
112 0, 0, 3, 3, 0, 3, 3, 3,
113 1, 1, 1, 1, 1, 1, 1, 1,
114 1, 1, 0, 0, 3, 3, 3, 3,
115 0, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1,
117 1, 1, 1, 1, 1, 1, 1, 1,
118 1, 1, 1, 0, 0, 0, 3, 1,
119 0, 1, 1, 1, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1,
121 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 0, 3, 0, 3, 0
123};
124
125int last_nl = true;
126 /* this is true if the last thing scanned was a newline */
127
128
129
130int lexi () {
131 register char *tok;
132 /* local pointer to next char in token */
133 register int i;
134 /* local loop counter */
135 register char *j;
136 /* used for searching thru list of reserved words */
137 int unary_delim;
138 /* this is set to 1 if the current token forces a following operator to be
139 unary */
140 static int last_code;
141 /* the last token type returned */
142 static int l_struct;
143 /* set to 1 if the last token was 'struct' */
144 int found_it;
145 int code; /* internal code to be returned */
146 char qchar; /* the delimiter character for a string */
147
148 tok = token; /* point to start of place to save token */
149 unary_delim = false;
150 col_1 = last_nl; /* tell world that this token started in column
151 1 iff the last thing scanned was nl */
152 last_nl = false;
153
154 while (*buf_ptr == ' ' || *buf_ptr == '\t') {
155 /* get rid of blanks */
156 col_1 = false; /* leading blanks imply token is not in column 1
157 */
158 if (++buf_ptr >= buf_end)
159 fill_buffer ();
160 }
161
162/*----------------------------------------------------------*\
163| Scan an alphanumeric token
164\*----------------------------------------------------------*/
165
166 if (chartype[*buf_ptr & 0177] == alphanum) {
167 /* we have a character or number */
168 while (chartype[*buf_ptr & 0177] == alphanum) {
169 /* copy it over */
170 *tok++ = *buf_ptr++;
171 if (buf_ptr >= buf_end)
172 fill_buffer ();
173 }
174
175 *tok++ = '\0';
176
177 if (l_struct) { /* if last token was 'struct', then this token
178 should be treated as a declaration */
179 l_struct = false;
180 last_code = ident;
181 last_u_d = true;
182 return (decl);
183 }
184
185 last_u_d = false; /* operator after indentifier is binary */
186
187 for (i = 0; specials[i].rwd != 0; ++i) {
188 /* this loop will check if the token is a keyword. if so, a following
189 operator is unary */
190 last_code = ident; /* remember that this is the code we will return
191 */
192 j = specials[i].rwd;
193 /* point at ith reserved word */
194 tok = token; /* point at scanned toekn */
195 found_it = true; /* set to false if not found */
196 do {
197 if (*tok++ != *j) {
198 found_it = false;
199 break;
200 }
201 } while (*j++);
202
203 if (found_it) { /* we have a keyword */
204 last_u_d = true;
205 switch (specials[i].rwcode) {
206 case 1: /* it is a switch */
207 return (swstmt);
208 case 2: /* a case or default */
209 return (casestmt);
210
211 case 3: /* a "struct" */
212 l_struct = true;
213 /* Next time around, we will want to know that we have had
214 a 'struct' */
215 case 4: /* one of the declaration keywords */
216 if(p_l_follow) break; /* inside parens: cast */
217 last_code = decl;
218 return (decl);
219
220 case 5: /* if, while, for */
221 return (sp_paren);
222
223 case 6: /* do, else */
224 return (sp_nparen);
225
226 default: /* all others are treated like any other
227 identifier */
228 return (ident);
229 } /* end of switch */
230 } /* end of if (found_it) */
231
232 }
233
234 if (last_code == decl) /* if this is a declared variable, then
235 following sign is unary */
236 last_u_d = true; /* will make "int a -1" work */
237 last_code = ident;
238 return (ident); /* the ident is not in the list */
239 } /* end of procesing for alpanum character */
240
241
242
243/*----------------------------------------------------------*\
244| Scan a non-alphanumeric token
245\*----------------------------------------------------------*/
246
247 *tok++ = *buf_ptr; /* if it is only a one-character token, it is
248 moved here */
249 *tok = '\0';
250 if (++buf_ptr >= buf_end)
251 fill_buffer ();
252
253 switch (*token) {
254 case '\n':
255 unary_delim = last_u_d;
256 last_nl = true; /* remember that we just had a newline */
257 code = (had_eof ? 0 : newline);
258 /* if data has been exausted, the newline is a dummy, and we should
259 return code to stop */
260 break;
261
262 case '\'': /* start of quoted character */
263 qchar = '\''; /* remember final delimiter */
264 goto copy_lit; /* and go to common literal code */
265
266 case '"': /* start of string */
267 qchar = '"';
268
269 copy_lit:
270 do { /* copy the string */
271 while (1) { /* move one character or [/<char>]<char> */
272 if (*buf_ptr == '\n') {
273 /* check for unterminated literal */
274 printf ("%d: Unterminated literal\n", line_no);
275 goto stop_lit;
276 /* Don't copy any more */
277 }
278
279 *tok = *buf_ptr++;
280 if (buf_ptr >= buf_end)
281 fill_buffer ();
282 if (had_eof || ((tok - token) > (bufsize - 2))) {
283 printf ("Unterminated literal\n");
284 ++tok;
285 goto stop_lit;
286 /* get outof literal copying loop */
287 }
288
289 if (*tok == '\\') {
290 /* if escape, copy extra char */
291 if (*buf_ptr == '\n')
292 /* check for escaped newline */
293 ++line_no;
294 *(++tok) = *buf_ptr++;
295 ++tok; /* we must increment this again because we
296 copied two chars */
297 if (buf_ptr >= buf_end)
298 fill_buffer ();
299 }
300 else
301 break; /* we copied one character */
302 } /* end of while (1) */
303 } while (*tok++ != qchar);
304
305 stop_lit:
306 code = ident;
307 break;
308
309 case ('('):
310 case ('['):
311 unary_delim = true;
312 code = lparen;
313 break;
314
315 case (')'):
316 case (']'):
317 code = rparen;
318 break;
319
320 case '#':
321 unary_delim = last_u_d;
322 code = preesc;
323 break;
324
325 case '?':
326 unary_delim = true;
327 code = question;
328 break;
329
330 case (':'):
331 code = colon;
332 unary_delim = true;
333 break;
334
335 case (';'):
336 unary_delim = true;
337 code = semicolon;
338 break;
339
340 case ('{'):
341 unary_delim = true;
342 code = lbrace;
343 break;
344
345 case ('}'):
346 unary_delim = true;
347 code = rbrace;
348 break;
349
350 case 014: /* a form feed */
351 unary_delim = last_u_d;
352 last_nl = true; /* remember this so we can set 'col_1' right */
353 code = form_feed;
354 break;
355
356 case (','):
357 unary_delim = true;
358 code = comma;
359 break;
360
361 case '.':
362 unary_delim = false;
363 code = period;
364 break;
365
366 case '-':
367 case '+': /* check for -, +, --, ++ */
368 code = (last_u_d ? unary_op : binary_op);
369 unary_delim = true;
370
371 if (*buf_ptr == token[0]) {
372 /* check for doubled character */
373 *tok++ = *buf_ptr++;
374 /* buffer overflow will be checked at end of loop */
375 if (last_code == ident || last_code == rparen) {
376 code = (last_u_d ? unary_op : postop);
377 /* check for following ++ or -- */
378 unary_delim = false;
379 }
380 }
381 else
382 if (*buf_ptr == '>' || *buf_ptr == '=')
383 /* check for operator -> or += */
384 *tok++ = *buf_ptr++;
385 /* buffer overflow will be checked at end of switch */
386
387 break;
388
389 case '=':
390 if (chartype[*buf_ptr] == opchar) {
391 /* we have two char assignment */
392 *tok++ = *buf_ptr;
393 /* move second character */
394 if (++buf_ptr >= buf_end)
395 fill_buffer ();
396 }
397
398 code = binary_op;
399 unary_delim = true;
400 if (token[1] != '<' && token[1] != '>')
401 /* check for possible 3 char operator */
402 break;
403 /* can drop thru!!! */
404
405 case '>':
406 case '<':
407 case '!': /* ops like <, <<, <=, !=, etc */
408 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
409 *tok++ = *buf_ptr;
410 if (++buf_ptr >= buf_end)
411 fill_buffer ();
412 }
413
414 if (*buf_ptr == '=')
415 *tok++ = *buf_ptr++;
416 code = (last_u_d ? unary_op : binary_op);
417 unary_delim = true;
418 break;
419
420 default:
421 if (token[0] == '/' && *buf_ptr == '*') {
422 /* it is start of comment */
423 *tok++ = '*';
424
425 if (++buf_ptr >= buf_end)
426 fill_buffer ();
427
428 code = comment;
429 unary_delim = last_u_d;
430 break;
431 }
432
433 while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') {
434 /* handle ||, &&, etc, and also things as in int *****i */
435 *tok++ = *buf_ptr;
436 if (++buf_ptr >= buf_end)
437 fill_buffer ();
438 }
439
440
441 code = (last_u_d ? unary_op : binary_op);
442 unary_delim = true;
443
444
445 } /* end of switch */
446
447 if (code != newline) {
448 l_struct = false;
449 last_code = code;
450 }
451
452 if (buf_ptr >= buf_end) /* check for input buffer empty */
453 fill_buffer ();
454 last_u_d = unary_delim;
455 *tok = '\0'; /* null terminate the token */
456 return (code);
457};