[unix-history] / usr / src / usr.bin / indent / lexi.c

static char sccsid[] = "@(#)lexi.c	4.1	(Berkeley)	%G%";

/*

			  Copyright (C) 1976
				by the
			  Board of Trustees
				of the
			University of Illinois

			 All rights reserved


NAME:
	lexi

FUNCTION:
	This is the token scanner for indent

ALGORITHM:
	1) Strip off intervening blanks and/or tabs.
	2) If it is an alphanumeric token, move it to the token buffer "token".
	   Check if it is a special reserved word that indent will want to
	   know about.
	3) Non-alphanumeric tokens are handled with a big switch statement.  A
	   flag is kept to remember if the last token was a "unary delimiter",
	   which forces a following operator to be unary as opposed to binary.

PARAMETERS:
	None

RETURNS:
	An integer code indicating the type of token scanned.

GLOBALS:
	buf_ptr =
	had_eof
	last_u_d =	Set to true iff this token is a "unary delimiter"

CALLS:
	fill_buffer
	printf (lib)

CALLED BY:
	main

NOTES:
	Start of comment is passed back so that the comment can be scanned by
	pr_comment.

	Strings and character literals are returned just like identifiers.

HISTORY:
	initial coding 	November 1976	D A Willcox of CAC
	1/7/77		D A Willcox of CAC	Fix to provide proper handling
						of "int a -1;"

*/\f

/* Here we have the token scanner for indent.  It scans off one token and
   puts it in the global variable "token".  It returns a code, indicating the
   type of token scanned. */

#include "indent_globs.h";
#include "indent_codes.h";


#define alphanum 1
#define opchar 3

struct templ {
    char   *rwd;
    int     rwcode;
};

struct templ    specials[] =
{
    "switch", 1,
    "case", 2,
    "struct", 3,
    "default", 2,
    "int", 4,
    "char", 4,
    "float", 4,
    "double", 4,
    "long", 4,
    "short", 4,
    "typdef", 4,
    "unsigned", 4,
    "register", 4,
    "static", 4,
    "global", 4,
    "extern", 4,
    "if", 5,
    "while", 5,
    "for", 5,
    "else", 6,
    "do", 6,
    "sizeof", 0,
    0, 0
};

char    chartype[128] =
{		   /* this is used to facilitate the decision of what type
		      (alphanumeric, operator) each character is */
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 3, 0, 0, 0, 3, 3, 0,
    0, 0, 3, 3, 0, 3, 3, 3,
    1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 0, 0, 3, 3, 3, 3,
    0, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 0, 0, 0, 3, 1,
    0, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 0, 3, 0, 3, 0
};

int     last_nl = true;
 /* this is true if the last thing scanned was a newline */


int     lexi () {
    register char  *tok;
 /* local pointer to next char in token */
    register int    i;
 /* local loop counter */
    register char  *j;
 /* used for searching thru list of reserved words */
    int     unary_delim;
 /* this is set to 1 if the current token forces a following operator to be
    unary */
    static int  last_code;
 /* the last token type returned */
    static int  l_struct;
 /* set to 1 if the last token was 'struct' */
    int     found_it;
    int     code;  /* internal code to be returned */
    char    qchar; /* the delimiter character for a string */

    tok = token;	       /* point to start of place to save token */
    unary_delim = false;
    col_1 = last_nl;	       /* tell world that this token started in column
			          1 iff the last thing scanned was nl */
    last_nl = false;

    while (*buf_ptr == ' ' || *buf_ptr == '\t') {
    /* get rid of blanks */
	col_1 = false;	       /* leading blanks imply token is not in column 1
			          */
	if (++buf_ptr >= buf_end)
	    fill_buffer ();
    }

/*----------------------------------------------------------*\ 
|    Scan an alphanumeric token
\*----------------------------------------------------------*/

    if (chartype[*buf_ptr & 0177] == alphanum) {
    /* we have a character or number */
	while (chartype[*buf_ptr & 0177] == alphanum) {
	/* copy it over */
	    *tok++ = *buf_ptr++;
	    if (buf_ptr >= buf_end)
		fill_buffer ();
	}

	*tok++ = '\0';

	if (l_struct) {	       /* if last token was 'struct', then this token
			          should be treated as a declaration */
	    l_struct = false;
	    last_code = ident;
	    last_u_d = true;
	    return (decl);
	}

	last_u_d = false;      /* operator after indentifier is binary */

	for (i = 0; specials[i].rwd != 0; ++i) {
	/* this loop will check if the token is a keyword.  if so, a following
	   operator is unary */
	    last_code = ident; /* remember that this is the code we will return
			          */
	    j = specials[i].rwd;
	/* point at ith reserved word */
	    tok = token;       /* point at scanned toekn */
	    found_it = true;   /* set to false if not found */
	    do {
		if (*tok++ != *j) {
		    found_it = false;
		    break;
		}
	    } while (*j++);

	    if (found_it) {    /* we have a keyword */
		last_u_d = true;
		switch (specials[i].rwcode) {
		    case 1:    /* it is a switch */
			return (swstmt);
		    case 2:    /* a case or default */
			return (casestmt);

		    case 3:    /* a "struct" */
			l_struct = true;
		    /* Next time around, we will want to know that we have had
		       a 'struct' */
		    case 4:    /* one of the declaration keywords */
			if(p_l_follow) break;	/* inside parens: cast */
			last_code = decl;
			return (decl);

		    case 5:    /* if, while, for */
			return (sp_paren);

		    case 6:    /* do, else */
			return (sp_nparen);

		    default:   /* all others are treated like any other
			          identifier */
			return (ident);
		}	       /* end of switch */
	    }		       /* end of if (found_it) */

	}

	if (last_code == decl) /* if this is a declared variable, then
			          following sign is unary */
	    last_u_d = true;   /* will make "int a -1" work */
	last_code = ident;
	return (ident);	       /* the ident is not in the list */
    }			       /* end of procesing for alpanum character */


/*----------------------------------------------------------*\ 
|   Scan a non-alphanumeric token
\*----------------------------------------------------------*/

    *tok++ = *buf_ptr;	       /* if it is only a one-character token, it is
			          moved here */
    *tok = '\0';
    if (++buf_ptr >= buf_end)
	fill_buffer ();

    switch (*token) {
	case '\n': 
	    unary_delim = last_u_d;
	    last_nl = true;    /* remember that we just had a newline */
	    code = (had_eof ? 0 : newline);
	/* if data has been exausted, the newline is a dummy, and we should
	   return code to stop */
	    break;

	case '\'': 	       /* start of quoted character */
	    qchar = '\'';      /* remember final delimiter */
	    goto copy_lit;     /* and go to common literal code */

	case '"': 	       /* start of string */
	    qchar = '"';

    copy_lit: 
	    do {	       /* copy the string */
		while (1) {    /* move one character or [/<char>]<char> */
		    if (*buf_ptr == '\n') {
		    /* check for unterminated literal */
			printf ("%d: Unterminated literal\n", line_no);
			goto stop_lit;
		    /* Don't copy any more */
		    }

		    *tok = *buf_ptr++;
		    if (buf_ptr >= buf_end)
			fill_buffer ();
		    if (had_eof || ((tok - token) > (bufsize - 2))) {
			printf ("Unterminated literal\n");
			++tok;
			goto stop_lit;
		    /* get outof literal copying loop */
		    }

		    if (*tok == '\\') {
		    /* if escape, copy extra char */
			if (*buf_ptr == '\n')
			       /* check for escaped newline */
			    ++line_no;
			*(++tok) = *buf_ptr++;
			++tok; /* we must increment this again because we
			          copied two chars */
			if (buf_ptr >= buf_end)
			    fill_buffer ();
		    }
		    else
			break; /* we copied one character */
		}	       /* end of while (1) */
	    } while (*tok++ != qchar);

    stop_lit: 
	    code = ident;
	    break;

	case ('('): 
	case ('['): 
	    unary_delim = true;
	    code = lparen;
	    break;

	case (')'): 
	case (']'): 
	    code = rparen;
	    break;

	case '#': 
	    unary_delim = last_u_d;
	    code = preesc;
	    break;

	case '?': 
	    unary_delim = true;
	    code = question;
	    break;

	case (':'): 
	    code = colon;
	    unary_delim = true;
	    break;

	case (';'): 
	    unary_delim = true;
	    code = semicolon;
	    break;

	case ('{'): 
	    unary_delim = true;
	    code = lbrace;
	    break;

	case ('}'): 
	    unary_delim = true;
	    code = rbrace;
	    break;

	case 014: 	       /* a form feed */
	    unary_delim = last_u_d;
	    last_nl = true;    /* remember this so we can set 'col_1' right */
	    code = form_feed;
	    break;

	case (','): 
	    unary_delim = true;
	    code = comma;
	    break;

	case '.': 
	    unary_delim = false;
	    code = period;
	    break;

	case '-': 
	case '+': 	       /* check for -, +, --, ++ */
	    code = (last_u_d ? unary_op : binary_op);
	    unary_delim = true;

	    if (*buf_ptr == token[0]) {
	    /* check for doubled character */
		*tok++ = *buf_ptr++;
	    /* buffer overflow will be checked at end of loop */
		if (last_code == ident || last_code == rparen) {
		    code = (last_u_d ? unary_op : postop);
		/* check for following ++ or -- */
		    unary_delim = false;
		}
	    }
	    else
		if (*buf_ptr == '>' || *buf_ptr == '=')
			       /* check for operator -> or += */
		    *tok++ = *buf_ptr++;
	/* buffer overflow will be checked at end of switch */

	    break;

	case '=': 
	    if (chartype[*buf_ptr] == opchar) {
	    /* we have two char assignment */
		*tok++ = *buf_ptr;
	    /* move second character */
		if (++buf_ptr >= buf_end)
		    fill_buffer ();
	    }

	    code = binary_op;
	    unary_delim = true;
	    if (token[1] != '<' && token[1] != '>')
			       /* check for possible 3 char operator */
		break;
	/* can drop thru!!! */

	case '>': 
	case '<': 
	case '!': 	       /* ops like <, <<, <=, !=, etc */
	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
		*tok++ = *buf_ptr;
		if (++buf_ptr >= buf_end)
		    fill_buffer ();
	    }

	    if (*buf_ptr == '=')
		 *tok++ = *buf_ptr++;
	    code = (last_u_d ? unary_op : binary_op);
	    unary_delim = true;
	    break;

	default: 
	    if (token[0] == '/' && *buf_ptr == '*') {
	    /* it is start of comment */
		*tok++ = '*';

		if (++buf_ptr >= buf_end)
		    fill_buffer ();

		code = comment;
		unary_delim = last_u_d;
		break;
	    }

	    while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') {
	    /* handle ||, &&, etc, and also things as in int *****i */
		*tok++ = *buf_ptr;
		if (++buf_ptr >= buf_end)
		    fill_buffer ();
	    }


	    code = (last_u_d ? unary_op : binary_op);
	    unary_delim = true;


    }			       /* end of switch */

    if (code != newline) {
	l_struct = false;
	last_code = code;
    }

    if (buf_ptr >= buf_end)    /* check for input buffer empty */
	fill_buffer ();
    last_u_d = unary_delim;
    *tok = '\0';	       /* null terminate the token */
    return (code);
};
Commit	Line	Data
4b365fcd KM	1	static char sccsid[] = "@(#)lexi.c 4.1 (Berkeley) %G%";
	2
	3	/*
	4
	5	Copyright (C) 1976
	6	by the
	7	Board of Trustees
	8	of the
	9	University of Illinois
	10
	11	All rights reserved
	12
	13
	14	NAME:
	15	lexi
	16
	17	FUNCTION:
	18	This is the token scanner for indent
	19
	20	ALGORITHM:
	21	1) Strip off intervening blanks and/or tabs.
	22	2) If it is an alphanumeric token, move it to the token buffer "token".
	23	Check if it is a special reserved word that indent will want to
	24	know about.
	25	3) Non-alphanumeric tokens are handled with a big switch statement. A
	26	flag is kept to remember if the last token was a "unary delimiter",
	27	which forces a following operator to be unary as opposed to binary.
	28
	29	PARAMETERS:
	30	None
	31
	32	RETURNS:
	33	An integer code indicating the type of token scanned.
	34
	35	GLOBALS:
	36	buf_ptr =
	37	had_eof
	38	last_u_d = Set to true iff this token is a "unary delimiter"
	39
	40	CALLS:
	41	fill_buffer
	42	printf (lib)
	43
	44	CALLED BY:
	45	main
	46
	47	NOTES:
	48	Start of comment is passed back so that the comment can be scanned by
	49	pr_comment.
	50
	51	Strings and character literals are returned just like identifiers.
	52
	53	HISTORY:
	54	initial coding November 1976 D A Willcox of CAC
	55	1/7/77 D A Willcox of CAC Fix to provide proper handling
	56	of "int a -1;"
	57
	58	*/\f
	59
	60	/* Here we have the token scanner for indent. It scans off one token and
	61	puts it in the global variable "token". It returns a code, indicating the
	62	type of token scanned. */
	63
	64	#include "indent_globs.h";
65	#include "indent_codes.h";
66
67
68
69	#define alphanum 1
70	#define opchar 3
71
72	struct templ {
73	char *rwd;
74	int rwcode;
75	};
76
77	struct templ specials[] =
78	{
79	"switch", 1,
80	"case", 2,
81	"struct", 3,
82	"default", 2,
83	"int", 4,
84	"char", 4,
85	"float", 4,
86	"double", 4,
87	"long", 4,
88	"short", 4,
89	"typdef", 4,
90	"unsigned", 4,
91	"register", 4,
92	"static", 4,
93	"global", 4,
94	"extern", 4,
95	"if", 5,
96	"while", 5,
97	"for", 5,
98	"else", 6,
99	"do", 6,
100	"sizeof", 0,
101	0, 0
102	};
103
104	char chartype[128] =
105	{ /* this is used to facilitate the decision of what type
106	(alphanumeric, operator) each character is */
107	0, 0, 0, 0, 0, 0, 0, 0,
108	0, 0, 0, 0, 0, 0, 0, 0,
109	0, 0, 0, 0, 0, 0, 0, 0,
110	0, 0, 0, 0, 0, 0, 0, 0,
111	0, 3, 0, 0, 0, 3, 3, 0,
112	0, 0, 3, 3, 0, 3, 3, 3,
113	1, 1, 1, 1, 1, 1, 1, 1,
114	1, 1, 0, 0, 3, 3, 3, 3,
115	0, 1, 1, 1, 1, 1, 1, 1,
116	1, 1, 1, 1, 1, 1, 1, 1,
117	1, 1, 1, 1, 1, 1, 1, 1,
118	1, 1, 1, 0, 0, 0, 3, 1,
119	0, 1, 1, 1, 1, 1, 1, 1,
120	1, 1, 1, 1, 1, 1, 1, 1,
121	1, 1, 1, 1, 1, 1, 1, 1,
122	1, 1, 1, 0, 3, 0, 3, 0
123	};
124
125	int last_nl = true;
126	/* this is true if the last thing scanned was a newline */
127
128
129
130	int lexi () {
131	register char *tok;
132	/* local pointer to next char in token */
133	register int i;
134	/* local loop counter */
135	register char *j;
136	/* used for searching thru list of reserved words */
137	int unary_delim;
138	/* this is set to 1 if the current token forces a following operator to be
139	unary */
140	static int last_code;
141	/* the last token type returned */
142	static int l_struct;
143	/* set to 1 if the last token was 'struct' */
144	int found_it;
145	int code; /* internal code to be returned */
146	char qchar; /* the delimiter character for a string */
147
148	tok = token; /* point to start of place to save token */
149	unary_delim = false;
150	col_1 = last_nl; /* tell world that this token started in column
151	1 iff the last thing scanned was nl */
152	last_nl = false;
153
154	while (buf_ptr == ' ' \|\| buf_ptr == '\t') {
155	/* get rid of blanks */
156	col_1 = false; /* leading blanks imply token is not in column 1
157	*/
158	if (++buf_ptr >= buf_end)
159	fill_buffer ();
160	}
161
162	/----------------------------------------------------------\
163	\| Scan an alphanumeric token
164	\----------------------------------------------------------/
165
166	if (chartype[*buf_ptr & 0177] == alphanum) {
167	/* we have a character or number */
168	while (chartype[*buf_ptr & 0177] == alphanum) {
169	/* copy it over */
170	tok++ = buf_ptr++;
171	if (buf_ptr >= buf_end)
172	fill_buffer ();
173	}
174
175	*tok++ = '\0';
176
177	if (l_struct) { /* if last token was 'struct', then this token
178	should be treated as a declaration */
179	l_struct = false;
180	last_code = ident;
181	last_u_d = true;
182	return (decl);
183	}
184
185	last_u_d = false; /* operator after indentifier is binary */
186
187	for (i = 0; specials[i].rwd != 0; ++i) {
188	/* this loop will check if the token is a keyword. if so, a following
189	operator is unary */
190	last_code = ident; /* remember that this is the code we will return
191	*/
192	j = specials[i].rwd;
193	/* point at ith reserved word */
194	tok = token; /* point at scanned toekn */
195	found_it = true; /* set to false if not found */
196	do {
197	if (tok++ != j) {
198	found_it = false;
199	break;
200	}
201	} while (*j++);
202
203	if (found_it) { /* we have a keyword */
204	last_u_d = true;
205	switch (specials[i].rwcode) {
206	case 1: /* it is a switch */
207	return (swstmt);
208	case 2: /* a case or default */
209	return (casestmt);
210
211	case 3: /* a "struct" */
212	l_struct = true;
213	/* Next time around, we will want to know that we have had
214	a 'struct' */
215	case 4: /* one of the declaration keywords */
216	if(p_l_follow) break; /* inside parens: cast */
217	last_code = decl;
218	return (decl);
219
220	case 5: /* if, while, for */
221	return (sp_paren);
222
223	case 6: /* do, else */
224	return (sp_nparen);
225
226	default: /* all others are treated like any other
227	identifier */
228	return (ident);
229	} /* end of switch */
230	} /* end of if (found_it) */
231
232	}
233
234	if (last_code == decl) /* if this is a declared variable, then
235	following sign is unary */
236	last_u_d = true; /* will make "int a -1" work */
237	last_code = ident;
238	return (ident); /* the ident is not in the list */
239	} /* end of procesing for alpanum character */
240
241
242
243	/----------------------------------------------------------\
244	\| Scan a non-alphanumeric token
245	\----------------------------------------------------------/
246
247	tok++ = buf_ptr; /* if it is only a one-character token, it is
248	moved here */
249	*tok = '\0';
250	if (++buf_ptr >= buf_end)
251	fill_buffer ();
252
253	switch (*token) {
254	case '\n':
255	unary_delim = last_u_d;
256	last_nl = true; /* remember that we just had a newline */
257	code = (had_eof ? 0 : newline);
258	/* if data has been exausted, the newline is a dummy, and we should
259	return code to stop */
260	break;
261
262	case '\'': /* start of quoted character */
263	qchar = '\''; /* remember final delimiter */
264	goto copy_lit; /* and go to common literal code */
265
266	case '"': /* start of string */
267	qchar = '"';
268
269	copy_lit:
270	do { /* copy the string */
271	while (1) { /* move one character or [/<char>]<char> */
272	if (*buf_ptr == '\n') {
273	/* check for unterminated literal */
274	printf ("%d: Unterminated literal\n", line_no);
275	goto stop_lit;
276	/* Don't copy any more */
277	}
278
279	tok = buf_ptr++;
280	if (buf_ptr >= buf_end)
281	fill_buffer ();
282	if (had_eof \|\| ((tok - token) > (bufsize - 2))) {
283	printf ("Unterminated literal\n");
284	++tok;
285	goto stop_lit;
286	/* get outof literal copying loop */
287	}
288
289	if (*tok == '\\') {
290	/* if escape, copy extra char */
291	if (*buf_ptr == '\n')
292	/* check for escaped newline */
293	++line_no;
294	(++tok) = buf_ptr++;
295	++tok; /* we must increment this again because we
296	copied two chars */
297	if (buf_ptr >= buf_end)
298	fill_buffer ();
299	}
300	else
301	break; /* we copied one character */
302	} /* end of while (1) */
303	} while (*tok++ != qchar);
304
305	stop_lit:
306	code = ident;
307	break;
308
309	case ('('):
310	case ('['):
311	unary_delim = true;
312	code = lparen;
313	break;
314
315	case (')'):
316	case (']'):
317	code = rparen;
318	break;
319
320	case '#':
321	unary_delim = last_u_d;
322	code = preesc;
323	break;
324
325	case '?':
326	unary_delim = true;
327	code = question;
328	break;
329
330	case (':'):
331	code = colon;
332	unary_delim = true;
333	break;
334
335	case (';'):
336	unary_delim = true;
337	code = semicolon;
338	break;
339
340	case ('{'):
341	unary_delim = true;
342	code = lbrace;
343	break;
344
345	case ('}'):
346	unary_delim = true;
347	code = rbrace;
348	break;
349
350	case 014: /* a form feed */
351	unary_delim = last_u_d;
352	last_nl = true; /* remember this so we can set 'col_1' right */
353	code = form_feed;
354	break;
355
356	case (','):
357	unary_delim = true;
358	code = comma;
359	break;
360
361	case '.':
362	unary_delim = false;
363	code = period;
364	break;
365
366	case '-':
367	case '+': /* check for -, +, --, ++ */
368	code = (last_u_d ? unary_op : binary_op);
369	unary_delim = true;
370
371	if (*buf_ptr == token[0]) {
372	/* check for doubled character */
373	tok++ = buf_ptr++;
374	/* buffer overflow will be checked at end of loop */
375	if (last_code == ident \|\| last_code == rparen) {
376	code = (last_u_d ? unary_op : postop);
377	/* check for following ++ or -- */
378	unary_delim = false;
379	}
380	}
381	else
382	if (buf_ptr == '>' \|\| buf_ptr == '=')
383	/* check for operator -> or += */
384	tok++ = buf_ptr++;
385	/* buffer overflow will be checked at end of switch */
386
387	break;
388
389	case '=':
390	if (chartype[*buf_ptr] == opchar) {
391	/* we have two char assignment */
392	tok++ = buf_ptr;
393	/* move second character */
394	if (++buf_ptr >= buf_end)
395	fill_buffer ();
396	}
397
398	code = binary_op;
399	unary_delim = true;
400	if (token[1] != '<' && token[1] != '>')
401	/* check for possible 3 char operator */
402	break;
403	/* can drop thru!!! */
404
405	case '>':
406	case '<':
407	case '!': /* ops like <, <<, <=, !=, etc */
408	if (buf_ptr == '>' \|\| buf_ptr == '<' \|\| *buf_ptr == '=') {
409	tok++ = buf_ptr;
410	if (++buf_ptr >= buf_end)
411	fill_buffer ();
412	}
413
414	if (*buf_ptr == '=')
415	tok++ = buf_ptr++;
416	code = (last_u_d ? unary_op : binary_op);
417	unary_delim = true;
418	break;
419
420	default:
421	if (token[0] == '/' && buf_ptr == '') {
422	/* it is start of comment */
423	tok++ = '';
424
425	if (++buf_ptr >= buf_end)
426	fill_buffer ();
427
428	code = comment;
429	unary_delim = last_u_d;
430	break;
431	}
432
433	while ((tok - 1) == buf_ptr \|\| *buf_ptr=='=') {
434	/* handle \|\|, &&, etc, and also things as in int ****i /
435	tok++ = buf_ptr;
436	if (++buf_ptr >= buf_end)
437	fill_buffer ();
438	}
439
440
441	code = (last_u_d ? unary_op : binary_op);
442	unary_delim = true;
443
444
445	} /* end of switch */
446
447	if (code != newline) {
448	l_struct = false;
449	last_code = code;
450	}
451
452	if (buf_ptr >= buf_end) /* check for input buffer empty */
453	fill_buffer ();
454	last_u_d = unary_delim;
455	tok = '\0'; / null terminate the token */
456	return (code);
457	};