Commit | Line | Data |
---|---|---|
2958d0fe RH |
1 | %{ |
2 | /* break out words, output cap + word(inverted) */ | |
3 | ||
4 | #ifndef lint | |
280efb10 | 5 | static char sccsid[] = "@(#)style1.l 4.2 (Berkeley) 82/11/06"; |
2958d0fe RH |
6 | #endif not lint |
7 | ||
8 | #include <stdio.h> | |
280efb10 | 9 | #include <ctype.h> |
2958d0fe RH |
10 | #define OUT() for(i=yyleng-1;i>=0; i--)putchar(yytext[i]); putchar('\n') |
11 | #define OUT1(nam) printf("%c:%s\n",nam,yytext) | |
12 | #define OUTN(string) printf("%s\n",string) | |
13 | #include "names.h" | |
14 | #include "nhash.c" | |
15 | #include "dict.c" | |
16 | #include "ydict.c" | |
280efb10 | 17 | #include "abbrev.c" |
2958d0fe RH |
18 | char nt[] = "D:n't"; |
19 | char qs[] = "c:'s"; | |
20 | char fin[] = "E:."; | |
280efb10 | 21 | int NOCAPS = 0; /* if set all caps are turned to lower case */ |
2958d0fe RH |
22 | int i,j; |
23 | int dot = 0; | |
24 | int first = 1; | |
25 | int qflg,nflg; | |
26 | int cap = 0; | |
27 | %} | |
28 | %p 3000 | |
280efb10 RH |
29 | %a 3300 |
30 | %o 4500 | |
2958d0fe RH |
31 | |
32 | L [a-z] | |
33 | N [0-9] | |
34 | C [A-Z] | |
280efb10 RH |
35 | A [a-zA-Z] |
36 | P [a-zA-Z0-9] | |
2958d0fe RH |
37 | |
38 | %% | |
280efb10 RH |
39 | ^[.!].+[\n] { |
40 | if(dot){ | |
41 | OUTN(fin); | |
42 | dot = 0; | |
43 | first = 1; | |
44 | } | |
45 | printf(":%s",yytext); | |
46 | } | |
47 | May { | |
48 | if(first == 0){ | |
49 | OUT1(NOUN); | |
50 | } | |
51 | else { | |
52 | first = 0; | |
53 | yytext[0] = tolower(yytext[0]); | |
54 | cap = 1; | |
55 | goto wd; | |
56 | } | |
57 | } | |
58 | "U.S." { | |
2958d0fe RH |
59 | OUT1(NOUN); |
60 | } | |
61 | {C}{L}*'[s] { | |
62 | pos(1); | |
63 | if(first==1)first=0; | |
64 | } | |
280efb10 RH |
65 | {C}+['][s] { |
66 | if(NOCAPS) | |
67 | for(i=0;i<yyleng;i++) | |
68 | if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); | |
2958d0fe RH |
69 | OUT1(POS); |
70 | } | |
280efb10 RH |
71 | {P}+([-]{P}+)+ { |
72 | if(NOCAPS) | |
73 | for(i=0;i<yyleng;i++) | |
74 | if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); | |
2958d0fe RH |
75 | OUT1(NOUN_ADJ); |
76 | } | |
77 | {C}{C}+ { | |
280efb10 RH |
78 | if(NOCAPS) |
79 | for(i=0;i<yyleng;i++) | |
80 | yytext[i] = tolower(yytext[i]); | |
2958d0fe RH |
81 | if((i=input()) == 's'){ |
82 | yytext[yyleng++] = 's'; | |
83 | yytext[yyleng] = '\0'; | |
280efb10 | 84 | OUT1(PNOUN); |
2958d0fe RH |
85 | } |
86 | else { | |
87 | unput(i); | |
280efb10 RH |
88 | if(!NOCAPS) |
89 | for(i=0;i<yyleng;i++)yytext[i] = tolower(yytext[i]); | |
2958d0fe RH |
90 | goto wd; |
91 | } | |
92 | } | |
93 | [LD][']{C}{L}* { | |
280efb10 RH |
94 | if(NOCAPS){ |
95 | yytext[0] = tolower(yytext[0]); | |
96 | yytext[2] = tolower(yytext[2]); | |
97 | } | |
2958d0fe RH |
98 | OUT1(NOUN_ADJ); |
99 | } | |
100 | {C}{L}* { | |
101 | if(first==1) | |
102 | first=0; | |
103 | else cap = 1; | |
104 | if(yyleng==1 && yytext[0] == 'I'){ | |
105 | cap = 0; | |
106 | goto wd; | |
107 | } | |
280efb10 | 108 | yytext[0] = tolower(yytext[0]); |
2958d0fe RH |
109 | goto wd; |
110 | } | |
280efb10 | 111 | {N}":"{N}{N} { |
2958d0fe RH |
112 | OUT1(NOUN_ADJ); |
113 | } | |
114 | ({N}*[,])*({N}+".")+[ \t\n]+{C} { | |
115 | for(i=yyleng-1;i>0;i--) | |
116 | if(yytext[i] == '.')break; | |
117 | unput(yytext[yyleng-1]); | |
118 | yytext[i] = '\0'; | |
119 | OUT1(NOUN_ADJ); | |
120 | OUTN(fin); | |
121 | first = 1; | |
122 | } | |
280efb10 RH |
123 | ([hH]e"/"[sS]he)|([sS]he"/"[hH]e) { |
124 | if(NOCAPS) | |
125 | if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]); | |
126 | OUT1(PRONS); | |
127 | } | |
128 | ([hH]is"/"[hH]er)|([hH]er"/"[hH]is) { | |
129 | if(NOCAPS) | |
130 | if(isupper(yytext[0]))yytext[0] = tolower(yytext[0]); | |
131 | OUT1(POS); | |
132 | } | |
133 | [ \t`]*[a-zA-Z0-9.]*("\/"[a-zA-Z0-9.]+)+[']* { | |
134 | if(yytext[yyleng-1] == '.'){ | |
135 | if(ahead() == 0)dot=1; | |
136 | } | |
137 | if(NOCAPS) | |
138 | for(i=0;i<yyleng;i++) | |
139 | if(isupper(yytext[i]))yytext[i] = tolower(yytext[i]); | |
2958d0fe RH |
140 | OUT1(NOUN_ADJ); |
141 | } | |
142 | {N}+([,]{N}+)*("."{N}+)*[']*[s]* { | |
143 | OUT1(NOUN_ADJ); | |
144 | } | |
145 | {N}*([,]{N}+)*("."{N}+)+[']*[s]* { | |
146 | OUT1(NOUN_ADJ); | |
147 | } | |
148 | {N}+([,]{N}+)*("."{N}*)*[']*[s]* { | |
149 | if(yytext[yyleng-1] == '.')dot=1; | |
150 | OUT1(NOUN_ADJ); | |
151 | } | |
280efb10 RH |
152 | ({A}*{N}+{A}*)+ { |
153 | if(input() == '.') | |
154 | ahead(); | |
155 | if(NOCAPS) | |
156 | for(i=0;i<yyleng;i++) | |
157 | if(isupper(yytext[i]))yytext[i]=tolower(yytext[i]); | |
2958d0fe RH |
158 | OUT1(NOUN_ADJ); |
159 | } | |
2958d0fe RH |
160 | {N}+[%] { |
161 | OUT1(NOUN_ADJ); | |
162 | } | |
163 | "$"{N}+([,]{N}+)*("."{N}*)* { | |
164 | if(yytext[yyleng-1] == '.')dot=1; | |
165 | OUT1(NOUN); | |
166 | } | |
167 | [Aa]"."[ ]*[Mm]"." { | |
168 | OUT1(ADJ_ADV); | |
169 | } | |
170 | [Pp]"."[ ]*[Mm]"." { | |
171 | OUT1(ADJ_ADV); | |
172 | } | |
173 | "a."[ ]*"d." { | |
174 | OUT1(ADJ_ADV); | |
175 | } | |
176 | "b."[ ]*"c." { | |
177 | OUT1(ADJ_ADV); | |
178 | } | |
179 | "i."[ ]*"e." { | |
180 | OUT1(PREP); | |
181 | } | |
182 | "e."[ ]*"g." { | |
183 | OUT1(PREP); | |
184 | } | |
185 | "etc."[ \n]*[,)]* { | |
186 | i = yytext[4]; | |
187 | yytext[4] = '\0'; | |
188 | OUT1(NOUN); | |
189 | yytext[4] = i; | |
190 | yytext[0] = yytext[yyleng-1]; | |
191 | yytext[1] = '\0'; | |
192 | if(yytext[0] == ',' || yytext[0] == ')') | |
193 | OUT1(','); | |
194 | else { | |
195 | OUTN(fin); | |
196 | first = 1; | |
197 | } | |
198 | } | |
199 | "et al." { | |
200 | OUT1(NOUN); | |
201 | } | |
2958d0fe RH |
202 | in"."[ \n]*{C} { |
203 | unput(yytext[yyleng-1]); | |
204 | yytext[2] = '\0'; | |
205 | OUT1(PREP); | |
206 | OUTN(fin); | |
207 | first = 1; | |
208 | } | |
2958d0fe RH |
209 | Ph"."[ ]*[Dd]"." { |
210 | OUT1(ADJ); | |
211 | } | |
2958d0fe RH |
212 | [A-Z]"." { |
213 | dot=1; | |
214 | OUT1(NOUN); | |
215 | } | |
216 | can't { | |
217 | yytext[3]='\0'; | |
218 | yyleng -= 2; | |
219 | nflg=1; | |
220 | goto wd; | |
221 | } | |
222 | won't { | |
223 | OUT1('X'); | |
224 | } | |
280efb10 RH |
225 | ain't { |
226 | OUT1('g'); | |
227 | } | |
2958d0fe RH |
228 | {L}+n't { |
229 | nflg=1; | |
230 | yytext[yyleng-3]='\0'; | |
231 | yyleng -= 3; | |
232 | goto wd; | |
233 | } | |
234 | [A-Z]{L}+n't { | |
280efb10 | 235 | yytext[0] = tolower(yytext[0]); |
2958d0fe RH |
236 | nflg=1; |
237 | yytext[yyleng-3]='\0'; | |
238 | yyleng -= 3; | |
239 | goto wd; | |
240 | } | |
280efb10 | 241 | o'clock { |
2958d0fe RH |
242 | OUT1(ADV); |
243 | } | |
244 | {L}+'[s] { | |
245 | pos(0); | |
246 | } | |
247 | 'll { | |
248 | OUT1(lookup("will",1,0)); | |
249 | } | |
250 | 've { | |
251 | OUT1(lookup("have",1,0)); | |
252 | } | |
253 | 're { | |
254 | OUT1(lookup("are",1,0)); | |
255 | } | |
256 | 'd { | |
257 | OUT1(lookup("had",1,0)); | |
258 | } | |
259 | 'm { | |
260 | OUT1(lookup("am",1,0)); | |
261 | } | |
262 | 'ld { | |
263 | OUT1(lookup("would",1,0)); | |
264 | } | |
265 | {L}+ { | |
266 | wd: | |
267 | if((j = lookup(yytext,1,0)) != 0){ | |
268 | first=0; | |
269 | if(cap){ | |
280efb10 RH |
270 | if(!NOCAPS) |
271 | yytext[0] = toupper(yytext[0]); | |
2958d0fe RH |
272 | cap = 0; |
273 | if(dot)OUTN(fin); | |
274 | } | |
275 | dot=0; | |
276 | OUT1(j); | |
277 | if(nflg==1){ | |
278 | nflg=0; | |
279 | OUTN(nt); | |
280 | } | |
281 | } | |
282 | else{ | |
283 | first = dot=0; | |
284 | if(yytext[yyleng-1] == 'y' && cap == 0){ | |
285 | switch(yytext[yyleng-2]){ | |
286 | case 'c': look(cy,yyleng-2,NOUN); | |
287 | break; | |
288 | case 'f': look(fy,yyleng-2,VERB); | |
289 | break; | |
290 | case 'l': look(ly,yyleng-2,ADV); | |
291 | break; | |
292 | case 'g': if(yytext[yyleng-3] == 'o'){ | |
293 | OUT1(NOUN); | |
294 | break; | |
295 | } | |
280efb10 | 296 | look(gy,yyleng-2,ADJ); |
2958d0fe RH |
297 | break; |
298 | case 'r': switch(yytext[yyleng-3]){ | |
299 | case 'a': look(ary,yyleng-3,ADJ); | |
300 | break; | |
301 | case 'o': look(ory,yyleng-3,ADJ); | |
302 | break; | |
303 | case 'e': look(ery,yyleng-3,NOUN); | |
304 | break; | |
305 | default: look(ry,yyleng-2,NOUN); | |
306 | } | |
307 | break; | |
308 | case 't': if(yytext[yyleng-3] == 'i')look(ity,yyleng-3,NOUN); | |
309 | else look(ty,yyleng-2,ADJ); | |
310 | break; | |
311 | default: OUT(); | |
312 | } | |
280efb10 | 313 | } |
2958d0fe RH |
314 | else { |
315 | if(cap){ | |
280efb10 | 316 | if(!NOCAPS)yytext[0] = toupper(yytext[0]); |
2958d0fe RH |
317 | cap = 0; |
318 | OUT1(NOUN_ADJ); | |
319 | } | |
320 | else { | |
321 | OUT(); | |
322 | } | |
323 | } | |
324 | } | |
325 | } | |
326 | [\n] ; | |
327 | [ ]+ ; | |
328 | [\t]+ ; | |
329 | ";" { | |
330 | OUT1(';'); | |
331 | first=1; | |
332 | } | |
333 | (\"|`|')+ { | |
334 | if(dot){ | |
335 | OUTN(fin); | |
336 | dot=0; | |
337 | } | |
338 | if(qflg==1){ | |
339 | qflg=0; | |
340 | OUT1('"'); | |
341 | } | |
342 | else { | |
343 | qflg=1; | |
344 | first=1; | |
345 | OUT1('"'); | |
346 | } | |
347 | } | |
348 | ".\"" { | |
349 | qflg=0; | |
350 | first=1; | |
351 | OUT1(END); | |
352 | } | |
353 | "..." { | |
354 | OUT1(','); | |
355 | } | |
356 | "/." { | |
357 | first = 1; | |
358 | OUT1(END); | |
359 | } | |
280efb10 RH |
360 | {A}{A}+"." { |
361 | yytext[yyleng-1] = '\0'; | |
362 | if((j=abbrev(yytext,1,0)) != 0){ | |
363 | if(isupper(yytext[0])){ | |
364 | if(NOCAPS)yytext[0] = tolower(yytext[0]); | |
365 | if(first == 1)first=0; | |
366 | } | |
367 | yytext[yyleng-1] = '.'; | |
368 | OUT1(j); | |
369 | } | |
370 | else { | |
371 | j = ahead(); | |
372 | if(j == 0) | |
373 | yyleng--; | |
374 | for(i=0;i<yyleng;i++) | |
375 | if(isupper(yytext[i])){ | |
376 | yytext[i] = tolower(yytext[i]); | |
377 | if(i == 0)cap = 1; | |
378 | else cap = 0; | |
379 | } | |
380 | if(j == 0)goto wd; | |
381 | OUT1(NOUN_ADJ); | |
382 | } | |
383 | } | |
2958d0fe RH |
384 | "." { |
385 | first=1; | |
386 | OUT1(END); | |
387 | } | |
388 | "!\"" { | |
389 | qflg=0; | |
390 | first=1; | |
391 | OUT1(END); | |
392 | } | |
393 | "!" { | |
394 | first=1; | |
395 | OUT1(END); | |
396 | } | |
397 | "?\"" { | |
398 | qflg=0; | |
399 | first=1; | |
400 | OUT1(END); | |
401 | } | |
402 | "?" { | |
403 | first=1; | |
404 | OUT1(END); | |
405 | } | |
406 | ":" { | |
407 | OUT1(','); | |
408 | first=1; | |
409 | } | |
410 | [-]+ { | |
411 | OUT1(','); | |
412 | first=1; | |
413 | } | |
414 | "," { | |
415 | OUT1(','); | |
416 | } | |
417 | (\[|\(|\{|\]|\)|\}) { | |
418 | OUT1(','); | |
419 | } | |
420 | . { | |
421 | /* fprintf(stderr,"nwords funny char: %c\n",yytext[0])*/ ; | |
422 | } | |
423 | %% | |
424 | look(f,n,cc) | |
425 | char (*f)(); | |
426 | int n; | |
427 | char cc; | |
428 | { | |
429 | int nn; | |
430 | char save; | |
431 | save=yytext[n]; | |
432 | yytext[n] = '\0'; | |
433 | nn=(*f)(yytext,1,0); | |
434 | yytext[n] = save; | |
435 | if(nn != 0){ | |
436 | OUT1(nn); | |
437 | } | |
438 | else { | |
439 | OUT1(cc); | |
440 | } | |
441 | } | |
442 | pos(flg){ | |
443 | int ii,j; | |
280efb10 | 444 | if(flg == 1)yytext[0] = tolower(yytext[0]); |
2958d0fe RH |
445 | for(ii=yyleng-1;yytext[ii] != '\''; ii--); |
446 | yytext[ii] = '\0'; | |
447 | if((j=lookup(yytext,1,0)) != 0){ | |
448 | yyleng = ii; | |
449 | OUT1(j); | |
450 | OUTN(qs); | |
451 | } | |
452 | else{ | |
280efb10 | 453 | if(flg==1 && !NOCAPS)yytext[0] = toupper(yytext[0]); |
2958d0fe RH |
454 | yytext[ii] = '\''; |
455 | OUT1(POS); | |
456 | } | |
457 | } | |
458 | char *filename="-"; | |
459 | ||
460 | main(argc,argv) | |
461 | int argc; | |
462 | char *argv[]; | |
463 | { | |
464 | register int rc=0; | |
465 | putchar(':'); putchar('\n'); | |
466 | getd(); | |
280efb10 | 467 | getab(); |
2958d0fe RH |
468 | ygetd(); |
469 | if(argc<=1) { | |
470 | yylex(); | |
280efb10 | 471 | OUTN(fin); |
2958d0fe RH |
472 | }else{ |
473 | while(argc>1) { | |
474 | if(freopen(argv[1],"r",stdin)==NULL) { | |
475 | fprintf(stderr,"%s: cannot open\n", argv[1]); | |
476 | rc++; | |
477 | }else{ | |
478 | filename=argv[1]; | |
479 | yylex(); | |
280efb10 | 480 | OUTN(fin); |
2958d0fe RH |
481 | } |
482 | argc--; argv++; | |
483 | } | |
484 | } | |
485 | return(rc); | |
486 | } | |
280efb10 RH |
487 | ahead(){ |
488 | register int c; | |
489 | if(isalnum((c=input()))){ | |
490 | yytext[yyleng++] = '.'; | |
491 | while(!isspace((c=input() ))) | |
492 | yytext[yyleng++] = c; | |
493 | yytext[yyleng] = '\0'; | |
494 | unput(c); | |
495 | return(1); | |
496 | } | |
497 | unput(c); | |
498 | unput('.'); | |
499 | return(0); | |
500 | } |