BSD 4_3 development
[unix-history] / usr / contrib / nntp / rrn / search.c
CommitLineData
25a197fc
C
1/* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $
2 *
3 * $Log: search.c,v $
4 * Revision 4.3 85/05/01 11:50:16 lwall
5 * Baseline for release with 4.3bsd.
6 *
7 */
8
9/* string search routines */
10
11/* Copyright (c) 1981,1980 James Gosling */
12
13/* Modified Aug. 12, 1981 by Tom London to include regular expressions
14 as in ed. RE stuff hacked over by jag to correct a few major problems,
15 mainly dealing with searching within the buffer rather than copying
16 each line to a separate array. Newlines can now appear in RE's */
17
18/* Ripped to shreds and glued back together to make a search package,
19 * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
20 * Changes include:
21 * Buffer, window, and mlisp stuff gone.
22 * Translation tables reduced to 1 table.
23 * Expression buffer is now dynamically allocated.
24 * Character classes now implemented with a bitmap.
25 */
26
27#include "EXTERN.h"
28#include "common.h"
29#include "util.h"
30#include "INTERN.h"
31#include "search.h"
32
33#ifndef BITSPERBYTE
34#define BITSPERBYTE 8
35#endif
36
37#define BMAPSIZ (127 / BITSPERBYTE + 1)
38
39/* meta characters in the "compiled" form of a regular expression */
40#define CBRA 2 /* \( -- begin bracket */
41#define CCHR 4 /* a vanilla character */
42#define CDOT 6 /* . -- match anything except a newline */
43#define CCL 8 /* [...] -- character class */
44#define NCCL 10 /* [^...] -- negated character class */
45#define CDOL 12 /* $ -- matches the end of a line */
46#define CEND 14 /* The end of the pattern */
47#define CKET 16 /* \) -- close bracket */
48#define CBACK 18 /* \N -- backreference to the Nth bracketed
49 string */
50#define CIRC 20 /* ^ matches the beginning of a line */
51
52#define WORD 32 /* matches word character \w */
53#define NWORD 34 /* matches non-word characer \W */
54#define WBOUND 36 /* matches word boundary \b */
55#define NWBOUND 38 /* matches non-(word boundary) \B */
56
57#define STAR 01 /* * -- Kleene star, repeats the previous
58 REas many times as possible; the value
59 ORs with the other operator types */
60
61#define ASCSIZ 0200
62typedef char TRANSTABLE[ASCSIZ];
63
64static TRANSTABLE trans = {
650000,0001,0002,0003,0004,0005,0006,0007,
660010,0011,0012,0013,0014,0015,0016,0017,
670020,0021,0022,0023,0024,0025,0026,0027,
680030,0031,0032,0033,0034,0035,0036,0037,
690040,0041,0042,0043,0044,0045,0046,0047,
700050,0051,0052,0053,0054,0055,0056,0057,
710060,0061,0062,0063,0064,0065,0066,0067,
720070,0071,0072,0073,0074,0075,0076,0077,
730100,0101,0102,0103,0104,0105,0106,0107,
740110,0111,0112,0113,0114,0115,0116,0117,
750120,0121,0122,0123,0124,0125,0126,0127,
760130,0131,0132,0133,0134,0135,0136,0137,
770140,0141,0142,0143,0144,0145,0146,0147,
780150,0151,0152,0153,0154,0155,0156,0157,
790160,0161,0162,0163,0164,0165,0166,0167,
800170,0171,0172,0173,0174,0175,0176,0177,
81};
82static bool folding = FALSE;
83
84static int err;
85static char *FirstCharacter;
86
87void
88search_init()
89{
90#ifdef UNDEF
91 register int i;
92
93 for (i = 0; i < ASCSIZ; i++)
94 trans[i] = i;
95#else
96 ;
97#endif
98}
99
100void
101init_compex(compex)
102register COMPEX *compex;
103{
104 /* the following must start off zeroed */
105
106 compex->eblen = 0;
107 compex->brastr = Nullch;
108}
109
110void
111free_compex(compex)
112register COMPEX *compex;
113{
114 if (compex->eblen) {
115 free(compex->expbuf);
116 compex->eblen = 0;
117 }
118 if (compex->brastr) {
119 free(compex->brastr);
120 compex->brastr = Nullch;
121 }
122}
123
124static char *gbr_str = Nullch;
125static int gbr_siz = 0;
126
127char *
128getbracket(compex,n)
129register COMPEX *compex;
130int n;
131{
132 int length = compex->braelist[n] - compex->braslist[n];
133
134 if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0)
135 return nullstr;
136 growstr(&gbr_str, &gbr_siz, length+1);
137 safecpy(gbr_str, compex->braslist[n], length+1);
138 return gbr_str;
139}
140
141void
142case_fold(which)
143int which;
144{
145 register int i;
146
147 if (which != folding) {
148 if (which) {
149 for (i = 'A'; i <= 'Z'; i++)
150 trans[i] = tolower(i);
151 }
152 else {
153 for (i = 'A'; i <= 'Z'; i++)
154 trans[i] = i;
155 }
156 folding = which;
157 }
158}
159
160/* Compile the given regular expression into a [secret] internal format */
161
162char *
163compile (compex, strp, RE, fold)
164register COMPEX *compex;
165register char *strp;
166int RE;
167int fold;
168{
169 register int c;
170 register char *ep;
171 char *lastep;
172 char bracket[NBRA],
173 *bracketp;
174 char **alt = compex->alternatives;
175 char *retmes = "Badly formed search string";
176
177 case_fold(compex->do_folding = fold);
178 if (!compex->eblen) {
179 compex->expbuf = safemalloc(84);
180 compex->eblen = 80;
181 }
182 ep = compex->expbuf; /* point at expression buffer */
183 *alt++ = ep; /* first alternative starts here */
184 bracketp = bracket; /* first bracket goes here */
185 if (*strp == 0) { /* nothing to compile? */
186 if (*ep == 0) /* nothing there yet? */
187 return "Null search string";
188 return Nullch; /* just keep old expression */
189 }
190 compex->nbra = 0; /* no brackets yet */
191 lastep = 0;
192 for (;;) {
193 if (ep - compex->expbuf >= compex->eblen)
194 grow_eb(compex);
195 c = *strp++; /* fetch next char of pattern */
196 if (c == 0) { /* end of pattern? */
197 if (bracketp != bracket) { /* balanced brackets? */
198#ifdef VERBOSE
199 retmes = "Unbalanced parens";
200#endif
201 goto cerror;
202 }
203 *ep++ = CEND; /* terminate expression */
204 *alt++ = 0; /* terminal alternative list */
205 /*
206 compex->eblen = ep - compex->expbuf + 1;
207 compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */
208 return Nullch; /* return success */
209 }
210 if (c != '*')
211 lastep = ep;
212 if (!RE) { /* just a normal search string? */
213 *ep++ = CCHR; /* everything is a normal char */
214 *ep++ = c;
215 }
216 else /* it is a regular expression */
217 switch (c) {
218
219 case '\\': /* meta something */
220 switch (c = *strp++) {
221 case '(':
222 if (compex->nbra >= NBRA) {
223#ifdef VERBOSE
224 retmes = "Too many parens";
225#endif
226 goto cerror;
227 }
228 *bracketp++ = ++compex->nbra;
229 *ep++ = CBRA;
230 *ep++ = compex->nbra;
231 break;
232 case '|':
233 if (bracketp>bracket) {
234#ifdef VERBOSE
235 retmes = "No \\| in parens"; /* Alas! */
236#endif
237 goto cerror;
238 }
239 *ep++ = CEND;
240 *alt++ = ep;
241 break;
242 case ')':
243 if (bracketp <= bracket) {
244#ifdef VERBOSE
245 retmes = "Unmatched right paren";
246#endif
247 goto cerror;
248 }
249 *ep++ = CKET;
250 *ep++ = *--bracketp;
251 break;
252 case 'w':
253 *ep++ = WORD;
254 break;
255 case 'W':
256 *ep++ = NWORD;
257 break;
258 case 'b':
259 *ep++ = WBOUND;
260 break;
261 case 'B':
262 *ep++ = NWBOUND;
263 break;
264 case '0': case '1': case '2': case '3': case '4':
265 case '5': case '6': case '7': case '8': case '9':
266 *ep++ = CBACK;
267 *ep++ = c - '0';
268 break;
269 default:
270 *ep++ = CCHR;
271 if (c == '\0')
272 goto cerror;
273 *ep++ = c;
274 break;
275 }
276 break;
277 case '.':
278 *ep++ = CDOT;
279 continue;
280
281 case '*':
282 if (lastep == 0 || *lastep == CBRA || *lastep == CKET
283 || *lastep == CIRC
284 || (*lastep&STAR)|| *lastep>NWORD)
285 goto defchar;
286 *lastep |= STAR;
287 continue;
288
289 case '^':
290 if (ep != compex->expbuf && ep[-1] != CEND)
291 goto defchar;
292 *ep++ = CIRC;
293 continue;
294
295 case '$':
296 if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
297 goto defchar;
298 *ep++ = CDOL;
299 continue;
300
301 case '[': { /* character class */
302 register int i;
303
304 if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
305 grow_eb(compex); /* reserve bitmap */
306 for (i = BMAPSIZ; i; --i)
307 ep[i] = 0;
308
309 if ((c = *strp++) == '^') {
310 c = *strp++;
311 *ep++ = NCCL; /* negated */
312 }
313 else
314 *ep++ = CCL; /* normal */
315
316 i = 0; /* remember oldchar */
317 do {
318 if (c == '\0') {
319#ifdef VERBOSE
320 retmes = "Missing ]";
321#endif
322 goto cerror;
323 }
324 if (*strp == '-' && *(++strp))
325 i = *strp++;
326 else
327 i = c;
328 while (c <= i) {
329 ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
330 if (fold && isalpha(c))
331 ep[(c ^ 32) / BITSPERBYTE] |=
332 1 << ((c ^ 32) % BITSPERBYTE);
333 /* set the other bit too */
334 c++;
335 }
336 } while ((c = *strp++) != ']');
337 ep += BMAPSIZ;
338 continue;
339 }
340
341 defchar:
342 default:
343 *ep++ = CCHR;
344 *ep++ = c;
345 }
346 }
347cerror:
348 compex->expbuf[0] = 0;
349 compex->nbra = 0;
350 return retmes;
351}
352
353void
354grow_eb(compex)
355register COMPEX *compex;
356{
357 compex->eblen += 80;
358 compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
359}
360
361char *
362execute (compex, addr)
363register COMPEX *compex;
364char *addr;
365{
366 register char *p1 = addr;
367 register char *trt = trans;
368 register int c;
369
370 if (addr == Nullch)
371 return Nullch;
372 if (compex->nbra) { /* any brackets? */
373 for (c = 0; c <= compex->nbra; c++)
374 compex->braslist[c] = compex->braelist[c] = Nullch;
375 if (compex->brastr)
376 free(compex->brastr);
377 compex->brastr = savestr(p1); /* in case p1 is not static */
378 p1 = compex->brastr; /* ! */
379 }
380 case_fold(compex->do_folding); /* make sure table is correct */
381 FirstCharacter = p1; /* for ^ tests */
382 if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
383 c = trt[compex->expbuf[1]]; /* fast check for first character */
384 do {
385 if (trt[*p1] == c && advance (compex, p1, compex->expbuf))
386 return p1;
387 p1++;
388 } while (*p1 && !err);
389 return Nullch;
390 }
391 else { /* regular algorithm */
392 do {
393 register char **alt = compex->alternatives;
394 while (*alt) {
395 if (advance (compex, p1, *alt++))
396 return p1;
397 }
398 p1++;
399 } while (*p1 && !err);
400 return Nullch;
401 }
402}
403
404/* advance the match of the regular expression starting at ep along the
405 string lp, simulates an NDFSA */
406bool
407advance (compex, lp, ep)
408register COMPEX *compex;
409register char *ep;
410register char *lp;
411{
412 register char *curlp;
413 register char *trt = trans;
414 register int i;
415
416 while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
417 switch (*ep++) {
418
419 case CCHR:
420 if (trt[*ep++] != trt[*lp]) return FALSE;
421 lp++;
422 continue;
423
424 case CDOT:
425 if (*lp == '\n') return FALSE;
426 lp++;
427 continue;
428
429 case CDOL:
430 if (!*lp || *lp == '\n')
431 continue;
432 return FALSE;
433
434 case CIRC:
435 if (lp == FirstCharacter || lp[-1]=='\n')
436 continue;
437 return FALSE;
438
439 case WORD:
440 if (isalnum(*lp)) {
441 lp++;
442 continue;
443 }
444 return FALSE;
445
446 case NWORD:
447 if (!isalnum(*lp)) {
448 lp++;
449 continue;
450 }
451 return FALSE;
452
453 case WBOUND:
454 if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
455 (!*lp || !isalnum(*lp)) )
456 continue;
457 return FALSE;
458
459 case NWBOUND:
460 if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
461 (!*lp || !isalnum(*lp)))
462 continue;
463 return FALSE;
464
465 case CEND:
466 return TRUE;
467
468 case CCL:
469 if (cclass (ep, *lp, 1)) {
470 ep += BMAPSIZ;
471 lp++;
472 continue;
473 }
474 return FALSE;
475
476 case NCCL:
477 if (cclass (ep, *lp, 0)) {
478 ep += BMAPSIZ;
479 lp++;
480 continue;
481 }
482 return FALSE;
483
484 case CBRA:
485 compex->braslist[*ep++] = lp;
486 continue;
487
488 case CKET:
489 i = *ep++;
490 compex->braelist[i] = lp;
491 compex->braelist[0] = lp;
492 compex->braslist[0] = compex->braslist[i];
493 continue;
494
495 case CBACK:
496 if (compex->braelist[i = *ep++] == 0) {
497 fputs("bad braces\n",stdout) FLUSH;
498 err = TRUE;
499 return FALSE;
500 }
501 if (backref (compex, i, lp)) {
502 lp += compex->braelist[i] - compex->braslist[i];
503 continue;
504 }
505 return FALSE;
506
507 case CBACK | STAR:
508 if (compex->braelist[i = *ep++] == 0) {
509 fputs("bad braces\n",stdout) FLUSH;
510 err = TRUE;
511 return FALSE;
512 }
513 curlp = lp;
514 while (backref (compex, i, lp)) {
515 lp += compex->braelist[i] - compex->braslist[i];
516 }
517 while (lp >= curlp) {
518 if (advance (compex, lp, ep))
519 return TRUE;
520 lp -= compex->braelist[i] - compex->braslist[i];
521 }
522 continue;
523
524 case CDOT | STAR:
525 curlp = lp;
526 while (*lp++ && lp[-1] != '\n');
527 goto star;
528
529 case WORD | STAR:
530 curlp = lp;
531 while (*lp++ && isalnum(lp[-1]));
532 goto star;
533
534 case NWORD | STAR:
535 curlp = lp;
536 while (*lp++ && !isalnum(lp[-1]));
537 goto star;
538
539 case CCHR | STAR:
540 curlp = lp;
541 while (*lp++ && trt[lp[-1]] == trt[*ep]);
542 ep++;
543 goto star;
544
545 case CCL | STAR:
546 case NCCL | STAR:
547 curlp = lp;
548 while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR)));
549 ep += BMAPSIZ;
550 goto star;
551
552 star:
553 do {
554 lp--;
555 if (advance (compex, lp, ep))
556 return TRUE;
557 } while (lp > curlp);
558 return FALSE;
559
560 default:
561 fputs("Badly compiled pattern\n",stdout) FLUSH;
562 err = TRUE;
563 return -1;
564 }
565 if (*ep == CEND || *ep == CDOL) {
566 return TRUE;
567 }
568 return FALSE;
569}
570
571bool
572backref (compex, i, lp)
573register COMPEX *compex;
574register int i;
575register char *lp;
576{
577 register char *bp;
578
579 bp = compex->braslist[i];
580 while (*lp && *bp == *lp) {
581 bp++;
582 lp++;
583 if (bp >= compex->braelist[i])
584 return TRUE;
585 }
586 return FALSE;
587}
588
589bool
590cclass (set, c, af)
591register char *set;
592register int c;
593{
594 c &= 0177;
595#if BITSPERBYTE == 8
596 if (set[c >> 3] & 1 << (c & 7))
597#else
598 if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
599#endif
600 return af;
601 return !af;
602}