Commit | Line | Data |
---|---|---|
25a197fc C |
1 | /* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $ |
2 | * | |
3 | * $Log: search.c,v $ | |
4 | * Revision 4.3 85/05/01 11:50:16 lwall | |
5 | * Baseline for release with 4.3bsd. | |
6 | * | |
7 | */ | |
8 | ||
9 | /* string search routines */ | |
10 | ||
11 | /* Copyright (c) 1981,1980 James Gosling */ | |
12 | ||
13 | /* Modified Aug. 12, 1981 by Tom London to include regular expressions | |
14 | as in ed. RE stuff hacked over by jag to correct a few major problems, | |
15 | mainly dealing with searching within the buffer rather than copying | |
16 | each line to a separate array. Newlines can now appear in RE's */ | |
17 | ||
18 | /* Ripped to shreds and glued back together to make a search package, | |
19 | * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.) | |
20 | * Changes include: | |
21 | * Buffer, window, and mlisp stuff gone. | |
22 | * Translation tables reduced to 1 table. | |
23 | * Expression buffer is now dynamically allocated. | |
24 | * Character classes now implemented with a bitmap. | |
25 | */ | |
26 | ||
27 | #include "EXTERN.h" | |
28 | #include "common.h" | |
29 | #include "util.h" | |
30 | #include "INTERN.h" | |
31 | #include "search.h" | |
32 | ||
33 | #ifndef BITSPERBYTE | |
34 | #define BITSPERBYTE 8 | |
35 | #endif | |
36 | ||
37 | #define BMAPSIZ (127 / BITSPERBYTE + 1) | |
38 | ||
39 | /* meta characters in the "compiled" form of a regular expression */ | |
40 | #define CBRA 2 /* \( -- begin bracket */ | |
41 | #define CCHR 4 /* a vanilla character */ | |
42 | #define CDOT 6 /* . -- match anything except a newline */ | |
43 | #define CCL 8 /* [...] -- character class */ | |
44 | #define NCCL 10 /* [^...] -- negated character class */ | |
45 | #define CDOL 12 /* $ -- matches the end of a line */ | |
46 | #define CEND 14 /* The end of the pattern */ | |
47 | #define CKET 16 /* \) -- close bracket */ | |
48 | #define CBACK 18 /* \N -- backreference to the Nth bracketed | |
49 | string */ | |
50 | #define CIRC 20 /* ^ matches the beginning of a line */ | |
51 | ||
52 | #define WORD 32 /* matches word character \w */ | |
53 | #define NWORD 34 /* matches non-word characer \W */ | |
54 | #define WBOUND 36 /* matches word boundary \b */ | |
55 | #define NWBOUND 38 /* matches non-(word boundary) \B */ | |
56 | ||
57 | #define STAR 01 /* * -- Kleene star, repeats the previous | |
58 | REas many times as possible; the value | |
59 | ORs with the other operator types */ | |
60 | ||
61 | #define ASCSIZ 0200 | |
62 | typedef char TRANSTABLE[ASCSIZ]; | |
63 | ||
64 | static TRANSTABLE trans = { | |
65 | 0000,0001,0002,0003,0004,0005,0006,0007, | |
66 | 0010,0011,0012,0013,0014,0015,0016,0017, | |
67 | 0020,0021,0022,0023,0024,0025,0026,0027, | |
68 | 0030,0031,0032,0033,0034,0035,0036,0037, | |
69 | 0040,0041,0042,0043,0044,0045,0046,0047, | |
70 | 0050,0051,0052,0053,0054,0055,0056,0057, | |
71 | 0060,0061,0062,0063,0064,0065,0066,0067, | |
72 | 0070,0071,0072,0073,0074,0075,0076,0077, | |
73 | 0100,0101,0102,0103,0104,0105,0106,0107, | |
74 | 0110,0111,0112,0113,0114,0115,0116,0117, | |
75 | 0120,0121,0122,0123,0124,0125,0126,0127, | |
76 | 0130,0131,0132,0133,0134,0135,0136,0137, | |
77 | 0140,0141,0142,0143,0144,0145,0146,0147, | |
78 | 0150,0151,0152,0153,0154,0155,0156,0157, | |
79 | 0160,0161,0162,0163,0164,0165,0166,0167, | |
80 | 0170,0171,0172,0173,0174,0175,0176,0177, | |
81 | }; | |
82 | static bool folding = FALSE; | |
83 | ||
84 | static int err; | |
85 | static char *FirstCharacter; | |
86 | ||
87 | void | |
88 | search_init() | |
89 | { | |
90 | #ifdef UNDEF | |
91 | register int i; | |
92 | ||
93 | for (i = 0; i < ASCSIZ; i++) | |
94 | trans[i] = i; | |
95 | #else | |
96 | ; | |
97 | #endif | |
98 | } | |
99 | ||
100 | void | |
101 | init_compex(compex) | |
102 | register COMPEX *compex; | |
103 | { | |
104 | /* the following must start off zeroed */ | |
105 | ||
106 | compex->eblen = 0; | |
107 | compex->brastr = Nullch; | |
108 | } | |
109 | ||
110 | void | |
111 | free_compex(compex) | |
112 | register COMPEX *compex; | |
113 | { | |
114 | if (compex->eblen) { | |
115 | free(compex->expbuf); | |
116 | compex->eblen = 0; | |
117 | } | |
118 | if (compex->brastr) { | |
119 | free(compex->brastr); | |
120 | compex->brastr = Nullch; | |
121 | } | |
122 | } | |
123 | ||
124 | static char *gbr_str = Nullch; | |
125 | static int gbr_siz = 0; | |
126 | ||
127 | char * | |
128 | getbracket(compex,n) | |
129 | register COMPEX *compex; | |
130 | int n; | |
131 | { | |
132 | int length = compex->braelist[n] - compex->braslist[n]; | |
133 | ||
134 | if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0) | |
135 | return nullstr; | |
136 | growstr(&gbr_str, &gbr_siz, length+1); | |
137 | safecpy(gbr_str, compex->braslist[n], length+1); | |
138 | return gbr_str; | |
139 | } | |
140 | ||
141 | void | |
142 | case_fold(which) | |
143 | int which; | |
144 | { | |
145 | register int i; | |
146 | ||
147 | if (which != folding) { | |
148 | if (which) { | |
149 | for (i = 'A'; i <= 'Z'; i++) | |
150 | trans[i] = tolower(i); | |
151 | } | |
152 | else { | |
153 | for (i = 'A'; i <= 'Z'; i++) | |
154 | trans[i] = i; | |
155 | } | |
156 | folding = which; | |
157 | } | |
158 | } | |
159 | ||
160 | /* Compile the given regular expression into a [secret] internal format */ | |
161 | ||
162 | char * | |
163 | compile (compex, strp, RE, fold) | |
164 | register COMPEX *compex; | |
165 | register char *strp; | |
166 | int RE; | |
167 | int fold; | |
168 | { | |
169 | register int c; | |
170 | register char *ep; | |
171 | char *lastep; | |
172 | char bracket[NBRA], | |
173 | *bracketp; | |
174 | char **alt = compex->alternatives; | |
175 | char *retmes = "Badly formed search string"; | |
176 | ||
177 | case_fold(compex->do_folding = fold); | |
178 | if (!compex->eblen) { | |
179 | compex->expbuf = safemalloc(84); | |
180 | compex->eblen = 80; | |
181 | } | |
182 | ep = compex->expbuf; /* point at expression buffer */ | |
183 | *alt++ = ep; /* first alternative starts here */ | |
184 | bracketp = bracket; /* first bracket goes here */ | |
185 | if (*strp == 0) { /* nothing to compile? */ | |
186 | if (*ep == 0) /* nothing there yet? */ | |
187 | return "Null search string"; | |
188 | return Nullch; /* just keep old expression */ | |
189 | } | |
190 | compex->nbra = 0; /* no brackets yet */ | |
191 | lastep = 0; | |
192 | for (;;) { | |
193 | if (ep - compex->expbuf >= compex->eblen) | |
194 | grow_eb(compex); | |
195 | c = *strp++; /* fetch next char of pattern */ | |
196 | if (c == 0) { /* end of pattern? */ | |
197 | if (bracketp != bracket) { /* balanced brackets? */ | |
198 | #ifdef VERBOSE | |
199 | retmes = "Unbalanced parens"; | |
200 | #endif | |
201 | goto cerror; | |
202 | } | |
203 | *ep++ = CEND; /* terminate expression */ | |
204 | *alt++ = 0; /* terminal alternative list */ | |
205 | /* | |
206 | compex->eblen = ep - compex->expbuf + 1; | |
207 | compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */ | |
208 | return Nullch; /* return success */ | |
209 | } | |
210 | if (c != '*') | |
211 | lastep = ep; | |
212 | if (!RE) { /* just a normal search string? */ | |
213 | *ep++ = CCHR; /* everything is a normal char */ | |
214 | *ep++ = c; | |
215 | } | |
216 | else /* it is a regular expression */ | |
217 | switch (c) { | |
218 | ||
219 | case '\\': /* meta something */ | |
220 | switch (c = *strp++) { | |
221 | case '(': | |
222 | if (compex->nbra >= NBRA) { | |
223 | #ifdef VERBOSE | |
224 | retmes = "Too many parens"; | |
225 | #endif | |
226 | goto cerror; | |
227 | } | |
228 | *bracketp++ = ++compex->nbra; | |
229 | *ep++ = CBRA; | |
230 | *ep++ = compex->nbra; | |
231 | break; | |
232 | case '|': | |
233 | if (bracketp>bracket) { | |
234 | #ifdef VERBOSE | |
235 | retmes = "No \\| in parens"; /* Alas! */ | |
236 | #endif | |
237 | goto cerror; | |
238 | } | |
239 | *ep++ = CEND; | |
240 | *alt++ = ep; | |
241 | break; | |
242 | case ')': | |
243 | if (bracketp <= bracket) { | |
244 | #ifdef VERBOSE | |
245 | retmes = "Unmatched right paren"; | |
246 | #endif | |
247 | goto cerror; | |
248 | } | |
249 | *ep++ = CKET; | |
250 | *ep++ = *--bracketp; | |
251 | break; | |
252 | case 'w': | |
253 | *ep++ = WORD; | |
254 | break; | |
255 | case 'W': | |
256 | *ep++ = NWORD; | |
257 | break; | |
258 | case 'b': | |
259 | *ep++ = WBOUND; | |
260 | break; | |
261 | case 'B': | |
262 | *ep++ = NWBOUND; | |
263 | break; | |
264 | case '0': case '1': case '2': case '3': case '4': | |
265 | case '5': case '6': case '7': case '8': case '9': | |
266 | *ep++ = CBACK; | |
267 | *ep++ = c - '0'; | |
268 | break; | |
269 | default: | |
270 | *ep++ = CCHR; | |
271 | if (c == '\0') | |
272 | goto cerror; | |
273 | *ep++ = c; | |
274 | break; | |
275 | } | |
276 | break; | |
277 | case '.': | |
278 | *ep++ = CDOT; | |
279 | continue; | |
280 | ||
281 | case '*': | |
282 | if (lastep == 0 || *lastep == CBRA || *lastep == CKET | |
283 | || *lastep == CIRC | |
284 | || (*lastep&STAR)|| *lastep>NWORD) | |
285 | goto defchar; | |
286 | *lastep |= STAR; | |
287 | continue; | |
288 | ||
289 | case '^': | |
290 | if (ep != compex->expbuf && ep[-1] != CEND) | |
291 | goto defchar; | |
292 | *ep++ = CIRC; | |
293 | continue; | |
294 | ||
295 | case '$': | |
296 | if (*strp != 0 && (*strp != '\\' || strp[1] != '|')) | |
297 | goto defchar; | |
298 | *ep++ = CDOL; | |
299 | continue; | |
300 | ||
301 | case '[': { /* character class */ | |
302 | register int i; | |
303 | ||
304 | if (ep - compex->expbuf >= compex->eblen - BMAPSIZ) | |
305 | grow_eb(compex); /* reserve bitmap */ | |
306 | for (i = BMAPSIZ; i; --i) | |
307 | ep[i] = 0; | |
308 | ||
309 | if ((c = *strp++) == '^') { | |
310 | c = *strp++; | |
311 | *ep++ = NCCL; /* negated */ | |
312 | } | |
313 | else | |
314 | *ep++ = CCL; /* normal */ | |
315 | ||
316 | i = 0; /* remember oldchar */ | |
317 | do { | |
318 | if (c == '\0') { | |
319 | #ifdef VERBOSE | |
320 | retmes = "Missing ]"; | |
321 | #endif | |
322 | goto cerror; | |
323 | } | |
324 | if (*strp == '-' && *(++strp)) | |
325 | i = *strp++; | |
326 | else | |
327 | i = c; | |
328 | while (c <= i) { | |
329 | ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE); | |
330 | if (fold && isalpha(c)) | |
331 | ep[(c ^ 32) / BITSPERBYTE] |= | |
332 | 1 << ((c ^ 32) % BITSPERBYTE); | |
333 | /* set the other bit too */ | |
334 | c++; | |
335 | } | |
336 | } while ((c = *strp++) != ']'); | |
337 | ep += BMAPSIZ; | |
338 | continue; | |
339 | } | |
340 | ||
341 | defchar: | |
342 | default: | |
343 | *ep++ = CCHR; | |
344 | *ep++ = c; | |
345 | } | |
346 | } | |
347 | cerror: | |
348 | compex->expbuf[0] = 0; | |
349 | compex->nbra = 0; | |
350 | return retmes; | |
351 | } | |
352 | ||
353 | void | |
354 | grow_eb(compex) | |
355 | register COMPEX *compex; | |
356 | { | |
357 | compex->eblen += 80; | |
358 | compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4); | |
359 | } | |
360 | ||
361 | char * | |
362 | execute (compex, addr) | |
363 | register COMPEX *compex; | |
364 | char *addr; | |
365 | { | |
366 | register char *p1 = addr; | |
367 | register char *trt = trans; | |
368 | register int c; | |
369 | ||
370 | if (addr == Nullch) | |
371 | return Nullch; | |
372 | if (compex->nbra) { /* any brackets? */ | |
373 | for (c = 0; c <= compex->nbra; c++) | |
374 | compex->braslist[c] = compex->braelist[c] = Nullch; | |
375 | if (compex->brastr) | |
376 | free(compex->brastr); | |
377 | compex->brastr = savestr(p1); /* in case p1 is not static */ | |
378 | p1 = compex->brastr; /* ! */ | |
379 | } | |
380 | case_fold(compex->do_folding); /* make sure table is correct */ | |
381 | FirstCharacter = p1; /* for ^ tests */ | |
382 | if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) { | |
383 | c = trt[compex->expbuf[1]]; /* fast check for first character */ | |
384 | do { | |
385 | if (trt[*p1] == c && advance (compex, p1, compex->expbuf)) | |
386 | return p1; | |
387 | p1++; | |
388 | } while (*p1 && !err); | |
389 | return Nullch; | |
390 | } | |
391 | else { /* regular algorithm */ | |
392 | do { | |
393 | register char **alt = compex->alternatives; | |
394 | while (*alt) { | |
395 | if (advance (compex, p1, *alt++)) | |
396 | return p1; | |
397 | } | |
398 | p1++; | |
399 | } while (*p1 && !err); | |
400 | return Nullch; | |
401 | } | |
402 | } | |
403 | ||
404 | /* advance the match of the regular expression starting at ep along the | |
405 | string lp, simulates an NDFSA */ | |
406 | bool | |
407 | advance (compex, lp, ep) | |
408 | register COMPEX *compex; | |
409 | register char *ep; | |
410 | register char *lp; | |
411 | { | |
412 | register char *curlp; | |
413 | register char *trt = trans; | |
414 | register int i; | |
415 | ||
416 | while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET) | |
417 | switch (*ep++) { | |
418 | ||
419 | case CCHR: | |
420 | if (trt[*ep++] != trt[*lp]) return FALSE; | |
421 | lp++; | |
422 | continue; | |
423 | ||
424 | case CDOT: | |
425 | if (*lp == '\n') return FALSE; | |
426 | lp++; | |
427 | continue; | |
428 | ||
429 | case CDOL: | |
430 | if (!*lp || *lp == '\n') | |
431 | continue; | |
432 | return FALSE; | |
433 | ||
434 | case CIRC: | |
435 | if (lp == FirstCharacter || lp[-1]=='\n') | |
436 | continue; | |
437 | return FALSE; | |
438 | ||
439 | case WORD: | |
440 | if (isalnum(*lp)) { | |
441 | lp++; | |
442 | continue; | |
443 | } | |
444 | return FALSE; | |
445 | ||
446 | case NWORD: | |
447 | if (!isalnum(*lp)) { | |
448 | lp++; | |
449 | continue; | |
450 | } | |
451 | return FALSE; | |
452 | ||
453 | case WBOUND: | |
454 | if ((lp == FirstCharacter || !isalnum(lp[-1])) != | |
455 | (!*lp || !isalnum(*lp)) ) | |
456 | continue; | |
457 | return FALSE; | |
458 | ||
459 | case NWBOUND: | |
460 | if ((lp == FirstCharacter || !isalnum(lp[-1])) == | |
461 | (!*lp || !isalnum(*lp))) | |
462 | continue; | |
463 | return FALSE; | |
464 | ||
465 | case CEND: | |
466 | return TRUE; | |
467 | ||
468 | case CCL: | |
469 | if (cclass (ep, *lp, 1)) { | |
470 | ep += BMAPSIZ; | |
471 | lp++; | |
472 | continue; | |
473 | } | |
474 | return FALSE; | |
475 | ||
476 | case NCCL: | |
477 | if (cclass (ep, *lp, 0)) { | |
478 | ep += BMAPSIZ; | |
479 | lp++; | |
480 | continue; | |
481 | } | |
482 | return FALSE; | |
483 | ||
484 | case CBRA: | |
485 | compex->braslist[*ep++] = lp; | |
486 | continue; | |
487 | ||
488 | case CKET: | |
489 | i = *ep++; | |
490 | compex->braelist[i] = lp; | |
491 | compex->braelist[0] = lp; | |
492 | compex->braslist[0] = compex->braslist[i]; | |
493 | continue; | |
494 | ||
495 | case CBACK: | |
496 | if (compex->braelist[i = *ep++] == 0) { | |
497 | fputs("bad braces\n",stdout) FLUSH; | |
498 | err = TRUE; | |
499 | return FALSE; | |
500 | } | |
501 | if (backref (compex, i, lp)) { | |
502 | lp += compex->braelist[i] - compex->braslist[i]; | |
503 | continue; | |
504 | } | |
505 | return FALSE; | |
506 | ||
507 | case CBACK | STAR: | |
508 | if (compex->braelist[i = *ep++] == 0) { | |
509 | fputs("bad braces\n",stdout) FLUSH; | |
510 | err = TRUE; | |
511 | return FALSE; | |
512 | } | |
513 | curlp = lp; | |
514 | while (backref (compex, i, lp)) { | |
515 | lp += compex->braelist[i] - compex->braslist[i]; | |
516 | } | |
517 | while (lp >= curlp) { | |
518 | if (advance (compex, lp, ep)) | |
519 | return TRUE; | |
520 | lp -= compex->braelist[i] - compex->braslist[i]; | |
521 | } | |
522 | continue; | |
523 | ||
524 | case CDOT | STAR: | |
525 | curlp = lp; | |
526 | while (*lp++ && lp[-1] != '\n'); | |
527 | goto star; | |
528 | ||
529 | case WORD | STAR: | |
530 | curlp = lp; | |
531 | while (*lp++ && isalnum(lp[-1])); | |
532 | goto star; | |
533 | ||
534 | case NWORD | STAR: | |
535 | curlp = lp; | |
536 | while (*lp++ && !isalnum(lp[-1])); | |
537 | goto star; | |
538 | ||
539 | case CCHR | STAR: | |
540 | curlp = lp; | |
541 | while (*lp++ && trt[lp[-1]] == trt[*ep]); | |
542 | ep++; | |
543 | goto star; | |
544 | ||
545 | case CCL | STAR: | |
546 | case NCCL | STAR: | |
547 | curlp = lp; | |
548 | while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR))); | |
549 | ep += BMAPSIZ; | |
550 | goto star; | |
551 | ||
552 | star: | |
553 | do { | |
554 | lp--; | |
555 | if (advance (compex, lp, ep)) | |
556 | return TRUE; | |
557 | } while (lp > curlp); | |
558 | return FALSE; | |
559 | ||
560 | default: | |
561 | fputs("Badly compiled pattern\n",stdout) FLUSH; | |
562 | err = TRUE; | |
563 | return -1; | |
564 | } | |
565 | if (*ep == CEND || *ep == CDOL) { | |
566 | return TRUE; | |
567 | } | |
568 | return FALSE; | |
569 | } | |
570 | ||
571 | bool | |
572 | backref (compex, i, lp) | |
573 | register COMPEX *compex; | |
574 | register int i; | |
575 | register char *lp; | |
576 | { | |
577 | register char *bp; | |
578 | ||
579 | bp = compex->braslist[i]; | |
580 | while (*lp && *bp == *lp) { | |
581 | bp++; | |
582 | lp++; | |
583 | if (bp >= compex->braelist[i]) | |
584 | return TRUE; | |
585 | } | |
586 | return FALSE; | |
587 | } | |
588 | ||
589 | bool | |
590 | cclass (set, c, af) | |
591 | register char *set; | |
592 | register int c; | |
593 | { | |
594 | c &= 0177; | |
595 | #if BITSPERBYTE == 8 | |
596 | if (set[c >> 3] & 1 << (c & 7)) | |
597 | #else | |
598 | if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE)) | |
599 | #endif | |
600 | return af; | |
601 | return !af; | |
602 | } |