Updated README: Equal sign not required with `--mode` flag.
[sgk-go] / patterns / dfa.c
CommitLineData
7eeb782e
AT
1/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\
2 * This is GNU Go, a Go program. Contact gnugo@gnu.org, or see *
3 * http://www.gnu.org/software/gnugo/ for more information. *
4 * *
5 * Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, *
6 * 2008 and 2009 by the Free Software Foundation. *
7 * *
8 * This program is free software; you can redistribute it and/or *
9 * modify it under the terms of the GNU General Public License as *
10 * published by the Free Software Foundation - version 3 or *
11 * (at your option) any later version. *
12 * *
13 * This program is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16 * GNU General Public License in file COPYING for more details. *
17 * *
18 * You should have received a copy of the GNU General Public *
19 * License along with this program; if not, write to the Free *
20 * Software Foundation, Inc., 51 Franklin Street, Fifth Floor, *
21 * Boston, MA 02111, USA. *
22\* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
23
24/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
25 * * * * * * * * fast pattern matching with DFA version 2.9 * * *
26 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
27
28#include "liberty.h"
29#include "patterns.h"
30#include "dfa-mkpat.h"
31#include "random.h"
32
33#include <assert.h>
34#include <stdlib.h>
35
36#ifdef HAVE_CONFIG_H
37#include <config.h>
38#endif
39
40#ifdef HAVE_UNISTD_H
41#include <unistd.h>
42#endif
43
44
45/*********************
46 * Public data *
47 *********************/
48
49/* If > 0 more detailed information is given */
50int dfa_verbose = 0;
51
52
53/*********************
54 * Private data *
55 *********************/
56
57/* auxiliary dfa's for high level functions */
58#define DFA_BINS 33 /* Number of temporary bins used to store intermediate DFAs */
59static dfa_t aux_dfa[DFA_BINS]; /* used to store intermediate DFAs */
60static dfa_t aux_temp; /* used to store temporary DFAs */
61
62/* To be sure that everything was well initialized */
63static int dfa_was_initialized = 0;
64static int aux_count = 0;
65
66
67/* convert ATT_* values to the corresponding expected values on the board */
68static const char att2val[8] = {
69 '.', 'X', 'O', 'x', 'o', ',', 'a', '!'
70};
71
72#define EXPECTED_VAL(att_val) att2val[att_val]
73
74
75/************************************************
76 * forward declaration of private functions *
77 ************************************************/
78static void clean_dfa(dfa_t *pdfa);
79static void resize_dfa(dfa_t *pdfa, int max_states, int max_indexes);
80static void create_dfa(dfa_t *pdfa, const char *str, int att_val);
81static void do_sync_product(int l, int r);
82static void sync_product(dfa_t *pout, dfa_t *pleft, dfa_t *pright);
83
84static void dfa_prepare_rotation_data(void);
85
86
87/********************************
88 * manipulating attributes list *
89 ********************************/
90
91/*
92 * Test if val is member of the attributes set att
93 */
94
95static int
96member_att(dfa_t *pdfa, int att, int val)
97{
98 while (att != 0) {
99 if (pdfa->indexes[att].val == val)
100 return 1;
101
102 att = pdfa->indexes[att].next;
103 }
104
105 return 0;
106}
107
108/*
109 * return the union of two attribute sets att1 & att2
110 * repectively from dfa1 and dfa2 into
111 * att in dfa.
112 */
113
114static int
115union_att(dfa_t *pdfa, dfa_t *pdfa1, int att1, dfa_t *pdfa2, int att2)
116{
117 int att;
118 int att_aux;
119
120 /* copy att1 in att */
121 att = 0;
122 while (att1 != 0) {
123 pdfa->last_index++;
124 if (pdfa->last_index >= pdfa->max_indexes)
125 resize_dfa(pdfa, pdfa->max_states, pdfa->max_indexes + DFA_RESIZE_STEP);
126 att_aux = pdfa->last_index;
127
128 pdfa->indexes[att_aux].val = pdfa1->indexes[att1].val;
129 pdfa->indexes[att_aux].next = att;
130 att = att_aux;
131 att1 = pdfa1->indexes[att1].next;
132 }
133
134 /* add to att the new elements of att2 */
135 while (att2 != 0) {
136 if (!member_att(pdfa, att, pdfa2->indexes[att2].val)) {
137 pdfa->last_index++;
138 if (pdfa->last_index >= pdfa->max_indexes)
139 resize_dfa(pdfa, pdfa->max_states, pdfa->max_indexes + DFA_RESIZE_STEP);
140 att_aux = pdfa->last_index;
141
142 pdfa->indexes[att_aux].val = pdfa2->indexes[att2].val;
143 pdfa->indexes[att_aux].next = att;
144 att = att_aux;
145 }
146 att2 = pdfa2->indexes[att2].next;
147 }
148
149 return att;
150}
151
152
153/* Remove all attribute entry repetitions from a dfa.
154 */
155static void
156compactify_att(dfa_t *pdfa)
157{
158 int k;
159 int last = 0;
160 int save_last = pdfa->last_index;
161 int *map;
162 int *search_first;
163 int *search_next;
164 int size = (save_last + 1) * sizeof(int);
165
166 map = malloc(size);
167 map[0] = 0;
168 search_first = malloc(size);
169 memset(search_first, 0, size);
170 search_next = malloc(size);
171 memset(search_next, 0, size);
172
173 for (k = 1; k <= save_last; k++) {
174 int i = search_first[pdfa->indexes[k].val];
175
176 if (i) {
177 while (pdfa->indexes[i].next != pdfa->indexes[k].next) {
178 if (!search_next[i]) {
179 search_next[i] = ++last;
180 i = 0;
181 break;
182 }
183
184 i = search_next[i];
185 }
186 }
187 else
188 search_first[pdfa->indexes[k].val] = ++last;
189
190 if (i)
191 map[k] = i;
192 else {
193 map[k] = last;
194 pdfa->indexes[last] = pdfa->indexes[k];
195 }
196 }
197
198 free(search_first);
199 free(search_next);
200
201 if (last < save_last) {
202 pdfa->last_index = last;
203 for (k = 0; k <= pdfa->last_index; k++)
204 pdfa->indexes[k].next = map[pdfa->indexes[k].next];
205
206 for (k = 0; k <= pdfa->last_state; k++)
207 pdfa->states[k].att = map[pdfa->states[k].att];
208
209 if (0)
210 fprintf(stderr, "compactified: %d attributes left of %d\n",
211 last, save_last);
212
213 compactify_att(pdfa);
214 }
215
216 free(map);
217}
218
219
220/**********************
221 * manipulating dfa's *
222 **********************/
223
224
225/*
226 * return the effective size of a dfa in kB.
227 */
228
229int
230dfa_size(dfa_t *pdfa)
231{
232 int states_size, indexes_size;
233
234 states_size = (pdfa->last_state + 1) * sizeof(state_rt_t);
235 indexes_size = (pdfa->last_index + 1) * sizeof(attrib_rt_t);
236
237 return (states_size + indexes_size + sizeof(dfa_rt_t)) / 1024;
238}
239
240
241/*
242 * resize memory for a dfa
243 */
244
245static void
246resize_dfa(dfa_t *pdfa, int max_states, int max_indexes)
247{
248 state_t *pBuf;
249 attrib_t *pBuf2;
250 int i;
251
252 if (dfa_verbose > 1)
253 fprintf(stderr, "Resizing dfa %s\n", pdfa->name);
254
255 assert(pdfa->last_state <= pdfa->max_states);
256 assert(pdfa->last_index <= pdfa->max_indexes);
257
258 pBuf = realloc(pdfa->states, max_states * sizeof(*pBuf));
259 pBuf2 = realloc(pdfa->indexes, max_indexes * sizeof(*pBuf2));
260 if (pBuf == NULL || pBuf2 == NULL) {
261 fprintf(stderr, "No memory left for dfa: %s", pdfa->name);
262 exit(EXIT_FAILURE);
263 }
264
265 for (i = pdfa->max_states; i < max_states; i++)
266 memset(pBuf + i, 0, sizeof(state_t));
267 for (i = pdfa->max_indexes; i < max_indexes; i++)
268 memset(pBuf2 + i, 0, sizeof(attrib_t));
269
270 pdfa->states = pBuf;
271 pdfa->max_states = max_states;
272 pdfa->indexes = pBuf2;
273 pdfa->max_indexes = max_indexes;
274}
275
276
277
278/*
279 * dump a dfa (debugging purpose).
280 */
281
282static const char *line =
283 "----------------------------------------------------\n";
284
285void
286dump_dfa(FILE *f, dfa_t *pdfa)
287{
288 int i;
289 int att, k;
290
291 fprintf(f, line);
292 fprintf(f, " name : %s\n", pdfa->name);
293 fprintf(f, " Nb states : %7d, max= %d\n", pdfa->last_state + 1,
294 pdfa->max_states);
295 fprintf(f, " Nb Indexes : %7d, max= %d\n", pdfa->last_index,
296 pdfa->max_indexes);
297 fprintf(f, " memory needed : %d Mb\n", dfa_size(pdfa) / 1024);
298 fprintf(f, line);
299
300 if (dfa_size(pdfa) > 10000) /* change this value if needed */
301 return;
302 fprintf(f, " state | . | O | X | # | att \n");
303 fprintf(f, line);
304 for (i = 1; i != pdfa->last_state + 1; i++) {
305 int *pnext = pdfa->states[i].next;
306 fprintf(f, " %6d |", i);
307 fprintf(f, " %6d | %6d | %6d |", pnext[0], pnext[1], pnext[2]);
308 fprintf(f, " %6d |", pnext[OUT_BOARD]);
309 att = pdfa->states[i].att;
310 k = 0;
311 fprintf(f, " %5d:", att);
312 while (att != 0 && k < 10) {
313 fprintf(f, " %4d", pdfa->indexes[att].val);
314 att = pdfa->indexes[att].next;
315 k++;
316 }
317 if (att != 0)
318 fprintf(f, " ...");
319 fprintf(f, "\n");
320 }
321 fprintf(f, line);
322 fflush(f);
323}
324
325
326/*
327 * Reset a dfa
328 */
329
330static void
331clean_dfa(dfa_t *pdfa)
332{
333 memset(pdfa->states, 0, pdfa->max_states * sizeof(state_t));
334 memset(pdfa->indexes, 0, pdfa->max_indexes * sizeof(attrib_t));
335 pdfa->last_state = 1; /* initial state */
336 pdfa->last_index = 0;
337 pdfa->indexes[0].val = -1;
338}
339
340
341/*
342 * allocate memory for a new dfa
343 */
344
345void
346new_dfa(dfa_t *pdfa, const char *name)
347{
348 memset(pdfa, 0, sizeof(dfa_t));
349 resize_dfa(pdfa, DFA_INIT_SIZE, DFA_INIT_SIZE);
350 clean_dfa(pdfa);
351 if (name != NULL)
352 strcpy(pdfa->name, name);
353 else
354 strcpy(pdfa->name, "noname ");
355
356 if (dfa_verbose > 1)
357 fprintf(stderr, "dfa %s is born :)\n", pdfa->name);
358
359}
360
361/*
362 * free memory used by a dfa
363 */
364
365void
366kill_dfa(dfa_t *pdfa)
367{
368 free(pdfa->states);
369 free(pdfa->indexes);
370 if (dfa_verbose > 1)
371 fprintf(stderr, "dfa %s is dead :(\n", pdfa->name);
372
373 memset(pdfa, 0, sizeof(dfa_t));
374}
375
376
377/*
378 * Copy a dfa and resize the destination dfa if necessary.
379 */
380
381void
382copy_dfa(dfa_t *p_to, dfa_t *p_from)
383{
384 assert(p_to != p_from);
385
386 if (p_to->max_states < p_from->last_state)
387 resize_dfa(p_to, p_from->max_states, p_to->max_indexes);
388
389 if (p_to->max_indexes < p_from->last_index)
390 resize_dfa(p_to, p_to->max_states, p_from->max_indexes);
391
392 clean_dfa(p_to);
393
394 memcpy(p_to->states, p_from->states,
395 sizeof(state_t) * (p_from->last_state + 1));
396 memcpy(p_to->indexes, p_from->indexes,
397 sizeof(attrib_t) * (p_from->last_index + 1));
398
399 p_to->last_state = p_from->last_state;
400 p_to->last_index = p_from->last_index;
401}
402
403
404/*
405 * print c dfa:
406 * print the dfa in c format.
407 */
408
409void
410print_c_dfa(FILE *of, const char *name, dfa_t *pdfa)
411{
412 int i;
413
414 if (sizeof(unsigned short) < 2) {
415 fprintf(of, "#error shorts too short");
416 fprintf(stderr, "Error: shorts are expected to be at least 2 bytes long.\n");
417 exit(EXIT_FAILURE);
418 }
419
420 assert(dfa_minmax_delta(pdfa, -1, 1) > -32768);
421 if (dfa_minmax_delta(pdfa, -1, 0) > 32768) {
422 fprintf(of, "#error too many states");
423 fprintf(stderr, "Error: The dfa states are too disperse. Can't fit delta into a short.\n");
424 exit(EXIT_FAILURE);
425 }
426
427 if (pdfa->last_index + 1 > 65535) {
428 fprintf(of, "#error too many states");
429 fprintf(stderr, "Error: Too many index entries. Can't fit delta into a short.\n");
430 exit(EXIT_FAILURE);
431 }
432
433
434 fprintf(of, "\n#include \"dfa-mkpat.h\"\n");
435
436 fprintf(of, "static const state_rt_t state_%s[%d] = {\n",
437 name, pdfa->last_state + 1);
438 for (i = 0; i != pdfa->last_state + 1; i++) {
439 int j;
440 fprintf(of, "{{");
441 for (j = 0; j < 4; j++) {
442 int n = pdfa->states[i].next[j];
443 assert((n == 0) || (abs(n - i) < 32768));
444 fprintf(of, "%d", n ? n - i : 0);
445 if (j != 3)
446 fprintf(of, ",");
447 }
448 fprintf(of, "}, %d},%s", pdfa->states[i].att, ((i+1)%3 ? "\t" : "\n"));
449 }
450 fprintf(of, "};\n\n");
451
452
453 fprintf(of, "static const attrib_rt_t idx_%s[%d] = {\n",
454 name, pdfa->last_index + 1);
455 for (i = 0; i != pdfa->last_index + 1; i++)
456 fprintf(of, "{%d,%d},%s", pdfa->indexes[i].val, pdfa->indexes[i].next,
457 ((i+1)%4 ? "\t" : "\n"));
458 fprintf(of, "};\n\n");
459
460 fprintf(of, "static dfa_rt_t dfa_%s = {\n", name);
461 fprintf(of, " \"%s\",\n", name);
462 fprintf(of, "state_%s,\n", name);
463 fprintf(of, "idx_%s", name);
464 fprintf(of, "};\n");
465}
466
467
468/*
469 * Create a linear dfa from a string and an attributes value
470 * and resize the dfa if needed.
471 *
472 * For example:
473 * create_dfa(pdfa, "Oo?.", 2001)
474 * gives:
475 *
476 * 1 0,1 0,1,2 0
477 * (1,{}) -------> (2,{}) -------> (3,{}) -------> (4,{}) ------> (5,{2001})
478 *
479 * An empty string force a junk pattern : The scanner will always
480 * consider this pattern as active.
481 *
482 * The possible input symbols are :
483 *
484 * '.', ',', '*', '!' for EMPTY expected.
485 * 'X' for BLACK expected.
486 * 'O' for WHITE expected.
487 * 'x' for BLACK|EMPTY expected.
488 * 'o' for WHITE|EMPTY expected.
489 * '#', '+', '-', '|' for OUT_BOARD expected.
490 * '?' for EMPTY|BLACK|WHITE expected.
491 * '$' for EMPTY|BLACK|WHITE|OUT_BOARD expected.
492 */
493
494static void
495create_dfa(dfa_t *pdfa, const char *str, int att_val)
496{
497 int new_state;
498
499 if (dfa_verbose > 1)
500 fprintf(stderr, "linear dfa in %s with string: %s\n", pdfa->name, str);
501
502 assert(str != NULL);
503 assert(pdfa->max_states > 1);
504 assert(pdfa->max_indexes > 1);
505
506 clean_dfa(pdfa);
507 new_state = 1;
508 for (; *str != '\0' && strchr("$#+-|OoXx.?,!a*", *str); str++) {
509 memset(pdfa->states[new_state].next, 0, 4 * sizeof(int));
510 if (strchr("$?.ox,a!*", *str))
511 pdfa->states[new_state].next[0] = new_state + 1;
512 if (strchr("$?Oo", *str))
513 pdfa->states[new_state].next[1] = new_state + 1;
514 if (strchr("$?Xx", *str))
515 pdfa->states[new_state].next[2] = new_state + 1;
516 if (strchr("$#+-|", *str))
517 pdfa->states[new_state].next[OUT_BOARD] = new_state + 1;
518 new_state++;
519 if (new_state >= pdfa->max_states)
520 resize_dfa(pdfa, pdfa->max_states + DFA_RESIZE_STEP,
521 pdfa->max_indexes);
522 }
523 memset(pdfa->states[new_state].next, 0, 4 * sizeof(int));
524
525 pdfa->last_index++;
526 if (pdfa->last_index >= pdfa->max_indexes)
527 resize_dfa(pdfa, pdfa->max_states,
528 pdfa->max_indexes + DFA_RESIZE_STEP);
529
530 memset(&(pdfa->indexes[pdfa->last_index]), 0, sizeof(attrib_t));
531 pdfa->states[new_state].att = pdfa->last_index;
532
533 pdfa->indexes[pdfa->states[new_state].att].val = att_val;
534 pdfa->indexes[pdfa->states[new_state].att].next = 0;
535 pdfa->last_state = new_state;
536}
537
538
539/**************************
540 * Test array with a *
541 * hash table *
542 **************************/
543/* used by sync_product *
544 * to store visited states*
545 **************************/
546
547#define MAX_HASH_VALUE 4096
548
549typedef struct entry {
550 int l, r; /* key */
551 int val; /* value */
552 struct entry *pnext; /* NULL if end of list */
553} entry_t;
554
555typedef struct test_array {
556 entry_t *hash[MAX_HASH_VALUE];
557} test_array_t;
558
559
560/* initialize empty lists */
561static void
562new_test_array(test_array_t *pta)
563{
564 int h;
565
566 for (h = 0; h != MAX_HASH_VALUE ; h++)
567 pta->hash[h] = NULL;
568}
569
570/* Searh for (l, r) in the linked list plist */
571static int
572get_from_entry_list(entry_t *plist, int l, int r)
573{
574 int val = 0;
575
576 while (plist != NULL) {
577 if (plist->l == l && plist->r == r)
578 val = plist->val;
579 plist = plist->pnext;
580 }
581 return val;
582}
583
584/* get the value associated with (l, r) or 0 if none */
585static int
586get_from_test_array(test_array_t *pta, int l, int r)
587{
588 return get_from_entry_list(pta->hash[(l+r) % MAX_HASH_VALUE], l, r);
589}
590
591
592/* insert a new entry at the beginning of the linked list pplist */
593static void
594add_to_entry_list(entry_t **pplist, int l, int r, int val)
595{
596 entry_t *new_entry;
597
598 /* make sure val > 0: val = 0 is used in get_from_entry_list */
599 assert(val > 0);
600 assert(!get_from_entry_list(*pplist, l, r));
601
602 new_entry = malloc(sizeof(*new_entry));
603 if (new_entry == NULL) {
604 fprintf(stderr, "No memory left for new entry\n");
605 exit(EXIT_FAILURE);
606 }
607 new_entry->pnext = *pplist;
608 new_entry->l = l;
609 new_entry->r = r;
610 new_entry->val = val;
611 *pplist = new_entry;
612}
613
614
615/* add a value at (l, r) */
616static void
617add_to_test_array(test_array_t *pta, int l, int r, int val)
618{
619 add_to_entry_list(&(pta->hash[(l+r) % MAX_HASH_VALUE]), l, r, val);
620}
621
622/* free the elements of the linked list plist */
623static void
624free_entry_list(entry_t *plist)
625{
626 entry_t *pentry;
627
628 while (plist != NULL) {
629 pentry = plist;
630 plist = plist->pnext;
631 free(pentry);
632 }
633}
634
635/* free allocated memory */
636static void
637free_test_array(test_array_t *pta)
638{
639 int h;
640
641 for (h = 0; h != MAX_HASH_VALUE; h++) {
642 free_entry_list(pta->hash[h]);
643 pta->hash[h] = NULL;
644 }
645}
646
647
648/*
649 * Synchronization product between two automata.
650 *
651 * L(A) is the set of patterns recognized by the automaton A.
652 *
653 * A syncronized product betwenn two acyclic deterministic automata
654 * A1 and A2 is an acyclic deterministic classifier A1xA2 that
655 * recognize and classify the languages
656 * L(A1), L(A2), L(A1 Union A2) and L(A1 Inter A2).
657 *
658 * This algorithm do the product and the reduction at the same time.
659 *
660 * See Hopcroft & Ullman "The design and analysis of computer algorithms"
661 * Ed. Addison-Wesley, Reading MA, 1974
662 * For the theorical aspects.
663 */
664
665/* globals used to improve readability */
666static dfa_t *gpout, *gpleft, *gpright;
667
668/* Hash table used to test if a state has already been
669 visited and then give its position in the new automaton. */
670static test_array_t gtest;
671
672static void
673do_sync_product(int l, int r)
674{
675 int c;
676 int nextl, nextr;
677 int state;
678
679 state = gpout->last_state;
680
681 /* unify the attributes of states l and r */
682 gpout->states[state].att = union_att(gpout, gpleft, gpleft->states[l].att,
683 gpright, gpright->states[r].att);
684
685 /* scan each possible out-transition */
686 for (c = 0; c != 4; c++) {
687 nextl = gpleft->states[l].next[c];
688 nextr = gpright->states[r].next[c];
689 assert(nextl < gpleft->last_state + 1);
690 assert(nextr < gpright->last_state + 1);
691
692 /* transition to (0,0) mean no transition at all */
693 if (nextl != 0 || nextr != 0) {
694 /* if the out-state doesn't already exist */
695 if (get_from_test_array(&gtest, nextl, nextr) == 0) {
696 /* create it! */
697 gpout->last_state++;
698 if (gpout->last_state >= gpout->max_states)
699 resize_dfa(gpout, gpout->max_states + DFA_RESIZE_STEP,
700 gpout->max_indexes);
701
702 add_to_test_array(&gtest, nextl, nextr, gpout->last_state);
703
704 /* link it */
705 gpout->states[state].next[c] = gpout->last_state;
706
707 /* create also its sub-automaton */
708 do_sync_product(nextl, nextr);
709 }
710 else {
711 /* link it */
712 gpout->states[state].next[c] =
713 get_from_test_array(&gtest, nextl, nextr);
714 }
715 }
716 else {
717 /* no output by c from the actual state */
718 gpout->states[state].next[c] = 0;
719 }
720 }
721}
722
723static void
724sync_product(dfa_t *pout, dfa_t *pleft, dfa_t *pright)
725{
726 pout->last_index = 0;
727
728 if (dfa_verbose > 2) {
729 fprintf(stderr, "Product between %s and %s\n", pleft->name, pright->name);
730 fprintf(stderr, "result in %s\n", pout->name);
731 }
732
733
734 gpout = pout;
735 gpleft = pleft;
736 gpright = pright;
737 new_test_array(&gtest);
738 add_to_test_array(&gtest, 1, 1, 1);
739 pout->last_state = 1;
740
741 do_sync_product(1, 1);
742
743 free_test_array(&gtest);
744}
745
746/*
747 * Init/end functions
748 */
749
750void
751dfa_init(void)
752{
753 int j;
754
755 if (dfa_verbose > 1)
756 fprintf(stderr, "dfa: init\n");
757 dfa_was_initialized++;
758
759 build_spiral_order();
760 dfa_prepare_rotation_data();
761
762 for (j = 0; j < DFA_BINS; j++)
763 new_dfa(&(aux_dfa[j]), "binAux ");
764 new_dfa(&aux_temp, "tempAux ");
765}
766
767void
768dfa_end(void)
769{
770 int j;
771
772 if (dfa_verbose > 1)
773 fprintf(stderr, "dfa: end\n");
774
775 for (j = 0; j < DFA_BINS; j++)
776 kill_dfa(&(aux_dfa[j]));
777 kill_dfa(&aux_temp);
778 dfa_was_initialized--;
779}
780
781
782/*
783 * Returns max or min jump distance from state to next[next_index] for
784 * all states. If next_index < 0, then max/min for all for states.
785 */
786
787int
788dfa_minmax_delta(dfa_t *pdfa, int next_index, int isMin)
789{
790
791 int ret, i, j;
792 assert(next_index <= 3);
793
794 if (isMin)
795 ret = 99999;
796 else
797 ret = -1;
798
799 for (i = 0; i <= pdfa->last_state; i++) {
800 for (j = 0; j < 4; j++) {
801 if (j == next_index || next_index < 0) {
802 int next = pdfa->states[i].next[j];
803 if (!next)
804 continue;
805 if (isMin) {
806 if (ret > next - i)
807 ret = next - i;
808 }
809 else {
810 if (ret < next - i)
811 ret = next - i;
812 }
813 }
814 }
815 }
816
817 return ret;
818}
819
820#define DFA_ALIGN 2
821
822/*
823 * Re-orders DFA into a canonical form, which does a half-hearted
824 * attempt to reduce the size of jumps for all states entries.
825 */
826void
827dfa_shuffle(dfa_t *pdfa)
828{
829 struct state *old_states;
830 int *state_to;
831 int *state_from;
832 int *queue1;
833 int *queue2;
834 int *tempq;
835 int next_new_state;
836 int q1p;
837 int q2p;
838 int i, j;
839
840 state_to = calloc(pdfa->last_state+1, sizeof(*state_to));
841 state_from = calloc(pdfa->last_state+1, sizeof(*state_from));
842
843 queue1 = malloc((pdfa->last_state+1) * sizeof(*queue1));
844 queue2 = malloc((pdfa->last_state+1) * sizeof(*queue2));
845 q1p = 1;
846 q2p = 0;
847 queue1[0] = 1; /* i.e. start at state 1. */
848 state_from[0] = state_to[0] = 0;
849 state_from[1] = state_to[1] = 1;
850 next_new_state = 2;
851
852 while (q1p) {
853 for (i = 0; i < q1p; i++) {
854 for (j = 0; j < 4; j++) {
855 int n = pdfa->states[queue1[i]].next[j];
856 /* next_new_state = DFA_ALIGN * ((next_new_state-1) / DFA_ALIGN) + 1;*/
857 while (n && !state_to[n]) {
858 state_to[n] = next_new_state;
859 state_from[next_new_state] = n;
860 next_new_state++;
861 queue2[q2p++] = n;
862 n = pdfa->states[n].next[0];
863 }
864 }
865 }
866 tempq = queue1;
867 queue1 = queue2;
868 queue2 = tempq;
869 q1p = q2p;
870 q2p = 0;
871 }
872
873 old_states = malloc((pdfa->last_state+1) * sizeof(*old_states));
874 for (i = 1; i <= pdfa->last_state; i++) {
875 for (j = 0; j < 4; j++) {
876 old_states[i].next[j] = pdfa->states[i].next[j];
877 old_states[i].att = pdfa->states[i].att;
878 }
879 }
880 for (i = 1; i <= pdfa->last_state; i++) {
881 for (j = 0; j < 4; j++) {
882 assert(state_to[i] > 0);
883 pdfa->states[i].next[j] = state_to[old_states[state_from[i]].next[j]];
884 }
885 pdfa->states[i].att = old_states[state_from[i]].att;
886 }
887}
888
889
890/* Calculate the maximal number of patterns matched at one point for
891 * one transformation. Multiplying this number by 8 gives an upper
892 * bound for the total number of matched patterns for all
893 * transformation.
894 */
895int
896dfa_calculate_max_matched_patterns(dfa_t *pdfa)
897{
898 int total_max = 0;
899 int *state_max = calloc(pdfa->last_state + 1, sizeof(int));
900 char *queued = calloc(pdfa->last_state + 1, sizeof(char));
901 int *queue = malloc(pdfa->last_state * sizeof(int));
902 int queue_start = 0;
903 int queue_end = 1;
904
905 queue[0] = 1;
906 while (queue_start < queue_end) {
907 int state = queue[queue_start++];
908 int k;
909
910 /* Increment maximal number of matched patterns for each pattern
911 * matched at current `state'.
912 */
913 for (k = pdfa->states[state].att; k; k = pdfa->indexes[k].next)
914 state_max[state]++;
915
916 if (total_max < state_max[state])
917 total_max = state_max[state];
918
919 for (k = 0; k < 4; k++) {
920 int next = pdfa->states[state].next[k];
921
922 if (next != 0) {
923 if (!queued[next]) {
924 queue[queue_end++] = next;
925 queued[next] = 1;
926 }
927
928 if (state_max[next] < state_max[state])
929 state_max[next] = state_max[state];
930 }
931 }
932 }
933
934 assert(queue_end == pdfa->last_state);
935
936 free(state_max);
937 free(queued);
938 free(queue);
939
940 return total_max;
941}
942
943
944/*
945 * Merges cached dfas into the master DFA
946 */
947void
948dfa_finalize(dfa_t *pdfa)
949{
950 int j;
951 int next_bin = aux_count;
952 int last_bin = aux_count + DFA_BINS - 1;
953 while (next_bin + 1 != last_bin) {
954 for (j = aux_count + 1; j <= last_bin; j += 2) {
955 if (j+1 == next_bin)
956 copy_dfa(&aux_dfa[next_bin % DFA_BINS], &aux_dfa[j % DFA_BINS]);
957 else
958 sync_product(&aux_dfa[next_bin % DFA_BINS],
959 &aux_dfa[j % DFA_BINS],
960 &aux_dfa[(j+1) % DFA_BINS]);
961 next_bin++;
962 }
963 last_bin = next_bin - 1;
964 aux_count--;
965 next_bin = aux_count;
966 }
967 copy_dfa(pdfa, &aux_dfa[last_bin % DFA_BINS]);
968
969 compactify_att(pdfa);
970}
971
972/*
973 * Add a new string with attribute att_val into the dfa.
974 * if the new size of the dfa respect some size conditions
975 * return increase in kB or -1 if the pattern was rejected.
976 * This function never rejects string of length <= 1.
977 */
978
979float
980dfa_add_string(dfa_t *pdfa, const char *str, int pattern_index, int ll)
981{
982 dfa_t *new_dfa = &(aux_dfa[aux_count % DFA_BINS]);
983 dfa_t *old_dfa = &(aux_dfa[(aux_count+1) % DFA_BINS]);
984 float ratio;
985
986 if (dfa_verbose > 1) {
987 fprintf(stderr, "Adding to dfa %s the string: %s\n", pdfa->name, str);
988 fprintf(stderr, " pat_ind: %d; rotation: %d at bin: %d\n",
989 pattern_index, ll, aux_count);
990 }
991
992 assert(dfa_was_initialized > 0);
993 assert(pdfa != NULL);
994
995 create_dfa(&aux_temp, str, pattern_index);
996
997 /* then we do the synchronization product with dfa */
998 sync_product(new_dfa, old_dfa, &aux_temp);
999 aux_count++;
1000
1001 ratio = 1;
1002 if (dfa_size(old_dfa) > 0)
1003 ratio = (float) (dfa_size(new_dfa) / dfa_size(old_dfa));
1004
1005 return ratio;
1006}
1007
1008
1009/* Used for quick string rotation. */
1010static int dfa_rotation_data[DFA_BASE * DFA_BASE];
1011
1012static void
1013dfa_prepare_rotation_data(void)
1014{
1015 int k;
1016
1017 for (k = 0; k < DFA_MAX_ORDER; k++)
1018 dfa_rotation_data[DFA_POS(0, 0) + spiral[k][0]] = k;
1019}
1020
1021
1022/* Create a transformation of `string' and store it in
1023 * `rotated_string'. The latter must be of at least DFA_MAX_ORDER
1024 * characters in length. */
1025void
1026dfa_rotate_string(char *rotated_string, const char *string, int transformation)
1027{
1028 if (transformation > 0) {
1029 int k;
1030 int length = strlen(string);
1031 int new_length = 0;
1032
1033 memset(rotated_string, '$', DFA_MAX_ORDER);
1034
1035 for (k = 0; k < length; k++) {
1036 if (string[k] != '$') {
1037 int string_position = dfa_rotation_data[DFA_POS(0, 0)
1038 + spiral[k][transformation]];
1039 rotated_string[string_position] = string[k];
1040 if (string_position + 1 > new_length)
1041 new_length = string_position + 1;
1042 }
1043 }
1044
1045 rotated_string[new_length] = 0;
1046 }
1047 else
1048 strcpy(rotated_string, string);
1049}
1050
1051
1052/*
1053 * Build a pattern string from a pattern. `str' must refer a buffer
1054 * of size greater than DFA_MAX_ORDER.
1055 */
1056void
1057pattern_2_string(struct pattern *pat, struct patval_b *elements,
1058 char *str, int ci, int cj)
1059{
1060 char work_space[DFA_MAX_BOARD * 4][DFA_MAX_BOARD * 4];
1061 int m, n; /* anchor position */
1062 int edges, borders, to_test;
1063 int i, j, k;
1064 char c;
1065
1066 m = DFA_MAX_BOARD * 2 + ci;
1067 n = DFA_MAX_BOARD * 2 + cj; /* position of the anchor */
1068
1069 assert(dfa_was_initialized);
1070 memset(str, 0, DFA_MAX_ORDER);
1071 memset(work_space, '#', sizeof(work_space));
1072
1073 if (dfa_verbose > 0)
1074 fprintf(stderr, "converting pattern into string.\n");
1075
1076 /* basic edge constraints */
1077 for (i = DFA_MAX_BOARD; i != DFA_MAX_BOARD * 3; i++)
1078 for (j = DFA_MAX_BOARD; j != DFA_MAX_BOARD * 3; j++)
1079 work_space[i][j] = '$';
1080
1081 /* pattern mask */
1082 for (i = pat->mini + m; i != pat->maxi + m + 1; i++)
1083 for (j = pat->minj + n; j != pat->maxj + n + 1; j++)
1084 work_space[i][j] = '?';
1085
1086 /* more advanced edge constraints */
1087
1088 /* South constraint */
1089 if (pat->edge_constraints & SOUTH_EDGE) {
1090 for (i = m + pat->maxi + 1; i != DFA_MAX_BOARD * 3; i++)
1091 for (j = 0; j != DFA_MAX_BOARD * 3; j++)
1092 work_space[i][j] = '-';
1093 }
1094
1095 /* East constraint */
1096 if (pat->edge_constraints & EAST_EDGE) {
1097 for (i = 0; i != DFA_MAX_BOARD * 3; i++)
1098 for (j = n + pat->maxj + 1; j != DFA_MAX_BOARD * 3; j++)
1099 work_space[i][j] = '|';
1100 }
1101
1102 /* North constraint */
1103 if (pat->edge_constraints & NORTH_EDGE) {
1104 for (i = 0; i != m + pat->mini; i++)
1105 for (j = 0; j != DFA_MAX_BOARD * 4; j++)
1106 work_space[i][j] = '-';
1107 }
1108
1109 /* West constraint */
1110 if (pat->edge_constraints & WEST_EDGE) {
1111 /* take care not to erase the south edge constraint */
1112 for (i = 0; i != m + pat->maxi + 1; i++)
1113 for (j = 0; j != n + pat->minj; j++)
1114 work_space[i][j] = '|';
1115
1116 /* complete the last corner only if necessary */
1117 if (!(pat->edge_constraints & SOUTH_EDGE)) {
1118 for (i = m + pat->maxi + 1; i != DFA_MAX_BOARD * 3; i++)
1119 for (j = 0; j != n + pat->minj; j++)
1120 work_space[i][j] = '|';
1121 }
1122 }
1123
1124 /* dump */
1125 if (dfa_verbose > 4) {
1126 for (i = DFA_MAX_BOARD - 1; i != DFA_MAX_BOARD * 3 + 1; i++) {
1127 for (j = DFA_MAX_BOARD - 1; j != DFA_MAX_BOARD * 3 + 1; j++) {
1128 if (i == m && j == n)
1129 fprintf(stderr, "s"); /* mark the anchor */
1130 else
1131 fprintf(stderr, "%c", work_space[i][j]);
1132 }
1133 fprintf(stderr, "\n");
1134 }
1135 fprintf(stderr, "\n");
1136 }
1137
1138 /* pattern representation on the work space */
1139 for (k = 0; k != pat->patlen; k++) {
1140 c = EXPECTED_VAL(elements[k].att);
1141 assert(work_space[m + elements[k].x - ci][n + elements[k].y - cj] == '?');
1142 work_space[m + elements[k].x - ci][n + elements[k].y - cj] = c;
1143 }
1144
1145 /* dump */
1146 if (dfa_verbose > 3) {
1147 for (i = DFA_MAX_BOARD - 1; i != DFA_MAX_BOARD * 3 + 1; i++) {
1148 for (j = DFA_MAX_BOARD - 1; j != DFA_MAX_BOARD * 3 + 1; j++) {
1149 if (i == m && j == n)
1150 fprintf(stderr, "s"); /* mark the anchor */
1151 else
1152 fprintf(stderr, "%c", work_space[i][j]);
1153 }
1154 fprintf(stderr, "\n");
1155 }
1156 fprintf(stderr, "\n");
1157 }
1158
1159 /* Now we can build the smallest pattern string possible
1160 * from the anchor */
1161
1162 to_test = pat->patlen; /* How many positions left to test ? */
1163 edges = pat->edge_constraints; /* how many constraint tested ? */
1164 borders = 0xF;
1165 /* we must test at least one intersection by border for
1166 * patterns like
1167 *
1168 * ???
1169 * O.O
1170 * ???
1171 *
1172 * To ensure edge position.
1173 */
1174
1175 for (k = 0;
1176 (k != DFA_MAX_ORDER - 1) && ((borders > 0) || edges || to_test > 0);
1177 k++) {
1178 j = spiral[k][0] % DFA_BASE;
1179 if (j >= DFA_MAX_BOARD)
1180 j -= DFA_BASE;
1181 if (j <= -DFA_MAX_BOARD)
1182 j += DFA_BASE;
1183 i = (spiral[k][0] - j) / DFA_BASE;
1184
1185 if (i == pat->maxi)
1186 borders &= ~SOUTH_EDGE;
1187 if (i == pat->mini)
1188 borders &= ~NORTH_EDGE;
1189 if (j == pat->maxj)
1190 borders &= ~EAST_EDGE;
1191 if (j == pat->minj)
1192 borders &= ~WEST_EDGE;
1193
1194 assert(m + i < DFA_MAX_BOARD * 3 && m + i < DFA_MAX_BOARD * 3);
1195 str[k] = work_space[m + i][n + j];
1196 assert(strchr("XOxo.,a!?$#|-+", str[k]));
1197
1198 if (strchr("XOxo.,a!", str[k]))
1199 to_test--;
1200 if (strchr("#|-+", str[k])) {
1201 if (i > pat->maxi)
1202 edges &= ~SOUTH_EDGE;
1203 if (i < pat->mini)
1204 edges &= ~NORTH_EDGE;
1205 if (j > pat->maxj)
1206 edges &= ~EAST_EDGE;
1207 if (j < pat->minj)
1208 edges &= ~WEST_EDGE;
1209 }
1210 }
1211
1212 assert(k < DFA_MAX_ORDER);
1213 str[k] = '\0'; /* end of string */
1214
1215 if (0 && dfa_verbose > 0)
1216 fprintf(stderr, "converted pattern %s into string: %s\n", pat->name, str);
1217}
1218
1219
1220/**************************************
1221 * Experimental DFA builder *
1222 **************************************/
1223
1224/* This builder differs from the one above in that it builds the whole dfa
1225 * at once. That is, it must have all the patterns to build and cannot add
1226 * pattern by pattern. Currently, it is only used in DFA size optimization
1227 * (it seems to be significantly faster).
1228 */
1229
1230
1231/* Allocate a new dfa_attrib structure from a dynamic array. */
1232static dfa_attrib *
1233dfa_attrib_new(dfa_attrib_array *array, int string_index)
1234{
1235 dfa_attrib *attribute;
1236
1237 if (array->allocated == DFA_ATTRIB_BLOCK_SIZE) {
1238 dfa_attrib_block *new_block = malloc(sizeof(*new_block));
1239 assert(new_block);
1240
1241 new_block->previous = array->last_block;
1242 array->last_block = new_block;
1243 array->allocated = 0;
1244 }
1245
1246 attribute = &(array->last_block->attrib[array->allocated++]);
1247 attribute->next = NULL;
1248 attribute->string_index = string_index;
1249
1250 return attribute;
1251}
1252
1253
1254/* Initialize dfa_attrib_array structure. */
1255static void
1256dfa_attrib_array_reset(dfa_attrib_array *array)
1257{
1258 array->last_block = NULL;
1259 array->allocated = DFA_ATTRIB_BLOCK_SIZE;
1260}
1261
1262
1263/* Clear a dynamic array by freeing all blocks befor `cutoff_point'. */
1264static void
1265dfa_attrib_array_partially_clear(dfa_attrib_block *cutoff_point)
1266{
1267 if (cutoff_point) {
1268 dfa_attrib_block *block = cutoff_point->previous;
1269
1270 while (block) {
1271 dfa_attrib_block *previous = block->previous;
1272 free(block);
1273 block = previous;
1274 }
1275
1276 cutoff_point->previous = NULL;
1277 }
1278}
1279
1280
1281/* Clear a dynamic array completely. All blocks are freed. */
1282static void
1283dfa_attrib_array_clear(dfa_attrib_array *array)
1284{
1285 if (array->last_block) {
1286 dfa_attrib_array_partially_clear(array->last_block);
1287 free(array->last_block);
1288 array->last_block = NULL;
1289 }
1290
1291 array->allocated = DFA_ATTRIB_BLOCK_SIZE;
1292}
1293
1294
1295/* Allocate a new dfa_node structure in a DFA graph. */
1296static dfa_node *
1297dfa_node_new(dfa_graph *graph)
1298{
1299 dfa_node *node;
1300
1301 if (graph->allocated == DFA_NODE_BLOCK_SIZE) {
1302 dfa_node_block *new_block = malloc(sizeof(*new_block));
1303 assert(new_block);
1304
1305 new_block->previous = graph->last_block;
1306 graph->last_block = new_block;
1307 graph->allocated = 0;
1308 }
1309
1310 graph->num_nodes++;
1311 node = &(graph->last_block->node[graph->allocated++]);
1312 memset(node, 0, sizeof(dfa_node));
1313
1314 return node;
1315}
1316
1317
1318/* This is a hash table used to quickly find a DFA node using a linked list
1319 * of its attributes as a key.
1320 */
1321static dfa_hash_entry *dfa_hash_table[DFA_HASH_TABLE_SIZE];
1322static dfa_hash_block *dfa_hash_last_block = NULL;
1323static int dfa_hash_allocated;
1324
1325
1326/* Allocate a dfa_entry structure dynamically. */
1327static dfa_hash_entry *
1328dfa_hash_entry_new(void)
1329{
1330 if (dfa_hash_allocated == DFA_HASH_BLOCK_SIZE) {
1331 dfa_hash_block *new_block = malloc(sizeof(*new_block));
1332 assert(new_block);
1333
1334 new_block->previous = dfa_hash_last_block;
1335 dfa_hash_last_block = new_block;
1336 dfa_hash_allocated = 0;
1337 }
1338
1339 return &(dfa_hash_last_block->entry[dfa_hash_allocated++]);
1340}
1341
1342
1343/* Clear the hash table completely. Used after having finished a graph level. */
1344static void
1345dfa_hash_clear(void)
1346{
1347 memset(dfa_hash_table, 0, DFA_HASH_TABLE_SIZE * sizeof(dfa_hash_entry *));
1348
1349 if (dfa_hash_last_block) {
1350 dfa_hash_block *block = dfa_hash_last_block->previous;
1351
1352 while (block) {
1353 dfa_hash_block *previous = block->previous;
1354 free(block);
1355 block = previous;
1356 }
1357
1358 dfa_hash_last_block->previous = NULL;
1359 dfa_hash_allocated = 0;
1360 }
1361 else
1362 dfa_hash_allocated = DFA_HASH_BLOCK_SIZE;
1363}
1364
1365
1366/* Compute the hash value of a key (linked list of attributes). */
1367static int
1368dfa_hash_value(dfa_attrib *key)
1369{
1370 int hash_value = DFA_HASH_VALUE_1 * key->string_index;
1371 if (key->next) {
1372 hash_value += DFA_HASH_VALUE_2 * key->next->string_index;
1373 if (key->next->next)
1374 hash_value += DFA_HASH_VALUE_3 * key->next->next->string_index;
1375 }
1376
1377 return hash_value % DFA_HASH_TABLE_SIZE;
1378}
1379
1380
1381/* Search for a node with a given key in the hash table. */
1382static dfa_node *
1383dfa_hash_search(dfa_attrib *key)
1384{
1385 int hash_value = dfa_hash_value(key);
1386 dfa_hash_entry *entry;
1387
1388 for (entry = dfa_hash_table[hash_value]; entry; entry = entry->next) {
1389 dfa_attrib *left = key;
1390 dfa_attrib *right = entry->key;
1391
1392 while (left && right) {
1393 if (left->string_index != right->string_index)
1394 break;
1395
1396 left = left->next;
1397 right = right->next;
1398 }
1399
1400 if (!left && !right)
1401 return entry->value;
1402 }
1403
1404 return NULL;
1405}
1406
1407
1408/* Add a node to the table. The list of strings which pass through it is used
1409 * as a key.
1410 */
1411static void
1412dfa_hash_add_node(dfa_node *node)
1413{
1414 int hash_value = dfa_hash_value(node->passing_strings);
1415 dfa_hash_entry *entry;
1416
1417 entry = dfa_hash_entry_new();
1418 entry->next = dfa_hash_table[hash_value];
1419 dfa_hash_table[hash_value] = entry;
1420
1421 entry->key = node->passing_strings;
1422 entry->value = node;
1423}
1424
1425
1426/* DFA iterator. Used to walk the array of nodes in backward direction. */
1427static dfa_node_block *dfa_iterator_block;
1428static int dfa_iterator_node_num;
1429
1430
1431/* Reset the iterator. The last added node of the specified graph becomes
1432 * the current node.
1433 */
1434static dfa_node*
1435dfa_iterator_reset(dfa_graph *graph)
1436{
1437 assert(graph->last_block);
1438
1439 if (graph->allocated > 0) {
1440 dfa_iterator_block = graph->last_block;
1441 dfa_iterator_node_num = graph->allocated - 1;
1442 }
1443 else {
1444 dfa_iterator_block = graph->last_block->previous;
1445 assert(dfa_iterator_block);
1446 dfa_iterator_node_num = DFA_NODE_BLOCK_SIZE - 1;
1447 }
1448
1449 return &(dfa_iterator_block->node[dfa_iterator_node_num]);
1450}
1451
1452
1453/* Shift the current node pointer one node backwards. */
1454static dfa_node*
1455dfa_iterate(void)
1456{
1457 dfa_iterator_node_num--;
1458 if (dfa_iterator_node_num < 0) {
1459 dfa_iterator_block = dfa_iterator_block->previous;
1460 assert(dfa_iterator_block);
1461 dfa_iterator_node_num = DFA_NODE_BLOCK_SIZE - 1;
1462 }
1463
1464 return &(dfa_iterator_block->node[dfa_iterator_node_num]);
1465}
1466
1467
1468/* Initialize a dfa_graph structure. */
1469void
1470dfa_graph_reset(dfa_graph *graph)
1471{
1472 graph->num_nodes = 0;
1473 graph->root = NULL;
1474
1475 graph->last_block = NULL;
1476 graph->allocated = DFA_NODE_BLOCK_SIZE;
1477 dfa_attrib_array_reset(&(graph->attributes));
1478}
1479
1480
1481/* Free all resources associated with the specified DFA graph. */
1482static void
1483dfa_graph_clear(dfa_graph *graph)
1484{
1485 dfa_node_block *block = graph->last_block;
1486 graph->num_nodes = 0;
1487 graph->root = NULL;
1488
1489 while (block) {
1490 dfa_node_block *previous = block->previous;
1491 free(block);
1492 block = previous;
1493 }
1494
1495 graph->last_block = NULL;
1496 graph->allocated = DFA_NODE_BLOCK_SIZE;
1497
1498 dfa_attrib_array_clear(&(graph->attributes));
1499}
1500
1501
1502/* dfa_graph_build_level() builds a level of a graph. Level `n' is a set of
1503 * nodes which correspond to n's element of a string. When matching using a
1504 * dfa, nodes of level `n' are only checked at (n + 1)'s iteration (root node
1505 * is considered to be level -1, but is matched at iteration 0).
1506 */
1507static void
1508dfa_graph_build_level(dfa_graph *graph, char **strings, int level,
1509 dfa_node *terminal_node,
1510 dfa_attrib_array *passing_strings_array)
1511{
1512 int save_num_nodes = graph->num_nodes;
1513 dfa_attrib_block *cutoff_point;
1514 dfa_node *node;
1515 dfa_node *this_terminal_node = dfa_iterator_reset(graph);
1516
1517 cutoff_point = passing_strings_array->last_block;
1518 dfa_hash_clear();
1519
1520 /* Walk through all nodes of the previous level (backwards, but that doesn't
1521 * matter - it's just because iterator works that way).
1522 */
1523 for (node = this_terminal_node; node != terminal_node; node = dfa_iterate()) {
1524 int k;
1525 int num_masks = 0;
1526 char mask[4];
1527 dfa_attrib *passing_string;
1528 dfa_attrib **link = &(node->attributes);
1529 dfa_attrib *new_passing_strings[4];
1530 dfa_attrib **new_link[4];
1531
1532 /* Calculate all different masks for subnodes. For instance, if there are
1533 * three strings passing through a node of level 1, say "X$...", "Xx..."
1534 * and "XO...", there will be three masks: 8 (stands for '#'), 5 ('X' and
1535 * '.') and 2 ('O'). String "X$..." will pass further through all three
1536 * subnodes, "Xx..." - through subnode corresponding to mask 5 and string
1537 * "XO..." - through subnode corresponding to mask 2.
1538 */
1539 for (passing_string = node->passing_strings;
1540 passing_string && num_masks < 4;
1541 passing_string = passing_string->next) {
1542 int index = passing_string->string_index;
1543 char string_mask = strings[index][level];
1544
1545 if (string_mask) {
1546 int limit = num_masks;
1547
1548 for (k = 0; k < limit; k++) {
1549 char common_branches = string_mask & mask[k];
1550
1551 if (common_branches && common_branches != mask[k]) {
1552 /* Split a mask, since the string passes through a "part" of it. */
1553 mask[k] ^= common_branches;
1554 mask[num_masks++] = common_branches;
1555 }
1556
1557 string_mask ^= common_branches;
1558 }
1559
1560 if (string_mask) {
1561 /* If there is no mask corresponding to a (part) of the string's
1562 * mask, add it now.
1563 */
1564 mask[num_masks++] = string_mask;
1565 }
1566 }
1567 else {
1568 /* If the string ends at this level, add its index to the list of
1569 * matched strings of the current node. Not used at the moment,
1570 * since this builder isn't used for actual DFA building.
1571 */
1572 *link = dfa_attrib_new(&(graph->attributes), index);
1573 link = &((*link)->next);
1574 }
1575 }
1576
1577 for (k = 0; k < num_masks; k++)
1578 new_link[k] = &(new_passing_strings[k]);
1579
1580 /* Now, for each mask, create a list of all strings which will follow it
1581 * (pass through a node corresponding to it). It is possible to merge this
1582 * loop with the previous one, but it is simplier to keep them separated.
1583 */
1584 for (passing_string = node->passing_strings; passing_string;
1585 passing_string = passing_string->next) {
1586 int index = passing_string->string_index;
1587
1588 for (k = 0; k < num_masks; k++) {
1589 if (strings[index][level] & mask[k]) {
1590 *(new_link[k]) = dfa_attrib_new(passing_strings_array, index);
1591 new_link[k] = &((*(new_link[k]))->next);
1592 }
1593 }
1594 }
1595
1596 /* Finally, create new nodes for the masks when necessary. */
1597 for (k = 0; k < num_masks; k++) {
1598 int i;
1599
1600 /* Maybe we have already added such a node? */
1601 dfa_node *new_node = dfa_hash_search(new_passing_strings[k]);
1602
1603 if (!new_node) {
1604 /* If not, create it, save the list of passing strings and add the
1605 * new node to hash table.
1606 */
1607 new_node = dfa_node_new(graph);
1608 new_node->passing_strings = new_passing_strings[k];
1609
1610 dfa_hash_add_node(new_node);
1611 }
1612
1613 /* At this moment we convert the masks to actual transitions. These are
1614 * also unused till we use this builder for actual DFA creation.
1615 */
1616 for (i = 0; i < 4; i++) {
1617 if (mask[k] & (1 << i))
1618 node->branch[i] = new_node;
1619 }
1620 }
1621 }
1622
1623 /* Free the lists of passing strings for the previous level. Useful if we
1624 * building an exceptionally huge DFA.
1625 */
1626 dfa_attrib_array_partially_clear(cutoff_point);
1627
1628 if (graph->num_nodes != save_num_nodes) {
1629 /* If we have added any nodes, this level is not the last one. */
1630 dfa_graph_build_level(graph, strings, level + 1, this_terminal_node,
1631 passing_strings_array);
1632 }
1633}
1634
1635
1636/* Convert a pattern to a string of masks. */
1637static char *
1638dfa_prepare_string(const char *string)
1639{
1640 int k;
1641 int l = strlen(string);
1642 char *dfa_string = malloc(l + 1);
1643 assert(dfa_string);
1644
1645 for (k = 0; k < l; k++) {
1646 switch (string[k]) {
1647 case '$': dfa_string[k] = 15; break; /* 1111 */
1648 case '-':
1649 case '|':
1650 case '+':
1651 case '#': dfa_string[k] = 8; break; /* 1000 */
1652 case '.':
1653 case ',':
1654 case '!':
1655 case 'a': dfa_string[k] = 1; break; /* 0001 */
1656 case '?': dfa_string[k] = 7; break; /* 0111 */
1657 case 'O': dfa_string[k] = 2; break; /* 0010 */
1658 case 'X': dfa_string[k] = 4; break; /* 0100 */
1659 case 'o': dfa_string[k] = 3; break; /* 0011 */
1660 case 'x': dfa_string[k] = 5; break; /* 0101 */
1661
1662 default: assert(0); /* Shouldn't happen. */
1663 }
1664 }
1665
1666 dfa_string[l] = 0;
1667 return dfa_string;
1668}
1669
1670
1671/* Initialize a dfa_patterns structure. */
1672void
1673dfa_patterns_reset(dfa_patterns *patterns)
1674{
1675 patterns->num_patterns = 0;
1676 patterns->patterns = NULL;
1677 patterns->last_pattern = NULL;
1678
1679 dfa_graph_reset(&(patterns->graph));
1680}
1681
1682
1683/* Clear the graph and reset all fields of a dfa_patterns structure. */
1684void
1685dfa_patterns_clear(dfa_patterns *patterns)
1686{
1687 dfa_pattern *pattern = patterns->patterns;
1688
1689 while (pattern) {
1690 int k;
1691 dfa_pattern *next = pattern->next;
1692
1693 for (k = 0; k < pattern->num_variations; k++)
1694 free(pattern->variation[k]);
1695
1696 free(pattern);
1697 pattern = next;
1698 }
1699
1700 patterns->num_patterns = 0;
1701 patterns->patterns = NULL;
1702 patterns->last_pattern = NULL;
1703
1704 dfa_graph_clear(&(patterns->graph));
1705}
1706
1707
1708/* Add a pattern to a list. If `index' is equal to the index of the last
1709 * added pattern, add a variation to that pattern instead.
1710 */
1711void
1712dfa_patterns_add_pattern(dfa_patterns *patterns, const char *string, int index)
1713{
1714 dfa_pattern *pattern = NULL;
1715
1716 if (index == patterns->num_patterns - 1) {
1717 assert(patterns->last_pattern);
1718 assert(patterns->last_pattern->num_variations < 8);
1719
1720 pattern = patterns->last_pattern;
1721 }
1722 else {
1723 assert(patterns->num_patterns <= index);
1724
1725 while (patterns->num_patterns <= index) {
1726 patterns->num_patterns++;
1727 pattern = malloc(sizeof(*pattern));
1728 pattern->num_variations = 0;
1729
1730 if (patterns->last_pattern)
1731 patterns->last_pattern->next = pattern;
1732 else
1733 patterns->patterns = pattern;
1734 patterns->last_pattern = pattern;
1735 }
1736
1737 pattern->current_variation = 0;
1738 pattern->next = NULL;
1739 }
1740
1741 pattern->variation[pattern->num_variations++] = dfa_prepare_string(string);
1742}
1743
1744
1745/* Set the variation of the last pattern. Can be used in actual DFA building
1746 * or to set a hint (results of the previous optimization) for optimization.
1747 */
1748void
1749dfa_patterns_set_last_pattern_variation(dfa_patterns *patterns, int variation)
1750{
1751 assert(patterns->last_pattern);
1752 assert(patterns->last_pattern->num_variations > variation);
1753
1754 patterns->last_pattern->current_variation = variation;
1755}
1756
1757
1758/* Make the shortest variation of the last pattern its current variation. It
1759 * is used as a starting point in DFA optimization process.
1760 */
1761void
1762dfa_patterns_select_shortest_variation(dfa_patterns *patterns)
1763{
1764 int k;
1765 int min_length;
1766 dfa_pattern *pattern = patterns->last_pattern;
1767 assert(pattern);
1768
1769 pattern->current_variation = 0;
1770 min_length = strlen(pattern->variation[0]);
1771 for (k = 1; k < pattern->num_variations; k++) {
1772 int length = strlen(pattern->variation[k]);
1773
1774 if (length < min_length) {
1775 pattern->current_variation = k;
1776 min_length = length;
1777 }
1778 }
1779}
1780
1781
1782/* Build a DFA graph for a list of patterns. */
1783void
1784dfa_patterns_build_graph(dfa_patterns *patterns)
1785{
1786 int k = 0;
1787 char **strings;
1788 dfa_attrib_array passing_strings_array;
1789 dfa_attrib **link;
1790 dfa_node *error_state;
1791 dfa_graph *graph = &(patterns->graph);
1792 dfa_pattern *pattern;
1793
1794 strings = malloc(patterns->num_patterns * sizeof(*strings));
1795 assert(strings);
1796
1797 dfa_graph_clear(graph);
1798 dfa_attrib_array_reset(&passing_strings_array);
1799
1800 /* Error state node is used as a terminator for level -1 (root node). */
1801 error_state = dfa_node_new(graph);
1802 graph->root = dfa_node_new(graph);
1803
1804 /* Add all strings as passing through root node (level -1). */
1805 link = &(graph->root->passing_strings);
1806 for (pattern = patterns->patterns; pattern; pattern = pattern->next, k++) {
1807 if (pattern->num_variations > 0) {
1808 assert(pattern->current_variation < pattern->num_variations);
1809 strings[k] = pattern->variation[pattern->current_variation];
1810
1811 *link = dfa_attrib_new(&passing_strings_array, k);
1812 link = &((*link)->next);
1813 }
1814 else
1815 strings[k] = NULL;
1816 }
1817
1818 dfa_graph_build_level(graph, strings, 0, error_state, &passing_strings_array);
1819
1820 free(strings);
1821 dfa_attrib_array_clear(&passing_strings_array);
1822}
1823
1824
1825/* dfa_patterns_optimize_variations() tries to reduce the size of DFA by
1826 * altering pattern variations (in fact, transformations). The algorithm
1827 * is to change several patterns' variations and if this happens to give
1828 * size reduction, to keep the change, otherwise, revert.
1829 *
1830 * This function contains many heuristically chosen values for variation
1831 * changing probability etc. They have been chosen by observing algorithm
1832 * effectiveness and seem to be very good.
1833 *
1834 * Note that we subtract 1 from the number of nodes to be consistent with
1835 * the standard builder, which doesn't count error state.
1836 */
1837int *
1838dfa_patterns_optimize_variations(dfa_patterns *patterns, int iterations)
1839{
1840 int k = 0;
1841 int failed_iterations = 0;
1842 int min_nodes_so_far;
1843 int num_nodes_original;
1844 int *best_variations;
1845 double lower_limit = 2.0 / patterns->num_patterns;
1846 double upper_limit = 6.0 / patterns->num_patterns;
1847 double change_probability = 4.0 / patterns->num_patterns;
1848 dfa_pattern *pattern;
1849
1850 best_variations = malloc(patterns->num_patterns * sizeof(*best_variations));
1851 assert(best_variations);
1852 for (pattern = patterns->patterns; pattern; pattern = pattern->next, k++)
1853 best_variations[k] = pattern->current_variation;
1854
1855 dfa_patterns_build_graph(patterns);
1856 num_nodes_original = patterns->graph.num_nodes;
1857 min_nodes_so_far = num_nodes_original;
1858
1859 fprintf(stderr, "Original number of DFA states: %d\n", min_nodes_so_far - 1);
1860 fprintf(stderr, "Trying to optimize in %d iterations\n", iterations);
1861
1862 gg_srand(num_nodes_original + patterns->num_patterns);
1863
1864 while (iterations--) {
1865 int changed_variations = 0;
1866 int k = 0;
1867
1868 /* Randomly change some variations. */
1869 for (pattern = patterns->patterns; pattern; pattern = pattern->next, k++) {
1870 if (gg_drand() < change_probability && pattern->num_variations > 1) {
1871 int new_variation = gg_rand() % (pattern->num_variations - 1);
1872 if (new_variation >= pattern->current_variation)
1873 new_variation++;
1874 pattern->current_variation = new_variation;
1875 changed_variations++;
1876 }
1877 else
1878 pattern->current_variation = best_variations[k];
1879 }
1880
1881 if (changed_variations == 0) {
1882 iterations++;
1883 continue;
1884 }
1885
1886 fprintf(stderr, ".");
1887 dfa_patterns_build_graph(patterns);
1888
1889 if (patterns->graph.num_nodes < min_nodes_so_far) {
1890 /* If the new set of variations produces smaller dfa, save it. */
1891 int k = 0;
1892 for (pattern = patterns->patterns; pattern; pattern = pattern->next, k++)
1893 best_variations[k] = pattern->current_variation;
1894
1895 fprintf(stderr, "\nOptimized: %d => %d states (%d iterations left)\n",
1896 min_nodes_so_far - 1, patterns->graph.num_nodes - 1, iterations);
1897 min_nodes_so_far = patterns->graph.num_nodes;
1898 failed_iterations = 0;
1899 }
1900 else
1901 failed_iterations++;
1902
1903 if (failed_iterations >= 30) {
1904 /* If haven't succeded in 30 last iterations, try to alter variation
1905 * change probability.
1906 */
1907 double delta = gg_drand() / patterns->num_patterns;
1908 if (change_probability > upper_limit
1909 || (change_probability >= lower_limit && gg_rand() % 2 == 0))
1910 delta = -delta;
1911
1912 change_probability += delta;
1913 failed_iterations = 0;
1914 }
1915 }
1916
1917 fprintf(stderr, "\nTotal optimization result: %d => %d states\n",
1918 num_nodes_original - 1, min_nodes_so_far - 1);
1919
1920 dfa_graph_clear(&(patterns->graph));
1921 return best_variations;
1922}
1923
1924
1925/*
1926 * Local Variables:
1927 * tab-width: 8
1928 * c-basic-offset: 2
1929 * End:
1930 */