Commit | Line | Data |
---|---|---|
49373f41 EW |
1 | /* |
2 | * Copyright (c) 1989 Regents of the University of California. | |
3 | * All rights reserved. | |
4 | * | |
3dd3a9e5 KB |
5 | * This code is derived from software contributed to Berkeley by |
6 | * Edward Wang at The University of California, Berkeley. | |
7 | * | |
87f529ec | 8 | * %sccs.include.redist.c% |
49373f41 EW |
9 | */ |
10 | ||
11 | #ifndef lint | |
af14aa25 | 12 | static char sccsid[] = "@(#)compress.c 3.6 (Berkeley) %G%"; |
49373f41 EW |
13 | #endif /* not lint */ |
14 | ||
15 | #include "ww.h" | |
49373f41 EW |
16 | #include "tt.h" |
17 | ||
17f96a0f EW |
18 | /* special */ |
19 | #include <stdio.h> | |
20 | int cc_trace = 0; | |
21 | FILE *cc_trace_fp; | |
22 | ||
0ab446d2 EW |
23 | /* tunable parameters */ |
24 | ||
17f96a0f EW |
25 | int cc_reverse = 1; |
26 | int cc_sort = 0; | |
27 | int cc_chop = 0; | |
0ab446d2 | 28 | |
17f96a0f EW |
29 | int cc_token_max = 8; /* <= TOKEN_MAX */ |
30 | int cc_token_min = 2; /* > tt.tt_put_token_cost */ | |
31 | int cc_npass0 = 1; | |
32 | int cc_npass1 = 1; | |
0ab446d2 | 33 | |
17f96a0f | 34 | int cc_bufsize = 1024 * 3; /* XXX, or 80 * 24 * 2 */ |
0ab446d2 | 35 | |
17f96a0f | 36 | int cc_ntoken = 8192; |
0ab446d2 | 37 | |
17f96a0f EW |
38 | #define cc_weight XXX |
39 | #ifndef cc_weight | |
40 | int cc_weight = 0; | |
0ab446d2 EW |
41 | #endif |
42 | ||
43 | #define TOKEN_MAX 16 | |
44 | ||
17f96a0f | 45 | struct cc { |
49373f41 | 46 | char string[TOKEN_MAX]; |
0ab446d2 EW |
47 | char length; |
48 | char flag; | |
17f96a0f | 49 | #ifndef cc_weight |
0ab446d2 EW |
50 | short weight; |
51 | #endif | |
52 | long time; /* time last seen */ | |
53 | short bcount; /* count in this buffer */ | |
54 | short ccount; /* count in compression */ | |
55 | short places; /* places in the buffer */ | |
56 | short code; /* token code */ | |
17f96a0f EW |
57 | struct cc *qforw, *qback; |
58 | struct cc *hforw, **hback; | |
49373f41 EW |
59 | }; |
60 | ||
17f96a0f EW |
61 | short cc_thresholds[TOKEN_MAX + 1]; |
62 | #define thresh(length) (cc_thresholds[length]) | |
0ab446d2 | 63 | #define threshp(code, count, length) \ |
17f96a0f | 64 | ((code) >= 0 || (short) (count) >= cc_thresholds[length]) |
0ab446d2 | 65 | |
17f96a0f EW |
66 | #ifndef cc_weight |
67 | short cc_wthresholds[TOKEN_MAX + 1]; | |
68 | #define wthresh(length) (cc_wthresholds[length]) | |
69 | #define wthreshp(weight, length) ((short) (weight) >= cc_wthresholds[length]) | |
0ab446d2 EW |
70 | #else |
71 | #define wthreshp(weight, length) (0) | |
72 | #endif | |
73 | ||
17f96a0f EW |
74 | #ifndef cc_weight |
75 | short cc_wlimits[TOKEN_MAX + 1]; | |
76 | #define wlimit(length) (cc_wlimits[length]) | |
0ab446d2 EW |
77 | #endif |
78 | ||
79 | #define put_token_score(length) ((length) - tt.tt_put_token_cost) | |
80 | ||
17f96a0f | 81 | int cc_score_adjustments[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */ |
0ab446d2 EW |
82 | #define score_adjust(score, p) \ |
83 | do { \ | |
84 | int length = (p)->length; \ | |
85 | int ccount = (p)->ccount; \ | |
86 | if (threshp((p)->code, ccount, length) || \ | |
87 | wthreshp((p)->weight, length)) /* XXX */ \ | |
88 | (score) -= length - tt.tt_put_token_cost; \ | |
89 | else \ | |
17f96a0f | 90 | (score) += cc_score_adjustments[length][ccount]; \ |
0ab446d2 EW |
91 | } while (0) |
92 | ||
17f96a0f | 93 | int cc_initial_scores[TOKEN_MAX + 1][8]; /* XXX, 8 > max of cc_thresholds */ |
0ab446d2 | 94 | |
17f96a0f | 95 | struct cc cc_q0a, cc_q0b, cc_q1a, cc_q1b; |
0ab446d2 | 96 | |
17f96a0f | 97 | #define qinsert(p1, p2) \ |
0ab446d2 | 98 | do { \ |
17f96a0f EW |
99 | register struct cc *forw = (p1)->qforw; \ |
100 | register struct cc *back = (p1)->qback; \ | |
0ab446d2 EW |
101 | back->qforw = forw; \ |
102 | forw->qback = back; \ | |
17f96a0f EW |
103 | forw = (p2)->qforw; \ |
104 | (p1)->qforw = forw; \ | |
105 | forw->qback = (p1); \ | |
106 | (p2)->qforw = (p1); \ | |
107 | (p1)->qback = (p2); \ | |
0ab446d2 EW |
108 | } while (0) |
109 | ||
17f96a0f | 110 | #define qinsertq(q, p) \ |
0ab446d2 | 111 | ((q)->qforw == (q) ? 0 : \ |
17f96a0f EW |
112 | ((q)->qback->qforw = (p)->qforw, \ |
113 | (p)->qforw->qback = (q)->qback, \ | |
114 | (q)->qforw->qback = (p), \ | |
115 | (p)->qforw = (q)->qforw, \ | |
0ab446d2 EW |
116 | (q)->qforw = (q), \ |
117 | (q)->qback = (q))) | |
118 | ||
119 | #define H (14) | |
120 | #define HSIZE (1 << H) | |
17f96a0f EW |
121 | #define hash(h, c) ((((h) >> H - 8 | (h) << 8) ^ (c)) & HSIZE - 1) |
122 | ||
123 | char *cc_buffer; | |
124 | struct cc **cc_output; /* the output array */ | |
125 | short *cc_places[TOKEN_MAX + 1]; | |
126 | short *cc_hashcodes; /* for computing hashcodes */ | |
127 | struct cc **cc_htab; /* the hash table */ | |
128 | struct cc **cc_tokens; /* holds all the active tokens */ | |
129 | struct cc_undo { | |
130 | struct cc **pos; | |
131 | struct cc *val; | |
132 | } *cc_undo; | |
133 | ||
134 | long cc_time, cc_time0; | |
135 | ||
136 | char *cc_tt_ob, *cc_tt_obe; | |
137 | ||
138 | ccinit() | |
49373f41 | 139 | { |
0ab446d2 | 140 | register i, j; |
17f96a0f | 141 | register struct cc *p; |
49373f41 | 142 | |
17f96a0f EW |
143 | if (tt.tt_token_max > cc_token_max) |
144 | tt.tt_token_max = cc_token_max; | |
145 | if (tt.tt_token_min < cc_token_min) | |
146 | tt.tt_token_min = cc_token_min; | |
0ab446d2 | 147 | if (tt.tt_token_min > tt.tt_token_max) { |
49373f41 | 148 | tt.tt_ntoken = 0; |
0ab446d2 | 149 | return 0; |
49373f41 | 150 | } |
17f96a0f EW |
151 | if (tt.tt_ntoken > cc_ntoken / 2) /* not likely */ |
152 | tt.tt_ntoken = cc_ntoken / 2; | |
0ab446d2 | 153 | #define C(x) (sizeof (x) / sizeof *(x)) |
17f96a0f | 154 | for (i = 0; i < C(cc_thresholds); i++) { |
0ab446d2 EW |
155 | int h = i - tt.tt_put_token_cost; |
156 | if (h > 0) | |
17f96a0f | 157 | cc_thresholds[i] = |
0ab446d2 EW |
158 | (tt.tt_set_token_cost + 1 + h - 1) / h + 1; |
159 | else | |
17f96a0f | 160 | cc_thresholds[i] = 0; |
0ab446d2 | 161 | } |
17f96a0f EW |
162 | for (i = 0; i < C(cc_score_adjustments); i++) { |
163 | int t = cc_thresholds[i]; | |
164 | for (j = 0; j < C(*cc_score_adjustments); j++) { | |
0ab446d2 | 165 | if (j >= t) |
17f96a0f | 166 | cc_score_adjustments[i][j] = |
0ab446d2 EW |
167 | - (i - tt.tt_put_token_cost); |
168 | else if (j < t - 1) | |
17f96a0f | 169 | cc_score_adjustments[i][j] = 0; |
0ab446d2 EW |
170 | else |
171 | /* | |
172 | * cost now is | |
173 | * length * (ccount + 1) a | |
174 | * cost before was | |
175 | * set-token-cost + length + | |
176 | * ccount * put-token-cost b | |
177 | * the score adjustment is (b - a) | |
178 | */ | |
17f96a0f | 179 | cc_score_adjustments[i][j] = |
0ab446d2 EW |
180 | tt.tt_set_token_cost + i + |
181 | j * tt.tt_put_token_cost - | |
182 | i * (j + 1); | |
183 | if (j >= t) | |
17f96a0f | 184 | cc_initial_scores[i][j] = 0; |
0ab446d2 EW |
185 | else |
186 | /* | |
187 | * - (set-token-cost + | |
188 | * (length - put-token-cost) - | |
189 | * (length - put-token-cost) * ccount) | |
190 | */ | |
17f96a0f | 191 | cc_initial_scores[i][j] = |
0ab446d2 EW |
192 | - (tt.tt_set_token_cost + |
193 | (i - tt.tt_put_token_cost) - | |
194 | (i - tt.tt_put_token_cost) * j); | |
195 | } | |
196 | } | |
17f96a0f EW |
197 | #ifndef cc_weight |
198 | for (i = 1; i < C(cc_wthresholds); i++) { | |
199 | cc_wthresholds[i] = | |
0ab446d2 EW |
200 | ((tt.tt_set_token_cost + tt.tt_put_token_cost) / i + |
201 | i / 5 + 1) * | |
17f96a0f EW |
202 | cc_weight + 1; |
203 | cc_wlimits[i] = cc_wthresholds[i] + cc_weight; | |
0ab446d2 EW |
204 | } |
205 | #endif | |
206 | #undef C | |
17f96a0f EW |
207 | if ((cc_output = (struct cc **) |
208 | malloc((unsigned) cc_bufsize * sizeof *cc_output)) == 0) | |
0ab446d2 | 209 | goto nomem; |
17f96a0f EW |
210 | if ((cc_hashcodes = (short *) |
211 | malloc((unsigned) cc_bufsize * sizeof *cc_hashcodes)) == 0) | |
0ab446d2 | 212 | goto nomem; |
17f96a0f | 213 | if ((cc_htab = (struct cc **) malloc(HSIZE * sizeof *cc_htab)) == 0) |
49373f41 | 214 | goto nomem; |
17f96a0f | 215 | if ((cc_tokens = (struct cc **) |
0ab446d2 | 216 | malloc((unsigned) |
17f96a0f EW |
217 | (cc_ntoken + tt.tt_token_max - tt.tt_token_min + 1) * |
218 | sizeof *cc_tokens)) == 0) | |
49373f41 | 219 | goto nomem; |
17f96a0f EW |
220 | if ((cc_undo = (struct cc_undo *) |
221 | malloc((unsigned) cc_bufsize * sizeof *cc_undo)) == 0) | |
0ab446d2 EW |
222 | goto nomem; |
223 | for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) | |
17f96a0f EW |
224 | if ((cc_places[i] = (short *) |
225 | malloc((unsigned) cc_bufsize * sizeof **cc_places)) == 0) | |
0ab446d2 | 226 | goto nomem; |
17f96a0f EW |
227 | cc_q0a.qforw = cc_q0a.qback = &cc_q0a; |
228 | cc_q0b.qforw = cc_q0b.qback = &cc_q0b; | |
229 | cc_q1a.qforw = cc_q1a.qback = &cc_q1a; | |
230 | cc_q1b.qforw = cc_q1b.qback = &cc_q1b; | |
231 | if ((p = (struct cc *) malloc((unsigned) cc_ntoken * sizeof *p)) == 0) | |
0ab446d2 EW |
232 | goto nomem; |
233 | for (i = 0; i < tt.tt_ntoken; i++) { | |
234 | p->code = i; | |
235 | p->time = -1; | |
17f96a0f EW |
236 | p->qback = cc_q0a.qback; |
237 | p->qforw = &cc_q0a; | |
0ab446d2 | 238 | p->qback->qforw = p; |
17f96a0f | 239 | cc_q0a.qback = p; |
0ab446d2 | 240 | p++; |
49373f41 | 241 | } |
17f96a0f | 242 | for (; i < cc_ntoken; i++) { |
0ab446d2 EW |
243 | p->code = -1; |
244 | p->time = -1; | |
17f96a0f EW |
245 | p->qback = cc_q1a.qback; |
246 | p->qforw = &cc_q1a; | |
0ab446d2 | 247 | p->qback->qforw = p; |
17f96a0f | 248 | cc_q1a.qback = p; |
0ab446d2 | 249 | p++; |
49373f41 | 250 | } |
17f96a0f EW |
251 | cc_tt_ob = tt_ob; |
252 | cc_tt_obe = tt_obe; | |
253 | if ((cc_buffer = malloc((unsigned) cc_bufsize)) == 0) | |
254 | goto nomem; | |
49373f41 EW |
255 | return 0; |
256 | nomem: | |
257 | wwerrno = WWE_NOMEM; | |
258 | return -1; | |
259 | } | |
260 | ||
17f96a0f | 261 | ccstart() |
49373f41 | 262 | { |
17f96a0f EW |
263 | register struct cc *p; |
264 | int ccflush(); | |
265 | ||
266 | (*tt.tt_flush)(); | |
267 | tt_obp = tt_ob = cc_buffer; | |
268 | tt_obe = tt_ob + cc_bufsize; | |
269 | tt.tt_flush = ccflush; | |
270 | bzero((char *) cc_htab, HSIZE * sizeof *cc_htab); | |
271 | for (p = cc_q0a.qforw; p != &cc_q0a; p = p->qforw) | |
0ab446d2 | 272 | p->hback = 0; |
17f96a0f | 273 | for (p = cc_q1a.qforw; p != &cc_q1a; p = p->qforw) |
0ab446d2 | 274 | p->hback = 0; |
17f96a0f EW |
275 | if (cc_trace) |
276 | cc_trace_fp = fopen("window-trace", "a"); | |
49373f41 EW |
277 | } |
278 | ||
17f96a0f | 279 | ccend() |
49373f41 | 280 | { |
17f96a0f EW |
281 | int ttflush(); |
282 | ||
283 | (*tt.tt_flush)(); | |
284 | tt_obp = tt_ob = cc_tt_ob; | |
285 | tt_obe = cc_tt_obe; | |
286 | tt.tt_flush = ttflush; | |
287 | if (cc_trace_fp != NULL) { | |
288 | (void) fclose(cc_trace_fp); | |
289 | cc_trace_fp = NULL; | |
290 | } | |
291 | } | |
292 | ||
293 | ccflush() | |
294 | { | |
295 | int bufsize = tt_obp - tt_ob; | |
0ab446d2 | 296 | int n; |
17f96a0f | 297 | int ttflush(); |
0ab446d2 | 298 | |
17f96a0f EW |
299 | if (tt_ob != cc_buffer) |
300 | abort(); | |
301 | if (cc_trace_fp != NULL) { | |
302 | (void) fwrite(tt_ob, 1, bufsize, cc_trace_fp); | |
303 | putc(-1, cc_trace_fp); | |
304 | } | |
305 | if (bufsize < tt.tt_token_min) { | |
306 | ttflush(); | |
0ab446d2 | 307 | return; |
0ab446d2 | 308 | } |
17f96a0f EW |
309 | tt_obp = tt_ob = cc_tt_ob; |
310 | tt_obe = cc_tt_obe; | |
311 | tt.tt_flush = ttflush; | |
312 | cc_time0 = cc_time; | |
313 | cc_time += bufsize; | |
314 | n = cc_sweep_phase(cc_buffer, bufsize, cc_tokens); | |
315 | cc_compress_phase(cc_output, bufsize, cc_tokens, n); | |
316 | cc_output_phase(cc_buffer, cc_output, bufsize); | |
317 | ttflush(); | |
318 | tt_obp = tt_ob = cc_buffer; | |
319 | tt_obe = cc_buffer + cc_bufsize; | |
320 | tt.tt_flush = ccflush; | |
0ab446d2 EW |
321 | } |
322 | ||
17f96a0f | 323 | cc_sweep_phase(buffer, bufsize, tokens) |
0ab446d2 | 324 | char *buffer; |
17f96a0f | 325 | struct cc **tokens; |
0ab446d2 | 326 | { |
17f96a0f | 327 | register struct cc **pp = tokens; |
0ab446d2 EW |
328 | register i, n; |
329 | #ifdef STATS | |
330 | int nn, ii; | |
331 | #endif | |
332 | ||
333 | #ifdef STATS | |
17f96a0f EW |
334 | if (verbose >= 0) |
335 | time_begin(); | |
0ab446d2 EW |
336 | if (verbose > 0) |
337 | printf("Sweep:"); | |
338 | #endif | |
17f96a0f | 339 | cc_sweep0(buffer, bufsize, tt.tt_token_min - 1); |
0ab446d2 | 340 | #ifdef STATS |
17f96a0f | 341 | ntoken_stat = 0; |
0ab446d2 EW |
342 | nn = 0; |
343 | ii = 0; | |
344 | #endif | |
345 | for (i = tt.tt_token_min; i <= tt.tt_token_max; i++) { | |
346 | #ifdef STATS | |
347 | if (verbose > 0) { | |
348 | if (ii > 7) { | |
349 | printf("\n "); | |
350 | ii = 0; | |
351 | } | |
352 | ii++; | |
353 | printf(" (%d", i); | |
354 | (void) fflush(stdout); | |
355 | } | |
356 | #endif | |
17f96a0f | 357 | n = cc_sweep(buffer, bufsize, pp, i); |
0ab446d2 EW |
358 | pp += n; |
359 | #ifdef STATS | |
360 | if (verbose > 0) { | |
361 | if (--n > 0) { | |
362 | printf(" %d", n); | |
363 | nn += n; | |
364 | } | |
365 | putchar(')'); | |
366 | } | |
367 | #endif | |
368 | } | |
17f96a0f | 369 | qinsertq(&cc_q1b, &cc_q1a); |
0ab446d2 EW |
370 | #ifdef STATS |
371 | if (verbose > 0) | |
372 | printf("\n %d tokens, %d candidates\n", | |
17f96a0f EW |
373 | ntoken_stat, nn); |
374 | if (verbose >= 0) | |
375 | time_end(); | |
0ab446d2 EW |
376 | #endif |
377 | return pp - tokens; | |
49373f41 EW |
378 | } |
379 | ||
17f96a0f | 380 | cc_sweep0(buffer, n, length) |
0ab446d2 | 381 | char *buffer; |
49373f41 | 382 | { |
17f96a0f EW |
383 | register char *p; |
384 | register short *hc; | |
0ab446d2 | 385 | register i; |
17f96a0f EW |
386 | register short c; |
387 | register short pc = tt.tt_padc; | |
0ab446d2 | 388 | |
17f96a0f EW |
389 | /* n and length are at least 1 */ |
390 | p = buffer++; | |
391 | hc = cc_hashcodes; | |
392 | i = n; | |
393 | do { | |
394 | if ((*hc++ = *p++) == pc) | |
395 | hc[-1] = -1; | |
396 | } while (--i); | |
397 | while (--length) { | |
398 | p = buffer++; | |
399 | hc = cc_hashcodes; | |
400 | for (i = n--; --i;) { | |
401 | if ((c = *p++) == pc || *hc < 0) | |
402 | c = -1; | |
403 | else | |
404 | c = hash(*hc, c); | |
405 | *hc++ = c; | |
0ab446d2 | 406 | } |
17f96a0f | 407 | } |
0ab446d2 EW |
408 | } |
409 | ||
17f96a0f | 410 | cc_sweep(buffer, bufsize, tokens, length) |
0ab446d2 | 411 | char *buffer; |
17f96a0f | 412 | struct cc **tokens; |
0ab446d2 EW |
413 | register length; |
414 | { | |
17f96a0f | 415 | register struct cc *p; |
0ab446d2 | 416 | register char *cp; |
49373f41 | 417 | register i; |
0ab446d2 | 418 | short *hc; |
17f96a0f EW |
419 | short *places = cc_places[length]; |
420 | struct cc **pp = tokens; | |
0ab446d2 | 421 | short threshold = thresh(length); |
17f96a0f | 422 | #ifndef cc_weight |
0ab446d2 EW |
423 | short wthreshold = wthresh(length); |
424 | short limit = wlimit(length); | |
425 | #endif | |
426 | int time; | |
17f96a0f | 427 | short pc = tt.tt_padc; |
49373f41 | 428 | |
0ab446d2 EW |
429 | i = length - 1; |
430 | bufsize -= i; | |
431 | cp = buffer + i; | |
17f96a0f EW |
432 | hc = cc_hashcodes; |
433 | time = cc_time0; | |
0ab446d2 | 434 | for (i = 0; i < bufsize; i++, time++) { |
17f96a0f | 435 | struct cc **h; |
0ab446d2 EW |
436 | |
437 | { | |
438 | register short *hc1 = hc; | |
17f96a0f EW |
439 | register short c = *cp++; |
440 | register short hh; | |
441 | if ((hh = *hc1) < 0 || c == pc) { | |
0ab446d2 EW |
442 | *hc1++ = -1; |
443 | hc = hc1; | |
0ab446d2 | 444 | continue; |
49373f41 | 445 | } |
17f96a0f EW |
446 | h = cc_htab + (*hc1++ = hash(hh, c)); |
447 | hc = hc1; | |
0ab446d2 EW |
448 | } |
449 | for (p = *h; p != 0; p = p->hforw) | |
450 | if (p->length == (char) length) { | |
451 | register char *p1 = p->string; | |
452 | register char *p2 = cp - length; | |
453 | register n = length; | |
454 | do | |
455 | if (*p1++ != *p2++) | |
456 | goto fail; | |
457 | while (--n); | |
458 | break; | |
459 | fail:; | |
460 | } | |
461 | if (p == 0) { | |
17f96a0f EW |
462 | p = cc_q1a.qback; |
463 | if (p == &cc_q1a || | |
464 | p->time >= cc_time0 && p->length == (char) length) | |
0ab446d2 EW |
465 | continue; |
466 | if (p->hback != 0) | |
467 | if ((*p->hback = p->hforw) != 0) | |
468 | p->hforw->hback = p->hback; | |
469 | { | |
470 | register char *p1 = p->string; | |
471 | register char *p2 = cp - length; | |
472 | register n = length; | |
473 | do | |
474 | *p1++ = *p2++; | |
475 | while (--n); | |
476 | } | |
477 | p->length = length; | |
17f96a0f EW |
478 | #ifndef cc_weight |
479 | p->weight = cc_weight; | |
0ab446d2 EW |
480 | #endif |
481 | p->time = time; | |
482 | p->bcount = 1; | |
483 | p->ccount = 0; | |
484 | p->flag = 0; | |
485 | if ((p->hforw = *h) != 0) | |
486 | p->hforw->hback = &p->hforw; | |
487 | *h = p; | |
488 | p->hback = h; | |
17f96a0f | 489 | qinsert(p, &cc_q1a); |
0ab446d2 EW |
490 | places[i] = -1; |
491 | p->places = i; | |
492 | #ifdef STATS | |
17f96a0f | 493 | ntoken_stat++; |
0ab446d2 | 494 | #endif |
17f96a0f EW |
495 | } else if (p->time < cc_time0) { |
496 | #ifndef cc_weight | |
0ab446d2 | 497 | if ((p->weight += p->time - time) < 0) |
17f96a0f EW |
498 | p->weight = cc_weight; |
499 | else if ((p->weight += cc_weight) > limit) | |
0ab446d2 EW |
500 | p->weight = limit; |
501 | #endif | |
502 | p->time = time; | |
503 | p->bcount = 1; | |
504 | p->ccount = 0; | |
505 | if (p->code >= 0) { | |
506 | p->flag = 1; | |
507 | *pp++ = p; | |
508 | } else | |
17f96a0f | 509 | #ifndef cc_weight |
0ab446d2 EW |
510 | if (p->weight >= wthreshold) { |
511 | p->flag = 1; | |
512 | *pp++ = p; | |
17f96a0f | 513 | qinsert(p, &cc_q1b); |
0ab446d2 EW |
514 | } else |
515 | #endif | |
516 | { | |
517 | p->flag = 0; | |
17f96a0f | 518 | qinsert(p, &cc_q1a); |
0ab446d2 EW |
519 | } |
520 | places[i] = -1; | |
521 | p->places = i; | |
522 | #ifdef STATS | |
17f96a0f | 523 | ntoken_stat++; |
0ab446d2 EW |
524 | #endif |
525 | } else if (p->time + length > time) { | |
49373f41 | 526 | /* |
0ab446d2 EW |
527 | * overlapping token, don't count as two and |
528 | * don't update time, but do adjust weight to offset | |
529 | * the difference | |
49373f41 | 530 | */ |
17f96a0f EW |
531 | #ifndef cc_weight |
532 | if (cc_weight != 0) { /* XXX */ | |
0ab446d2 EW |
533 | p->weight += time - p->time; |
534 | if (!p->flag && p->weight >= wthreshold) { | |
535 | p->flag = 1; | |
536 | *pp++ = p; | |
17f96a0f | 537 | qinsert(p, &cc_q1b); |
0ab446d2 EW |
538 | } |
539 | } | |
540 | #endif | |
541 | places[i] = p->places; | |
542 | p->places = i; | |
543 | } else { | |
17f96a0f | 544 | #ifndef cc_weight |
0ab446d2 | 545 | if ((p->weight += p->time - time) < 0) |
17f96a0f EW |
546 | p->weight = cc_weight; |
547 | else if ((p->weight += cc_weight) > limit) | |
0ab446d2 EW |
548 | p->weight = limit; |
549 | #endif | |
550 | p->time = time; | |
551 | p->bcount++; | |
552 | if (!p->flag && | |
553 | /* code must be < 0 if flag false here */ | |
554 | (p->bcount >= threshold | |
17f96a0f | 555 | #ifndef cc_weight |
0ab446d2 EW |
556 | || p->weight >= wthreshold |
557 | #endif | |
558 | )) { | |
559 | p->flag = 1; | |
560 | *pp++ = p; | |
17f96a0f | 561 | qinsert(p, &cc_q1b); |
0ab446d2 EW |
562 | } |
563 | places[i] = p->places; | |
564 | p->places = i; | |
49373f41 | 565 | } |
49373f41 | 566 | } |
0ab446d2 EW |
567 | if ((i = pp - tokens) > 0) { |
568 | *pp = 0; | |
17f96a0f EW |
569 | if (cc_reverse) |
570 | cc_sweep_reverse(tokens, places); | |
571 | if (cc_sort && i > 1) { | |
572 | int cc_token_compare(); | |
0ab446d2 | 573 | qsort((char *) tokens, i, sizeof *tokens, |
17f96a0f | 574 | cc_token_compare); |
0ab446d2 | 575 | } |
17f96a0f EW |
576 | if (cc_chop) { |
577 | if ((i = i * cc_chop / 100) == 0) | |
0ab446d2 EW |
578 | i = 1; |
579 | tokens[i] = 0; | |
580 | } | |
581 | i++; | |
582 | } | |
583 | return i; | |
584 | } | |
585 | ||
17f96a0f EW |
586 | cc_sweep_reverse(pp, places) |
587 | register struct cc **pp; | |
0ab446d2 EW |
588 | register short *places; |
589 | { | |
17f96a0f | 590 | register struct cc *p; |
0ab446d2 EW |
591 | register short front, back, t; |
592 | ||
593 | while ((p = *pp++) != 0) { | |
594 | back = -1; | |
595 | t = p->places; | |
596 | /* the list is never empty */ | |
597 | do { | |
598 | front = places[t]; | |
599 | places[t] = back; | |
600 | back = t; | |
601 | } while ((t = front) >= 0); | |
602 | p->places = back; | |
603 | } | |
604 | } | |
605 | ||
17f96a0f EW |
606 | cc_compress_phase(output, bufsize, tokens, ntoken) |
607 | struct cc **output; | |
608 | struct cc **tokens; | |
0ab446d2 EW |
609 | { |
610 | register i; | |
611 | ||
612 | bzero((char *) output, bufsize * sizeof *output); | |
17f96a0f EW |
613 | for (i = 0; i < cc_npass0; i++) |
614 | cc_compress_phase1(output, tokens, ntoken, 0); | |
615 | for (i = 0; i < cc_npass1; i++) | |
616 | cc_compress_phase1(output, tokens, ntoken, 1); | |
617 | cc_compress_cleanup(output, bufsize); | |
0ab446d2 EW |
618 | } |
619 | ||
17f96a0f EW |
620 | cc_compress_phase1(output, tokens, ntoken, flag) |
621 | register struct cc **output; | |
622 | struct cc **tokens; | |
0ab446d2 | 623 | { |
17f96a0f EW |
624 | register struct cc **pp; |
625 | #ifdef STATS | |
af14aa25 | 626 | register int i = 0; |
17f96a0f EW |
627 | int nt = 0, cc = 0, nc = 0; |
628 | #endif | |
0ab446d2 EW |
629 | |
630 | #ifdef STATS | |
17f96a0f EW |
631 | if (verbose >= 0) |
632 | time_begin(); | |
0ab446d2 EW |
633 | if (verbose > 0) |
634 | printf("Compress:"); | |
635 | #endif | |
636 | pp = tokens; | |
637 | while (pp < tokens + ntoken) { | |
638 | #ifdef STATS | |
639 | if (verbose > 0) { | |
17f96a0f EW |
640 | ntoken_stat = 0; |
641 | ccount_stat = 0; | |
642 | ncover_stat = 0; | |
0ab446d2 EW |
643 | if (i > 2) { |
644 | printf("\n "); | |
645 | i = 0; | |
646 | } | |
647 | i++; | |
648 | printf(" (%d", (*pp)->length); | |
649 | (void) fflush(stdout); | |
650 | } | |
651 | #endif | |
17f96a0f | 652 | pp += cc_compress(output, pp, flag); |
0ab446d2 | 653 | #ifdef STATS |
17f96a0f EW |
654 | if (verbose > 0) { |
655 | printf(" %dt %du %dc)", ntoken_stat, ccount_stat, | |
656 | ncover_stat); | |
657 | nt += ntoken_stat; | |
658 | cc += ccount_stat; | |
659 | nc += ncover_stat; | |
660 | } | |
0ab446d2 EW |
661 | #endif |
662 | } | |
663 | #ifdef STATS | |
664 | if (verbose > 0) | |
17f96a0f EW |
665 | printf("\n total: (%dt %du %dc)\n", nt, cc, nc); |
666 | if (verbose >= 0) | |
667 | time_end(); | |
0ab446d2 EW |
668 | #endif |
669 | } | |
670 | ||
17f96a0f EW |
671 | cc_compress_cleanup(output, bufsize) |
672 | register struct cc **output; | |
0ab446d2 | 673 | { |
17f96a0f | 674 | register struct cc **end; |
0ab446d2 EW |
675 | |
676 | /* the previous output phase may have been interrupted */ | |
17f96a0f | 677 | qinsertq(&cc_q0b, &cc_q0a); |
0ab446d2 | 678 | for (end = output + bufsize; output < end;) { |
17f96a0f | 679 | register struct cc *p; |
0ab446d2 EW |
680 | register length; |
681 | if ((p = *output) == 0) { | |
682 | output++; | |
683 | continue; | |
684 | } | |
685 | length = p->length; | |
686 | if (!p->flag) { | |
687 | } else if (p->code >= 0) { | |
17f96a0f | 688 | qinsert(p, &cc_q0b); |
0ab446d2 EW |
689 | p->flag = 0; |
690 | } else if (p->ccount == 0) { | |
691 | *output = 0; | |
692 | } else if (p->ccount >= thresh(length) | |
17f96a0f | 693 | #ifndef cc_weight |
0ab446d2 EW |
694 | || wthreshp(p->weight, length) |
695 | #endif | |
696 | ) { | |
697 | p->flag = 0; | |
698 | } else { | |
699 | p->ccount = 0; | |
700 | *output = 0; | |
701 | } | |
702 | output += length; | |
703 | } | |
704 | } | |
705 | ||
17f96a0f EW |
706 | cc_compress(output, tokens, flag) |
707 | struct cc **output; | |
708 | struct cc **tokens; | |
0ab446d2 EW |
709 | char flag; |
710 | { | |
17f96a0f EW |
711 | struct cc **pp = tokens; |
712 | register struct cc *p = *pp++; | |
0ab446d2 EW |
713 | int length = p->length; |
714 | int threshold = thresh(length); | |
17f96a0f | 715 | #ifndef cc_weight |
0ab446d2 EW |
716 | short wthreshold = wthresh(length); |
717 | #endif | |
17f96a0f EW |
718 | short *places = cc_places[length]; |
719 | int *initial_scores = cc_initial_scores[length]; | |
0ab446d2 EW |
720 | int initial_score0 = put_token_score(length); |
721 | ||
722 | do { | |
723 | int score; | |
17f96a0f | 724 | register struct cc_undo *undop; |
0ab446d2 EW |
725 | int ccount; |
726 | #ifdef STATS | |
727 | int ncover; | |
728 | #endif | |
729 | int i; | |
730 | ||
731 | ccount = p->ccount; | |
732 | if ((short) ccount >= p->bcount) | |
733 | continue; | |
734 | if (p->code >= 0 || ccount >= threshold) | |
735 | score = 0; | |
17f96a0f | 736 | #ifndef cc_weight |
0ab446d2 EW |
737 | else if (p->weight >= wthreshold) |
738 | /* allow one fewer match than normal */ | |
739 | /* XXX, should adjust for ccount */ | |
740 | score = - tt.tt_set_token_cost; | |
741 | #endif | |
742 | else | |
743 | score = initial_scores[ccount]; | |
17f96a0f | 744 | undop = cc_undo; |
0ab446d2 EW |
745 | #ifdef STATS |
746 | ncover = 0; | |
747 | #endif | |
748 | for (i = p->places; i >= 0; i = places[i]) { | |
17f96a0f EW |
749 | register struct cc **jp; |
750 | register struct cc *x; | |
751 | register struct cc **ip = output + i; | |
0ab446d2 | 752 | register score0 = initial_score0; |
17f96a0f EW |
753 | struct cc **iip = ip + length; |
754 | struct cc_undo *undop1 = undop; | |
0ab446d2 EW |
755 | |
756 | if ((x = *(jp = ip)) != 0) | |
757 | goto z; | |
758 | while (--jp >= output) | |
759 | if ((x = *jp) != 0) { | |
760 | if (jp + x->length > ip) | |
761 | goto z; | |
762 | break; | |
763 | } | |
764 | jp = ip + 1; | |
765 | while (jp < iip) { | |
766 | if ((x = *jp) == 0) { | |
767 | jp++; | |
768 | continue; | |
769 | } | |
770 | z: | |
771 | if (x == p) | |
772 | goto undo; | |
773 | #ifdef STATS | |
774 | ncover++; | |
775 | #endif | |
776 | undop->pos = jp; | |
777 | undop->val = x; | |
778 | undop++; | |
779 | *jp = 0; | |
780 | x->ccount--; | |
781 | score_adjust(score0, x); | |
782 | if (score0 < 0 && flag) | |
783 | goto undo; | |
784 | jp += x->length; | |
785 | } | |
786 | undop->pos = ip; | |
787 | undop->val = 0; | |
788 | undop++; | |
789 | *ip = p; | |
790 | ccount++; | |
791 | score += score0; | |
792 | continue; | |
793 | undo: | |
794 | while (--undop >= undop1) | |
795 | if (*undop->pos = x = undop->val) | |
796 | x->ccount++; | |
797 | undop++; | |
798 | } | |
799 | if (score > 0) { | |
800 | #ifdef STATS | |
17f96a0f EW |
801 | ccount_stat += ccount - p->ccount; |
802 | ntoken_stat++; | |
803 | ncover_stat += ncover; | |
0ab446d2 EW |
804 | #endif |
805 | p->ccount = ccount; | |
806 | } else { | |
17f96a0f | 807 | register struct cc_undo *u = cc_undo; |
0ab446d2 | 808 | while (--undop >= u) { |
17f96a0f | 809 | register struct cc *x; |
0ab446d2 EW |
810 | if (*undop->pos = x = undop->val) |
811 | x->ccount++; | |
812 | } | |
813 | } | |
814 | } while ((p = *pp++) != 0); | |
815 | return pp - tokens; | |
49373f41 EW |
816 | } |
817 | ||
17f96a0f EW |
818 | cc_output_phase(buffer, output, bufsize) |
819 | register char *buffer; | |
820 | register struct cc **output; | |
821 | register bufsize; | |
49373f41 EW |
822 | { |
823 | register i; | |
17f96a0f | 824 | register struct cc *p, *p1; |
49373f41 | 825 | |
17f96a0f | 826 | for (i = 0; i < bufsize;) { |
0ab446d2 | 827 | if ((p = output[i]) == 0) { |
17f96a0f | 828 | ttputc(buffer[i]); |
49373f41 | 829 | i++; |
0ab446d2 EW |
830 | } else if (p->code >= 0) { |
831 | if (--p->ccount == 0) | |
17f96a0f | 832 | qinsert(p, &cc_q0a); |
0ab446d2 | 833 | (*tt.tt_put_token)(p->code, p->string, p->length); |
49373f41 | 834 | wwntokuse++; |
0ab446d2 EW |
835 | wwntoksave += put_token_score(p->length); |
836 | i += p->length; | |
17f96a0f | 837 | } else if ((p1 = cc_q0a.qback) != &cc_q0a) { |
0ab446d2 EW |
838 | p->code = p1->code; |
839 | p1->code = -1; | |
17f96a0f | 840 | qinsert(p1, &cc_q1a); |
0ab446d2 | 841 | if (--p->ccount == 0) |
17f96a0f | 842 | qinsert(p, &cc_q0a); |
0ab446d2 | 843 | else |
17f96a0f | 844 | qinsert(p, &cc_q0b); |
0ab446d2 | 845 | (*tt.tt_set_token)(p->code, p->string, p->length); |
49373f41 EW |
846 | wwntokdef++; |
847 | wwntoksave -= tt.tt_set_token_cost; | |
0ab446d2 | 848 | i += p->length; |
49373f41 | 849 | } else { |
0ab446d2 | 850 | p->ccount--; |
17f96a0f | 851 | ttwrite(p->string, p->length); |
0ab446d2 EW |
852 | wwntokbad++; |
853 | i += p->length; | |
49373f41 EW |
854 | } |
855 | } | |
17f96a0f | 856 | wwntokc += bufsize; |
49373f41 | 857 | } |
0ab446d2 | 858 | |
17f96a0f EW |
859 | cc_token_compare(p1, p2) |
860 | struct cc **p1, **p2; | |
0ab446d2 EW |
861 | { |
862 | return (*p2)->bcount - (*p1)->bcount; | |
863 | } |