Commit | Line | Data |
---|---|---|
7d7124cb TL |
1 | # |
2 | /* diff - differential file comparison | |
3 | * | |
4 | * Uses an algorithm due to Harold Stone, which finds | |
5 | * a pair of longest identical subsequences in the two | |
6 | * files. | |
7 | * | |
8 | * The major goal is to generate the match vector J. | |
9 | * J[i] is the index of the line in file1 corresponding | |
10 | * to line i file0. J[i] = 0 if there is no | |
11 | * such line in file1. | |
12 | * | |
13 | * Lines are hashed so as to work in core. All potential | |
14 | * matches are located by sorting the lines of each file | |
15 | * on the hash (called value\b\b\b\b\b_____). In particular, this | |
16 | * collects the equivalence classes in file1 together. | |
17 | * Subroutine equiv\b\b\b\b____ replaces the value of each line in | |
18 | * file0 by the index of the first element of its | |
19 | * matching equivalence in (the reordered) file1. | |
20 | * To save space equiv\b\b\b\b\b_____ squeezes file1 into a single | |
21 | * array member\b\b\b\b\b\b______ in which the equivalence classes | |
22 | * are simply concatenated, except that their first | |
23 | * members are flagged by changing sign. | |
24 | * | |
25 | * Next the indices that point into member\b\b\b\b\b\b______ are unsorted\b\b\b\b\b\b\b\b_______ into | |
26 | * array class\b\b\b\b\b_____ according to the original order of file0. | |
27 | * | |
28 | * The cleverness lies in routine stone\b\b\b\b\b______. This marches | |
29 | * through the lines of file0, developing a vector klist\b\b\b\b\b_____ | |
30 | * of "k-candidates". At step i a k-candidate is a matched | |
31 | * pair of lines x,y (x in file0 y in file1) such that | |
32 | * there is a common subsequence of lenght k | |
33 | * between the first i lines of file0 and the first y | |
34 | * lines of file1, but there is no such subsequence for | |
35 | * any smaller y. x is the earliest possible mate to y | |
36 | * that occurs in such a subsequence. | |
37 | * | |
38 | * Whenever any of the members of the equivalence class of | |
39 | * lines in file1 matable to a line in file0 has serial number | |
40 | * less than the y of some k-candidate, that k-candidate | |
41 | * with the smallest such y is replaced. The new | |
42 | * k-candidate is chained (via pred\b\b\b\b____) to the current | |
43 | * k-1 candidate so that the actual subsequence can | |
44 | * be recovered. When a member has serial number greater | |
45 | * that the y of all k-candidates, the klist is extended. | |
46 | * At the end, the longest subsequence is pulled out | |
47 | * and placed in the array J by unravel\b\b\b\b\b\b\b_______. | |
48 | * | |
49 | * With J in hand, the matches there recorded are | |
50 | * check\b\b\b\b\b_____ed against reality to assure that no spurious | |
51 | * matches have crept in due to hashing. If they have, | |
52 | * they are broken, and "jackpot " is recorded--a harmless | |
53 | * matter except that a true match for a spuriously | |
54 | * mated line may now be unnecessarily reported as a change. | |
55 | * | |
56 | * Much of the complexity of the program comes simply | |
57 | * from trying to minimize core utilization and | |
58 | * maximize the range of doable problems by dynamically | |
59 | * allocating what is needed and reusing what is not. | |
60 | * The core requirements for problems larger than somewhat | |
61 | * are (in words) 2*length(file0) + length(file1) + | |
62 | * 3*(number of k-candidates installed), typically about | |
63 | * 6n words for files of length n. | |
64 | */ | |
65 | #include <stdio.h> | |
66 | #include <ctype.h> | |
67 | #include <sys/types.h> | |
68 | #include <sys/stat.h> | |
69 | #include <signal.h> | |
70 | #define prints(s) fputs(s,stdout) | |
71 | ||
72 | #define HALFLONG 16 | |
73 | #define low(x) (x&((1L<<HALFLONG)-1)) | |
74 | #define high(x) (x>>HALFLONG) | |
75 | FILE *input[2]; | |
76 | FILE *fopen(); | |
77 | ||
78 | struct cand { | |
79 | int x; | |
80 | int y; | |
81 | int pred; | |
82 | } cand; | |
83 | struct line { | |
84 | int serial; | |
85 | int value; | |
86 | } *file[2], line; | |
87 | int len[2]; | |
88 | struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ | |
89 | int slen[2]; | |
90 | int pref, suff; /*length of prefix and suffix*/ | |
91 | int *class; /*will be overlaid on file[0]*/ | |
92 | int *member; /*will be overlaid on file[1]*/ | |
93 | int *klist; /*will be overlaid on file[0] after class*/ | |
94 | struct cand *clist; /* merely a free storage pot for candidates */ | |
95 | int clen = 0; | |
96 | int *J; /*will be overlaid on class*/ | |
97 | long *ixold; /*will be overlaid on klist*/ | |
98 | long *ixnew; /*will be overlaid on file[1]*/ | |
99 | int opt; /* -1,0,1 = -e,normal,-f */ | |
100 | int status = 2; | |
101 | int anychange = 0; | |
102 | char *empty = ""; | |
103 | int bflag; | |
104 | ||
105 | char *tempfile; /*used when comparing against std input*/ | |
106 | char *mktemp(); | |
107 | char *dummy; /*used in resetting storage search ptr*/ | |
108 | ||
109 | done() | |
110 | { | |
111 | unlink(tempfile); | |
112 | exit(status); | |
113 | } | |
114 | ||
115 | char *talloc(n) | |
116 | { | |
117 | extern char *malloc(); | |
118 | register char *p; | |
119 | p = malloc((unsigned)n); | |
120 | if(p!=NULL) | |
121 | return(p); | |
122 | noroom(); | |
123 | } | |
124 | ||
125 | char *ralloc(p,n) /*compacting reallocation */ | |
126 | char *p; | |
127 | { | |
128 | register char *q; | |
129 | char *realloc(); | |
130 | free(p); | |
131 | free(dummy); | |
132 | dummy = malloc(1); | |
133 | q = realloc(p, (unsigned)n); | |
134 | if(q==NULL) | |
135 | noroom(); | |
136 | return(q); | |
137 | } | |
138 | ||
139 | noroom() | |
140 | { | |
141 | mesg("files too big, try -h\n",empty); | |
142 | done(); | |
143 | } | |
144 | ||
145 | sort(a,n) /*shellsort CACM #201*/ | |
146 | struct line *a; | |
147 | { | |
148 | struct line w; | |
149 | register int j,m; | |
150 | struct line *ai; | |
151 | register struct line *aim; | |
152 | int k; | |
153 | for(j=1;j<=n;j*= 2) | |
154 | m = 2*j - 1; | |
155 | for(m/=2;m!=0;m/=2) { | |
156 | k = n-m; | |
157 | for(j=1;j<=k;j++) { | |
158 | for(ai = &a[j]; ai > a; ai -= m) { | |
159 | aim = &ai[m]; | |
160 | if(aim < ai) | |
161 | break; /*wraparound*/ | |
162 | if(aim->value > ai[0].value || | |
163 | aim->value == ai[0].value && | |
164 | aim->serial > ai[0].serial) | |
165 | break; | |
166 | w.value = ai[0].value; | |
167 | ai[0].value = aim->value; | |
168 | aim->value = w.value; | |
169 | w.serial = ai[0].serial; | |
170 | ai[0].serial = aim->serial; | |
171 | aim->serial = w.serial; | |
172 | } | |
173 | } | |
174 | } | |
175 | } | |
176 | ||
177 | unsort(f, l, b) | |
178 | struct line *f; | |
179 | int *b; | |
180 | { | |
181 | register int *a; | |
182 | register int i; | |
183 | a = (int *)talloc((l+1)*sizeof(int)); | |
184 | for(i=1;i<=l;i++) | |
185 | a[f[i].serial] = f[i].value; | |
186 | for(i=1;i<=l;i++) | |
187 | b[i] = a[i]; | |
188 | free((char *)a); | |
189 | } | |
190 | ||
191 | filename(pa1, pa2) | |
192 | char **pa1, **pa2; | |
193 | { | |
194 | register char *a1, *b1, *a2; | |
195 | char buf[512]; | |
196 | struct stat stbuf; | |
197 | int i, f; | |
198 | a1 = *pa1; | |
199 | a2 = *pa2; | |
200 | if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) { | |
201 | b1 = *pa1 = malloc(100); | |
202 | while(*b1++ = *a1++) ; | |
203 | b1[-1] = '/'; | |
204 | a1 = b1; | |
205 | while(*a1++ = *a2++) | |
206 | if(*a2 && *a2!='/' && a2[-1]=='/') | |
207 | a1 = b1; | |
208 | } | |
209 | else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) { | |
210 | signal(SIGHUP,done); | |
211 | signal(SIGINT,done); | |
212 | signal(SIGPIPE,done); | |
213 | signal(SIGTERM,done); | |
214 | *pa1 = tempfile = mktemp("/tmp/dXXXXX"); | |
215 | if((f=creat(tempfile,0600)) < 0) { | |
216 | mesg("cannot create ",tempfile); | |
217 | done(); | |
218 | } | |
219 | while((i=read(0,buf,512))>0) | |
220 | write(f,buf,i); | |
221 | close(f); | |
222 | } | |
223 | } | |
224 | ||
225 | prepare(i, arg) | |
226 | char *arg; | |
227 | { | |
228 | register struct line *p; | |
229 | register j,h; | |
230 | if((input[i] = fopen(arg,"r")) == NULL){ | |
231 | mesg("cannot open ", arg); | |
232 | done(); | |
233 | } | |
234 | p = (struct line *)talloc(3*sizeof(line)); | |
235 | for(j=0; h=readhash(input[i]);) { | |
236 | p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line)); | |
237 | p[j].value = h; | |
238 | } | |
239 | len[i] = j; | |
240 | file[i] = p; | |
241 | fclose(input[i]); | |
242 | } | |
243 | ||
244 | prune() | |
245 | { | |
246 | register i,j; | |
247 | for(pref=0;pref<len[0]&&pref<len[1]&& | |
248 | file[0][pref+1].value==file[1][pref+1].value; | |
249 | pref++ ) ; | |
250 | for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& | |
251 | file[0][len[0]-suff].value==file[1][len[1]-suff].value; | |
252 | suff++) ; | |
253 | for(j=0;j<2;j++) { | |
254 | sfile[j] = file[j]+pref; | |
255 | slen[j] = len[j]-pref-suff; | |
256 | for(i=0;i<=slen[j];i++) | |
257 | sfile[j][i].serial = i; | |
258 | } | |
259 | } | |
260 | ||
261 | equiv(a,n,b,m,c) | |
262 | struct line *a, *b; | |
263 | int *c; | |
264 | { | |
265 | register int i, j; | |
266 | i = j = 1; | |
267 | while(i<=n && j<=m) { | |
268 | if(a[i].value <b[j].value) | |
269 | a[i++].value = 0; | |
270 | else if(a[i].value == b[j].value) | |
271 | a[i++].value = j; | |
272 | else | |
273 | j++; | |
274 | } | |
275 | while(i <= n) | |
276 | a[i++].value = 0; | |
277 | b[m+1].value = 0; | |
278 | j = 0; | |
279 | while(++j <= m) { | |
280 | c[j] = -b[j].serial; | |
281 | while(b[j+1].value == b[j].value) { | |
282 | j++; | |
283 | c[j] = b[j].serial; | |
284 | } | |
285 | } | |
286 | c[j] = -1; | |
287 | } | |
288 | ||
289 | main(argc, argv) | |
290 | char **argv; | |
291 | { | |
292 | register int k; | |
293 | char **args; | |
294 | ||
295 | args = argv; | |
296 | if(argc>3 && *argv[1]=='-') { | |
297 | argc--; | |
298 | argv++; | |
299 | for(k=1;argv[0][k];k++) { | |
300 | switch(argv[0][k]) { | |
301 | case 'e': | |
302 | opt = -1; | |
303 | break; | |
304 | case 'f': | |
305 | opt = 1; | |
306 | break; | |
307 | case 'b': | |
308 | bflag = 1; | |
309 | break; | |
310 | case 'h': | |
311 | execv("/usr/lib/diffh",args); | |
312 | mesg("cannot find diffh",empty); | |
313 | done(); | |
314 | } | |
315 | } | |
316 | } | |
317 | if(argc!=3) { | |
318 | mesg("arg count",empty); | |
319 | done(); | |
320 | } | |
321 | dummy = malloc(1); | |
322 | ||
323 | filename(&argv[1], &argv[2]); | |
324 | filename(&argv[2], &argv[1]); | |
325 | prepare(0, argv[1]); | |
326 | prepare(1, argv[2]); | |
327 | prune(); | |
328 | sort(sfile[0],slen[0]); | |
329 | sort(sfile[1],slen[1]); | |
330 | ||
331 | member = (int *)file[1]; | |
332 | equiv(sfile[0], slen[0], sfile[1], slen[1], member); | |
333 | member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int)); | |
334 | ||
335 | class = (int *)file[0]; | |
336 | unsort(sfile[0], slen[0], class); | |
337 | class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int)); | |
338 | ||
339 | klist = (int *)talloc((slen[0]+2)*sizeof(int)); | |
340 | clist = (struct cand *)talloc(sizeof(cand)); | |
341 | k = stone(class, slen[0], member, klist); | |
342 | free((char *)member); | |
343 | free((char *)class); | |
344 | ||
345 | J = (int *)talloc((len[0]+2)*sizeof(int)); | |
346 | unravel(klist[k]); | |
347 | free((char *)clist); | |
348 | free((char *)klist); | |
349 | ||
350 | ixold = (long *)talloc((len[0]+2)*sizeof(long)); | |
351 | ixnew = (long *)talloc((len[1]+2)*sizeof(long)); | |
352 | check(argv); | |
353 | output(argv); | |
354 | status = anychange; | |
355 | done(); | |
356 | } | |
357 | ||
358 | stone(a,n,b,c) | |
359 | int *a; | |
360 | int *b; | |
361 | int *c; | |
362 | { | |
363 | register int i, k,y; | |
364 | int j, l; | |
365 | int oldc, tc; | |
366 | int oldl; | |
367 | k = 0; | |
368 | c[0] = newcand(0,0,0); | |
369 | for(i=1; i<=n; i++) { | |
370 | j = a[i]; | |
371 | if(j==0) | |
372 | continue; | |
373 | y = -b[j]; | |
374 | oldl = 0; | |
375 | oldc = c[0]; | |
376 | do { | |
377 | if(y <= clist[oldc].y) | |
378 | continue; | |
379 | l = search(c, k, y); | |
380 | if(l!=oldl+1) | |
381 | oldc = c[l-1]; | |
382 | if(l<=k) { | |
383 | if(clist[c[l]].y <= y) | |
384 | continue; | |
385 | tc = c[l]; | |
386 | c[l] = newcand(i,y,oldc); | |
387 | oldc = tc; | |
388 | oldl = l; | |
389 | } else { | |
390 | c[l] = newcand(i,y,oldc); | |
391 | k++; | |
392 | break; | |
393 | } | |
394 | } while((y=b[++j]) > 0); | |
395 | } | |
396 | return(k); | |
397 | } | |
398 | ||
399 | newcand(x,y,pred) | |
400 | { | |
401 | register struct cand *q; | |
402 | clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand)); | |
403 | q = clist + clen -1; | |
404 | q->x = x; | |
405 | q->y = y; | |
406 | q->pred = pred; | |
407 | return(clen-1); | |
408 | } | |
409 | ||
410 | search(c, k, y) | |
411 | int *c; | |
412 | { | |
413 | register int i, j, l; | |
414 | int t; | |
415 | if(clist[c[k]].y<y) /*quick look for typical case*/ | |
416 | return(k+1); | |
417 | i = 0; | |
418 | j = k+1; | |
419 | while((l=(i+j)/2) > i) { | |
420 | t = clist[c[l]].y; | |
421 | if(t > y) | |
422 | j = l; | |
423 | else if(t < y) | |
424 | i = l; | |
425 | else | |
426 | return(l); | |
427 | } | |
428 | return(l+1); | |
429 | } | |
430 | ||
431 | unravel(p) | |
432 | { | |
433 | register int i; | |
434 | register struct cand *q; | |
435 | for(i=0; i<=len[0]; i++) | |
436 | J[i] = i<=pref ? i: | |
437 | i>len[0]-suff ? i+len[1]-len[0]: | |
438 | 0; | |
439 | for(q=clist+p;q->y!=0;q=clist+q->pred) | |
440 | J[q->x+pref] = q->y+pref; | |
441 | } | |
442 | ||
443 | /* check does double duty: | |
444 | 1. ferret out any fortuitous correspondences due | |
445 | to confounding by hashing (which result in "jackpot") | |
446 | 2. collect random access indexes to the two files */ | |
447 | ||
448 | check(argv) | |
449 | char **argv; | |
450 | { | |
451 | register int i, j; | |
452 | int jackpot; | |
453 | long ctold, ctnew; | |
454 | char c,d; | |
455 | input[0] = fopen(argv[1],"r"); | |
456 | input[1] = fopen(argv[2],"r"); | |
457 | j = 1; | |
458 | ixold[0] = ixnew[0] = 0; | |
459 | jackpot = 0; | |
460 | ctold = ctnew = 0; | |
461 | for(i=1;i<=len[0];i++) { | |
462 | if(J[i]==0) { | |
463 | ixold[i] = ctold += skipline(0); | |
464 | continue; | |
465 | } | |
466 | while(j<J[i]) { | |
467 | ixnew[j] = ctnew += skipline(1); | |
468 | j++; | |
469 | } | |
470 | for(;;) { | |
471 | c = getc(input[0]); | |
472 | d = getc(input[1]); | |
473 | ctold++; | |
474 | ctnew++; | |
475 | if(bflag && isspace(c) && isspace(d)) { | |
476 | do { | |
477 | if(c=='\n') break; | |
478 | ctold++; | |
479 | } while(isspace(c=getc(input[0]))); | |
480 | do { | |
481 | if(d=='\n') break; | |
482 | ctnew++; | |
483 | } while(isspace(d=getc(input[1]))); | |
484 | } | |
485 | if(c!=d) { | |
486 | jackpot++; | |
487 | J[i] = 0; | |
488 | if(c!='\n') | |
489 | ctold += skipline(0); | |
490 | if(d!='\n') | |
491 | ctnew += skipline(1); | |
492 | break; | |
493 | } | |
494 | if(c=='\n') | |
495 | break; | |
496 | } | |
497 | ixold[i] = ctold; | |
498 | ixnew[j] = ctnew; | |
499 | j++; | |
500 | } | |
501 | for(;j<=len[1];j++) { | |
502 | ixnew[j] = ctnew += skipline(1); | |
503 | } | |
504 | fclose(input[0]); | |
505 | fclose(input[1]); | |
506 | /* | |
507 | if(jackpot) | |
508 | mesg("jackpot",empty); | |
509 | */ | |
510 | } | |
511 | ||
512 | skipline(f) | |
513 | { | |
514 | register i; | |
515 | for(i=1;getc(input[f])!='\n';i++) ; | |
516 | return(i); | |
517 | } | |
518 | ||
519 | output(argv) | |
520 | char **argv; | |
521 | { | |
522 | int m; | |
523 | register int i0, i1, j1; | |
524 | int j0; | |
525 | input[0] = fopen(argv[1],"r"); | |
526 | input[1] = fopen(argv[2],"r"); | |
527 | m = len[0]; | |
528 | J[0] = 0; | |
529 | J[m+1] = len[1]+1; | |
530 | if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) { | |
531 | while(i0<=m&&J[i0]==J[i0-1]+1) i0++; | |
532 | j0 = J[i0-1]+1; | |
533 | i1 = i0-1; | |
534 | while(i1<m&&J[i1+1]==0) i1++; | |
535 | j1 = J[i1+1]-1; | |
536 | J[i1] = j1; | |
537 | change(i0,i1,j0,j1); | |
538 | } else for(i0=m;i0>=1;i0=i1-1) { | |
539 | while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; | |
540 | j0 = J[i0+1]-1; | |
541 | i1 = i0+1; | |
542 | while(i1>1&&J[i1-1]==0) i1--; | |
543 | j1 = J[i1-1]+1; | |
544 | J[i1] = j1; | |
545 | change(i1,i0,j1,j0); | |
546 | } | |
547 | if(m==0) | |
548 | change(1,0,1,len[1]); | |
549 | } | |
550 | ||
551 | change(a,b,c,d) | |
552 | { | |
553 | if(a>b&&c>d) return; | |
554 | anychange = 1; | |
555 | if(opt!=1) { | |
556 | range(a,b,","); | |
557 | putchar(a>b?'a':c>d?'d':'c'); | |
558 | if(opt!=-1) range(c,d,","); | |
559 | } else { | |
560 | putchar(a>b?'a':c>d?'d':'c'); | |
561 | range(a,b," "); | |
562 | } | |
563 | putchar('\n'); | |
564 | if(opt==0) { | |
565 | fetch(ixold,a,b,input[0],"< "); | |
566 | if(a<=b&&c<=d) prints("---\n"); | |
567 | } | |
568 | fetch(ixnew,c,d,input[1],opt==0?"> ":empty); | |
569 | if(opt!=0&&c<=d) prints(".\n"); | |
570 | } | |
571 | ||
572 | range(a,b,separator) | |
573 | char *separator; | |
574 | { | |
575 | printf("%d", a>b?b:a); | |
576 | if(a<b) { | |
577 | printf("%s%d", separator, b); | |
578 | } | |
579 | } | |
580 | ||
581 | fetch(f,a,b,lb,s) | |
582 | long *f; | |
583 | FILE *lb; | |
584 | char *s; | |
585 | { | |
586 | register int i, j; | |
587 | register int nc; | |
588 | for(i=a;i<=b;i++) { | |
589 | fseek(lb,f[i-1],0); | |
590 | nc = f[i]-f[i-1]; | |
591 | prints(s); | |
592 | for(j=0;j<nc;j++) | |
593 | putchar(getc(lb)); | |
594 | } | |
595 | } | |
596 | ||
597 | /* hashing has the effect of | |
598 | * arranging line in 7-bit bytes and then | |
599 | * summing 1-s complement in 16-bit hunks | |
600 | */ | |
601 | ||
602 | readhash(f) | |
603 | FILE *f; | |
604 | { | |
605 | long sum; | |
606 | register unsigned shift; | |
607 | register space; | |
608 | register t; | |
609 | sum = 1; | |
610 | space = 0; | |
611 | if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) { | |
612 | if(t==-1) | |
613 | return(0); | |
614 | sum += (long)t << (shift%=HALFLONG); | |
615 | } | |
616 | else for(shift=0;;) { | |
617 | switch(t=getc(f)) { | |
618 | case -1: | |
619 | return(0); | |
620 | case '\t': | |
621 | case ' ': | |
622 | space++; | |
623 | continue; | |
624 | default: | |
625 | if(space) { | |
626 | shift += 7; | |
627 | space = 0; | |
628 | } | |
629 | sum += (long)t << (shift%=HALFLONG); | |
630 | shift += 7; | |
631 | continue; | |
632 | case '\n': | |
633 | break; | |
634 | } | |
635 | break; | |
636 | } | |
637 | sum = low(sum) + high(sum); | |
638 | return((short)low(sum) + (short)high(sum)); | |
639 | } | |
640 | ||
641 | mesg(s,t) | |
642 | char *s, *t; | |
643 | { | |
644 | fprintf(stderr,"diff: %s%s\n",s,t); | |
645 | } |