Research V7 development
[unix-history] / usr / src / cmd / diff.c
CommitLineData
13d94d7c
DM
1#
2/* diff - differential file comparison
3*
4* Uses an algorithm due to Harold Stone, which finds
5* a pair of longest identical subsequences in the two
6* files.
7*
8* The major goal is to generate the match vector J.
9* J[i] is the index of the line in file1 corresponding
10* to line i file0. J[i] = 0 if there is no
11* such line in file1.
12*
13* Lines are hashed so as to work in core. All potential
14* matches are located by sorting the lines of each file
15* on the hash (called value\b\b\b\b\b_____). In particular, this
16* collects the equivalence classes in file1 together.
17* Subroutine equiv\b\b\b\b____ replaces the value of each line in
18* file0 by the index of the first element of its
19* matching equivalence in (the reordered) file1.
20* To save space equiv\b\b\b\b\b_____ squeezes file1 into a single
21* array member\b\b\b\b\b\b______ in which the equivalence classes
22* are simply concatenated, except that their first
23* members are flagged by changing sign.
24*
25* Next the indices that point into member\b\b\b\b\b\b______ are unsorted\b\b\b\b\b\b\b\b_______ into
26* array class\b\b\b\b\b_____ according to the original order of file0.
27*
28* The cleverness lies in routine stone\b\b\b\b\b______. This marches
29* through the lines of file0, developing a vector klist\b\b\b\b\b_____
30* of "k-candidates". At step i a k-candidate is a matched
31* pair of lines x,y (x in file0 y in file1) such that
32* there is a common subsequence of lenght k
33* between the first i lines of file0 and the first y
34* lines of file1, but there is no such subsequence for
35* any smaller y. x is the earliest possible mate to y
36* that occurs in such a subsequence.
37*
38* Whenever any of the members of the equivalence class of
39* lines in file1 matable to a line in file0 has serial number
40* less than the y of some k-candidate, that k-candidate
41* with the smallest such y is replaced. The new
42* k-candidate is chained (via pred\b\b\b\b____) to the current
43* k-1 candidate so that the actual subsequence can
44* be recovered. When a member has serial number greater
45* that the y of all k-candidates, the klist is extended.
46* At the end, the longest subsequence is pulled out
47* and placed in the array J by unravel\b\b\b\b\b\b\b_______.
48*
49* With J in hand, the matches there recorded are
50* check\b\b\b\b\b_____ed against reality to assure that no spurious
51* matches have crept in due to hashing. If they have,
52* they are broken, and "jackpot " is recorded--a harmless
53* matter except that a true match for a spuriously
54* mated line may now be unnecessarily reported as a change.
55*
56* Much of the complexity of the program comes simply
57* from trying to minimize core utilization and
58* maximize the range of doable problems by dynamically
59* allocating what is needed and reusing what is not.
60* The core requirements for problems larger than somewhat
61* are (in words) 2*length(file0) + length(file1) +
62* 3*(number of k-candidates installed), typically about
63* 6n words for files of length n.
64*/
65#include <stdio.h>
66#include <ctype.h>
67#include <sys/types.h>
68#include <sys/stat.h>
69#include <signal.h>
70#define prints(s) fputs(s,stdout)
71
72#define HALFLONG 16
73#define low(x) (x&((1L<<HALFLONG)-1))
74#define high(x) (x>>HALFLONG)
75FILE *input[2];
76FILE *fopen();
77
78struct cand {
79 int x;
80 int y;
81 int pred;
82} cand;
83struct line {
84 int serial;
85 int value;
86} *file[2], line;
87int len[2];
88struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/
89int slen[2];
90int pref, suff; /*length of prefix and suffix*/
91int *class; /*will be overlaid on file[0]*/
92int *member; /*will be overlaid on file[1]*/
93int *klist; /*will be overlaid on file[0] after class*/
94struct cand *clist; /* merely a free storage pot for candidates */
95int clen = 0;
96int *J; /*will be overlaid on class*/
97long *ixold; /*will be overlaid on klist*/
98long *ixnew; /*will be overlaid on file[1]*/
99int opt; /* -1,0,1 = -e,normal,-f */
100int status = 2;
101int anychange = 0;
102char *empty = "";
103int bflag;
104
105char *tempfile; /*used when comparing against std input*/
106char *mktemp();
107char *dummy; /*used in resetting storage search ptr*/
108
109done()
110{
111 unlink(tempfile);
112 exit(status);
113}
114
115char *talloc(n)
116{
117 extern char *malloc();
118 register char *p;
119 p = malloc((unsigned)n);
120 if(p!=NULL)
121 return(p);
122 noroom();
123}
124
125char *ralloc(p,n) /*compacting reallocation */
126char *p;
127{
128 register char *q;
129 char *realloc();
130 free(p);
131 free(dummy);
132 dummy = malloc(1);
133 q = realloc(p, (unsigned)n);
134 if(q==NULL)
135 noroom();
136 return(q);
137}
138
139noroom()
140{
141 mesg("files too big, try -h\n",empty);
142 done();
143}
144
145sort(a,n) /*shellsort CACM #201*/
146struct line *a;
147{
148 struct line w;
149 register int j,m;
150 struct line *ai;
151 register struct line *aim;
152 int k;
153 for(j=1;j<=n;j*= 2)
154 m = 2*j - 1;
155 for(m/=2;m!=0;m/=2) {
156 k = n-m;
157 for(j=1;j<=k;j++) {
158 for(ai = &a[j]; ai > a; ai -= m) {
159 aim = &ai[m];
160 if(aim < ai)
161 break; /*wraparound*/
162 if(aim->value > ai[0].value ||
163 aim->value == ai[0].value &&
164 aim->serial > ai[0].serial)
165 break;
166 w.value = ai[0].value;
167 ai[0].value = aim->value;
168 aim->value = w.value;
169 w.serial = ai[0].serial;
170 ai[0].serial = aim->serial;
171 aim->serial = w.serial;
172 }
173 }
174 }
175}
176
177unsort(f, l, b)
178struct line *f;
179int *b;
180{
181 register int *a;
182 register int i;
183 a = (int *)talloc((l+1)*sizeof(int));
184 for(i=1;i<=l;i++)
185 a[f[i].serial] = f[i].value;
186 for(i=1;i<=l;i++)
187 b[i] = a[i];
188 free((char *)a);
189}
190
191filename(pa1, pa2)
192char **pa1, **pa2;
193{
194 register char *a1, *b1, *a2;
195 char buf[512];
196 struct stat stbuf;
197 int i, f;
198 a1 = *pa1;
199 a2 = *pa2;
200 if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) {
201 b1 = *pa1 = malloc(100);
202 while(*b1++ = *a1++) ;
203 b1[-1] = '/';
204 a1 = b1;
205 while(*a1++ = *a2++)
206 if(*a2 && *a2!='/' && a2[-1]=='/')
207 a1 = b1;
208 }
209 else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) {
210 signal(SIGHUP,done);
211 signal(SIGINT,done);
212 signal(SIGPIPE,done);
213 signal(SIGTERM,done);
214 *pa1 = tempfile = mktemp("/tmp/dXXXXX");
215 if((f=creat(tempfile,0600)) < 0) {
216 mesg("cannot create ",tempfile);
217 done();
218 }
219 while((i=read(0,buf,512))>0)
220 write(f,buf,i);
221 close(f);
222 }
223}
224
225prepare(i, arg)
226char *arg;
227{
228 register struct line *p;
229 register j,h;
230 if((input[i] = fopen(arg,"r")) == NULL){
231 mesg("cannot open ", arg);
232 done();
233 }
234 p = (struct line *)talloc(3*sizeof(line));
235 for(j=0; h=readhash(input[i]);) {
236 p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line));
237 p[j].value = h;
238 }
239 len[i] = j;
240 file[i] = p;
241 fclose(input[i]);
242}
243
244prune()
245{
246 register i,j;
247 for(pref=0;pref<len[0]&&pref<len[1]&&
248 file[0][pref+1].value==file[1][pref+1].value;
249 pref++ ) ;
250 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
251 file[0][len[0]-suff].value==file[1][len[1]-suff].value;
252 suff++) ;
253 for(j=0;j<2;j++) {
254 sfile[j] = file[j]+pref;
255 slen[j] = len[j]-pref-suff;
256 for(i=0;i<=slen[j];i++)
257 sfile[j][i].serial = i;
258 }
259}
260
261equiv(a,n,b,m,c)
262struct line *a, *b;
263int *c;
264{
265 register int i, j;
266 i = j = 1;
267 while(i<=n && j<=m) {
268 if(a[i].value <b[j].value)
269 a[i++].value = 0;
270 else if(a[i].value == b[j].value)
271 a[i++].value = j;
272 else
273 j++;
274 }
275 while(i <= n)
276 a[i++].value = 0;
277 b[m+1].value = 0;
278 j = 0;
279 while(++j <= m) {
280 c[j] = -b[j].serial;
281 while(b[j+1].value == b[j].value) {
282 j++;
283 c[j] = b[j].serial;
284 }
285 }
286 c[j] = -1;
287}
288
289main(argc, argv)
290char **argv;
291{
292 register int k;
293 char **args;
294
295 args = argv;
296 if(argc>3 && *argv[1]=='-') {
297 argc--;
298 argv++;
299 for(k=1;argv[0][k];k++) {
300 switch(argv[0][k]) {
301 case 'e':
302 opt = -1;
303 break;
304 case 'f':
305 opt = 1;
306 break;
307 case 'b':
308 bflag = 1;
309 break;
310 case 'h':
311 execv("/usr/lib/diffh",args);
312 mesg("cannot find diffh",empty);
313 done();
314 }
315 }
316 }
317 if(argc!=3) {
318 mesg("arg count",empty);
319 done();
320 }
321 dummy = malloc(1);
322
323 filename(&argv[1], &argv[2]);
324 filename(&argv[2], &argv[1]);
325 prepare(0, argv[1]);
326 prepare(1, argv[2]);
327 prune();
328 sort(sfile[0],slen[0]);
329 sort(sfile[1],slen[1]);
330
331 member = (int *)file[1];
332 equiv(sfile[0], slen[0], sfile[1], slen[1], member);
333 member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int));
334
335 class = (int *)file[0];
336 unsort(sfile[0], slen[0], class);
337 class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int));
338
339 klist = (int *)talloc((slen[0]+2)*sizeof(int));
340 clist = (struct cand *)talloc(sizeof(cand));
341 k = stone(class, slen[0], member, klist);
342 free((char *)member);
343 free((char *)class);
344
345 J = (int *)talloc((len[0]+2)*sizeof(int));
346 unravel(klist[k]);
347 free((char *)clist);
348 free((char *)klist);
349
350 ixold = (long *)talloc((len[0]+2)*sizeof(long));
351 ixnew = (long *)talloc((len[1]+2)*sizeof(long));
352 check(argv);
353 output(argv);
354 status = anychange;
355 done();
356}
357
358stone(a,n,b,c)
359int *a;
360int *b;
361int *c;
362{
363 register int i, k,y;
364 int j, l;
365 int oldc, tc;
366 int oldl;
367 k = 0;
368 c[0] = newcand(0,0,0);
369 for(i=1; i<=n; i++) {
370 j = a[i];
371 if(j==0)
372 continue;
373 y = -b[j];
374 oldl = 0;
375 oldc = c[0];
376 do {
377 if(y <= clist[oldc].y)
378 continue;
379 l = search(c, k, y);
380 if(l!=oldl+1)
381 oldc = c[l-1];
382 if(l<=k) {
383 if(clist[c[l]].y <= y)
384 continue;
385 tc = c[l];
386 c[l] = newcand(i,y,oldc);
387 oldc = tc;
388 oldl = l;
389 } else {
390 c[l] = newcand(i,y,oldc);
391 k++;
392 break;
393 }
394 } while((y=b[++j]) > 0);
395 }
396 return(k);
397}
398
399newcand(x,y,pred)
400{
401 register struct cand *q;
402 clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand));
403 q = clist + clen -1;
404 q->x = x;
405 q->y = y;
406 q->pred = pred;
407 return(clen-1);
408}
409
410search(c, k, y)
411int *c;
412{
413 register int i, j, l;
414 int t;
415 if(clist[c[k]].y<y) /*quick look for typical case*/
416 return(k+1);
417 i = 0;
418 j = k+1;
419 while((l=(i+j)/2) > i) {
420 t = clist[c[l]].y;
421 if(t > y)
422 j = l;
423 else if(t < y)
424 i = l;
425 else
426 return(l);
427 }
428 return(l+1);
429}
430
431unravel(p)
432{
433 register int i;
434 register struct cand *q;
435 for(i=0; i<=len[0]; i++)
436 J[i] = i<=pref ? i:
437 i>len[0]-suff ? i+len[1]-len[0]:
438 0;
439 for(q=clist+p;q->y!=0;q=clist+q->pred)
440 J[q->x+pref] = q->y+pref;
441}
442
443/* check does double duty:
4441. ferret out any fortuitous correspondences due
445to confounding by hashing (which result in "jackpot")
4462. collect random access indexes to the two files */
447
448check(argv)
449char **argv;
450{
451 register int i, j;
452 int jackpot;
453 long ctold, ctnew;
454 char c,d;
455 input[0] = fopen(argv[1],"r");
456 input[1] = fopen(argv[2],"r");
457 j = 1;
458 ixold[0] = ixnew[0] = 0;
459 jackpot = 0;
460 ctold = ctnew = 0;
461 for(i=1;i<=len[0];i++) {
462 if(J[i]==0) {
463 ixold[i] = ctold += skipline(0);
464 continue;
465 }
466 while(j<J[i]) {
467 ixnew[j] = ctnew += skipline(1);
468 j++;
469 }
470 for(;;) {
471 c = getc(input[0]);
472 d = getc(input[1]);
473 ctold++;
474 ctnew++;
475 if(bflag && isspace(c) && isspace(d)) {
476 do {
477 if(c=='\n') break;
478 ctold++;
479 } while(isspace(c=getc(input[0])));
480 do {
481 if(d=='\n') break;
482 ctnew++;
483 } while(isspace(d=getc(input[1])));
484 }
485 if(c!=d) {
486 jackpot++;
487 J[i] = 0;
488 if(c!='\n')
489 ctold += skipline(0);
490 if(d!='\n')
491 ctnew += skipline(1);
492 break;
493 }
494 if(c=='\n')
495 break;
496 }
497 ixold[i] = ctold;
498 ixnew[j] = ctnew;
499 j++;
500 }
501 for(;j<=len[1];j++) {
502 ixnew[j] = ctnew += skipline(1);
503 }
504 fclose(input[0]);
505 fclose(input[1]);
506/*
507 if(jackpot)
508 mesg("jackpot",empty);
509*/
510}
511
512skipline(f)
513{
514 register i;
515 for(i=1;getc(input[f])!='\n';i++) ;
516 return(i);
517}
518
519output(argv)
520char **argv;
521{
522 int m;
523 register int i0, i1, j1;
524 int j0;
525 input[0] = fopen(argv[1],"r");
526 input[1] = fopen(argv[2],"r");
527 m = len[0];
528 J[0] = 0;
529 J[m+1] = len[1]+1;
530 if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) {
531 while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
532 j0 = J[i0-1]+1;
533 i1 = i0-1;
534 while(i1<m&&J[i1+1]==0) i1++;
535 j1 = J[i1+1]-1;
536 J[i1] = j1;
537 change(i0,i1,j0,j1);
538 } else for(i0=m;i0>=1;i0=i1-1) {
539 while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
540 j0 = J[i0+1]-1;
541 i1 = i0+1;
542 while(i1>1&&J[i1-1]==0) i1--;
543 j1 = J[i1-1]+1;
544 J[i1] = j1;
545 change(i1,i0,j1,j0);
546 }
547 if(m==0)
548 change(1,0,1,len[1]);
549}
550
551change(a,b,c,d)
552{
553 if(a>b&&c>d) return;
554 anychange = 1;
555 if(opt!=1) {
556 range(a,b,",");
557 putchar(a>b?'a':c>d?'d':'c');
558 if(opt!=-1) range(c,d,",");
559 } else {
560 putchar(a>b?'a':c>d?'d':'c');
561 range(a,b," ");
562 }
563 putchar('\n');
564 if(opt==0) {
565 fetch(ixold,a,b,input[0],"< ");
566 if(a<=b&&c<=d) prints("---\n");
567 }
568 fetch(ixnew,c,d,input[1],opt==0?"> ":empty);
569 if(opt!=0&&c<=d) prints(".\n");
570}
571
572range(a,b,separator)
573char *separator;
574{
575 printf("%d", a>b?b:a);
576 if(a<b) {
577 printf("%s%d", separator, b);
578 }
579}
580
581fetch(f,a,b,lb,s)
582long *f;
583FILE *lb;
584char *s;
585{
586 register int i, j;
587 register int nc;
588 for(i=a;i<=b;i++) {
589 fseek(lb,f[i-1],0);
590 nc = f[i]-f[i-1];
591 prints(s);
592 for(j=0;j<nc;j++)
593 putchar(getc(lb));
594 }
595}
596
597/* hashing has the effect of
598 * arranging line in 7-bit bytes and then
599 * summing 1-s complement in 16-bit hunks
600*/
601
602readhash(f)
603FILE *f;
604{
605 long sum;
606 register unsigned shift;
607 register space;
608 register t;
609 sum = 1;
610 space = 0;
611 if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) {
612 if(t==-1)
613 return(0);
614 sum += (long)t << (shift%=HALFLONG);
615 }
616 else for(shift=0;;) {
617 switch(t=getc(f)) {
618 case -1:
619 return(0);
620 case '\t':
621 case ' ':
622 space++;
623 continue;
624 default:
625 if(space) {
626 shift += 7;
627 space = 0;
628 }
629 sum += (long)t << (shift%=HALFLONG);
630 shift += 7;
631 continue;
632 case '\n':
633 break;
634 }
635 break;
636 }
637 sum = low(sum) + high(sum);
638 return((short)low(sum) + (short)high(sum));
639}
640
641mesg(s,t)
642char *s, *t;
643{
644 fprintf(stderr,"diff: %s%s\n",s,t);
645}