Bell 32V development
authorTom London <tbl@research.uucp>
Mon, 11 Dec 1978 00:48:39 +0000 (19:48 -0500)
committerTom London <tbl@research.uucp>
Mon, 11 Dec 1978 00:48:39 +0000 (19:48 -0500)
Work on file usr/src/cmd/diff.c

Co-Authored-By: John Reiser <jfr@research.uucp>
Synthesized-from: 32v

usr/src/cmd/diff.c [new file with mode: 0644]

diff --git a/usr/src/cmd/diff.c b/usr/src/cmd/diff.c
new file mode 100644 (file)
index 0000000..5620965
--- /dev/null
@@ -0,0 +1,645 @@
+#
+/*     diff - differential file comparison
+*
+*      Uses an algorithm due to Harold Stone, which finds
+*      a pair of longest identical subsequences in the two
+*      files.
+*
+*      The major goal is to generate the match vector J.
+*      J[i] is the index of the line in file1 corresponding
+*      to line i file0. J[i] = 0 if there is no
+*      such line in file1.
+*
+*      Lines are hashed so as to work in core. All potential
+*      matches are located by sorting the lines of each file
+*      on the hash (called value\b\b\b\b\b_____). In particular, this
+*      collects the equivalence classes in file1 together.
+*      Subroutine equiv\b\b\b\b____  replaces the value of each line in
+*      file0 by the index of the first element of its 
+*      matching equivalence in (the reordered) file1.
+*      To save space equiv\b\b\b\b\b_____ squeezes file1 into a single
+*      array member\b\b\b\b\b\b______ in which the equivalence classes
+*      are simply concatenated, except that their first
+*      members are flagged by changing sign.
+*
+*      Next the indices that point into member\b\b\b\b\b\b______ are unsorted\b\b\b\b\b\b\b\b_______   into
+*      array class\b\b\b\b\b_____ according to the original order of file0.
+*
+*      The cleverness lies in routine stone\b\b\b\b\b______. This marches
+*      through the lines of file0, developing a vector klist\b\b\b\b\b_____
+*      of "k-candidates". At step i a k-candidate is a matched
+*      pair of lines x,y (x in file0 y in file1) such that
+*      there is a common subsequence of lenght k
+*      between the first i lines of file0 and the first y 
+*      lines of file1, but there is no such subsequence for
+*      any smaller y. x is the earliest possible mate to y
+*      that occurs in such a subsequence.
+*
+*      Whenever any of the members of the equivalence class of
+*      lines in file1 matable to a line in file0 has serial number 
+*      less than the y of some k-candidate, that k-candidate 
+*      with the smallest such y is replaced. The new 
+*      k-candidate is chained (via pred\b\b\b\b____) to the current
+*      k-1 candidate so that the actual subsequence can
+*      be recovered. When a member has serial number greater
+*      that the y of all k-candidates, the klist is extended.
+*      At the end, the longest subsequence is pulled out
+*      and placed in the array J by unravel\b\b\b\b\b\b\b_______.
+*
+*      With J in hand, the matches there recorded are
+*      check\b\b\b\b\b_____ed against reality to assure that no spurious
+*      matches have crept in due to hashing. If they have,
+*      they are broken, and "jackpot " is recorded--a harmless
+*      matter except that a true match for a spuriously
+*      mated line may now be unnecessarily reported as a change.
+*
+*      Much of the complexity of the program comes simply
+*      from trying to minimize core utilization and
+*      maximize the range of doable problems by dynamically
+*      allocating what is needed and reusing what is not.
+*      The core requirements for problems larger than somewhat
+*      are (in words) 2*length(file0) + length(file1) +
+*      3*(number of k-candidates installed),  typically about
+*      6n words for files of length n. 
+*/
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <signal.h>
+#define        prints(s)       fputs(s,stdout)
+
+#define HALFLONG 16
+#define low(x) (x&((1L<<HALFLONG)-1))
+#define high(x)        (x>>HALFLONG)
+FILE *input[2];
+FILE *fopen();
+
+struct cand {
+       int x;
+       int y;
+       int pred;
+} cand;
+struct line {
+       int serial;
+       int value;
+} *file[2], line;
+int len[2];
+struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/
+int slen[2];
+int pref, suff;        /*length of prefix and suffix*/
+int *class;    /*will be overlaid on file[0]*/
+int *member;   /*will be overlaid on file[1]*/
+int *klist;            /*will be overlaid on file[0] after class*/
+struct cand *clist;    /* merely a free storage pot for candidates */
+int clen = 0;
+int *J;                /*will be overlaid on class*/
+long *ixold;   /*will be overlaid on klist*/
+long *ixnew;   /*will be overlaid on file[1]*/
+int opt;       /* -1,0,1 = -e,normal,-f */
+int status = 2;
+int anychange = 0;
+char *empty = "";
+int bflag;
+
+char *tempfile;        /*used when comparing against std input*/
+char *mktemp();
+char *dummy;   /*used in resetting storage search ptr*/
+
+done()
+{
+       unlink(tempfile);
+       exit(status);
+}
+
+char *talloc(n)
+{
+       extern char *malloc();
+       register char *p;
+       p = malloc((unsigned)n);
+       if(p!=NULL)
+               return(p);
+       noroom();
+}
+
+char *ralloc(p,n)      /*compacting reallocation */
+char *p;
+{
+       register char *q;
+       char *realloc();
+       free(p);
+       free(dummy);
+       dummy = malloc(1);
+       q = realloc(p, (unsigned)n);
+       if(q==NULL)
+               noroom();
+       return(q);
+}
+
+noroom()
+{
+       mesg("files too big, try -h\n",empty);
+       done();
+}
+
+sort(a,n)      /*shellsort CACM #201*/
+struct line *a;
+{
+       struct line w;
+       register int j,m;
+       struct line *ai;
+       register struct line *aim;
+       int k;
+       for(j=1;j<=n;j*= 2)
+               m = 2*j - 1;
+       for(m/=2;m!=0;m/=2) {
+               k = n-m;
+               for(j=1;j<=k;j++) {
+                       for(ai = &a[j]; ai > a; ai -= m) {
+                               aim = &ai[m];
+                               if(aim < ai)
+                                       break;  /*wraparound*/
+                               if(aim->value > ai[0].value ||
+                                  aim->value == ai[0].value &&
+                                  aim->serial > ai[0].serial)
+                                       break;
+                               w.value = ai[0].value;
+                               ai[0].value = aim->value;
+                               aim->value = w.value;
+                               w.serial = ai[0].serial;
+                               ai[0].serial = aim->serial;
+                               aim->serial = w.serial;
+                       }
+               }
+       }
+}
+
+unsort(f, l, b)
+struct line *f;
+int *b;
+{
+       register int *a;
+       register int i;
+       a = (int *)talloc((l+1)*sizeof(int));
+       for(i=1;i<=l;i++)
+               a[f[i].serial] = f[i].value;
+       for(i=1;i<=l;i++)
+               b[i] = a[i];
+       free((char *)a);
+}
+
+filename(pa1, pa2)
+char **pa1, **pa2;
+{
+       register char *a1, *b1, *a2;
+       char buf[512];
+       struct stat stbuf;
+       int i, f;
+       a1 = *pa1;
+       a2 = *pa2;
+       if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) {
+               b1 = *pa1 = malloc(100);
+               while(*b1++ = *a1++) ;
+               b1[-1] = '/';
+               a1 = b1;
+               while(*a1++ = *a2++)
+                       if(*a2 && *a2!='/' && a2[-1]=='/')
+                               a1 = b1;
+       }
+       else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) {
+               signal(SIGHUP,done);
+               signal(SIGINT,done);
+               signal(SIGPIPE,done);
+               signal(SIGTERM,done);
+               *pa1 = tempfile = mktemp("/tmp/dXXXXX");
+               if((f=creat(tempfile,0600)) < 0) {
+                       mesg("cannot create ",tempfile);
+                       done();
+               }
+               while((i=read(0,buf,512))>0)
+                       write(f,buf,i);
+               close(f);
+       }
+}
+
+prepare(i, arg)
+char *arg;
+{
+       register struct line *p;
+       register j,h;
+       if((input[i] = fopen(arg,"r")) == NULL){
+               mesg("cannot open ", arg);
+               done();
+       }
+       p = (struct line *)talloc(3*sizeof(line));
+       for(j=0; h=readhash(input[i]);) {
+               p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line));
+               p[j].value = h;
+       }
+       len[i] = j;
+       file[i] = p;
+       fclose(input[i]);
+}
+
+prune()
+{
+       register i,j;
+       for(pref=0;pref<len[0]&&pref<len[1]&&
+               file[0][pref+1].value==file[1][pref+1].value;
+               pref++ ) ;
+       for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
+               file[0][len[0]-suff].value==file[1][len[1]-suff].value;
+               suff++) ;
+       for(j=0;j<2;j++) {
+               sfile[j] = file[j]+pref;
+               slen[j] = len[j]-pref-suff;
+               for(i=0;i<=slen[j];i++)
+                       sfile[j][i].serial = i;
+       }
+}
+
+equiv(a,n,b,m,c)
+struct line *a, *b;
+int *c;
+{
+       register int i, j;
+       i = j = 1;
+       while(i<=n && j<=m) {
+               if(a[i].value <b[j].value)
+                       a[i++].value = 0;
+               else if(a[i].value == b[j].value)
+                       a[i++].value = j;
+               else
+                       j++;
+       }
+       while(i <= n)
+               a[i++].value = 0;
+       b[m+1].value = 0;
+       j = 0;
+       while(++j <= m) {
+               c[j] = -b[j].serial;
+               while(b[j+1].value == b[j].value) {
+                       j++;
+                       c[j] = b[j].serial;
+               }
+       }
+       c[j] = -1;
+}
+
+main(argc, argv)
+char **argv;
+{
+       register int k;
+       char **args;
+
+       args = argv;
+       if(argc>3 && *argv[1]=='-') {
+               argc--;
+               argv++;
+               for(k=1;argv[0][k];k++) {
+                       switch(argv[0][k]) {
+                       case 'e':
+                               opt = -1;
+                               break;
+                       case 'f':
+                               opt = 1;
+                               break;
+                       case 'b':
+                               bflag = 1;
+                               break;
+                       case 'h':
+                               execv("/usr/lib/diffh",args);
+                               mesg("cannot find diffh",empty);
+                               done();
+                       }
+               }
+       }
+       if(argc!=3) {
+               mesg("arg count",empty);
+               done();
+       }
+       dummy = malloc(1);
+
+       filename(&argv[1], &argv[2]);
+       filename(&argv[2], &argv[1]);
+       prepare(0, argv[1]);
+       prepare(1, argv[2]);
+       prune();
+       sort(sfile[0],slen[0]);
+       sort(sfile[1],slen[1]);
+
+       member = (int *)file[1];
+       equiv(sfile[0], slen[0], sfile[1], slen[1], member);
+       member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int));
+
+       class = (int *)file[0];
+       unsort(sfile[0], slen[0], class);
+       class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int));
+
+       klist = (int *)talloc((slen[0]+2)*sizeof(int));
+       clist = (struct cand *)talloc(sizeof(cand));
+       k = stone(class, slen[0], member, klist);
+       free((char *)member);
+       free((char *)class);
+
+       J = (int *)talloc((len[0]+2)*sizeof(int));
+       unravel(klist[k]);
+       free((char *)clist);
+       free((char *)klist);
+
+       ixold = (long *)talloc((len[0]+2)*sizeof(long));
+       ixnew = (long *)talloc((len[1]+2)*sizeof(long));
+       check(argv);
+       output(argv);
+       status = anychange;
+       done();
+}
+
+stone(a,n,b,c)
+int *a;
+int *b;
+int *c;
+{
+       register int i, k,y;
+       int j, l;
+       int oldc, tc;
+       int oldl;
+       k = 0;
+       c[0] = newcand(0,0,0);
+       for(i=1; i<=n; i++) {
+               j = a[i];
+               if(j==0)
+                       continue;
+               y = -b[j];
+               oldl = 0;
+               oldc = c[0];
+               do {
+                       if(y <= clist[oldc].y)
+                               continue;
+                       l = search(c, k, y);
+                       if(l!=oldl+1)
+                               oldc = c[l-1];
+                       if(l<=k) {
+                               if(clist[c[l]].y <= y)
+                                       continue;
+                               tc = c[l];
+                               c[l] = newcand(i,y,oldc);
+                               oldc = tc;
+                               oldl = l;
+                       } else {
+                               c[l] = newcand(i,y,oldc);
+                               k++;
+                               break;
+                       }
+               } while((y=b[++j]) > 0);
+       }
+       return(k);
+}
+
+newcand(x,y,pred)
+{
+       register struct cand *q;
+       clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand));
+       q = clist + clen -1;
+       q->x = x;
+       q->y = y;
+       q->pred = pred;
+       return(clen-1);
+}
+
+search(c, k, y)
+int *c;
+{
+       register int i, j, l;
+       int t;
+       if(clist[c[k]].y<y)     /*quick look for typical case*/
+               return(k+1);
+       i = 0;
+       j = k+1;
+       while((l=(i+j)/2) > i) {
+               t = clist[c[l]].y;
+               if(t > y)
+                       j = l;
+               else if(t < y)
+                       i = l;
+               else
+                       return(l);
+       }
+       return(l+1);
+}
+
+unravel(p)
+{
+       register int i;
+       register struct cand *q;
+       for(i=0; i<=len[0]; i++)
+               J[i] =  i<=pref ? i:
+                       i>len[0]-suff ? i+len[1]-len[0]:
+                       0;
+       for(q=clist+p;q->y!=0;q=clist+q->pred)
+               J[q->x+pref] = q->y+pref;
+}
+
+/* check does double duty:
+1.  ferret out any fortuitous correspondences due
+to confounding by hashing (which result in "jackpot")
+2.  collect random access indexes to the two files */
+
+check(argv)
+char **argv;
+{
+       register int i, j;
+       int jackpot;
+       long ctold, ctnew;
+       char c,d;
+       input[0] = fopen(argv[1],"r");
+       input[1] = fopen(argv[2],"r");
+       j = 1;
+       ixold[0] = ixnew[0] = 0;
+       jackpot = 0;
+       ctold = ctnew = 0;
+       for(i=1;i<=len[0];i++) {
+               if(J[i]==0) {
+                       ixold[i] = ctold += skipline(0);
+                       continue;
+               }
+               while(j<J[i]) {
+                       ixnew[j] = ctnew += skipline(1);
+                       j++;
+               }
+               for(;;) {
+                       c = getc(input[0]);
+                       d = getc(input[1]);
+                       ctold++;
+                       ctnew++;
+                       if(bflag && isspace(c) && isspace(d)) {
+                               do {
+                                       if(c=='\n') break;
+                                       ctold++;
+                               } while(isspace(c=getc(input[0])));
+                               do {
+                                       if(d=='\n') break;
+                                       ctnew++;
+                               } while(isspace(d=getc(input[1])));
+                       }
+                       if(c!=d) {
+                               jackpot++;
+                               J[i] = 0;
+                               if(c!='\n')
+                                       ctold += skipline(0);
+                               if(d!='\n')
+                                       ctnew += skipline(1);
+                               break;
+                       }
+                       if(c=='\n')
+                               break;
+               }
+               ixold[i] = ctold;
+               ixnew[j] = ctnew;
+               j++;
+       }
+       for(;j<=len[1];j++) {
+               ixnew[j] = ctnew += skipline(1);
+       }
+       fclose(input[0]);
+       fclose(input[1]);
+/*
+       if(jackpot)
+               mesg("jackpot",empty);
+*/
+}
+
+skipline(f)
+{
+       register i;
+       for(i=1;getc(input[f])!='\n';i++) ;
+       return(i);
+}
+
+output(argv)
+char **argv;
+{
+       int m;
+       register int i0, i1, j1;
+       int j0;
+       input[0] = fopen(argv[1],"r");
+       input[1] = fopen(argv[2],"r");
+       m = len[0];
+       J[0] = 0;
+       J[m+1] = len[1]+1;
+       if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) {
+               while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
+               j0 = J[i0-1]+1;
+               i1 = i0-1;
+               while(i1<m&&J[i1+1]==0) i1++;
+               j1 = J[i1+1]-1;
+               J[i1] = j1;
+               change(i0,i1,j0,j1);
+       } else for(i0=m;i0>=1;i0=i1-1) {
+               while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
+               j0 = J[i0+1]-1;
+               i1 = i0+1;
+               while(i1>1&&J[i1-1]==0) i1--;
+               j1 = J[i1-1]+1;
+               J[i1] = j1;
+               change(i1,i0,j1,j0);
+       }
+       if(m==0)
+               change(1,0,1,len[1]);
+}
+
+change(a,b,c,d)
+{
+       if(a>b&&c>d) return;
+       anychange = 1;
+       if(opt!=1) {
+               range(a,b,",");
+               putchar(a>b?'a':c>d?'d':'c');
+               if(opt!=-1) range(c,d,",");
+       } else {
+               putchar(a>b?'a':c>d?'d':'c');
+               range(a,b," ");
+       }
+       putchar('\n');
+       if(opt==0) {
+               fetch(ixold,a,b,input[0],"< ");
+               if(a<=b&&c<=d) prints("---\n");
+       }
+       fetch(ixnew,c,d,input[1],opt==0?"> ":empty);
+       if(opt!=0&&c<=d) prints(".\n");
+}
+
+range(a,b,separator)
+char *separator;
+{
+       printf("%d", a>b?b:a);
+       if(a<b) {
+               printf("%s%d", separator, b);
+       }
+}
+
+fetch(f,a,b,lb,s)
+long *f;
+FILE *lb;
+char *s;
+{
+       register int i, j;
+       register int nc;
+       for(i=a;i<=b;i++) {
+               fseek(lb,f[i-1],0);
+               nc = f[i]-f[i-1];
+               prints(s);
+               for(j=0;j<nc;j++)
+                       putchar(getc(lb));
+       }
+}
+
+/* hashing has the effect of
+ * arranging line in 7-bit bytes and then
+ * summing 1-s complement in 16-bit hunks 
+*/
+
+readhash(f)
+FILE *f;
+{
+       long sum;
+       register unsigned shift;
+       register space;
+       register t;
+       sum = 1;
+       space = 0;
+       if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) {
+               if(t==-1)
+                       return(0);
+               sum += (long)t << (shift%=HALFLONG);
+       }
+       else for(shift=0;;) {
+               switch(t=getc(f)) {
+               case -1:
+                       return(0);
+               case '\t':
+               case ' ':
+                       space++;
+                       continue;
+               default:
+                       if(space) {
+                               shift += 7;
+                               space = 0;
+                       }
+                       sum += (long)t << (shift%=HALFLONG);
+                       shift += 7;
+                       continue;
+               case '\n':
+                       break;
+               }
+               break;
+       }
+       sum = low(sum) + high(sum);
+       return((short)low(sum) + (short)high(sum));
+}
+
+mesg(s,t)
+char *s, *t;
+{
+       fprintf(stderr,"diff: %s%s\n",s,t);
+}