/* diff - differential file comparison
* Uses an algorithm due to Harold Stone, which finds
* a pair of longest identical subsequences in the two
* The major goal is to generate the match vector J.
* J[i] is the index of the line in file1 corresponding
* to line i file0. J[i] = 0 if there is no
* Lines are hashed so as to work in core. All potential
* matches are located by sorting the lines of each file
* on the hash (called value\b\b\b\b\b_____). In particular, this
* collects the equivalence classes in file1 together.
* Subroutine equiv\b\b\b\b____ replaces the value of each line in
* file0 by the index of the first element of its
* matching equivalence in (the reordered) file1.
* To save space equiv\b\b\b\b\b_____ squeezes file1 into a single
* array member\b\b\b\b\b\b______ in which the equivalence classes
* are simply concatenated, except that their first
* members are flagged by changing sign.
* Next the indices that point into member\b\b\b\b\b\b______ are unsorted\b\b\b\b\b\b\b\b_______ into
* array class\b\b\b\b\b_____ according to the original order of file0.
* The cleverness lies in routine stone\b\b\b\b\b______. This marches
* through the lines of file0, developing a vector klist\b\b\b\b\b_____
* of "k-candidates". At step i a k-candidate is a matched
* pair of lines x,y (x in file0 y in file1) such that
* there is a common subsequence of lenght k
* between the first i lines of file0 and the first y
* lines of file1, but there is no such subsequence for
* any smaller y. x is the earliest possible mate to y
* that occurs in such a subsequence.
* Whenever any of the members of the equivalence class of
* lines in file1 matable to a line in file0 has serial number
* less than the y of some k-candidate, that k-candidate
* with the smallest such y is replaced. The new
* k-candidate is chained (via pred\b\b\b\b____) to the current
* k-1 candidate so that the actual subsequence can
* be recovered. When a member has serial number greater
* that the y of all k-candidates, the klist is extended.
* At the end, the longest subsequence is pulled out
* and placed in the array J by unravel\b\b\b\b\b\b\b_______.
* With J in hand, the matches there recorded are
* check\b\b\b\b\b_____ed against reality to assure that no spurious
* matches have crept in due to hashing. If they have,
* they are broken, and "jackpot " is recorded--a harmless
* matter except that a true match for a spuriously
* mated line may now be unnecessarily reported as a change.
* Much of the complexity of the program comes simply
* from trying to minimize core utilization and
* maximize the range of doable problems by dynamically
* allocating what is needed and reusing what is not.
* The core requirements for problems larger than somewhat
* are (in words) 2*length(file0) + length(file1) +
* 3*(number of k-candidates installed), typically about
* 6n words for files of length n.
#define prints(s) fputs(s,stdout)
#define low(x) (x&((1L<<HALFLONG)-1))
#define high(x) (x>>HALFLONG)
struct line
*sfile
[2]; /*shortened by pruning common prefix and suffix*/
int pref
, suff
; /*length of prefix and suffix*/
int *class; /*will be overlaid on file[0]*/
int *member
; /*will be overlaid on file[1]*/
int *klist
; /*will be overlaid on file[0] after class*/
struct cand
*clist
; /* merely a free storage pot for candidates */
int *J
; /*will be overlaid on class*/
long *ixold
; /*will be overlaid on klist*/
long *ixnew
; /*will be overlaid on file[1]*/
int opt
; /* -1,0,1 = -e,normal,-f */
char *tempfile
; /*used when comparing against std input*/
char *dummy
; /*used in resetting storage search ptr*/
char *ralloc(p
,n
) /*compacting reallocation */
q
= realloc(p
, (unsigned)n
);
mesg("files too big, try -h\n",empty
);
sort(a
,n
) /*shellsort CACM #201*/
register struct line
*aim
;
for(ai
= &a
[j
]; ai
> a
; ai
-= m
) {
if(aim
->value
> ai
[0].value
||
aim
->value
== ai
[0].value
&&
aim
->serial
> ai
[0].serial
)
ai
[0].value
= aim
->value
;
ai
[0].serial
= aim
->serial
;
a
= (int *)talloc((l
+1)*sizeof(int));
a
[f
[i
].serial
] = f
[i
].value
;
register char *a1
, *b1
, *a2
;
if(stat(a1
,&stbuf
)!=-1 && ((stbuf
.st_mode
&S_IFMT
)==S_IFDIR
)) {
if(*a2
&& *a2
!='/' && a2
[-1]=='/')
else if(a1
[0]=='-'&&a1
[1]==0&&tempfile
==0) {
*pa1
= tempfile
= mktemp("/tmp/dXXXXX");
if((f
=creat(tempfile
,0600)) < 0) {
mesg("cannot create ",tempfile
);
while((i
=read(0,buf
,BUFSIZ
))>0)
if((input
[i
] = fopen(arg
,"r")) == NULL
){
mesg("cannot open ", arg
);
p
= (struct line
*)talloc(3*sizeof(line
));
for(j
=0; h
=readhash(input
[i
]);) {
p
= (struct line
*)ralloc((char *)p
,(++j
+3)*sizeof(line
));
for(pref
=0;pref
<len
[0]&&pref
<len
[1]&&
file
[0][pref
+1].value
==file
[1][pref
+1].value
;
for(suff
=0;suff
<len
[0]-pref
&&suff
<len
[1]-pref
&&
file
[0][len
[0]-suff
].value
==file
[1][len
[1]-suff
].value
;
slen
[j
] = len
[j
]-pref
-suff
;
if(a
[i
].value
<b
[j
].value
)
else if(a
[i
].value
== b
[j
].value
)
while(b
[j
+1].value
== b
[j
].value
) {
if(argc
>3 && *argv
[1]=='-') {
for(k
=1;argv
[0][k
];k
++) {
execv("/usr/lib/diffh",args
);
mesg("cannot find diffh",empty
);
filename(&argv
[1], &argv
[2]);
filename(&argv
[2], &argv
[1]);
equiv(sfile
[0], slen
[0], sfile
[1], slen
[1], member
);
member
= (int *)ralloc((char *)member
,(slen
[1]+2)*sizeof(int));
unsort(sfile
[0], slen
[0], class);
class = (int *)ralloc((char *)class,(slen
[0]+2)*sizeof(int));
klist
= (int *)talloc((slen
[0]+2)*sizeof(int));
clist
= (struct cand
*)talloc(sizeof(cand
));
k
= stone(class, slen
[0], member
, klist
);
J
= (int *)talloc((len
[0]+2)*sizeof(int));
ixold
= (long *)talloc((len
[0]+2)*sizeof(long));
ixnew
= (long *)talloc((len
[1]+2)*sizeof(long));
c
[l
] = newcand(i
,y
,oldc
);
c
[l
] = newcand(i
,y
,oldc
);
clist
= (struct cand
*)ralloc((char *)clist
,++clen
*sizeof(cand
));
if(clist
[c
[k
]].y
<y
) /*quick look for typical case*/
i
>len
[0]-suff
? i
+len
[1]-len
[0]:
for(q
=clist
+p
;q
->y
!=0;q
=clist
+q
->pred
)
J
[q
->x
+pref
] = q
->y
+pref
;
/* check does double duty:
1. ferret out any fortuitous correspondences due
to confounding by hashing (which result in "jackpot")
2. collect random access indexes to the two files */
input
[0] = fopen(argv
[1],"r");
input
[1] = fopen(argv
[2],"r");
ixold
[i
] = ctold
+= skipline(0);
ixnew
[j
] = ctnew
+= skipline(1);
if(bflag
&& isspace(c
) && isspace(d
)) {
} while(isspace(c
=getc(input
[0])));
} while(isspace(d
=getc(input
[1])));
ixnew
[j
] = ctnew
+= skipline(1);
for(i
=1;getc(input
[f
])!='\n';i
++) ;
input
[0] = fopen(argv
[1],"r");
input
[1] = fopen(argv
[2],"r");
if(opt
!=-1) for(i0
=1;i0
<=m
;i0
=i1
+1) {
while(i0
<=m
&&J
[i0
]==J
[i0
-1]+1) i0
++;
while(i1
<m
&&J
[i1
+1]==0) i1
++;
} else for(i0
=m
;i0
>=1;i0
=i1
-1) {
while(i0
>=1&&J
[i0
]==J
[i0
+1]-1&&J
[i0
]!=0) i0
--;
while(i1
>1&&J
[i1
-1]==0) i1
--;
putchar(a
>b
?'a':c
>d
?'d':'c');
if(opt
!=-1) range(c
,d
,",");
putchar(a
>b
?'a':c
>d
?'d':'c');
fetch(ixold
,a
,b
,input
[0],"< ");
if(a
<=b
&&c
<=d
) prints("---\n");
fetch(ixnew
,c
,d
,input
[1],opt
==0?"> ":empty
);
if(opt
!=0&&c
<=d
) prints(".\n");
printf("%s%d", separator
, b
);
/* hashing has the effect of
* arranging line in 7-bit bytes and then
* summing 1-s complement in 16-bit hunks
if(!bflag
) for(shift
=0;(t
=getc(f
))!='\n';shift
+=7) {
sum
+= (long)t
<< (shift
%=HALFLONG
);
sum
+= (long)t
<< (shift
%=HALFLONG
);
sum
= low(sum
) + high(sum
);
return((short)low(sum
) + (short)high(sum
));
fprintf(stderr
,"diff: %s%s\n",s
,t
);