Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Efficiently compare files, boolean outcome only (equal / not equal). |
2 | ||
3 | Tricks (used in this order): | |
4 | - Files with identical type, size & mtime are assumed to be clones | |
5 | - Files with different type or size cannot be identical | |
6 | - We keep a cache of outcomes of earlier comparisons | |
7 | - We don't fork a process to run 'cmp' but read the files ourselves | |
8 | """ | |
9 | ||
10 | import os | |
11 | ||
12 | cache = {} | |
13 | ||
14 | def cmp(f1, f2, shallow=1): | |
15 | """Compare two files, use the cache if possible. | |
16 | Return 1 for identical files, 0 for different. | |
17 | Raise exceptions if either file could not be statted, read, etc.""" | |
18 | s1, s2 = sig(os.stat(f1)), sig(os.stat(f2)) | |
19 | if s1[0] != 8 or s2[0] != 8: | |
20 | # Either is a not a plain file -- always report as different | |
21 | return 0 | |
22 | if shallow and s1 == s2: | |
23 | # type, size & mtime match -- report same | |
24 | return 1 | |
25 | if s1[:2] != s2[:2]: # Types or sizes differ, don't bother | |
26 | # types or sizes differ -- report different | |
27 | return 0 | |
28 | # same type and size -- look in the cache | |
29 | key = (f1, f2) | |
30 | try: | |
31 | cs1, cs2, outcome = cache[key] | |
32 | # cache hit | |
33 | if s1 == cs1 and s2 == cs2: | |
34 | # cached signatures match | |
35 | return outcome | |
36 | # stale cached signature(s) | |
37 | except KeyError: | |
38 | # cache miss | |
39 | pass | |
40 | # really compare | |
41 | outcome = do_cmp(f1, f2) | |
42 | cache[key] = s1, s2, outcome | |
43 | return outcome | |
44 | ||
45 | def sig(st): | |
46 | """Return signature (i.e., type, size, mtime) from raw stat data | |
47 | 0-5: st_mode, st_ino, st_dev, st_nlink, st_uid, st_gid | |
48 | 6-9: st_size, st_atime, st_mtime, st_ctime""" | |
49 | type = st[0] / 4096 | |
50 | size = st[6] | |
51 | mtime = st[8] | |
52 | return type, size, mtime | |
53 | ||
54 | def do_cmp(f1, f2): | |
55 | """Compare two files, really.""" | |
56 | bufsize = 8*1024 # Could be tuned | |
57 | fp1 = open(f1, 'rb') | |
58 | fp2 = open(f2, 'rb') | |
59 | while 1: | |
60 | b1 = fp1.read(bufsize) | |
61 | b2 = fp2.read(bufsize) | |
62 | if b1 != b2: return 0 | |
63 | if not b1: return 1 |