Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | #! /usr/bin/env python |
2 | ||
3 | """ | |
4 | Module difflib -- helpers for computing deltas between objects. | |
5 | ||
6 | Function get_close_matches(word, possibilities, n=3, cutoff=0.6): | |
7 | Use SequenceMatcher to return list of the best "good enough" matches. | |
8 | ||
9 | Function context_diff(a, b): | |
10 | For two lists of strings, return a delta in context diff format. | |
11 | ||
12 | Function ndiff(a, b): | |
13 | Return a delta: the difference between `a` and `b` (lists of strings). | |
14 | ||
15 | Function restore(delta, which): | |
16 | Return one of the two sequences that generated an ndiff delta. | |
17 | ||
18 | Function unified_diff(a, b): | |
19 | For two lists of strings, return a delta in unified diff format. | |
20 | ||
21 | Class SequenceMatcher: | |
22 | A flexible class for comparing pairs of sequences of any type. | |
23 | ||
24 | Class Differ: | |
25 | For producing human-readable deltas from sequences of lines of text. | |
26 | ||
27 | Class HtmlDiff: | |
28 | For producing HTML side by side comparison with change highlights. | |
29 | """ | |
30 | ||
31 | __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', | |
32 | 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', | |
33 | 'unified_diff', 'HtmlDiff'] | |
34 | ||
35 | import heapq | |
36 | ||
37 | def _calculate_ratio(matches, length): | |
38 | if length: | |
39 | return 2.0 * matches / length | |
40 | return 1.0 | |
41 | ||
42 | class SequenceMatcher: | |
43 | ||
44 | """ | |
45 | SequenceMatcher is a flexible class for comparing pairs of sequences of | |
46 | any type, so long as the sequence elements are hashable. The basic | |
47 | algorithm predates, and is a little fancier than, an algorithm | |
48 | published in the late 1980's by Ratcliff and Obershelp under the | |
49 | hyperbolic name "gestalt pattern matching". The basic idea is to find | |
50 | the longest contiguous matching subsequence that contains no "junk" | |
51 | elements (R-O doesn't address junk). The same idea is then applied | |
52 | recursively to the pieces of the sequences to the left and to the right | |
53 | of the matching subsequence. This does not yield minimal edit | |
54 | sequences, but does tend to yield matches that "look right" to people. | |
55 | ||
56 | SequenceMatcher tries to compute a "human-friendly diff" between two | |
57 | sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the | |
58 | longest *contiguous* & junk-free matching subsequence. That's what | |
59 | catches peoples' eyes. The Windows(tm) windiff has another interesting | |
60 | notion, pairing up elements that appear uniquely in each sequence. | |
61 | That, and the method here, appear to yield more intuitive difference | |
62 | reports than does diff. This method appears to be the least vulnerable | |
63 | to synching up on blocks of "junk lines", though (like blank lines in | |
64 | ordinary text files, or maybe "<P>" lines in HTML files). That may be | |
65 | because this is the only method of the 3 that has a *concept* of | |
66 | "junk" <wink>. | |
67 | ||
68 | Example, comparing two strings, and considering blanks to be "junk": | |
69 | ||
70 | >>> s = SequenceMatcher(lambda x: x == " ", | |
71 | ... "private Thread currentThread;", | |
72 | ... "private volatile Thread currentThread;") | |
73 | >>> | |
74 | ||
75 | .ratio() returns a float in [0, 1], measuring the "similarity" of the | |
76 | sequences. As a rule of thumb, a .ratio() value over 0.6 means the | |
77 | sequences are close matches: | |
78 | ||
79 | >>> print round(s.ratio(), 3) | |
80 | 0.866 | |
81 | >>> | |
82 | ||
83 | If you're only interested in where the sequences match, | |
84 | .get_matching_blocks() is handy: | |
85 | ||
86 | >>> for block in s.get_matching_blocks(): | |
87 | ... print "a[%d] and b[%d] match for %d elements" % block | |
88 | a[0] and b[0] match for 8 elements | |
89 | a[8] and b[17] match for 6 elements | |
90 | a[14] and b[23] match for 15 elements | |
91 | a[29] and b[38] match for 0 elements | |
92 | ||
93 | Note that the last tuple returned by .get_matching_blocks() is always a | |
94 | dummy, (len(a), len(b), 0), and this is the only case in which the last | |
95 | tuple element (number of elements matched) is 0. | |
96 | ||
97 | If you want to know how to change the first sequence into the second, | |
98 | use .get_opcodes(): | |
99 | ||
100 | >>> for opcode in s.get_opcodes(): | |
101 | ... print "%6s a[%d:%d] b[%d:%d]" % opcode | |
102 | equal a[0:8] b[0:8] | |
103 | insert a[8:8] b[8:17] | |
104 | equal a[8:14] b[17:23] | |
105 | equal a[14:29] b[23:38] | |
106 | ||
107 | See the Differ class for a fancy human-friendly file differencer, which | |
108 | uses SequenceMatcher both to compare sequences of lines, and to compare | |
109 | sequences of characters within similar (near-matching) lines. | |
110 | ||
111 | See also function get_close_matches() in this module, which shows how | |
112 | simple code building on SequenceMatcher can be used to do useful work. | |
113 | ||
114 | Timing: Basic R-O is cubic time worst case and quadratic time expected | |
115 | case. SequenceMatcher is quadratic time for the worst case and has | |
116 | expected-case behavior dependent in a complicated way on how many | |
117 | elements the sequences have in common; best case time is linear. | |
118 | ||
119 | Methods: | |
120 | ||
121 | __init__(isjunk=None, a='', b='') | |
122 | Construct a SequenceMatcher. | |
123 | ||
124 | set_seqs(a, b) | |
125 | Set the two sequences to be compared. | |
126 | ||
127 | set_seq1(a) | |
128 | Set the first sequence to be compared. | |
129 | ||
130 | set_seq2(b) | |
131 | Set the second sequence to be compared. | |
132 | ||
133 | find_longest_match(alo, ahi, blo, bhi) | |
134 | Find longest matching block in a[alo:ahi] and b[blo:bhi]. | |
135 | ||
136 | get_matching_blocks() | |
137 | Return list of triples describing matching subsequences. | |
138 | ||
139 | get_opcodes() | |
140 | Return list of 5-tuples describing how to turn a into b. | |
141 | ||
142 | ratio() | |
143 | Return a measure of the sequences' similarity (float in [0,1]). | |
144 | ||
145 | quick_ratio() | |
146 | Return an upper bound on .ratio() relatively quickly. | |
147 | ||
148 | real_quick_ratio() | |
149 | Return an upper bound on ratio() very quickly. | |
150 | """ | |
151 | ||
152 | def __init__(self, isjunk=None, a='', b=''): | |
153 | """Construct a SequenceMatcher. | |
154 | ||
155 | Optional arg isjunk is None (the default), or a one-argument | |
156 | function that takes a sequence element and returns true iff the | |
157 | element is junk. None is equivalent to passing "lambda x: 0", i.e. | |
158 | no elements are considered to be junk. For example, pass | |
159 | lambda x: x in " \\t" | |
160 | if you're comparing lines as sequences of characters, and don't | |
161 | want to synch up on blanks or hard tabs. | |
162 | ||
163 | Optional arg a is the first of two sequences to be compared. By | |
164 | default, an empty string. The elements of a must be hashable. See | |
165 | also .set_seqs() and .set_seq1(). | |
166 | ||
167 | Optional arg b is the second of two sequences to be compared. By | |
168 | default, an empty string. The elements of b must be hashable. See | |
169 | also .set_seqs() and .set_seq2(). | |
170 | """ | |
171 | ||
172 | # Members: | |
173 | # a | |
174 | # first sequence | |
175 | # b | |
176 | # second sequence; differences are computed as "what do | |
177 | # we need to do to 'a' to change it into 'b'?" | |
178 | # b2j | |
179 | # for x in b, b2j[x] is a list of the indices (into b) | |
180 | # at which x appears; junk elements do not appear | |
181 | # fullbcount | |
182 | # for x in b, fullbcount[x] == the number of times x | |
183 | # appears in b; only materialized if really needed (used | |
184 | # only for computing quick_ratio()) | |
185 | # matching_blocks | |
186 | # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; | |
187 | # ascending & non-overlapping in i and in j; terminated by | |
188 | # a dummy (len(a), len(b), 0) sentinel | |
189 | # opcodes | |
190 | # a list of (tag, i1, i2, j1, j2) tuples, where tag is | |
191 | # one of | |
192 | # 'replace' a[i1:i2] should be replaced by b[j1:j2] | |
193 | # 'delete' a[i1:i2] should be deleted | |
194 | # 'insert' b[j1:j2] should be inserted | |
195 | # 'equal' a[i1:i2] == b[j1:j2] | |
196 | # isjunk | |
197 | # a user-supplied function taking a sequence element and | |
198 | # returning true iff the element is "junk" -- this has | |
199 | # subtle but helpful effects on the algorithm, which I'll | |
200 | # get around to writing up someday <0.9 wink>. | |
201 | # DON'T USE! Only __chain_b uses this. Use isbjunk. | |
202 | # isbjunk | |
203 | # for x in b, isbjunk(x) == isjunk(x) but much faster; | |
204 | # it's really the has_key method of a hidden dict. | |
205 | # DOES NOT WORK for x in a! | |
206 | # isbpopular | |
207 | # for x in b, isbpopular(x) is true iff b is reasonably long | |
208 | # (at least 200 elements) and x accounts for more than 1% of | |
209 | # its elements. DOES NOT WORK for x in a! | |
210 | ||
211 | self.isjunk = isjunk | |
212 | self.a = self.b = None | |
213 | self.set_seqs(a, b) | |
214 | ||
215 | def set_seqs(self, a, b): | |
216 | """Set the two sequences to be compared. | |
217 | ||
218 | >>> s = SequenceMatcher() | |
219 | >>> s.set_seqs("abcd", "bcde") | |
220 | >>> s.ratio() | |
221 | 0.75 | |
222 | """ | |
223 | ||
224 | self.set_seq1(a) | |
225 | self.set_seq2(b) | |
226 | ||
227 | def set_seq1(self, a): | |
228 | """Set the first sequence to be compared. | |
229 | ||
230 | The second sequence to be compared is not changed. | |
231 | ||
232 | >>> s = SequenceMatcher(None, "abcd", "bcde") | |
233 | >>> s.ratio() | |
234 | 0.75 | |
235 | >>> s.set_seq1("bcde") | |
236 | >>> s.ratio() | |
237 | 1.0 | |
238 | >>> | |
239 | ||
240 | SequenceMatcher computes and caches detailed information about the | |
241 | second sequence, so if you want to compare one sequence S against | |
242 | many sequences, use .set_seq2(S) once and call .set_seq1(x) | |
243 | repeatedly for each of the other sequences. | |
244 | ||
245 | See also set_seqs() and set_seq2(). | |
246 | """ | |
247 | ||
248 | if a is self.a: | |
249 | return | |
250 | self.a = a | |
251 | self.matching_blocks = self.opcodes = None | |
252 | ||
253 | def set_seq2(self, b): | |
254 | """Set the second sequence to be compared. | |
255 | ||
256 | The first sequence to be compared is not changed. | |
257 | ||
258 | >>> s = SequenceMatcher(None, "abcd", "bcde") | |
259 | >>> s.ratio() | |
260 | 0.75 | |
261 | >>> s.set_seq2("abcd") | |
262 | >>> s.ratio() | |
263 | 1.0 | |
264 | >>> | |
265 | ||
266 | SequenceMatcher computes and caches detailed information about the | |
267 | second sequence, so if you want to compare one sequence S against | |
268 | many sequences, use .set_seq2(S) once and call .set_seq1(x) | |
269 | repeatedly for each of the other sequences. | |
270 | ||
271 | See also set_seqs() and set_seq1(). | |
272 | """ | |
273 | ||
274 | if b is self.b: | |
275 | return | |
276 | self.b = b | |
277 | self.matching_blocks = self.opcodes = None | |
278 | self.fullbcount = None | |
279 | self.__chain_b() | |
280 | ||
281 | # For each element x in b, set b2j[x] to a list of the indices in | |
282 | # b where x appears; the indices are in increasing order; note that | |
283 | # the number of times x appears in b is len(b2j[x]) ... | |
284 | # when self.isjunk is defined, junk elements don't show up in this | |
285 | # map at all, which stops the central find_longest_match method | |
286 | # from starting any matching block at a junk element ... | |
287 | # also creates the fast isbjunk function ... | |
288 | # b2j also does not contain entries for "popular" elements, meaning | |
289 | # elements that account for more than 1% of the total elements, and | |
290 | # when the sequence is reasonably large (>= 200 elements); this can | |
291 | # be viewed as an adaptive notion of semi-junk, and yields an enormous | |
292 | # speedup when, e.g., comparing program files with hundreds of | |
293 | # instances of "return NULL;" ... | |
294 | # note that this is only called when b changes; so for cross-product | |
295 | # kinds of matches, it's best to call set_seq2 once, then set_seq1 | |
296 | # repeatedly | |
297 | ||
298 | def __chain_b(self): | |
299 | # Because isjunk is a user-defined (not C) function, and we test | |
300 | # for junk a LOT, it's important to minimize the number of calls. | |
301 | # Before the tricks described here, __chain_b was by far the most | |
302 | # time-consuming routine in the whole module! If anyone sees | |
303 | # Jim Roskind, thank him again for profile.py -- I never would | |
304 | # have guessed that. | |
305 | # The first trick is to build b2j ignoring the possibility | |
306 | # of junk. I.e., we don't call isjunk at all yet. Throwing | |
307 | # out the junk later is much cheaper than building b2j "right" | |
308 | # from the start. | |
309 | b = self.b | |
310 | n = len(b) | |
311 | self.b2j = b2j = {} | |
312 | populardict = {} | |
313 | for i, elt in enumerate(b): | |
314 | if elt in b2j: | |
315 | indices = b2j[elt] | |
316 | if n >= 200 and len(indices) * 100 > n: | |
317 | populardict[elt] = 1 | |
318 | del indices[:] | |
319 | else: | |
320 | indices.append(i) | |
321 | else: | |
322 | b2j[elt] = [i] | |
323 | ||
324 | # Purge leftover indices for popular elements. | |
325 | for elt in populardict: | |
326 | del b2j[elt] | |
327 | ||
328 | # Now b2j.keys() contains elements uniquely, and especially when | |
329 | # the sequence is a string, that's usually a good deal smaller | |
330 | # than len(string). The difference is the number of isjunk calls | |
331 | # saved. | |
332 | isjunk = self.isjunk | |
333 | junkdict = {} | |
334 | if isjunk: | |
335 | for d in populardict, b2j: | |
336 | for elt in d.keys(): | |
337 | if isjunk(elt): | |
338 | junkdict[elt] = 1 | |
339 | del d[elt] | |
340 | ||
341 | # Now for x in b, isjunk(x) == x in junkdict, but the | |
342 | # latter is much faster. Note too that while there may be a | |
343 | # lot of junk in the sequence, the number of *unique* junk | |
344 | # elements is probably small. So the memory burden of keeping | |
345 | # this dict alive is likely trivial compared to the size of b2j. | |
346 | self.isbjunk = junkdict.has_key | |
347 | self.isbpopular = populardict.has_key | |
348 | ||
349 | def find_longest_match(self, alo, ahi, blo, bhi): | |
350 | """Find longest matching block in a[alo:ahi] and b[blo:bhi]. | |
351 | ||
352 | If isjunk is not defined: | |
353 | ||
354 | Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where | |
355 | alo <= i <= i+k <= ahi | |
356 | blo <= j <= j+k <= bhi | |
357 | and for all (i',j',k') meeting those conditions, | |
358 | k >= k' | |
359 | i <= i' | |
360 | and if i == i', j <= j' | |
361 | ||
362 | In other words, of all maximal matching blocks, return one that | |
363 | starts earliest in a, and of all those maximal matching blocks that | |
364 | start earliest in a, return the one that starts earliest in b. | |
365 | ||
366 | >>> s = SequenceMatcher(None, " abcd", "abcd abcd") | |
367 | >>> s.find_longest_match(0, 5, 0, 9) | |
368 | (0, 4, 5) | |
369 | ||
370 | If isjunk is defined, first the longest matching block is | |
371 | determined as above, but with the additional restriction that no | |
372 | junk element appears in the block. Then that block is extended as | |
373 | far as possible by matching (only) junk elements on both sides. So | |
374 | the resulting block never matches on junk except as identical junk | |
375 | happens to be adjacent to an "interesting" match. | |
376 | ||
377 | Here's the same example as before, but considering blanks to be | |
378 | junk. That prevents " abcd" from matching the " abcd" at the tail | |
379 | end of the second sequence directly. Instead only the "abcd" can | |
380 | match, and matches the leftmost "abcd" in the second sequence: | |
381 | ||
382 | >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") | |
383 | >>> s.find_longest_match(0, 5, 0, 9) | |
384 | (1, 0, 4) | |
385 | ||
386 | If no blocks match, return (alo, blo, 0). | |
387 | ||
388 | >>> s = SequenceMatcher(None, "ab", "c") | |
389 | >>> s.find_longest_match(0, 2, 0, 1) | |
390 | (0, 0, 0) | |
391 | """ | |
392 | ||
393 | # CAUTION: stripping common prefix or suffix would be incorrect. | |
394 | # E.g., | |
395 | # ab | |
396 | # acab | |
397 | # Longest matching block is "ab", but if common prefix is | |
398 | # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so | |
399 | # strip, so ends up claiming that ab is changed to acab by | |
400 | # inserting "ca" in the middle. That's minimal but unintuitive: | |
401 | # "it's obvious" that someone inserted "ac" at the front. | |
402 | # Windiff ends up at the same place as diff, but by pairing up | |
403 | # the unique 'b's and then matching the first two 'a's. | |
404 | ||
405 | a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk | |
406 | besti, bestj, bestsize = alo, blo, 0 | |
407 | # find longest junk-free match | |
408 | # during an iteration of the loop, j2len[j] = length of longest | |
409 | # junk-free match ending with a[i-1] and b[j] | |
410 | j2len = {} | |
411 | nothing = [] | |
412 | for i in xrange(alo, ahi): | |
413 | # look at all instances of a[i] in b; note that because | |
414 | # b2j has no junk keys, the loop is skipped if a[i] is junk | |
415 | j2lenget = j2len.get | |
416 | newj2len = {} | |
417 | for j in b2j.get(a[i], nothing): | |
418 | # a[i] matches b[j] | |
419 | if j < blo: | |
420 | continue | |
421 | if j >= bhi: | |
422 | break | |
423 | k = newj2len[j] = j2lenget(j-1, 0) + 1 | |
424 | if k > bestsize: | |
425 | besti, bestj, bestsize = i-k+1, j-k+1, k | |
426 | j2len = newj2len | |
427 | ||
428 | # Extend the best by non-junk elements on each end. In particular, | |
429 | # "popular" non-junk elements aren't in b2j, which greatly speeds | |
430 | # the inner loop above, but also means "the best" match so far | |
431 | # doesn't contain any junk *or* popular non-junk elements. | |
432 | while besti > alo and bestj > blo and \ | |
433 | not isbjunk(b[bestj-1]) and \ | |
434 | a[besti-1] == b[bestj-1]: | |
435 | besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 | |
436 | while besti+bestsize < ahi and bestj+bestsize < bhi and \ | |
437 | not isbjunk(b[bestj+bestsize]) and \ | |
438 | a[besti+bestsize] == b[bestj+bestsize]: | |
439 | bestsize += 1 | |
440 | ||
441 | # Now that we have a wholly interesting match (albeit possibly | |
442 | # empty!), we may as well suck up the matching junk on each | |
443 | # side of it too. Can't think of a good reason not to, and it | |
444 | # saves post-processing the (possibly considerable) expense of | |
445 | # figuring out what to do with it. In the case of an empty | |
446 | # interesting match, this is clearly the right thing to do, | |
447 | # because no other kind of match is possible in the regions. | |
448 | while besti > alo and bestj > blo and \ | |
449 | isbjunk(b[bestj-1]) and \ | |
450 | a[besti-1] == b[bestj-1]: | |
451 | besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 | |
452 | while besti+bestsize < ahi and bestj+bestsize < bhi and \ | |
453 | isbjunk(b[bestj+bestsize]) and \ | |
454 | a[besti+bestsize] == b[bestj+bestsize]: | |
455 | bestsize = bestsize + 1 | |
456 | ||
457 | return besti, bestj, bestsize | |
458 | ||
459 | def get_matching_blocks(self): | |
460 | """Return list of triples describing matching subsequences. | |
461 | ||
462 | Each triple is of the form (i, j, n), and means that | |
463 | a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in | |
464 | i and in j. | |
465 | ||
466 | The last triple is a dummy, (len(a), len(b), 0), and is the only | |
467 | triple with n==0. | |
468 | ||
469 | >>> s = SequenceMatcher(None, "abxcd", "abcd") | |
470 | >>> s.get_matching_blocks() | |
471 | [(0, 0, 2), (3, 2, 2), (5, 4, 0)] | |
472 | """ | |
473 | ||
474 | if self.matching_blocks is not None: | |
475 | return self.matching_blocks | |
476 | self.matching_blocks = [] | |
477 | la, lb = len(self.a), len(self.b) | |
478 | self.__helper(0, la, 0, lb, self.matching_blocks) | |
479 | self.matching_blocks.append( (la, lb, 0) ) | |
480 | return self.matching_blocks | |
481 | ||
482 | # builds list of matching blocks covering a[alo:ahi] and | |
483 | # b[blo:bhi], appending them in increasing order to answer | |
484 | ||
485 | def __helper(self, alo, ahi, blo, bhi, answer): | |
486 | i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) | |
487 | # a[alo:i] vs b[blo:j] unknown | |
488 | # a[i:i+k] same as b[j:j+k] | |
489 | # a[i+k:ahi] vs b[j+k:bhi] unknown | |
490 | if k: | |
491 | if alo < i and blo < j: | |
492 | self.__helper(alo, i, blo, j, answer) | |
493 | answer.append(x) | |
494 | if i+k < ahi and j+k < bhi: | |
495 | self.__helper(i+k, ahi, j+k, bhi, answer) | |
496 | ||
497 | def get_opcodes(self): | |
498 | """Return list of 5-tuples describing how to turn a into b. | |
499 | ||
500 | Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple | |
501 | has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the | |
502 | tuple preceding it, and likewise for j1 == the previous j2. | |
503 | ||
504 | The tags are strings, with these meanings: | |
505 | ||
506 | 'replace': a[i1:i2] should be replaced by b[j1:j2] | |
507 | 'delete': a[i1:i2] should be deleted. | |
508 | Note that j1==j2 in this case. | |
509 | 'insert': b[j1:j2] should be inserted at a[i1:i1]. | |
510 | Note that i1==i2 in this case. | |
511 | 'equal': a[i1:i2] == b[j1:j2] | |
512 | ||
513 | >>> a = "qabxcd" | |
514 | >>> b = "abycdf" | |
515 | >>> s = SequenceMatcher(None, a, b) | |
516 | >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): | |
517 | ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % | |
518 | ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) | |
519 | delete a[0:1] (q) b[0:0] () | |
520 | equal a[1:3] (ab) b[0:2] (ab) | |
521 | replace a[3:4] (x) b[2:3] (y) | |
522 | equal a[4:6] (cd) b[3:5] (cd) | |
523 | insert a[6:6] () b[5:6] (f) | |
524 | """ | |
525 | ||
526 | if self.opcodes is not None: | |
527 | return self.opcodes | |
528 | i = j = 0 | |
529 | self.opcodes = answer = [] | |
530 | for ai, bj, size in self.get_matching_blocks(): | |
531 | # invariant: we've pumped out correct diffs to change | |
532 | # a[:i] into b[:j], and the next matching block is | |
533 | # a[ai:ai+size] == b[bj:bj+size]. So we need to pump | |
534 | # out a diff to change a[i:ai] into b[j:bj], pump out | |
535 | # the matching block, and move (i,j) beyond the match | |
536 | tag = '' | |
537 | if i < ai and j < bj: | |
538 | tag = 'replace' | |
539 | elif i < ai: | |
540 | tag = 'delete' | |
541 | elif j < bj: | |
542 | tag = 'insert' | |
543 | if tag: | |
544 | answer.append( (tag, i, ai, j, bj) ) | |
545 | i, j = ai+size, bj+size | |
546 | # the list of matching blocks is terminated by a | |
547 | # sentinel with size 0 | |
548 | if size: | |
549 | answer.append( ('equal', ai, i, bj, j) ) | |
550 | return answer | |
551 | ||
552 | def get_grouped_opcodes(self, n=3): | |
553 | """ Isolate change clusters by eliminating ranges with no changes. | |
554 | ||
555 | Return a generator of groups with upto n lines of context. | |
556 | Each group is in the same format as returned by get_opcodes(). | |
557 | ||
558 | >>> from pprint import pprint | |
559 | >>> a = map(str, range(1,40)) | |
560 | >>> b = a[:] | |
561 | >>> b[8:8] = ['i'] # Make an insertion | |
562 | >>> b[20] += 'x' # Make a replacement | |
563 | >>> b[23:28] = [] # Make a deletion | |
564 | >>> b[30] += 'y' # Make another replacement | |
565 | >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) | |
566 | [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], | |
567 | [('equal', 16, 19, 17, 20), | |
568 | ('replace', 19, 20, 20, 21), | |
569 | ('equal', 20, 22, 21, 23), | |
570 | ('delete', 22, 27, 23, 23), | |
571 | ('equal', 27, 30, 23, 26)], | |
572 | [('equal', 31, 34, 27, 30), | |
573 | ('replace', 34, 35, 30, 31), | |
574 | ('equal', 35, 38, 31, 34)]] | |
575 | """ | |
576 | ||
577 | codes = self.get_opcodes() | |
578 | if not codes: | |
579 | codes = [("equal", 0, 1, 0, 1)] | |
580 | # Fixup leading and trailing groups if they show no changes. | |
581 | if codes[0][0] == 'equal': | |
582 | tag, i1, i2, j1, j2 = codes[0] | |
583 | codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 | |
584 | if codes[-1][0] == 'equal': | |
585 | tag, i1, i2, j1, j2 = codes[-1] | |
586 | codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) | |
587 | ||
588 | nn = n + n | |
589 | group = [] | |
590 | for tag, i1, i2, j1, j2 in codes: | |
591 | # End the current group and start a new one whenever | |
592 | # there is a large range with no changes. | |
593 | if tag == 'equal' and i2-i1 > nn: | |
594 | group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) | |
595 | yield group | |
596 | group = [] | |
597 | i1, j1 = max(i1, i2-n), max(j1, j2-n) | |
598 | group.append((tag, i1, i2, j1 ,j2)) | |
599 | if group and not (len(group)==1 and group[0][0] == 'equal'): | |
600 | yield group | |
601 | ||
602 | def ratio(self): | |
603 | """Return a measure of the sequences' similarity (float in [0,1]). | |
604 | ||
605 | Where T is the total number of elements in both sequences, and | |
606 | M is the number of matches, this is 2.0*M / T. | |
607 | Note that this is 1 if the sequences are identical, and 0 if | |
608 | they have nothing in common. | |
609 | ||
610 | .ratio() is expensive to compute if you haven't already computed | |
611 | .get_matching_blocks() or .get_opcodes(), in which case you may | |
612 | want to try .quick_ratio() or .real_quick_ratio() first to get an | |
613 | upper bound. | |
614 | ||
615 | >>> s = SequenceMatcher(None, "abcd", "bcde") | |
616 | >>> s.ratio() | |
617 | 0.75 | |
618 | >>> s.quick_ratio() | |
619 | 0.75 | |
620 | >>> s.real_quick_ratio() | |
621 | 1.0 | |
622 | """ | |
623 | ||
624 | matches = reduce(lambda sum, triple: sum + triple[-1], | |
625 | self.get_matching_blocks(), 0) | |
626 | return _calculate_ratio(matches, len(self.a) + len(self.b)) | |
627 | ||
628 | def quick_ratio(self): | |
629 | """Return an upper bound on ratio() relatively quickly. | |
630 | ||
631 | This isn't defined beyond that it is an upper bound on .ratio(), and | |
632 | is faster to compute. | |
633 | """ | |
634 | ||
635 | # viewing a and b as multisets, set matches to the cardinality | |
636 | # of their intersection; this counts the number of matches | |
637 | # without regard to order, so is clearly an upper bound | |
638 | if self.fullbcount is None: | |
639 | self.fullbcount = fullbcount = {} | |
640 | for elt in self.b: | |
641 | fullbcount[elt] = fullbcount.get(elt, 0) + 1 | |
642 | fullbcount = self.fullbcount | |
643 | # avail[x] is the number of times x appears in 'b' less the | |
644 | # number of times we've seen it in 'a' so far ... kinda | |
645 | avail = {} | |
646 | availhas, matches = avail.has_key, 0 | |
647 | for elt in self.a: | |
648 | if availhas(elt): | |
649 | numb = avail[elt] | |
650 | else: | |
651 | numb = fullbcount.get(elt, 0) | |
652 | avail[elt] = numb - 1 | |
653 | if numb > 0: | |
654 | matches = matches + 1 | |
655 | return _calculate_ratio(matches, len(self.a) + len(self.b)) | |
656 | ||
657 | def real_quick_ratio(self): | |
658 | """Return an upper bound on ratio() very quickly. | |
659 | ||
660 | This isn't defined beyond that it is an upper bound on .ratio(), and | |
661 | is faster to compute than either .ratio() or .quick_ratio(). | |
662 | """ | |
663 | ||
664 | la, lb = len(self.a), len(self.b) | |
665 | # can't have more matches than the number of elements in the | |
666 | # shorter sequence | |
667 | return _calculate_ratio(min(la, lb), la + lb) | |
668 | ||
669 | def get_close_matches(word, possibilities, n=3, cutoff=0.6): | |
670 | """Use SequenceMatcher to return list of the best "good enough" matches. | |
671 | ||
672 | word is a sequence for which close matches are desired (typically a | |
673 | string). | |
674 | ||
675 | possibilities is a list of sequences against which to match word | |
676 | (typically a list of strings). | |
677 | ||
678 | Optional arg n (default 3) is the maximum number of close matches to | |
679 | return. n must be > 0. | |
680 | ||
681 | Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities | |
682 | that don't score at least that similar to word are ignored. | |
683 | ||
684 | The best (no more than n) matches among the possibilities are returned | |
685 | in a list, sorted by similarity score, most similar first. | |
686 | ||
687 | >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) | |
688 | ['apple', 'ape'] | |
689 | >>> import keyword as _keyword | |
690 | >>> get_close_matches("wheel", _keyword.kwlist) | |
691 | ['while'] | |
692 | >>> get_close_matches("apple", _keyword.kwlist) | |
693 | [] | |
694 | >>> get_close_matches("accept", _keyword.kwlist) | |
695 | ['except'] | |
696 | """ | |
697 | ||
698 | if not n > 0: | |
699 | raise ValueError("n must be > 0: %r" % (n,)) | |
700 | if not 0.0 <= cutoff <= 1.0: | |
701 | raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) | |
702 | result = [] | |
703 | s = SequenceMatcher() | |
704 | s.set_seq2(word) | |
705 | for x in possibilities: | |
706 | s.set_seq1(x) | |
707 | if s.real_quick_ratio() >= cutoff and \ | |
708 | s.quick_ratio() >= cutoff and \ | |
709 | s.ratio() >= cutoff: | |
710 | result.append((s.ratio(), x)) | |
711 | ||
712 | # Move the best scorers to head of list | |
713 | result = heapq.nlargest(n, result) | |
714 | # Strip scores for the best n matches | |
715 | return [x for score, x in result] | |
716 | ||
717 | def _count_leading(line, ch): | |
718 | """ | |
719 | Return number of `ch` characters at the start of `line`. | |
720 | ||
721 | Example: | |
722 | ||
723 | >>> _count_leading(' abc', ' ') | |
724 | 3 | |
725 | """ | |
726 | ||
727 | i, n = 0, len(line) | |
728 | while i < n and line[i] == ch: | |
729 | i += 1 | |
730 | return i | |
731 | ||
732 | class Differ: | |
733 | r""" | |
734 | Differ is a class for comparing sequences of lines of text, and | |
735 | producing human-readable differences or deltas. Differ uses | |
736 | SequenceMatcher both to compare sequences of lines, and to compare | |
737 | sequences of characters within similar (near-matching) lines. | |
738 | ||
739 | Each line of a Differ delta begins with a two-letter code: | |
740 | ||
741 | '- ' line unique to sequence 1 | |
742 | '+ ' line unique to sequence 2 | |
743 | ' ' line common to both sequences | |
744 | '? ' line not present in either input sequence | |
745 | ||
746 | Lines beginning with '? ' attempt to guide the eye to intraline | |
747 | differences, and were not present in either input sequence. These lines | |
748 | can be confusing if the sequences contain tab characters. | |
749 | ||
750 | Note that Differ makes no claim to produce a *minimal* diff. To the | |
751 | contrary, minimal diffs are often counter-intuitive, because they synch | |
752 | up anywhere possible, sometimes accidental matches 100 pages apart. | |
753 | Restricting synch points to contiguous matches preserves some notion of | |
754 | locality, at the occasional cost of producing a longer diff. | |
755 | ||
756 | Example: Comparing two texts. | |
757 | ||
758 | First we set up the texts, sequences of individual single-line strings | |
759 | ending with newlines (such sequences can also be obtained from the | |
760 | `readlines()` method of file-like objects): | |
761 | ||
762 | >>> text1 = ''' 1. Beautiful is better than ugly. | |
763 | ... 2. Explicit is better than implicit. | |
764 | ... 3. Simple is better than complex. | |
765 | ... 4. Complex is better than complicated. | |
766 | ... '''.splitlines(1) | |
767 | >>> len(text1) | |
768 | 4 | |
769 | >>> text1[0][-1] | |
770 | '\n' | |
771 | >>> text2 = ''' 1. Beautiful is better than ugly. | |
772 | ... 3. Simple is better than complex. | |
773 | ... 4. Complicated is better than complex. | |
774 | ... 5. Flat is better than nested. | |
775 | ... '''.splitlines(1) | |
776 | ||
777 | Next we instantiate a Differ object: | |
778 | ||
779 | >>> d = Differ() | |
780 | ||
781 | Note that when instantiating a Differ object we may pass functions to | |
782 | filter out line and character 'junk'. See Differ.__init__ for details. | |
783 | ||
784 | Finally, we compare the two: | |
785 | ||
786 | >>> result = list(d.compare(text1, text2)) | |
787 | ||
788 | 'result' is a list of strings, so let's pretty-print it: | |
789 | ||
790 | >>> from pprint import pprint as _pprint | |
791 | >>> _pprint(result) | |
792 | [' 1. Beautiful is better than ugly.\n', | |
793 | '- 2. Explicit is better than implicit.\n', | |
794 | '- 3. Simple is better than complex.\n', | |
795 | '+ 3. Simple is better than complex.\n', | |
796 | '? ++\n', | |
797 | '- 4. Complex is better than complicated.\n', | |
798 | '? ^ ---- ^\n', | |
799 | '+ 4. Complicated is better than complex.\n', | |
800 | '? ++++ ^ ^\n', | |
801 | '+ 5. Flat is better than nested.\n'] | |
802 | ||
803 | As a single multi-line string it looks like this: | |
804 | ||
805 | >>> print ''.join(result), | |
806 | 1. Beautiful is better than ugly. | |
807 | - 2. Explicit is better than implicit. | |
808 | - 3. Simple is better than complex. | |
809 | + 3. Simple is better than complex. | |
810 | ? ++ | |
811 | - 4. Complex is better than complicated. | |
812 | ? ^ ---- ^ | |
813 | + 4. Complicated is better than complex. | |
814 | ? ++++ ^ ^ | |
815 | + 5. Flat is better than nested. | |
816 | ||
817 | Methods: | |
818 | ||
819 | __init__(linejunk=None, charjunk=None) | |
820 | Construct a text differencer, with optional filters. | |
821 | ||
822 | compare(a, b) | |
823 | Compare two sequences of lines; generate the resulting delta. | |
824 | """ | |
825 | ||
826 | def __init__(self, linejunk=None, charjunk=None): | |
827 | """ | |
828 | Construct a text differencer, with optional filters. | |
829 | ||
830 | The two optional keyword parameters are for filter functions: | |
831 | ||
832 | - `linejunk`: A function that should accept a single string argument, | |
833 | and return true iff the string is junk. The module-level function | |
834 | `IS_LINE_JUNK` may be used to filter out lines without visible | |
835 | characters, except for at most one splat ('#'). It is recommended | |
836 | to leave linejunk None; as of Python 2.3, the underlying | |
837 | SequenceMatcher class has grown an adaptive notion of "noise" lines | |
838 | that's better than any static definition the author has ever been | |
839 | able to craft. | |
840 | ||
841 | - `charjunk`: A function that should accept a string of length 1. The | |
842 | module-level function `IS_CHARACTER_JUNK` may be used to filter out | |
843 | whitespace characters (a blank or tab; **note**: bad idea to include | |
844 | newline in this!). Use of IS_CHARACTER_JUNK is recommended. | |
845 | """ | |
846 | ||
847 | self.linejunk = linejunk | |
848 | self.charjunk = charjunk | |
849 | ||
850 | def compare(self, a, b): | |
851 | r""" | |
852 | Compare two sequences of lines; generate the resulting delta. | |
853 | ||
854 | Each sequence must contain individual single-line strings ending with | |
855 | newlines. Such sequences can be obtained from the `readlines()` method | |
856 | of file-like objects. The delta generated also consists of newline- | |
857 | terminated strings, ready to be printed as-is via the writeline() | |
858 | method of a file-like object. | |
859 | ||
860 | Example: | |
861 | ||
862 | >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), | |
863 | ... 'ore\ntree\nemu\n'.splitlines(1))), | |
864 | - one | |
865 | ? ^ | |
866 | + ore | |
867 | ? ^ | |
868 | - two | |
869 | - three | |
870 | ? - | |
871 | + tree | |
872 | + emu | |
873 | """ | |
874 | ||
875 | cruncher = SequenceMatcher(self.linejunk, a, b) | |
876 | for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): | |
877 | if tag == 'replace': | |
878 | g = self._fancy_replace(a, alo, ahi, b, blo, bhi) | |
879 | elif tag == 'delete': | |
880 | g = self._dump('-', a, alo, ahi) | |
881 | elif tag == 'insert': | |
882 | g = self._dump('+', b, blo, bhi) | |
883 | elif tag == 'equal': | |
884 | g = self._dump(' ', a, alo, ahi) | |
885 | else: | |
886 | raise ValueError, 'unknown tag %r' % (tag,) | |
887 | ||
888 | for line in g: | |
889 | yield line | |
890 | ||
891 | def _dump(self, tag, x, lo, hi): | |
892 | """Generate comparison results for a same-tagged range.""" | |
893 | for i in xrange(lo, hi): | |
894 | yield '%s %s' % (tag, x[i]) | |
895 | ||
896 | def _plain_replace(self, a, alo, ahi, b, blo, bhi): | |
897 | assert alo < ahi and blo < bhi | |
898 | # dump the shorter block first -- reduces the burden on short-term | |
899 | # memory if the blocks are of very different sizes | |
900 | if bhi - blo < ahi - alo: | |
901 | first = self._dump('+', b, blo, bhi) | |
902 | second = self._dump('-', a, alo, ahi) | |
903 | else: | |
904 | first = self._dump('-', a, alo, ahi) | |
905 | second = self._dump('+', b, blo, bhi) | |
906 | ||
907 | for g in first, second: | |
908 | for line in g: | |
909 | yield line | |
910 | ||
911 | def _fancy_replace(self, a, alo, ahi, b, blo, bhi): | |
912 | r""" | |
913 | When replacing one block of lines with another, search the blocks | |
914 | for *similar* lines; the best-matching pair (if any) is used as a | |
915 | synch point, and intraline difference marking is done on the | |
916 | similar pair. Lots of work, but often worth it. | |
917 | ||
918 | Example: | |
919 | ||
920 | >>> d = Differ() | |
921 | >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, | |
922 | ... ['abcdefGhijkl\n'], 0, 1) | |
923 | >>> print ''.join(results), | |
924 | - abcDefghiJkl | |
925 | ? ^ ^ ^ | |
926 | + abcdefGhijkl | |
927 | ? ^ ^ ^ | |
928 | """ | |
929 | ||
930 | # don't synch up unless the lines have a similarity score of at | |
931 | # least cutoff; best_ratio tracks the best score seen so far | |
932 | best_ratio, cutoff = 0.74, 0.75 | |
933 | cruncher = SequenceMatcher(self.charjunk) | |
934 | eqi, eqj = None, None # 1st indices of equal lines (if any) | |
935 | ||
936 | # search for the pair that matches best without being identical | |
937 | # (identical lines must be junk lines, & we don't want to synch up | |
938 | # on junk -- unless we have to) | |
939 | for j in xrange(blo, bhi): | |
940 | bj = b[j] | |
941 | cruncher.set_seq2(bj) | |
942 | for i in xrange(alo, ahi): | |
943 | ai = a[i] | |
944 | if ai == bj: | |
945 | if eqi is None: | |
946 | eqi, eqj = i, j | |
947 | continue | |
948 | cruncher.set_seq1(ai) | |
949 | # computing similarity is expensive, so use the quick | |
950 | # upper bounds first -- have seen this speed up messy | |
951 | # compares by a factor of 3. | |
952 | # note that ratio() is only expensive to compute the first | |
953 | # time it's called on a sequence pair; the expensive part | |
954 | # of the computation is cached by cruncher | |
955 | if cruncher.real_quick_ratio() > best_ratio and \ | |
956 | cruncher.quick_ratio() > best_ratio and \ | |
957 | cruncher.ratio() > best_ratio: | |
958 | best_ratio, best_i, best_j = cruncher.ratio(), i, j | |
959 | if best_ratio < cutoff: | |
960 | # no non-identical "pretty close" pair | |
961 | if eqi is None: | |
962 | # no identical pair either -- treat it as a straight replace | |
963 | for line in self._plain_replace(a, alo, ahi, b, blo, bhi): | |
964 | yield line | |
965 | return | |
966 | # no close pair, but an identical pair -- synch up on that | |
967 | best_i, best_j, best_ratio = eqi, eqj, 1.0 | |
968 | else: | |
969 | # there's a close pair, so forget the identical pair (if any) | |
970 | eqi = None | |
971 | ||
972 | # a[best_i] very similar to b[best_j]; eqi is None iff they're not | |
973 | # identical | |
974 | ||
975 | # pump out diffs from before the synch point | |
976 | for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): | |
977 | yield line | |
978 | ||
979 | # do intraline marking on the synch pair | |
980 | aelt, belt = a[best_i], b[best_j] | |
981 | if eqi is None: | |
982 | # pump out a '-', '?', '+', '?' quad for the synched lines | |
983 | atags = btags = "" | |
984 | cruncher.set_seqs(aelt, belt) | |
985 | for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): | |
986 | la, lb = ai2 - ai1, bj2 - bj1 | |
987 | if tag == 'replace': | |
988 | atags += '^' * la | |
989 | btags += '^' * lb | |
990 | elif tag == 'delete': | |
991 | atags += '-' * la | |
992 | elif tag == 'insert': | |
993 | btags += '+' * lb | |
994 | elif tag == 'equal': | |
995 | atags += ' ' * la | |
996 | btags += ' ' * lb | |
997 | else: | |
998 | raise ValueError, 'unknown tag %r' % (tag,) | |
999 | for line in self._qformat(aelt, belt, atags, btags): | |
1000 | yield line | |
1001 | else: | |
1002 | # the synch pair is identical | |
1003 | yield ' ' + aelt | |
1004 | ||
1005 | # pump out diffs from after the synch point | |
1006 | for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): | |
1007 | yield line | |
1008 | ||
1009 | def _fancy_helper(self, a, alo, ahi, b, blo, bhi): | |
1010 | g = [] | |
1011 | if alo < ahi: | |
1012 | if blo < bhi: | |
1013 | g = self._fancy_replace(a, alo, ahi, b, blo, bhi) | |
1014 | else: | |
1015 | g = self._dump('-', a, alo, ahi) | |
1016 | elif blo < bhi: | |
1017 | g = self._dump('+', b, blo, bhi) | |
1018 | ||
1019 | for line in g: | |
1020 | yield line | |
1021 | ||
1022 | def _qformat(self, aline, bline, atags, btags): | |
1023 | r""" | |
1024 | Format "?" output and deal with leading tabs. | |
1025 | ||
1026 | Example: | |
1027 | ||
1028 | >>> d = Differ() | |
1029 | >>> results = d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n', | |
1030 | ... ' ^ ^ ^ ', '+ ^ ^ ^ ') | |
1031 | >>> for line in results: print repr(line) | |
1032 | ... | |
1033 | '- \tabcDefghiJkl\n' | |
1034 | '? \t ^ ^ ^\n' | |
1035 | '+ \t\tabcdefGhijkl\n' | |
1036 | '? \t ^ ^ ^\n' | |
1037 | """ | |
1038 | ||
1039 | # Can hurt, but will probably help most of the time. | |
1040 | common = min(_count_leading(aline, "\t"), | |
1041 | _count_leading(bline, "\t")) | |
1042 | common = min(common, _count_leading(atags[:common], " ")) | |
1043 | atags = atags[common:].rstrip() | |
1044 | btags = btags[common:].rstrip() | |
1045 | ||
1046 | yield "- " + aline | |
1047 | if atags: | |
1048 | yield "? %s%s\n" % ("\t" * common, atags) | |
1049 | ||
1050 | yield "+ " + bline | |
1051 | if btags: | |
1052 | yield "? %s%s\n" % ("\t" * common, btags) | |
1053 | ||
1054 | # With respect to junk, an earlier version of ndiff simply refused to | |
1055 | # *start* a match with a junk element. The result was cases like this: | |
1056 | # before: private Thread currentThread; | |
1057 | # after: private volatile Thread currentThread; | |
1058 | # If you consider whitespace to be junk, the longest contiguous match | |
1059 | # not starting with junk is "e Thread currentThread". So ndiff reported | |
1060 | # that "e volatil" was inserted between the 't' and the 'e' in "private". | |
1061 | # While an accurate view, to people that's absurd. The current version | |
1062 | # looks for matching blocks that are entirely junk-free, then extends the | |
1063 | # longest one of those as far as possible but only with matching junk. | |
1064 | # So now "currentThread" is matched, then extended to suck up the | |
1065 | # preceding blank; then "private" is matched, and extended to suck up the | |
1066 | # following blank; then "Thread" is matched; and finally ndiff reports | |
1067 | # that "volatile " was inserted before "Thread". The only quibble | |
1068 | # remaining is that perhaps it was really the case that " volatile" | |
1069 | # was inserted after "private". I can live with that <wink>. | |
1070 | ||
1071 | import re | |
1072 | ||
1073 | def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match): | |
1074 | r""" | |
1075 | Return 1 for ignorable line: iff `line` is blank or contains a single '#'. | |
1076 | ||
1077 | Examples: | |
1078 | ||
1079 | >>> IS_LINE_JUNK('\n') | |
1080 | True | |
1081 | >>> IS_LINE_JUNK(' # \n') | |
1082 | True | |
1083 | >>> IS_LINE_JUNK('hello\n') | |
1084 | False | |
1085 | """ | |
1086 | ||
1087 | return pat(line) is not None | |
1088 | ||
1089 | def IS_CHARACTER_JUNK(ch, ws=" \t"): | |
1090 | r""" | |
1091 | Return 1 for ignorable character: iff `ch` is a space or tab. | |
1092 | ||
1093 | Examples: | |
1094 | ||
1095 | >>> IS_CHARACTER_JUNK(' ') | |
1096 | True | |
1097 | >>> IS_CHARACTER_JUNK('\t') | |
1098 | True | |
1099 | >>> IS_CHARACTER_JUNK('\n') | |
1100 | False | |
1101 | >>> IS_CHARACTER_JUNK('x') | |
1102 | False | |
1103 | """ | |
1104 | ||
1105 | return ch in ws | |
1106 | ||
1107 | ||
1108 | def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', | |
1109 | tofiledate='', n=3, lineterm='\n'): | |
1110 | r""" | |
1111 | Compare two sequences of lines; generate the delta as a unified diff. | |
1112 | ||
1113 | Unified diffs are a compact way of showing line changes and a few | |
1114 | lines of context. The number of context lines is set by 'n' which | |
1115 | defaults to three. | |
1116 | ||
1117 | By default, the diff control lines (those with ---, +++, or @@) are | |
1118 | created with a trailing newline. This is helpful so that inputs | |
1119 | created from file.readlines() result in diffs that are suitable for | |
1120 | file.writelines() since both the inputs and outputs have trailing | |
1121 | newlines. | |
1122 | ||
1123 | For inputs that do not have trailing newlines, set the lineterm | |
1124 | argument to "" so that the output will be uniformly newline free. | |
1125 | ||
1126 | The unidiff format normally has a header for filenames and modification | |
1127 | times. Any or all of these may be specified using strings for | |
1128 | 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification | |
1129 | times are normally expressed in the format returned by time.ctime(). | |
1130 | ||
1131 | Example: | |
1132 | ||
1133 | >>> for line in unified_diff('one two three four'.split(), | |
1134 | ... 'zero one tree four'.split(), 'Original', 'Current', | |
1135 | ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003', | |
1136 | ... lineterm=''): | |
1137 | ... print line | |
1138 | --- Original Sat Jan 26 23:30:50 1991 | |
1139 | +++ Current Fri Jun 06 10:20:52 2003 | |
1140 | @@ -1,4 +1,4 @@ | |
1141 | +zero | |
1142 | one | |
1143 | -two | |
1144 | -three | |
1145 | +tree | |
1146 | four | |
1147 | """ | |
1148 | ||
1149 | started = False | |
1150 | for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): | |
1151 | if not started: | |
1152 | yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm) | |
1153 | yield '+++ %s %s%s' % (tofile, tofiledate, lineterm) | |
1154 | started = True | |
1155 | i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] | |
1156 | yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm) | |
1157 | for tag, i1, i2, j1, j2 in group: | |
1158 | if tag == 'equal': | |
1159 | for line in a[i1:i2]: | |
1160 | yield ' ' + line | |
1161 | continue | |
1162 | if tag == 'replace' or tag == 'delete': | |
1163 | for line in a[i1:i2]: | |
1164 | yield '-' + line | |
1165 | if tag == 'replace' or tag == 'insert': | |
1166 | for line in b[j1:j2]: | |
1167 | yield '+' + line | |
1168 | ||
1169 | # See http://www.unix.org/single_unix_specification/ | |
1170 | def context_diff(a, b, fromfile='', tofile='', | |
1171 | fromfiledate='', tofiledate='', n=3, lineterm='\n'): | |
1172 | r""" | |
1173 | Compare two sequences of lines; generate the delta as a context diff. | |
1174 | ||
1175 | Context diffs are a compact way of showing line changes and a few | |
1176 | lines of context. The number of context lines is set by 'n' which | |
1177 | defaults to three. | |
1178 | ||
1179 | By default, the diff control lines (those with *** or ---) are | |
1180 | created with a trailing newline. This is helpful so that inputs | |
1181 | created from file.readlines() result in diffs that are suitable for | |
1182 | file.writelines() since both the inputs and outputs have trailing | |
1183 | newlines. | |
1184 | ||
1185 | For inputs that do not have trailing newlines, set the lineterm | |
1186 | argument to "" so that the output will be uniformly newline free. | |
1187 | ||
1188 | The context diff format normally has a header for filenames and | |
1189 | modification times. Any or all of these may be specified using | |
1190 | strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. | |
1191 | The modification times are normally expressed in the format returned | |
1192 | by time.ctime(). If not specified, the strings default to blanks. | |
1193 | ||
1194 | Example: | |
1195 | ||
1196 | >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), | |
1197 | ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current', | |
1198 | ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')), | |
1199 | *** Original Sat Jan 26 23:30:50 1991 | |
1200 | --- Current Fri Jun 06 10:22:46 2003 | |
1201 | *************** | |
1202 | *** 1,4 **** | |
1203 | one | |
1204 | ! two | |
1205 | ! three | |
1206 | four | |
1207 | --- 1,4 ---- | |
1208 | + zero | |
1209 | one | |
1210 | ! tree | |
1211 | four | |
1212 | """ | |
1213 | ||
1214 | started = False | |
1215 | prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':' '} | |
1216 | for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): | |
1217 | if not started: | |
1218 | yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm) | |
1219 | yield '--- %s %s%s' % (tofile, tofiledate, lineterm) | |
1220 | started = True | |
1221 | ||
1222 | yield '***************%s' % (lineterm,) | |
1223 | if group[-1][2] - group[0][1] >= 2: | |
1224 | yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm) | |
1225 | else: | |
1226 | yield '*** %d ****%s' % (group[-1][2], lineterm) | |
1227 | visiblechanges = [e for e in group if e[0] in ('replace', 'delete')] | |
1228 | if visiblechanges: | |
1229 | for tag, i1, i2, _, _ in group: | |
1230 | if tag != 'insert': | |
1231 | for line in a[i1:i2]: | |
1232 | yield prefixmap[tag] + line | |
1233 | ||
1234 | if group[-1][4] - group[0][3] >= 2: | |
1235 | yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm) | |
1236 | else: | |
1237 | yield '--- %d ----%s' % (group[-1][4], lineterm) | |
1238 | visiblechanges = [e for e in group if e[0] in ('replace', 'insert')] | |
1239 | if visiblechanges: | |
1240 | for tag, _, _, j1, j2 in group: | |
1241 | if tag != 'delete': | |
1242 | for line in b[j1:j2]: | |
1243 | yield prefixmap[tag] + line | |
1244 | ||
1245 | def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): | |
1246 | r""" | |
1247 | Compare `a` and `b` (lists of strings); return a `Differ`-style delta. | |
1248 | ||
1249 | Optional keyword parameters `linejunk` and `charjunk` are for filter | |
1250 | functions (or None): | |
1251 | ||
1252 | - linejunk: A function that should accept a single string argument, and | |
1253 | return true iff the string is junk. The default is None, and is | |
1254 | recommended; as of Python 2.3, an adaptive notion of "noise" lines is | |
1255 | used that does a good job on its own. | |
1256 | ||
1257 | - charjunk: A function that should accept a string of length 1. The | |
1258 | default is module-level function IS_CHARACTER_JUNK, which filters out | |
1259 | whitespace characters (a blank or tab; note: bad idea to include newline | |
1260 | in this!). | |
1261 | ||
1262 | Tools/scripts/ndiff.py is a command-line front-end to this function. | |
1263 | ||
1264 | Example: | |
1265 | ||
1266 | >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), | |
1267 | ... 'ore\ntree\nemu\n'.splitlines(1)) | |
1268 | >>> print ''.join(diff), | |
1269 | - one | |
1270 | ? ^ | |
1271 | + ore | |
1272 | ? ^ | |
1273 | - two | |
1274 | - three | |
1275 | ? - | |
1276 | + tree | |
1277 | + emu | |
1278 | """ | |
1279 | return Differ(linejunk, charjunk).compare(a, b) | |
1280 | ||
1281 | def _mdiff(fromlines, tolines, context=None, linejunk=None, | |
1282 | charjunk=IS_CHARACTER_JUNK): | |
1283 | """Returns generator yielding marked up from/to side by side differences. | |
1284 | ||
1285 | Arguments: | |
1286 | fromlines -- list of text lines to compared to tolines | |
1287 | tolines -- list of text lines to be compared to fromlines | |
1288 | context -- number of context lines to display on each side of difference, | |
1289 | if None, all from/to text lines will be generated. | |
1290 | linejunk -- passed on to ndiff (see ndiff documentation) | |
1291 | charjunk -- passed on to ndiff (see ndiff documentation) | |
1292 | ||
1293 | This function returns an interator which returns a tuple: | |
1294 | (from line tuple, to line tuple, boolean flag) | |
1295 | ||
1296 | from/to line tuple -- (line num, line text) | |
1297 | line num -- integer or None (to indicate a context seperation) | |
1298 | line text -- original line text with following markers inserted: | |
1299 | '\0+' -- marks start of added text | |
1300 | '\0-' -- marks start of deleted text | |
1301 | '\0^' -- marks start of changed text | |
1302 | '\1' -- marks end of added/deleted/changed text | |
1303 | ||
1304 | boolean flag -- None indicates context separation, True indicates | |
1305 | either "from" or "to" line contains a change, otherwise False. | |
1306 | ||
1307 | This function/iterator was originally developed to generate side by side | |
1308 | file difference for making HTML pages (see HtmlDiff class for example | |
1309 | usage). | |
1310 | ||
1311 | Note, this function utilizes the ndiff function to generate the side by | |
1312 | side difference markup. Optional ndiff arguments may be passed to this | |
1313 | function and they in turn will be passed to ndiff. | |
1314 | """ | |
1315 | import re | |
1316 | ||
1317 | # regular expression for finding intraline change indices | |
1318 | change_re = re.compile('(\++|\-+|\^+)') | |
1319 | ||
1320 | # create the difference iterator to generate the differences | |
1321 | diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) | |
1322 | ||
1323 | def _make_line(lines, format_key, side, num_lines=[0,0]): | |
1324 | """Returns line of text with user's change markup and line formatting. | |
1325 | ||
1326 | lines -- list of lines from the ndiff generator to produce a line of | |
1327 | text from. When producing the line of text to return, the | |
1328 | lines used are removed from this list. | |
1329 | format_key -- '+' return first line in list with "add" markup around | |
1330 | the entire line. | |
1331 | '-' return first line in list with "delete" markup around | |
1332 | the entire line. | |
1333 | '?' return first line in list with add/delete/change | |
1334 | intraline markup (indices obtained from second line) | |
1335 | None return first line in list with no markup | |
1336 | side -- indice into the num_lines list (0=from,1=to) | |
1337 | num_lines -- from/to current line number. This is NOT intended to be a | |
1338 | passed parameter. It is present as a keyword argument to | |
1339 | maintain memory of the current line numbers between calls | |
1340 | of this function. | |
1341 | ||
1342 | Note, this function is purposefully not defined at the module scope so | |
1343 | that data it needs from its parent function (within whose context it | |
1344 | is defined) does not need to be of module scope. | |
1345 | """ | |
1346 | num_lines[side] += 1 | |
1347 | # Handle case where no user markup is to be added, just return line of | |
1348 | # text with user's line format to allow for usage of the line number. | |
1349 | if format_key is None: | |
1350 | return (num_lines[side],lines.pop(0)[2:]) | |
1351 | # Handle case of intraline changes | |
1352 | if format_key == '?': | |
1353 | text, markers = lines.pop(0), lines.pop(0) | |
1354 | # find intraline changes (store change type and indices in tuples) | |
1355 | sub_info = [] | |
1356 | def record_sub_info(match_object,sub_info=sub_info): | |
1357 | sub_info.append([match_object.group(1)[0],match_object.span()]) | |
1358 | return match_object.group(1) | |
1359 | change_re.sub(record_sub_info,markers) | |
1360 | # process each tuple inserting our special marks that won't be | |
1361 | # noticed by an xml/html escaper. | |
1362 | for key,(begin,end) in sub_info[::-1]: | |
1363 | text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:] | |
1364 | text = text[2:] | |
1365 | # Handle case of add/delete entire line | |
1366 | else: | |
1367 | text = lines.pop(0)[2:] | |
1368 | # if line of text is just a newline, insert a space so there is | |
1369 | # something for the user to highlight and see. | |
1370 | if not text: | |
1371 | text = ' ' | |
1372 | # insert marks that won't be noticed by an xml/html escaper. | |
1373 | text = '\0' + format_key + text + '\1' | |
1374 | # Return line of text, first allow user's line formatter to do its | |
1375 | # thing (such as adding the line number) then replace the special | |
1376 | # marks with what the user's change markup. | |
1377 | return (num_lines[side],text) | |
1378 | ||
1379 | def _line_iterator(): | |
1380 | """Yields from/to lines of text with a change indication. | |
1381 | ||
1382 | This function is an iterator. It itself pulls lines from a | |
1383 | differencing iterator, processes them and yields them. When it can | |
1384 | it yields both a "from" and a "to" line, otherwise it will yield one | |
1385 | or the other. In addition to yielding the lines of from/to text, a | |
1386 | boolean flag is yielded to indicate if the text line(s) have | |
1387 | differences in them. | |
1388 | ||
1389 | Note, this function is purposefully not defined at the module scope so | |
1390 | that data it needs from its parent function (within whose context it | |
1391 | is defined) does not need to be of module scope. | |
1392 | """ | |
1393 | lines = [] | |
1394 | num_blanks_pending, num_blanks_to_yield = 0, 0 | |
1395 | while True: | |
1396 | # Load up next 4 lines so we can look ahead, create strings which | |
1397 | # are a concatenation of the first character of each of the 4 lines | |
1398 | # so we can do some very readable comparisons. | |
1399 | while len(lines) < 4: | |
1400 | try: | |
1401 | lines.append(diff_lines_iterator.next()) | |
1402 | except StopIteration: | |
1403 | lines.append('X') | |
1404 | s = ''.join([line[0] for line in lines]) | |
1405 | if s.startswith('X'): | |
1406 | # When no more lines, pump out any remaining blank lines so the | |
1407 | # corresponding add/delete lines get a matching blank line so | |
1408 | # all line pairs get yielded at the next level. | |
1409 | num_blanks_to_yield = num_blanks_pending | |
1410 | elif s.startswith('-?+?'): | |
1411 | # simple intraline change | |
1412 | yield _make_line(lines,'?',0), _make_line(lines,'?',1), True | |
1413 | continue | |
1414 | elif s.startswith('--++'): | |
1415 | # in delete block, add block coming: we do NOT want to get | |
1416 | # caught up on blank lines yet, just process the delete line | |
1417 | num_blanks_pending -= 1 | |
1418 | yield _make_line(lines,'-',0), None, True | |
1419 | continue | |
1420 | elif s.startswith('--?+') or s.startswith('--+') or \ | |
1421 | s.startswith('- '): | |
1422 | # in delete block and see a intraline change or unchanged line | |
1423 | # coming: yield the delete line and then blanks | |
1424 | from_line,to_line = _make_line(lines,'-',0), None | |
1425 | num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0 | |
1426 | elif s.startswith('-+?'): | |
1427 | # intraline change | |
1428 | yield _make_line(lines,None,0), _make_line(lines,'?',1), True | |
1429 | continue | |
1430 | elif s.startswith('-?+'): | |
1431 | # intraline change | |
1432 | yield _make_line(lines,'?',0), _make_line(lines,None,1), True | |
1433 | continue | |
1434 | elif s.startswith('-'): | |
1435 | # delete FROM line | |
1436 | num_blanks_pending -= 1 | |
1437 | yield _make_line(lines,'-',0), None, True | |
1438 | continue | |
1439 | elif s.startswith('+--'): | |
1440 | # in add block, delete block coming: we do NOT want to get | |
1441 | # caught up on blank lines yet, just process the add line | |
1442 | num_blanks_pending += 1 | |
1443 | yield None, _make_line(lines,'+',1), True | |
1444 | continue | |
1445 | elif s.startswith('+ ') or s.startswith('+-'): | |
1446 | # will be leaving an add block: yield blanks then add line | |
1447 | from_line, to_line = None, _make_line(lines,'+',1) | |
1448 | num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0 | |
1449 | elif s.startswith('+'): | |
1450 | # inside an add block, yield the add line | |
1451 | num_blanks_pending += 1 | |
1452 | yield None, _make_line(lines,'+',1), True | |
1453 | continue | |
1454 | elif s.startswith(' '): | |
1455 | # unchanged text, yield it to both sides | |
1456 | yield _make_line(lines[:],None,0),_make_line(lines,None,1),False | |
1457 | continue | |
1458 | # Catch up on the blank lines so when we yield the next from/to | |
1459 | # pair, they are lined up. | |
1460 | while(num_blanks_to_yield < 0): | |
1461 | num_blanks_to_yield += 1 | |
1462 | yield None,('','\n'),True | |
1463 | while(num_blanks_to_yield > 0): | |
1464 | num_blanks_to_yield -= 1 | |
1465 | yield ('','\n'),None,True | |
1466 | if s.startswith('X'): | |
1467 | raise StopIteration | |
1468 | else: | |
1469 | yield from_line,to_line,True | |
1470 | ||
1471 | def _line_pair_iterator(): | |
1472 | """Yields from/to lines of text with a change indication. | |
1473 | ||
1474 | This function is an iterator. It itself pulls lines from the line | |
1475 | iterator. Its difference from that iterator is that this function | |
1476 | always yields a pair of from/to text lines (with the change | |
1477 | indication). If necessary it will collect single from/to lines | |
1478 | until it has a matching pair from/to pair to yield. | |
1479 | ||
1480 | Note, this function is purposefully not defined at the module scope so | |
1481 | that data it needs from its parent function (within whose context it | |
1482 | is defined) does not need to be of module scope. | |
1483 | """ | |
1484 | line_iterator = _line_iterator() | |
1485 | fromlines,tolines=[],[] | |
1486 | while True: | |
1487 | # Collecting lines of text until we have a from/to pair | |
1488 | while (len(fromlines)==0 or len(tolines)==0): | |
1489 | from_line, to_line, found_diff =line_iterator.next() | |
1490 | if from_line is not None: | |
1491 | fromlines.append((from_line,found_diff)) | |
1492 | if to_line is not None: | |
1493 | tolines.append((to_line,found_diff)) | |
1494 | # Once we have a pair, remove them from the collection and yield it | |
1495 | from_line, fromDiff = fromlines.pop(0) | |
1496 | to_line, to_diff = tolines.pop(0) | |
1497 | yield (from_line,to_line,fromDiff or to_diff) | |
1498 | ||
1499 | # Handle case where user does not want context differencing, just yield | |
1500 | # them up without doing anything else with them. | |
1501 | line_pair_iterator = _line_pair_iterator() | |
1502 | if context is None: | |
1503 | while True: | |
1504 | yield line_pair_iterator.next() | |
1505 | # Handle case where user wants context differencing. We must do some | |
1506 | # storage of lines until we know for sure that they are to be yielded. | |
1507 | else: | |
1508 | context += 1 | |
1509 | lines_to_write = 0 | |
1510 | while True: | |
1511 | # Store lines up until we find a difference, note use of a | |
1512 | # circular queue because we only need to keep around what | |
1513 | # we need for context. | |
1514 | index, contextLines = 0, [None]*(context) | |
1515 | found_diff = False | |
1516 | while(found_diff is False): | |
1517 | from_line, to_line, found_diff = line_pair_iterator.next() | |
1518 | i = index % context | |
1519 | contextLines[i] = (from_line, to_line, found_diff) | |
1520 | index += 1 | |
1521 | # Yield lines that we have collected so far, but first yield | |
1522 | # the user's separator. | |
1523 | if index > context: | |
1524 | yield None, None, None | |
1525 | lines_to_write = context | |
1526 | else: | |
1527 | lines_to_write = index | |
1528 | index = 0 | |
1529 | while(lines_to_write): | |
1530 | i = index % context | |
1531 | index += 1 | |
1532 | yield contextLines[i] | |
1533 | lines_to_write -= 1 | |
1534 | # Now yield the context lines after the change | |
1535 | lines_to_write = context-1 | |
1536 | while(lines_to_write): | |
1537 | from_line, to_line, found_diff = line_pair_iterator.next() | |
1538 | # If another change within the context, extend the context | |
1539 | if found_diff: | |
1540 | lines_to_write = context-1 | |
1541 | else: | |
1542 | lines_to_write -= 1 | |
1543 | yield from_line, to_line, found_diff | |
1544 | ||
1545 | ||
1546 | _file_template = """ | |
1547 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" | |
1548 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
1549 | ||
1550 | <html> | |
1551 | ||
1552 | <head> | |
1553 | <meta http-equiv="Content-Type" | |
1554 | content="text/html; charset=ISO-8859-1" /> | |
1555 | <title></title> | |
1556 | <style type="text/css">%(styles)s | |
1557 | </style> | |
1558 | </head> | |
1559 | ||
1560 | <body> | |
1561 | %(table)s%(legend)s | |
1562 | </body> | |
1563 | ||
1564 | </html>""" | |
1565 | ||
1566 | _styles = """ | |
1567 | table.diff {font-family:Courier; border:medium;} | |
1568 | .diff_header {background-color:#e0e0e0} | |
1569 | td.diff_header {text-align:right} | |
1570 | .diff_next {background-color:#c0c0c0} | |
1571 | .diff_add {background-color:#aaffaa} | |
1572 | .diff_chg {background-color:#ffff77} | |
1573 | .diff_sub {background-color:#ffaaaa}""" | |
1574 | ||
1575 | _table_template = """ | |
1576 | <table class="diff" id="difflib_chg_%(prefix)s_top" | |
1577 | cellspacing="0" cellpadding="0" rules="groups" > | |
1578 | <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> | |
1579 | <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> | |
1580 | %(header_row)s | |
1581 | <tbody> | |
1582 | %(data_rows)s </tbody> | |
1583 | </table>""" | |
1584 | ||
1585 | _legend = """ | |
1586 | <table class="diff" summary="Legends"> | |
1587 | <tr> <th colspan="2"> Legends </th> </tr> | |
1588 | <tr> <td> <table border="" summary="Colors"> | |
1589 | <tr><th> Colors </th> </tr> | |
1590 | <tr><td class="diff_add"> Added </td></tr> | |
1591 | <tr><td class="diff_chg">Changed</td> </tr> | |
1592 | <tr><td class="diff_sub">Deleted</td> </tr> | |
1593 | </table></td> | |
1594 | <td> <table border="" summary="Links"> | |
1595 | <tr><th colspan="2"> Links </th> </tr> | |
1596 | <tr><td>(f)irst change</td> </tr> | |
1597 | <tr><td>(n)ext change</td> </tr> | |
1598 | <tr><td>(t)op</td> </tr> | |
1599 | </table></td> </tr> | |
1600 | </table>""" | |
1601 | ||
1602 | class HtmlDiff(object): | |
1603 | """For producing HTML side by side comparison with change highlights. | |
1604 | ||
1605 | This class can be used to create an HTML table (or a complete HTML file | |
1606 | containing the table) showing a side by side, line by line comparison | |
1607 | of text with inter-line and intra-line change highlights. The table can | |
1608 | be generated in either full or contextual difference mode. | |
1609 | ||
1610 | The following methods are provided for HTML generation: | |
1611 | ||
1612 | make_table -- generates HTML for a single side by side table | |
1613 | make_file -- generates complete HTML file with a single side by side table | |
1614 | ||
1615 | See tools/scripts/diff.py for an example usage of this class. | |
1616 | """ | |
1617 | ||
1618 | _file_template = _file_template | |
1619 | _styles = _styles | |
1620 | _table_template = _table_template | |
1621 | _legend = _legend | |
1622 | _default_prefix = 0 | |
1623 | ||
1624 | def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, | |
1625 | charjunk=IS_CHARACTER_JUNK): | |
1626 | """HtmlDiff instance initializer | |
1627 | ||
1628 | Arguments: | |
1629 | tabsize -- tab stop spacing, defaults to 8. | |
1630 | wrapcolumn -- column number where lines are broken and wrapped, | |
1631 | defaults to None where lines are not wrapped. | |
1632 | linejunk,charjunk -- keyword arguments passed into ndiff() (used to by | |
1633 | HtmlDiff() to generate the side by side HTML differences). See | |
1634 | ndiff() documentation for argument default values and descriptions. | |
1635 | """ | |
1636 | self._tabsize = tabsize | |
1637 | self._wrapcolumn = wrapcolumn | |
1638 | self._linejunk = linejunk | |
1639 | self._charjunk = charjunk | |
1640 | ||
1641 | def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, | |
1642 | numlines=5): | |
1643 | """Returns HTML file of side by side comparison with change highlights | |
1644 | ||
1645 | Arguments: | |
1646 | fromlines -- list of "from" lines | |
1647 | tolines -- list of "to" lines | |
1648 | fromdesc -- "from" file column header string | |
1649 | todesc -- "to" file column header string | |
1650 | context -- set to True for contextual differences (defaults to False | |
1651 | which shows full differences). | |
1652 | numlines -- number of context lines. When context is set True, | |
1653 | controls number of lines displayed before and after the change. | |
1654 | When context is False, controls the number of lines to place | |
1655 | the "next" link anchors before the next change (so click of | |
1656 | "next" link jumps to just before the change). | |
1657 | """ | |
1658 | ||
1659 | return self._file_template % dict( | |
1660 | styles = self._styles, | |
1661 | legend = self._legend, | |
1662 | table = self.make_table(fromlines,tolines,fromdesc,todesc, | |
1663 | context=context,numlines=numlines)) | |
1664 | ||
1665 | def _tab_newline_replace(self,fromlines,tolines): | |
1666 | """Returns from/to line lists with tabs expanded and newlines removed. | |
1667 | ||
1668 | Instead of tab characters being replaced by the number of spaces | |
1669 | needed to fill in to the next tab stop, this function will fill | |
1670 | the space with tab characters. This is done so that the difference | |
1671 | algorithms can identify changes in a file when tabs are replaced by | |
1672 | spaces and vice versa. At the end of the HTML generation, the tab | |
1673 | characters will be replaced with a nonbreakable space. | |
1674 | """ | |
1675 | def expand_tabs(line): | |
1676 | # hide real spaces | |
1677 | line = line.replace(' ','\0') | |
1678 | # expand tabs into spaces | |
1679 | line = line.expandtabs(self._tabsize) | |
1680 | # relace spaces from expanded tabs back into tab characters | |
1681 | # (we'll replace them with markup after we do differencing) | |
1682 | line = line.replace(' ','\t') | |
1683 | return line.replace('\0',' ').rstrip('\n') | |
1684 | fromlines = [expand_tabs(line) for line in fromlines] | |
1685 | tolines = [expand_tabs(line) for line in tolines] | |
1686 | return fromlines,tolines | |
1687 | ||
1688 | def _split_line(self,data_list,line_num,text): | |
1689 | """Builds list of text lines by splitting text lines at wrap point | |
1690 | ||
1691 | This function will determine if the input text line needs to be | |
1692 | wrapped (split) into separate lines. If so, the first wrap point | |
1693 | will be determined and the first line appended to the output | |
1694 | text line list. This function is used recursively to handle | |
1695 | the second part of the split line to further split it. | |
1696 | """ | |
1697 | # if blank line or context separator, just add it to the output list | |
1698 | if not line_num: | |
1699 | data_list.append((line_num,text)) | |
1700 | return | |
1701 | ||
1702 | # if line text doesn't need wrapping, just add it to the output list | |
1703 | size = len(text) | |
1704 | max = self._wrapcolumn | |
1705 | if (size <= max) or ((size -(text.count('\0')*3)) <= max): | |
1706 | data_list.append((line_num,text)) | |
1707 | return | |
1708 | ||
1709 | # scan text looking for the wrap point, keeping track if the wrap | |
1710 | # point is inside markers | |
1711 | i = 0 | |
1712 | n = 0 | |
1713 | mark = '' | |
1714 | while n < max and i < size: | |
1715 | if text[i] == '\0': | |
1716 | i += 1 | |
1717 | mark = text[i] | |
1718 | i += 1 | |
1719 | elif text[i] == '\1': | |
1720 | i += 1 | |
1721 | mark = '' | |
1722 | else: | |
1723 | i += 1 | |
1724 | n += 1 | |
1725 | ||
1726 | # wrap point is inside text, break it up into separate lines | |
1727 | line1 = text[:i] | |
1728 | line2 = text[i:] | |
1729 | ||
1730 | # if wrap point is inside markers, place end marker at end of first | |
1731 | # line and start marker at beginning of second line because each | |
1732 | # line will have its own table tag markup around it. | |
1733 | if mark: | |
1734 | line1 = line1 + '\1' | |
1735 | line2 = '\0' + mark + line2 | |
1736 | ||
1737 | # tack on first line onto the output list | |
1738 | data_list.append((line_num,line1)) | |
1739 | ||
1740 | # use this routine again to wrap the remaining text | |
1741 | self._split_line(data_list,'>',line2) | |
1742 | ||
1743 | def _line_wrapper(self,diffs): | |
1744 | """Returns iterator that splits (wraps) mdiff text lines""" | |
1745 | ||
1746 | # pull from/to data and flags from mdiff iterator | |
1747 | for fromdata,todata,flag in diffs: | |
1748 | # check for context separators and pass them through | |
1749 | if flag is None: | |
1750 | yield fromdata,todata,flag | |
1751 | continue | |
1752 | (fromline,fromtext),(toline,totext) = fromdata,todata | |
1753 | # for each from/to line split it at the wrap column to form | |
1754 | # list of text lines. | |
1755 | fromlist,tolist = [],[] | |
1756 | self._split_line(fromlist,fromline,fromtext) | |
1757 | self._split_line(tolist,toline,totext) | |
1758 | # yield from/to line in pairs inserting blank lines as | |
1759 | # necessary when one side has more wrapped lines | |
1760 | while fromlist or tolist: | |
1761 | if fromlist: | |
1762 | fromdata = fromlist.pop(0) | |
1763 | else: | |
1764 | fromdata = ('',' ') | |
1765 | if tolist: | |
1766 | todata = tolist.pop(0) | |
1767 | else: | |
1768 | todata = ('',' ') | |
1769 | yield fromdata,todata,flag | |
1770 | ||
1771 | def _collect_lines(self,diffs): | |
1772 | """Collects mdiff output into separate lists | |
1773 | ||
1774 | Before storing the mdiff from/to data into a list, it is converted | |
1775 | into a single line of text with HTML markup. | |
1776 | """ | |
1777 | ||
1778 | fromlist,tolist,flaglist = [],[],[] | |
1779 | # pull from/to data and flags from mdiff style iterator | |
1780 | for fromdata,todata,flag in diffs: | |
1781 | try: | |
1782 | # store HTML markup of the lines into the lists | |
1783 | fromlist.append(self._format_line(0,flag,*fromdata)) | |
1784 | tolist.append(self._format_line(1,flag,*todata)) | |
1785 | except TypeError: | |
1786 | # exceptions occur for lines where context separators go | |
1787 | fromlist.append(None) | |
1788 | tolist.append(None) | |
1789 | flaglist.append(flag) | |
1790 | return fromlist,tolist,flaglist | |
1791 | ||
1792 | def _format_line(self,side,flag,linenum,text): | |
1793 | """Returns HTML markup of "from" / "to" text lines | |
1794 | ||
1795 | side -- 0 or 1 indicating "from" or "to" text | |
1796 | flag -- indicates if difference on line | |
1797 | linenum -- line number (used for line number column) | |
1798 | text -- line text to be marked up | |
1799 | """ | |
1800 | try: | |
1801 | linenum = '%d' % linenum | |
1802 | id = ' id="%s%s"' % (self._prefix[side],linenum) | |
1803 | except TypeError: | |
1804 | # handle blank lines where linenum is '>' or '' | |
1805 | id = '' | |
1806 | # replace those things that would get confused with HTML symbols | |
1807 | text=text.replace("&","&").replace(">",">").replace("<","<") | |
1808 | ||
1809 | # make space non-breakable so they don't get compressed or line wrapped | |
1810 | text = text.replace(' ',' ').rstrip() | |
1811 | ||
1812 | return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \ | |
1813 | % (id,linenum,text) | |
1814 | ||
1815 | def _make_prefix(self): | |
1816 | """Create unique anchor prefixes""" | |
1817 | ||
1818 | # Generate a unique anchor prefix so multiple tables | |
1819 | # can exist on the same HTML page without conflicts. | |
1820 | fromprefix = "from%d_" % HtmlDiff._default_prefix | |
1821 | toprefix = "to%d_" % HtmlDiff._default_prefix | |
1822 | HtmlDiff._default_prefix += 1 | |
1823 | # store prefixes so line format method has access | |
1824 | self._prefix = [fromprefix,toprefix] | |
1825 | ||
1826 | def _convert_flags(self,fromlist,tolist,flaglist,context,numlines): | |
1827 | """Makes list of "next" links""" | |
1828 | ||
1829 | # all anchor names will be generated using the unique "to" prefix | |
1830 | toprefix = self._prefix[1] | |
1831 | ||
1832 | # process change flags, generating middle column of next anchors/links | |
1833 | next_id = ['']*len(flaglist) | |
1834 | next_href = ['']*len(flaglist) | |
1835 | num_chg, in_change = 0, False | |
1836 | last = 0 | |
1837 | for i,flag in enumerate(flaglist): | |
1838 | if flag: | |
1839 | if not in_change: | |
1840 | in_change = True | |
1841 | last = i | |
1842 | # at the beginning of a change, drop an anchor a few lines | |
1843 | # (the context lines) before the change for the previous | |
1844 | # link | |
1845 | i = max([0,i-numlines]) | |
1846 | next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg) | |
1847 | # at the beginning of a change, drop a link to the next | |
1848 | # change | |
1849 | num_chg += 1 | |
1850 | next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % ( | |
1851 | toprefix,num_chg) | |
1852 | else: | |
1853 | in_change = False | |
1854 | # check for cases where there is no content to avoid exceptions | |
1855 | if not flaglist: | |
1856 | flaglist = [False] | |
1857 | next_id = [''] | |
1858 | next_href = [''] | |
1859 | last = 0 | |
1860 | if context: | |
1861 | fromlist = ['<td></td><td> No Differences Found </td>'] | |
1862 | tolist = fromlist | |
1863 | else: | |
1864 | fromlist = tolist = ['<td></td><td> Empty File </td>'] | |
1865 | # if not a change on first line, drop a link | |
1866 | if not flaglist[0]: | |
1867 | next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix | |
1868 | # redo the last link to link to the top | |
1869 | next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix) | |
1870 | ||
1871 | return fromlist,tolist,flaglist,next_href,next_id | |
1872 | ||
1873 | def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, | |
1874 | numlines=5): | |
1875 | """Returns HTML table of side by side comparison with change highlights | |
1876 | ||
1877 | Arguments: | |
1878 | fromlines -- list of "from" lines | |
1879 | tolines -- list of "to" lines | |
1880 | fromdesc -- "from" file column header string | |
1881 | todesc -- "to" file column header string | |
1882 | context -- set to True for contextual differences (defaults to False | |
1883 | which shows full differences). | |
1884 | numlines -- number of context lines. When context is set True, | |
1885 | controls number of lines displayed before and after the change. | |
1886 | When context is False, controls the number of lines to place | |
1887 | the "next" link anchors before the next change (so click of | |
1888 | "next" link jumps to just before the change). | |
1889 | """ | |
1890 | ||
1891 | # make unique anchor prefixes so that multiple tables may exist | |
1892 | # on the same page without conflict. | |
1893 | self._make_prefix() | |
1894 | ||
1895 | # change tabs to spaces before it gets more difficult after we insert | |
1896 | # markkup | |
1897 | fromlines,tolines = self._tab_newline_replace(fromlines,tolines) | |
1898 | ||
1899 | # create diffs iterator which generates side by side from/to data | |
1900 | if context: | |
1901 | context_lines = numlines | |
1902 | else: | |
1903 | context_lines = None | |
1904 | diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, | |
1905 | charjunk=self._charjunk) | |
1906 | ||
1907 | # set up iterator to wrap lines that exceed desired width | |
1908 | if self._wrapcolumn: | |
1909 | diffs = self._line_wrapper(diffs) | |
1910 | ||
1911 | # collect up from/to lines and flags into lists (also format the lines) | |
1912 | fromlist,tolist,flaglist = self._collect_lines(diffs) | |
1913 | ||
1914 | # process change flags, generating middle column of next anchors/links | |
1915 | fromlist,tolist,flaglist,next_href,next_id = self._convert_flags( | |
1916 | fromlist,tolist,flaglist,context,numlines) | |
1917 | ||
1918 | import cStringIO | |
1919 | s = cStringIO.StringIO() | |
1920 | fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \ | |
1921 | '<td class="diff_next">%s</td>%s</tr>\n' | |
1922 | for i in range(len(flaglist)): | |
1923 | if flaglist[i] is None: | |
1924 | # mdiff yields None on separator lines skip the bogus ones | |
1925 | # generated for the first line | |
1926 | if i > 0: | |
1927 | s.write(' </tbody> \n <tbody>\n') | |
1928 | else: | |
1929 | s.write( fmt % (next_id[i],next_href[i],fromlist[i], | |
1930 | next_href[i],tolist[i])) | |
1931 | if fromdesc or todesc: | |
1932 | header_row = '<thead><tr>%s%s%s%s</tr></thead>' % ( | |
1933 | '<th class="diff_next"><br /></th>', | |
1934 | '<th colspan="2" class="diff_header">%s</th>' % fromdesc, | |
1935 | '<th class="diff_next"><br /></th>', | |
1936 | '<th colspan="2" class="diff_header">%s</th>' % todesc) | |
1937 | else: | |
1938 | header_row = '' | |
1939 | ||
1940 | table = self._table_template % dict( | |
1941 | data_rows=s.getvalue(), | |
1942 | header_row=header_row, | |
1943 | prefix=self._prefix[1]) | |
1944 | ||
1945 | return table.replace('\0+','<span class="diff_add">'). \ | |
1946 | replace('\0-','<span class="diff_sub">'). \ | |
1947 | replace('\0^','<span class="diff_chg">'). \ | |
1948 | replace('\1','</span>'). \ | |
1949 | replace('\t',' ') | |
1950 | ||
1951 | del re | |
1952 | ||
1953 | def restore(delta, which): | |
1954 | r""" | |
1955 | Generate one of the two sequences that generated a delta. | |
1956 | ||
1957 | Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract | |
1958 | lines originating from file 1 or 2 (parameter `which`), stripping off line | |
1959 | prefixes. | |
1960 | ||
1961 | Examples: | |
1962 | ||
1963 | >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), | |
1964 | ... 'ore\ntree\nemu\n'.splitlines(1)) | |
1965 | >>> diff = list(diff) | |
1966 | >>> print ''.join(restore(diff, 1)), | |
1967 | one | |
1968 | two | |
1969 | three | |
1970 | >>> print ''.join(restore(diff, 2)), | |
1971 | ore | |
1972 | tree | |
1973 | emu | |
1974 | """ | |
1975 | try: | |
1976 | tag = {1: "- ", 2: "+ "}[int(which)] | |
1977 | except KeyError: | |
1978 | raise ValueError, ('unknown delta choice (must be 1 or 2): %r' | |
1979 | % which) | |
1980 | prefixes = (" ", tag) | |
1981 | for line in delta: | |
1982 | if line[:2] in prefixes: | |
1983 | yield line[2:] | |
1984 | ||
1985 | def _test(): | |
1986 | import doctest, difflib | |
1987 | return doctest.testmod(difflib) | |
1988 | ||
1989 | if __name__ == "__main__": | |
1990 | _test() |