Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | package Algorithm::Diff; |
2 | use strict; | |
3 | use vars qw($VERSION @EXPORT_OK @ISA @EXPORT); | |
4 | use integer; # see below in _replaceNextLargerWith() for mod to make | |
5 | # if you don't use this | |
6 | require Exporter; | |
7 | @ISA = qw(Exporter); | |
8 | @EXPORT = qw(); | |
9 | @EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff); | |
10 | $VERSION = sprintf('%d.%02d', (q$Revision: 1.15 $ =~ /\d+/g)); | |
11 | ||
12 | # McIlroy-Hunt diff algorithm | |
13 | # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com> | |
14 | # by Ned Konz, perl@bike-nomad.com | |
15 | ||
16 | =head1 NAME | |
17 | ||
18 | Algorithm::Diff - Compute `intelligent' differences between two files / lists | |
19 | ||
20 | =head1 SYNOPSIS | |
21 | ||
22 | use Algorithm::Diff qw(diff sdiff LCS traverse_sequences | |
23 | traverse_balanced); | |
24 | ||
25 | @lcs = LCS( \@seq1, \@seq2 ); | |
26 | ||
27 | @lcs = LCS( \@seq1, \@seq2, $key_generation_function ); | |
28 | ||
29 | $lcsref = LCS( \@seq1, \@seq2 ); | |
30 | ||
31 | $lcsref = LCS( \@seq1, \@seq2, $key_generation_function ); | |
32 | ||
33 | @diffs = diff( \@seq1, \@seq2 ); | |
34 | ||
35 | @diffs = diff( \@seq1, \@seq2, $key_generation_function ); | |
36 | ||
37 | @sdiffs = sdiff( \@seq1, \@seq2 ); | |
38 | ||
39 | @sdiffs = sdiff( \@seq1, \@seq2, $key_generation_function ); | |
40 | ||
41 | traverse_sequences( \@seq1, \@seq2, | |
42 | { MATCH => $callback, | |
43 | DISCARD_A => $callback, | |
44 | DISCARD_B => $callback, | |
45 | } ); | |
46 | ||
47 | traverse_sequences( \@seq1, \@seq2, | |
48 | { MATCH => $callback, | |
49 | DISCARD_A => $callback, | |
50 | DISCARD_B => $callback, | |
51 | }, | |
52 | $key_generation_function ); | |
53 | ||
54 | traverse_balanced( \@seq1, \@seq2, | |
55 | { MATCH => $callback, | |
56 | DISCARD_A => $callback, | |
57 | DISCARD_B => $callback, | |
58 | CHANGE => $callback, | |
59 | } ); | |
60 | ||
61 | =head1 INTRODUCTION | |
62 | ||
63 | (by Mark-Jason Dominus) | |
64 | ||
65 | I once read an article written by the authors of C<diff>; they said | |
66 | that they hard worked very hard on the algorithm until they found the | |
67 | right one. | |
68 | ||
69 | I think what they ended up using (and I hope someone will correct me, | |
70 | because I am not very confident about this) was the `longest common | |
71 | subsequence' method. in the LCS problem, you have two sequences of | |
72 | items: | |
73 | ||
74 | a b c d f g h j q z | |
75 | ||
76 | a b c d e f g i j k r x y z | |
77 | ||
78 | and you want to find the longest sequence of items that is present in | |
79 | both original sequences in the same order. That is, you want to find | |
80 | a new sequence I<S> which can be obtained from the first sequence by | |
81 | deleting some items, and from the secend sequence by deleting other | |
82 | items. You also want I<S> to be as long as possible. In this case | |
83 | I<S> is | |
84 | ||
85 | a b c d f g j z | |
86 | ||
87 | From there it's only a small step to get diff-like output: | |
88 | ||
89 | e h i k q r x y | |
90 | + - + + - + + + | |
91 | ||
92 | This module solves the LCS problem. It also includes a canned | |
93 | function to generate C<diff>-like output. | |
94 | ||
95 | It might seem from the example above that the LCS of two sequences is | |
96 | always pretty obvious, but that's not always the case, especially when | |
97 | the two sequences have many repeated elements. For example, consider | |
98 | ||
99 | a x b y c z p d q | |
100 | a b c a x b y c z | |
101 | ||
102 | A naive approach might start by matching up the C<a> and C<b> that | |
103 | appear at the beginning of each sequence, like this: | |
104 | ||
105 | a x b y c z p d q | |
106 | a b c a b y c z | |
107 | ||
108 | This finds the common subsequence C<a b c z>. But actually, the LCS | |
109 | is C<a x b y c z>: | |
110 | ||
111 | a x b y c z p d q | |
112 | a b c a x b y c z | |
113 | ||
114 | =head1 USAGE | |
115 | ||
116 | This module provides three exportable functions, which we'll deal with in | |
117 | ascending order of difficulty: C<LCS>, | |
118 | C<diff>, C<sdiff>, C<traverse_sequences>, and C<traverse_balanced>. | |
119 | ||
120 | =head2 C<LCS> | |
121 | ||
122 | Given references to two lists of items, LCS returns an array containing their | |
123 | longest common subsequence. In scalar context, it returns a reference to | |
124 | such a list. | |
125 | ||
126 | @lcs = LCS( \@seq1, \@seq2 ); | |
127 | $lcsref = LCS( \@seq1, \@seq2 ); | |
128 | ||
129 | C<LCS> may be passed an optional third parameter; this is a CODE | |
130 | reference to a key generation function. See L</KEY GENERATION | |
131 | FUNCTIONS>. | |
132 | ||
133 | @lcs = LCS( \@seq1, \@seq2, $keyGen ); | |
134 | $lcsref = LCS( \@seq1, \@seq2, $keyGen ); | |
135 | ||
136 | Additional parameters, if any, will be passed to the key generation | |
137 | routine. | |
138 | ||
139 | =head2 C<diff> | |
140 | ||
141 | @diffs = diff( \@seq1, \@seq2 ); | |
142 | $diffs_ref = diff( \@seq1, \@seq2 ); | |
143 | ||
144 | C<diff> computes the smallest set of additions and deletions necessary | |
145 | to turn the first sequence into the second, and returns a description | |
146 | of these changes. The description is a list of I<hunks>; each hunk | |
147 | represents a contiguous section of items which should be added, | |
148 | deleted, or replaced. The return value of C<diff> is a list of | |
149 | hunks, or, in scalar context, a reference to such a list. | |
150 | ||
151 | Here is an example: The diff of the following two sequences: | |
152 | ||
153 | a b c e h j l m n p | |
154 | b c d e f j k l m r s t | |
155 | ||
156 | Result: | |
157 | ||
158 | [ | |
159 | [ [ '-', 0, 'a' ] ], | |
160 | ||
161 | [ [ '+', 2, 'd' ] ], | |
162 | ||
163 | [ [ '-', 4, 'h' ] , | |
164 | [ '+', 4, 'f' ] ], | |
165 | ||
166 | [ [ '+', 6, 'k' ] ], | |
167 | ||
168 | [ [ '-', 8, 'n' ], | |
169 | [ '-', 9, 'p' ], | |
170 | [ '+', 9, 'r' ], | |
171 | [ '+', 10, 's' ], | |
172 | [ '+', 11, 't' ], | |
173 | ] | |
174 | ] | |
175 | ||
176 | There are five hunks here. The first hunk says that the C<a> at | |
177 | position 0 of the first sequence should be deleted (C<->). The second | |
178 | hunk says that the C<d> at position 2 of the second sequence should | |
179 | be inserted (C<+>). The third hunk says that the C<h> at position 4 | |
180 | of the first sequence should be removed and replaced with the C<f> | |
181 | from position 4 of the second sequence. The other two hunks similarly. | |
182 | ||
183 | C<diff> may be passed an optional third parameter; this is a CODE | |
184 | reference to a key generation function. See L</KEY GENERATION | |
185 | FUNCTIONS>. | |
186 | ||
187 | Additional parameters, if any, will be passed to the key generation | |
188 | routine. | |
189 | ||
190 | =head2 C<sdiff> | |
191 | ||
192 | @sdiffs = sdiff( \@seq1, \@seq2 ); | |
193 | $sdiffs_ref = sdiff( \@seq1, \@seq2 ); | |
194 | ||
195 | C<sdiff> computes all necessary components to show two sequences | |
196 | and their minimized differences side by side, just like the | |
197 | Unix-utility I<sdiff> does: | |
198 | ||
199 | same same | |
200 | before | after | |
201 | old < - | |
202 | - > new | |
203 | ||
204 | It returns a list of array refs, each pointing to an array of | |
205 | display instructions. In scalar context it returns a reference | |
206 | to such a list. | |
207 | ||
208 | Display instructions consist of three elements: A modifier indicator | |
209 | (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified, | |
210 | C<c>: Element changed) and the value of the old and new elements, to | |
211 | be displayed side by side. | |
212 | ||
213 | An C<sdiff> of the following two sequences: | |
214 | ||
215 | a b c e h j l m n p | |
216 | b c d e f j k l m r s t | |
217 | ||
218 | results in | |
219 | ||
220 | [ [ '-', 'a', '' ], | |
221 | [ 'u', 'b', 'b' ], | |
222 | [ 'u', 'c', 'c' ], | |
223 | [ '+', '', 'd' ], | |
224 | [ 'u', 'e', 'e' ], | |
225 | [ 'c', 'h', 'f' ], | |
226 | [ 'u', 'j', 'j' ], | |
227 | [ '+', '', 'k' ], | |
228 | [ 'u', 'l', 'l' ], | |
229 | [ 'u', 'm', 'm' ], | |
230 | [ 'c', 'n', 'r' ], | |
231 | [ 'c', 'p', 's' ], | |
232 | [ '+', '', 't' ] ] | |
233 | ||
234 | C<sdiff> may be passed an optional third parameter; this is a CODE | |
235 | reference to a key generation function. See L</KEY GENERATION | |
236 | FUNCTIONS>. | |
237 | ||
238 | Additional parameters, if any, will be passed to the key generation | |
239 | routine. | |
240 | ||
241 | =head2 C<traverse_sequences> | |
242 | ||
243 | C<traverse_sequences> is the most general facility provided by this | |
244 | module; C<diff> and C<LCS> are implemented as calls to it. | |
245 | ||
246 | Imagine that there are two arrows. Arrow A points to an element of sequence A, | |
247 | and arrow B points to an element of the sequence B. Initially, the arrows | |
248 | point to the first elements of the respective sequences. C<traverse_sequences> | |
249 | will advance the arrows through the sequences one element at a time, calling an | |
250 | appropriate user-specified callback function before each advance. It | |
251 | willadvance the arrows in such a way that if there are equal elements C<$A[$i]> | |
252 | and C<$B[$j]> which are equal and which are part of the LCS, there will be | |
253 | some moment during the execution of C<traverse_sequences> when arrow A is | |
254 | pointing to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens, | |
255 | C<traverse_sequences> will call the C<MATCH> callback function and then it will | |
256 | advance both arrows. | |
257 | ||
258 | Otherwise, one of the arrows is pointing to an element of its sequence that is | |
259 | not part of the LCS. C<traverse_sequences> will advance that arrow and will | |
260 | call the C<DISCARD_A> or the C<DISCARD_B> callback, depending on which arrow it | |
261 | advanced. If both arrows point to elements that are not part of the LCS, then | |
262 | C<traverse_sequences> will advance one of them and call the appropriate | |
263 | callback, but it is not specified which it will call. | |
264 | ||
265 | The arguments to C<traverse_sequences> are the two sequences to traverse, and a | |
266 | hash which specifies the callback functions, like this: | |
267 | ||
268 | traverse_sequences( \@seq1, \@seq2, | |
269 | { MATCH => $callback_1, | |
270 | DISCARD_A => $callback_2, | |
271 | DISCARD_B => $callback_3, | |
272 | } ); | |
273 | ||
274 | Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least the | |
275 | indices of the two arrows as their arguments. They are not expected to return | |
276 | any values. If a callback is omitted from the table, it is not called. | |
277 | ||
278 | Callbacks for A_FINISHED and B_FINISHED are invoked with at least the | |
279 | corresponding index in A or B. | |
280 | ||
281 | If arrow A reaches the end of its sequence, before arrow B does, | |
282 | C<traverse_sequences> will call the C<A_FINISHED> callback when it advances | |
283 | arrow B, if there is such a function; if not it will call C<DISCARD_B> instead. | |
284 | Similarly if arrow B finishes first. C<traverse_sequences> returns when both | |
285 | arrows are at the ends of their respective sequences. It returns true on | |
286 | success and false on failure. At present there is no way to fail. | |
287 | ||
288 | C<traverse_sequences> may be passed an optional fourth parameter; this is a | |
289 | CODE reference to a key generation function. See L</KEY GENERATION FUNCTIONS>. | |
290 | ||
291 | Additional parameters, if any, will be passed to the key generation function. | |
292 | ||
293 | =head2 C<traverse_balanced> | |
294 | ||
295 | C<traverse_balanced> is an alternative to C<traverse_sequences>. It | |
296 | uses a different algorithm to iterate through the entries in the | |
297 | computed LCS. Instead of sticking to one side and showing element changes | |
298 | as insertions and deletions only, it will jump back and forth between | |
299 | the two sequences and report I<changes> occurring as deletions on one | |
300 | side followed immediatly by an insertion on the other side. | |
301 | ||
302 | In addition to the | |
303 | C<DISCARD_A>, | |
304 | C<DISCARD_B>, and | |
305 | C<MATCH> | |
306 | callbacks supported by C<traverse_sequences>, C<traverse_balanced> supports | |
307 | a C<CHANGE> callback indicating that one element got C<replaced> by another: | |
308 | ||
309 | traverse_sequences( \@seq1, \@seq2, | |
310 | { MATCH => $callback_1, | |
311 | DISCARD_A => $callback_2, | |
312 | DISCARD_B => $callback_3, | |
313 | CHANGE => $callback_4, | |
314 | } ); | |
315 | ||
316 | If no C<CHANGE> callback is specified, C<traverse_balanced> | |
317 | will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions, | |
318 | therefore resulting in a similar behaviour as C<traverse_sequences> | |
319 | with different order of events. | |
320 | ||
321 | C<traverse_balanced> might be a bit slower than C<traverse_sequences>, | |
322 | noticable only while processing huge amounts of data. | |
323 | ||
324 | The C<sdiff> function of this module | |
325 | is implemented as call to C<traverse_balanced>. | |
326 | ||
327 | =head1 KEY GENERATION FUNCTIONS | |
328 | ||
329 | C<diff>, C<LCS>, and C<traverse_sequences> accept an optional last parameter. | |
330 | This is a CODE reference to a key generating (hashing) function that should | |
331 | return a string that uniquely identifies a given element. It should be the | |
332 | case that if two elements are to be considered equal, their keys should be the | |
333 | same (and the other way around). If no key generation function is provided, | |
334 | the key will be the element as a string. | |
335 | ||
336 | By default, comparisons will use "eq" and elements will be turned into keys | |
337 | using the default stringizing operator '""'. | |
338 | ||
339 | Where this is important is when you're comparing something other than strings. | |
340 | If it is the case that you have multiple different objects that should be | |
341 | considered to be equal, you should supply a key generation function. Otherwise, | |
342 | you have to make sure that your arrays contain unique references. | |
343 | ||
344 | For instance, consider this example: | |
345 | ||
346 | package Person; | |
347 | ||
348 | sub new | |
349 | { | |
350 | my $package = shift; | |
351 | return bless { name => '', ssn => '', @_ }, $package; | |
352 | } | |
353 | ||
354 | sub clone | |
355 | { | |
356 | my $old = shift; | |
357 | my $new = bless { %$old }, ref($old); | |
358 | } | |
359 | ||
360 | sub hash | |
361 | { | |
362 | return shift()->{'ssn'}; | |
363 | } | |
364 | ||
365 | my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' ); | |
366 | my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' ); | |
367 | my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' ); | |
368 | my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' ); | |
369 | my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' ); | |
370 | ||
371 | If you did this: | |
372 | ||
373 | my $array1 = [ $person1, $person2, $person4 ]; | |
374 | my $array2 = [ $person1, $person3, $person4, $person5 ]; | |
375 | Algorithm::Diff::diff( $array1, $array2 ); | |
376 | ||
377 | everything would work out OK (each of the objects would be converted | |
378 | into a string like "Person=HASH(0x82425b0)" for comparison). | |
379 | ||
380 | But if you did this: | |
381 | ||
382 | my $array1 = [ $person1, $person2, $person4 ]; | |
383 | my $array2 = [ $person1, $person3, $person4->clone(), $person5 ]; | |
384 | Algorithm::Diff::diff( $array1, $array2 ); | |
385 | ||
386 | $person4 and $person4->clone() (which have the same name and SSN) | |
387 | would be seen as different objects. If you wanted them to be considered | |
388 | equivalent, you would have to pass in a key generation function: | |
389 | ||
390 | my $array1 = [ $person1, $person2, $person4 ]; | |
391 | my $array2 = [ $person1, $person3, $person4->clone(), $person5 ]; | |
392 | Algorithm::Diff::diff( $array1, $array2, \&Person::hash ); | |
393 | ||
394 | This would use the 'ssn' field in each Person as a comparison key, and | |
395 | so would consider $person4 and $person4->clone() as equal. | |
396 | ||
397 | You may also pass additional parameters to the key generation function | |
398 | if you wish. | |
399 | ||
400 | =head1 AUTHOR | |
401 | ||
402 | This version by Ned Konz, perl@bike-nomad.com | |
403 | ||
404 | =head1 LICENSE | |
405 | ||
406 | Copyright (c) 2000-2002 Ned Konz. All rights reserved. | |
407 | This program is free software; | |
408 | you can redistribute it and/or modify it under the same terms | |
409 | as Perl itself. | |
410 | ||
411 | =head1 CREDITS | |
412 | ||
413 | Versions through 0.59 (and much of this documentation) were written by: | |
414 | ||
415 | Mark-Jason Dominus, mjd-perl-diff@plover.com | |
416 | ||
417 | This version borrows the documentation and names of the routines | |
418 | from Mark-Jason's, but has all new code in Diff.pm. | |
419 | ||
420 | This code was adapted from the Smalltalk code of | |
421 | Mario Wolczko <mario@wolczko.com>, which is available at | |
422 | ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st | |
423 | ||
424 | C<sdiff> and C<traverse_balanced> were written by Mike Schilli | |
425 | <m@perlmeister.com>. | |
426 | ||
427 | The algorithm is that described in | |
428 | I<A Fast Algorithm for Computing Longest Common Subsequences>, | |
429 | CACM, vol.20, no.5, pp.350-353, May 1977, with a few | |
430 | minor improvements to improve the speed. | |
431 | ||
432 | =cut | |
433 | ||
434 | # Create a hash that maps each element of $aCollection to the set of positions | |
435 | # it occupies in $aCollection, restricted to the elements within the range of | |
436 | # indexes specified by $start and $end. | |
437 | # The fourth parameter is a subroutine reference that will be called to | |
438 | # generate a string to use as a key. | |
439 | # Additional parameters, if any, will be passed to this subroutine. | |
440 | # | |
441 | # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen ); | |
442 | ||
443 | sub _withPositionsOfInInterval | |
444 | { | |
445 | my $aCollection = shift; # array ref | |
446 | my $start = shift; | |
447 | my $end = shift; | |
448 | my $keyGen = shift; | |
449 | my %d; | |
450 | my $index; | |
451 | for ( $index = $start ; $index <= $end ; $index++ ) | |
452 | { | |
453 | my $element = $aCollection->[$index]; | |
454 | my $key = &$keyGen( $element, @_ ); | |
455 | if ( exists( $d{$key} ) ) | |
456 | { | |
457 | unshift ( @{ $d{$key} }, $index ); | |
458 | } | |
459 | else | |
460 | { | |
461 | $d{$key} = [$index]; | |
462 | } | |
463 | } | |
464 | return wantarray ? %d : \%d; | |
465 | } | |
466 | ||
467 | # Find the place at which aValue would normally be inserted into the array. If | |
468 | # that place is already occupied by aValue, do nothing, and return undef. If | |
469 | # the place does not exist (i.e., it is off the end of the array), add it to | |
470 | # the end, otherwise replace the element at that point with aValue. | |
471 | # It is assumed that the array's values are numeric. | |
472 | # This is where the bulk (75%) of the time is spent in this module, so try to | |
473 | # make it fast! | |
474 | ||
475 | sub _replaceNextLargerWith | |
476 | { | |
477 | my ( $array, $aValue, $high ) = @_; | |
478 | $high ||= $#$array; | |
479 | ||
480 | # off the end? | |
481 | if ( $high == -1 || $aValue > $array->[-1] ) | |
482 | { | |
483 | push ( @$array, $aValue ); | |
484 | return $high + 1; | |
485 | } | |
486 | ||
487 | # binary search for insertion point... | |
488 | my $low = 0; | |
489 | my $index; | |
490 | my $found; | |
491 | while ( $low <= $high ) | |
492 | { | |
493 | $index = ( $high + $low ) / 2; | |
494 | ||
495 | # $index = int(( $high + $low ) / 2); # without 'use integer' | |
496 | $found = $array->[$index]; | |
497 | ||
498 | if ( $aValue == $found ) | |
499 | { | |
500 | return undef; | |
501 | } | |
502 | elsif ( $aValue > $found ) | |
503 | { | |
504 | $low = $index + 1; | |
505 | } | |
506 | else | |
507 | { | |
508 | $high = $index - 1; | |
509 | } | |
510 | } | |
511 | ||
512 | # now insertion point is in $low. | |
513 | $array->[$low] = $aValue; # overwrite next larger | |
514 | return $low; | |
515 | } | |
516 | ||
517 | # This method computes the longest common subsequence in $a and $b. | |
518 | ||
519 | # Result is array or ref, whose contents is such that | |
520 | # $a->[ $i ] == $b->[ $result[ $i ] ] | |
521 | # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined. | |
522 | ||
523 | # An additional argument may be passed; this is a hash or key generating | |
524 | # function that should return a string that uniquely identifies the given | |
525 | # element. It should be the case that if the key is the same, the elements | |
526 | # will compare the same. If this parameter is undef or missing, the key | |
527 | # will be the element as a string. | |
528 | ||
529 | # By default, comparisons will use "eq" and elements will be turned into keys | |
530 | # using the default stringizing operator '""'. | |
531 | ||
532 | # Additional parameters, if any, will be passed to the key generation routine. | |
533 | ||
534 | sub _longestCommonSubsequence | |
535 | { | |
536 | my $a = shift; # array ref | |
537 | my $b = shift; # array ref | |
538 | my $keyGen = shift; # code ref | |
539 | my $compare; # code ref | |
540 | ||
541 | # set up code refs | |
542 | # Note that these are optimized. | |
543 | if ( !defined($keyGen) ) # optimize for strings | |
544 | { | |
545 | $keyGen = sub { $_[0] }; | |
546 | $compare = sub { my ( $a, $b ) = @_; $a eq $b }; | |
547 | } | |
548 | else | |
549 | { | |
550 | $compare = sub { | |
551 | my $a = shift; | |
552 | my $b = shift; | |
553 | &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ ); | |
554 | }; | |
555 | } | |
556 | ||
557 | my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) = | |
558 | ( 0, $#$a, 0, $#$b, [] ); | |
559 | ||
560 | # First we prune off any common elements at the beginning | |
561 | while ( $aStart <= $aFinish | |
562 | and $bStart <= $bFinish | |
563 | and &$compare( $a->[$aStart], $b->[$bStart], @_ ) ) | |
564 | { | |
565 | $matchVector->[ $aStart++ ] = $bStart++; | |
566 | } | |
567 | ||
568 | # now the end | |
569 | while ( $aStart <= $aFinish | |
570 | and $bStart <= $bFinish | |
571 | and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) ) | |
572 | { | |
573 | $matchVector->[ $aFinish-- ] = $bFinish--; | |
574 | } | |
575 | ||
576 | # Now compute the equivalence classes of positions of elements | |
577 | my $bMatches = | |
578 | _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ ); | |
579 | my $thresh = []; | |
580 | my $links = []; | |
581 | ||
582 | my ( $i, $ai, $j, $k ); | |
583 | for ( $i = $aStart ; $i <= $aFinish ; $i++ ) | |
584 | { | |
585 | $ai = &$keyGen( $a->[$i], @_ ); | |
586 | if ( exists( $bMatches->{$ai} ) ) | |
587 | { | |
588 | $k = 0; | |
589 | for $j ( @{ $bMatches->{$ai} } ) | |
590 | { | |
591 | ||
592 | # optimization: most of the time this will be true | |
593 | if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j ) | |
594 | { | |
595 | $thresh->[$k] = $j; | |
596 | } | |
597 | else | |
598 | { | |
599 | $k = _replaceNextLargerWith( $thresh, $j, $k ); | |
600 | } | |
601 | ||
602 | # oddly, it's faster to always test this (CPU cache?). | |
603 | if ( defined($k) ) | |
604 | { | |
605 | $links->[$k] = | |
606 | [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ]; | |
607 | } | |
608 | } | |
609 | } | |
610 | } | |
611 | ||
612 | if (@$thresh) | |
613 | { | |
614 | for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] ) | |
615 | { | |
616 | $matchVector->[ $link->[1] ] = $link->[2]; | |
617 | } | |
618 | } | |
619 | ||
620 | return wantarray ? @$matchVector : $matchVector; | |
621 | } | |
622 | ||
623 | sub traverse_sequences | |
624 | { | |
625 | my $a = shift; # array ref | |
626 | my $b = shift; # array ref | |
627 | my $callbacks = shift || {}; | |
628 | my $keyGen = shift; | |
629 | my $matchCallback = $callbacks->{'MATCH'} || sub { }; | |
630 | my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; | |
631 | my $finishedACallback = $callbacks->{'A_FINISHED'}; | |
632 | my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; | |
633 | my $finishedBCallback = $callbacks->{'B_FINISHED'}; | |
634 | my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); | |
635 | ||
636 | # Process all the lines in @$matchVector | |
637 | my $lastA = $#$a; | |
638 | my $lastB = $#$b; | |
639 | my $bi = 0; | |
640 | my $ai; | |
641 | ||
642 | for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ ) | |
643 | { | |
644 | my $bLine = $matchVector->[$ai]; | |
645 | if ( defined($bLine) ) # matched | |
646 | { | |
647 | &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine; | |
648 | &$matchCallback( $ai, $bi++, @_ ); | |
649 | } | |
650 | else | |
651 | { | |
652 | &$discardACallback( $ai, $bi, @_ ); | |
653 | } | |
654 | } | |
655 | ||
656 | # The last entry (if any) processed was a match. | |
657 | # $ai and $bi point just past the last matching lines in their sequences. | |
658 | ||
659 | while ( $ai <= $lastA or $bi <= $lastB ) | |
660 | { | |
661 | ||
662 | # last A? | |
663 | if ( $ai == $lastA + 1 and $bi <= $lastB ) | |
664 | { | |
665 | if ( defined($finishedACallback) ) | |
666 | { | |
667 | &$finishedACallback( $lastA, @_ ); | |
668 | $finishedACallback = undef; | |
669 | } | |
670 | else | |
671 | { | |
672 | &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB; | |
673 | } | |
674 | } | |
675 | ||
676 | # last B? | |
677 | if ( $bi == $lastB + 1 and $ai <= $lastA ) | |
678 | { | |
679 | if ( defined($finishedBCallback) ) | |
680 | { | |
681 | &$finishedBCallback( $lastB, @_ ); | |
682 | $finishedBCallback = undef; | |
683 | } | |
684 | else | |
685 | { | |
686 | &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA; | |
687 | } | |
688 | } | |
689 | ||
690 | &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA; | |
691 | &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB; | |
692 | } | |
693 | ||
694 | return 1; | |
695 | } | |
696 | ||
697 | sub traverse_balanced | |
698 | { | |
699 | my $a = shift; # array ref | |
700 | my $b = shift; # array ref | |
701 | my $callbacks = shift || {}; | |
702 | my $keyGen = shift; | |
703 | my $matchCallback = $callbacks->{'MATCH'} || sub { }; | |
704 | my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; | |
705 | my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; | |
706 | my $changeCallback = $callbacks->{'CHANGE'}; | |
707 | my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); | |
708 | ||
709 | # Process all the lines in match vector | |
710 | my $lastA = $#$a; | |
711 | my $lastB = $#$b; | |
712 | my $bi = 0; | |
713 | my $ai = 0; | |
714 | my $ma = -1; | |
715 | my $mb; | |
716 | ||
717 | while (1) | |
718 | { | |
719 | ||
720 | # Find next match indices $ma and $mb | |
721 | do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] ); | |
722 | ||
723 | last if $ma > $#$matchVector; # end of matchVector? | |
724 | $mb = $matchVector->[$ma]; | |
725 | ||
726 | # Proceed with discard a/b or change events until | |
727 | # next match | |
728 | while ( $ai < $ma || $bi < $mb ) | |
729 | { | |
730 | ||
731 | if ( $ai < $ma && $bi < $mb ) | |
732 | { | |
733 | ||
734 | # Change | |
735 | if ( defined $changeCallback ) | |
736 | { | |
737 | &$changeCallback( $ai++, $bi++, @_ ); | |
738 | } | |
739 | else | |
740 | { | |
741 | &$discardACallback( $ai++, $bi, @_ ); | |
742 | &$discardBCallback( $ai, $bi++, @_ ); | |
743 | } | |
744 | } | |
745 | elsif ( $ai < $ma ) | |
746 | { | |
747 | &$discardACallback( $ai++, $bi, @_ ); | |
748 | } | |
749 | else | |
750 | { | |
751 | ||
752 | # $bi < $mb | |
753 | &$discardBCallback( $ai, $bi++, @_ ); | |
754 | } | |
755 | } | |
756 | ||
757 | # Match | |
758 | &$matchCallback( $ai++, $bi++, @_ ); | |
759 | } | |
760 | ||
761 | while ( $ai <= $lastA || $bi <= $lastB ) | |
762 | { | |
763 | if ( $ai <= $lastA && $bi <= $lastB ) | |
764 | { | |
765 | ||
766 | # Change | |
767 | if ( defined $changeCallback ) | |
768 | { | |
769 | &$changeCallback( $ai++, $bi++, @_ ); | |
770 | } | |
771 | else | |
772 | { | |
773 | &$discardACallback( $ai++, $bi, @_ ); | |
774 | &$discardBCallback( $ai, $bi++, @_ ); | |
775 | } | |
776 | } | |
777 | elsif ( $ai <= $lastA ) | |
778 | { | |
779 | &$discardACallback( $ai++, $bi, @_ ); | |
780 | } | |
781 | else | |
782 | { | |
783 | ||
784 | # $bi <= $lastB | |
785 | &$discardBCallback( $ai, $bi++, @_ ); | |
786 | } | |
787 | } | |
788 | ||
789 | return 1; | |
790 | } | |
791 | ||
792 | sub LCS | |
793 | { | |
794 | my $a = shift; # array ref | |
795 | my $matchVector = _longestCommonSubsequence( $a, @_ ); | |
796 | my @retval; | |
797 | my $i; | |
798 | for ( $i = 0 ; $i <= $#$matchVector ; $i++ ) | |
799 | { | |
800 | if ( defined( $matchVector->[$i] ) ) | |
801 | { | |
802 | push ( @retval, $a->[$i] ); | |
803 | } | |
804 | } | |
805 | return wantarray ? @retval : \@retval; | |
806 | } | |
807 | ||
808 | sub diff | |
809 | { | |
810 | my $a = shift; # array ref | |
811 | my $b = shift; # array ref | |
812 | my $retval = []; | |
813 | my $hunk = []; | |
814 | my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) }; | |
815 | my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) }; | |
816 | my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] }; | |
817 | traverse_sequences( $a, $b, | |
818 | { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ ); | |
819 | &$match(); | |
820 | return wantarray ? @$retval : $retval; | |
821 | } | |
822 | ||
823 | sub sdiff | |
824 | { | |
825 | my $a = shift; # array ref | |
826 | my $b = shift; # array ref | |
827 | my $retval = []; | |
828 | my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) }; | |
829 | my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) }; | |
830 | my $change = sub { | |
831 | push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] ); | |
832 | }; | |
833 | my $match = sub { | |
834 | push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] ); | |
835 | }; | |
836 | traverse_balanced( | |
837 | $a, | |
838 | $b, | |
839 | { | |
840 | MATCH => $match, | |
841 | DISCARD_A => $discard, | |
842 | DISCARD_B => $add, | |
843 | CHANGE => $change, | |
844 | }, | |
845 | @_ | |
846 | ); | |
847 | return wantarray ? @$retval : $retval; | |
848 | } | |
849 | ||
850 | 1; |