| 1 | package Algorithm::Diff; |
| 2 | use strict; |
| 3 | use vars qw($VERSION @EXPORT_OK @ISA @EXPORT); |
| 4 | use integer; # see below in _replaceNextLargerWith() for mod to make |
| 5 | # if you don't use this |
| 6 | require Exporter; |
| 7 | @ISA = qw(Exporter); |
| 8 | @EXPORT = qw(); |
| 9 | @EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff); |
| 10 | $VERSION = sprintf('%d.%02d', (q$Revision: 1.15 $ =~ /\d+/g)); |
| 11 | |
| 12 | # McIlroy-Hunt diff algorithm |
| 13 | # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com> |
| 14 | # by Ned Konz, perl@bike-nomad.com |
| 15 | |
| 16 | =head1 NAME |
| 17 | |
| 18 | Algorithm::Diff - Compute `intelligent' differences between two files / lists |
| 19 | |
| 20 | =head1 SYNOPSIS |
| 21 | |
| 22 | use Algorithm::Diff qw(diff sdiff LCS traverse_sequences |
| 23 | traverse_balanced); |
| 24 | |
| 25 | @lcs = LCS( \@seq1, \@seq2 ); |
| 26 | |
| 27 | @lcs = LCS( \@seq1, \@seq2, $key_generation_function ); |
| 28 | |
| 29 | $lcsref = LCS( \@seq1, \@seq2 ); |
| 30 | |
| 31 | $lcsref = LCS( \@seq1, \@seq2, $key_generation_function ); |
| 32 | |
| 33 | @diffs = diff( \@seq1, \@seq2 ); |
| 34 | |
| 35 | @diffs = diff( \@seq1, \@seq2, $key_generation_function ); |
| 36 | |
| 37 | @sdiffs = sdiff( \@seq1, \@seq2 ); |
| 38 | |
| 39 | @sdiffs = sdiff( \@seq1, \@seq2, $key_generation_function ); |
| 40 | |
| 41 | traverse_sequences( \@seq1, \@seq2, |
| 42 | { MATCH => $callback, |
| 43 | DISCARD_A => $callback, |
| 44 | DISCARD_B => $callback, |
| 45 | } ); |
| 46 | |
| 47 | traverse_sequences( \@seq1, \@seq2, |
| 48 | { MATCH => $callback, |
| 49 | DISCARD_A => $callback, |
| 50 | DISCARD_B => $callback, |
| 51 | }, |
| 52 | $key_generation_function ); |
| 53 | |
| 54 | traverse_balanced( \@seq1, \@seq2, |
| 55 | { MATCH => $callback, |
| 56 | DISCARD_A => $callback, |
| 57 | DISCARD_B => $callback, |
| 58 | CHANGE => $callback, |
| 59 | } ); |
| 60 | |
| 61 | =head1 INTRODUCTION |
| 62 | |
| 63 | (by Mark-Jason Dominus) |
| 64 | |
| 65 | I once read an article written by the authors of C<diff>; they said |
| 66 | that they hard worked very hard on the algorithm until they found the |
| 67 | right one. |
| 68 | |
| 69 | I think what they ended up using (and I hope someone will correct me, |
| 70 | because I am not very confident about this) was the `longest common |
| 71 | subsequence' method. in the LCS problem, you have two sequences of |
| 72 | items: |
| 73 | |
| 74 | a b c d f g h j q z |
| 75 | |
| 76 | a b c d e f g i j k r x y z |
| 77 | |
| 78 | and you want to find the longest sequence of items that is present in |
| 79 | both original sequences in the same order. That is, you want to find |
| 80 | a new sequence I<S> which can be obtained from the first sequence by |
| 81 | deleting some items, and from the secend sequence by deleting other |
| 82 | items. You also want I<S> to be as long as possible. In this case |
| 83 | I<S> is |
| 84 | |
| 85 | a b c d f g j z |
| 86 | |
| 87 | From there it's only a small step to get diff-like output: |
| 88 | |
| 89 | e h i k q r x y |
| 90 | + - + + - + + + |
| 91 | |
| 92 | This module solves the LCS problem. It also includes a canned |
| 93 | function to generate C<diff>-like output. |
| 94 | |
| 95 | It might seem from the example above that the LCS of two sequences is |
| 96 | always pretty obvious, but that's not always the case, especially when |
| 97 | the two sequences have many repeated elements. For example, consider |
| 98 | |
| 99 | a x b y c z p d q |
| 100 | a b c a x b y c z |
| 101 | |
| 102 | A naive approach might start by matching up the C<a> and C<b> that |
| 103 | appear at the beginning of each sequence, like this: |
| 104 | |
| 105 | a x b y c z p d q |
| 106 | a b c a b y c z |
| 107 | |
| 108 | This finds the common subsequence C<a b c z>. But actually, the LCS |
| 109 | is C<a x b y c z>: |
| 110 | |
| 111 | a x b y c z p d q |
| 112 | a b c a x b y c z |
| 113 | |
| 114 | =head1 USAGE |
| 115 | |
| 116 | This module provides three exportable functions, which we'll deal with in |
| 117 | ascending order of difficulty: C<LCS>, |
| 118 | C<diff>, C<sdiff>, C<traverse_sequences>, and C<traverse_balanced>. |
| 119 | |
| 120 | =head2 C<LCS> |
| 121 | |
| 122 | Given references to two lists of items, LCS returns an array containing their |
| 123 | longest common subsequence. In scalar context, it returns a reference to |
| 124 | such a list. |
| 125 | |
| 126 | @lcs = LCS( \@seq1, \@seq2 ); |
| 127 | $lcsref = LCS( \@seq1, \@seq2 ); |
| 128 | |
| 129 | C<LCS> may be passed an optional third parameter; this is a CODE |
| 130 | reference to a key generation function. See L</KEY GENERATION |
| 131 | FUNCTIONS>. |
| 132 | |
| 133 | @lcs = LCS( \@seq1, \@seq2, $keyGen ); |
| 134 | $lcsref = LCS( \@seq1, \@seq2, $keyGen ); |
| 135 | |
| 136 | Additional parameters, if any, will be passed to the key generation |
| 137 | routine. |
| 138 | |
| 139 | =head2 C<diff> |
| 140 | |
| 141 | @diffs = diff( \@seq1, \@seq2 ); |
| 142 | $diffs_ref = diff( \@seq1, \@seq2 ); |
| 143 | |
| 144 | C<diff> computes the smallest set of additions and deletions necessary |
| 145 | to turn the first sequence into the second, and returns a description |
| 146 | of these changes. The description is a list of I<hunks>; each hunk |
| 147 | represents a contiguous section of items which should be added, |
| 148 | deleted, or replaced. The return value of C<diff> is a list of |
| 149 | hunks, or, in scalar context, a reference to such a list. |
| 150 | |
| 151 | Here is an example: The diff of the following two sequences: |
| 152 | |
| 153 | a b c e h j l m n p |
| 154 | b c d e f j k l m r s t |
| 155 | |
| 156 | Result: |
| 157 | |
| 158 | [ |
| 159 | [ [ '-', 0, 'a' ] ], |
| 160 | |
| 161 | [ [ '+', 2, 'd' ] ], |
| 162 | |
| 163 | [ [ '-', 4, 'h' ] , |
| 164 | [ '+', 4, 'f' ] ], |
| 165 | |
| 166 | [ [ '+', 6, 'k' ] ], |
| 167 | |
| 168 | [ [ '-', 8, 'n' ], |
| 169 | [ '-', 9, 'p' ], |
| 170 | [ '+', 9, 'r' ], |
| 171 | [ '+', 10, 's' ], |
| 172 | [ '+', 11, 't' ], |
| 173 | ] |
| 174 | ] |
| 175 | |
| 176 | There are five hunks here. The first hunk says that the C<a> at |
| 177 | position 0 of the first sequence should be deleted (C<->). The second |
| 178 | hunk says that the C<d> at position 2 of the second sequence should |
| 179 | be inserted (C<+>). The third hunk says that the C<h> at position 4 |
| 180 | of the first sequence should be removed and replaced with the C<f> |
| 181 | from position 4 of the second sequence. The other two hunks similarly. |
| 182 | |
| 183 | C<diff> may be passed an optional third parameter; this is a CODE |
| 184 | reference to a key generation function. See L</KEY GENERATION |
| 185 | FUNCTIONS>. |
| 186 | |
| 187 | Additional parameters, if any, will be passed to the key generation |
| 188 | routine. |
| 189 | |
| 190 | =head2 C<sdiff> |
| 191 | |
| 192 | @sdiffs = sdiff( \@seq1, \@seq2 ); |
| 193 | $sdiffs_ref = sdiff( \@seq1, \@seq2 ); |
| 194 | |
| 195 | C<sdiff> computes all necessary components to show two sequences |
| 196 | and their minimized differences side by side, just like the |
| 197 | Unix-utility I<sdiff> does: |
| 198 | |
| 199 | same same |
| 200 | before | after |
| 201 | old < - |
| 202 | - > new |
| 203 | |
| 204 | It returns a list of array refs, each pointing to an array of |
| 205 | display instructions. In scalar context it returns a reference |
| 206 | to such a list. |
| 207 | |
| 208 | Display instructions consist of three elements: A modifier indicator |
| 209 | (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified, |
| 210 | C<c>: Element changed) and the value of the old and new elements, to |
| 211 | be displayed side by side. |
| 212 | |
| 213 | An C<sdiff> of the following two sequences: |
| 214 | |
| 215 | a b c e h j l m n p |
| 216 | b c d e f j k l m r s t |
| 217 | |
| 218 | results in |
| 219 | |
| 220 | [ [ '-', 'a', '' ], |
| 221 | [ 'u', 'b', 'b' ], |
| 222 | [ 'u', 'c', 'c' ], |
| 223 | [ '+', '', 'd' ], |
| 224 | [ 'u', 'e', 'e' ], |
| 225 | [ 'c', 'h', 'f' ], |
| 226 | [ 'u', 'j', 'j' ], |
| 227 | [ '+', '', 'k' ], |
| 228 | [ 'u', 'l', 'l' ], |
| 229 | [ 'u', 'm', 'm' ], |
| 230 | [ 'c', 'n', 'r' ], |
| 231 | [ 'c', 'p', 's' ], |
| 232 | [ '+', '', 't' ] ] |
| 233 | |
| 234 | C<sdiff> may be passed an optional third parameter; this is a CODE |
| 235 | reference to a key generation function. See L</KEY GENERATION |
| 236 | FUNCTIONS>. |
| 237 | |
| 238 | Additional parameters, if any, will be passed to the key generation |
| 239 | routine. |
| 240 | |
| 241 | =head2 C<traverse_sequences> |
| 242 | |
| 243 | C<traverse_sequences> is the most general facility provided by this |
| 244 | module; C<diff> and C<LCS> are implemented as calls to it. |
| 245 | |
| 246 | Imagine that there are two arrows. Arrow A points to an element of sequence A, |
| 247 | and arrow B points to an element of the sequence B. Initially, the arrows |
| 248 | point to the first elements of the respective sequences. C<traverse_sequences> |
| 249 | will advance the arrows through the sequences one element at a time, calling an |
| 250 | appropriate user-specified callback function before each advance. It |
| 251 | willadvance the arrows in such a way that if there are equal elements C<$A[$i]> |
| 252 | and C<$B[$j]> which are equal and which are part of the LCS, there will be |
| 253 | some moment during the execution of C<traverse_sequences> when arrow A is |
| 254 | pointing to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens, |
| 255 | C<traverse_sequences> will call the C<MATCH> callback function and then it will |
| 256 | advance both arrows. |
| 257 | |
| 258 | Otherwise, one of the arrows is pointing to an element of its sequence that is |
| 259 | not part of the LCS. C<traverse_sequences> will advance that arrow and will |
| 260 | call the C<DISCARD_A> or the C<DISCARD_B> callback, depending on which arrow it |
| 261 | advanced. If both arrows point to elements that are not part of the LCS, then |
| 262 | C<traverse_sequences> will advance one of them and call the appropriate |
| 263 | callback, but it is not specified which it will call. |
| 264 | |
| 265 | The arguments to C<traverse_sequences> are the two sequences to traverse, and a |
| 266 | hash which specifies the callback functions, like this: |
| 267 | |
| 268 | traverse_sequences( \@seq1, \@seq2, |
| 269 | { MATCH => $callback_1, |
| 270 | DISCARD_A => $callback_2, |
| 271 | DISCARD_B => $callback_3, |
| 272 | } ); |
| 273 | |
| 274 | Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least the |
| 275 | indices of the two arrows as their arguments. They are not expected to return |
| 276 | any values. If a callback is omitted from the table, it is not called. |
| 277 | |
| 278 | Callbacks for A_FINISHED and B_FINISHED are invoked with at least the |
| 279 | corresponding index in A or B. |
| 280 | |
| 281 | If arrow A reaches the end of its sequence, before arrow B does, |
| 282 | C<traverse_sequences> will call the C<A_FINISHED> callback when it advances |
| 283 | arrow B, if there is such a function; if not it will call C<DISCARD_B> instead. |
| 284 | Similarly if arrow B finishes first. C<traverse_sequences> returns when both |
| 285 | arrows are at the ends of their respective sequences. It returns true on |
| 286 | success and false on failure. At present there is no way to fail. |
| 287 | |
| 288 | C<traverse_sequences> may be passed an optional fourth parameter; this is a |
| 289 | CODE reference to a key generation function. See L</KEY GENERATION FUNCTIONS>. |
| 290 | |
| 291 | Additional parameters, if any, will be passed to the key generation function. |
| 292 | |
| 293 | =head2 C<traverse_balanced> |
| 294 | |
| 295 | C<traverse_balanced> is an alternative to C<traverse_sequences>. It |
| 296 | uses a different algorithm to iterate through the entries in the |
| 297 | computed LCS. Instead of sticking to one side and showing element changes |
| 298 | as insertions and deletions only, it will jump back and forth between |
| 299 | the two sequences and report I<changes> occurring as deletions on one |
| 300 | side followed immediatly by an insertion on the other side. |
| 301 | |
| 302 | In addition to the |
| 303 | C<DISCARD_A>, |
| 304 | C<DISCARD_B>, and |
| 305 | C<MATCH> |
| 306 | callbacks supported by C<traverse_sequences>, C<traverse_balanced> supports |
| 307 | a C<CHANGE> callback indicating that one element got C<replaced> by another: |
| 308 | |
| 309 | traverse_sequences( \@seq1, \@seq2, |
| 310 | { MATCH => $callback_1, |
| 311 | DISCARD_A => $callback_2, |
| 312 | DISCARD_B => $callback_3, |
| 313 | CHANGE => $callback_4, |
| 314 | } ); |
| 315 | |
| 316 | If no C<CHANGE> callback is specified, C<traverse_balanced> |
| 317 | will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions, |
| 318 | therefore resulting in a similar behaviour as C<traverse_sequences> |
| 319 | with different order of events. |
| 320 | |
| 321 | C<traverse_balanced> might be a bit slower than C<traverse_sequences>, |
| 322 | noticable only while processing huge amounts of data. |
| 323 | |
| 324 | The C<sdiff> function of this module |
| 325 | is implemented as call to C<traverse_balanced>. |
| 326 | |
| 327 | =head1 KEY GENERATION FUNCTIONS |
| 328 | |
| 329 | C<diff>, C<LCS>, and C<traverse_sequences> accept an optional last parameter. |
| 330 | This is a CODE reference to a key generating (hashing) function that should |
| 331 | return a string that uniquely identifies a given element. It should be the |
| 332 | case that if two elements are to be considered equal, their keys should be the |
| 333 | same (and the other way around). If no key generation function is provided, |
| 334 | the key will be the element as a string. |
| 335 | |
| 336 | By default, comparisons will use "eq" and elements will be turned into keys |
| 337 | using the default stringizing operator '""'. |
| 338 | |
| 339 | Where this is important is when you're comparing something other than strings. |
| 340 | If it is the case that you have multiple different objects that should be |
| 341 | considered to be equal, you should supply a key generation function. Otherwise, |
| 342 | you have to make sure that your arrays contain unique references. |
| 343 | |
| 344 | For instance, consider this example: |
| 345 | |
| 346 | package Person; |
| 347 | |
| 348 | sub new |
| 349 | { |
| 350 | my $package = shift; |
| 351 | return bless { name => '', ssn => '', @_ }, $package; |
| 352 | } |
| 353 | |
| 354 | sub clone |
| 355 | { |
| 356 | my $old = shift; |
| 357 | my $new = bless { %$old }, ref($old); |
| 358 | } |
| 359 | |
| 360 | sub hash |
| 361 | { |
| 362 | return shift()->{'ssn'}; |
| 363 | } |
| 364 | |
| 365 | my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' ); |
| 366 | my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' ); |
| 367 | my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' ); |
| 368 | my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' ); |
| 369 | my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' ); |
| 370 | |
| 371 | If you did this: |
| 372 | |
| 373 | my $array1 = [ $person1, $person2, $person4 ]; |
| 374 | my $array2 = [ $person1, $person3, $person4, $person5 ]; |
| 375 | Algorithm::Diff::diff( $array1, $array2 ); |
| 376 | |
| 377 | everything would work out OK (each of the objects would be converted |
| 378 | into a string like "Person=HASH(0x82425b0)" for comparison). |
| 379 | |
| 380 | But if you did this: |
| 381 | |
| 382 | my $array1 = [ $person1, $person2, $person4 ]; |
| 383 | my $array2 = [ $person1, $person3, $person4->clone(), $person5 ]; |
| 384 | Algorithm::Diff::diff( $array1, $array2 ); |
| 385 | |
| 386 | $person4 and $person4->clone() (which have the same name and SSN) |
| 387 | would be seen as different objects. If you wanted them to be considered |
| 388 | equivalent, you would have to pass in a key generation function: |
| 389 | |
| 390 | my $array1 = [ $person1, $person2, $person4 ]; |
| 391 | my $array2 = [ $person1, $person3, $person4->clone(), $person5 ]; |
| 392 | Algorithm::Diff::diff( $array1, $array2, \&Person::hash ); |
| 393 | |
| 394 | This would use the 'ssn' field in each Person as a comparison key, and |
| 395 | so would consider $person4 and $person4->clone() as equal. |
| 396 | |
| 397 | You may also pass additional parameters to the key generation function |
| 398 | if you wish. |
| 399 | |
| 400 | =head1 AUTHOR |
| 401 | |
| 402 | This version by Ned Konz, perl@bike-nomad.com |
| 403 | |
| 404 | =head1 LICENSE |
| 405 | |
| 406 | Copyright (c) 2000-2002 Ned Konz. All rights reserved. |
| 407 | This program is free software; |
| 408 | you can redistribute it and/or modify it under the same terms |
| 409 | as Perl itself. |
| 410 | |
| 411 | =head1 CREDITS |
| 412 | |
| 413 | Versions through 0.59 (and much of this documentation) were written by: |
| 414 | |
| 415 | Mark-Jason Dominus, mjd-perl-diff@plover.com |
| 416 | |
| 417 | This version borrows the documentation and names of the routines |
| 418 | from Mark-Jason's, but has all new code in Diff.pm. |
| 419 | |
| 420 | This code was adapted from the Smalltalk code of |
| 421 | Mario Wolczko <mario@wolczko.com>, which is available at |
| 422 | ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st |
| 423 | |
| 424 | C<sdiff> and C<traverse_balanced> were written by Mike Schilli |
| 425 | <m@perlmeister.com>. |
| 426 | |
| 427 | The algorithm is that described in |
| 428 | I<A Fast Algorithm for Computing Longest Common Subsequences>, |
| 429 | CACM, vol.20, no.5, pp.350-353, May 1977, with a few |
| 430 | minor improvements to improve the speed. |
| 431 | |
| 432 | =cut |
| 433 | |
| 434 | # Create a hash that maps each element of $aCollection to the set of positions |
| 435 | # it occupies in $aCollection, restricted to the elements within the range of |
| 436 | # indexes specified by $start and $end. |
| 437 | # The fourth parameter is a subroutine reference that will be called to |
| 438 | # generate a string to use as a key. |
| 439 | # Additional parameters, if any, will be passed to this subroutine. |
| 440 | # |
| 441 | # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen ); |
| 442 | |
| 443 | sub _withPositionsOfInInterval |
| 444 | { |
| 445 | my $aCollection = shift; # array ref |
| 446 | my $start = shift; |
| 447 | my $end = shift; |
| 448 | my $keyGen = shift; |
| 449 | my %d; |
| 450 | my $index; |
| 451 | for ( $index = $start ; $index <= $end ; $index++ ) |
| 452 | { |
| 453 | my $element = $aCollection->[$index]; |
| 454 | my $key = &$keyGen( $element, @_ ); |
| 455 | if ( exists( $d{$key} ) ) |
| 456 | { |
| 457 | unshift ( @{ $d{$key} }, $index ); |
| 458 | } |
| 459 | else |
| 460 | { |
| 461 | $d{$key} = [$index]; |
| 462 | } |
| 463 | } |
| 464 | return wantarray ? %d : \%d; |
| 465 | } |
| 466 | |
| 467 | # Find the place at which aValue would normally be inserted into the array. If |
| 468 | # that place is already occupied by aValue, do nothing, and return undef. If |
| 469 | # the place does not exist (i.e., it is off the end of the array), add it to |
| 470 | # the end, otherwise replace the element at that point with aValue. |
| 471 | # It is assumed that the array's values are numeric. |
| 472 | # This is where the bulk (75%) of the time is spent in this module, so try to |
| 473 | # make it fast! |
| 474 | |
| 475 | sub _replaceNextLargerWith |
| 476 | { |
| 477 | my ( $array, $aValue, $high ) = @_; |
| 478 | $high ||= $#$array; |
| 479 | |
| 480 | # off the end? |
| 481 | if ( $high == -1 || $aValue > $array->[-1] ) |
| 482 | { |
| 483 | push ( @$array, $aValue ); |
| 484 | return $high + 1; |
| 485 | } |
| 486 | |
| 487 | # binary search for insertion point... |
| 488 | my $low = 0; |
| 489 | my $index; |
| 490 | my $found; |
| 491 | while ( $low <= $high ) |
| 492 | { |
| 493 | $index = ( $high + $low ) / 2; |
| 494 | |
| 495 | # $index = int(( $high + $low ) / 2); # without 'use integer' |
| 496 | $found = $array->[$index]; |
| 497 | |
| 498 | if ( $aValue == $found ) |
| 499 | { |
| 500 | return undef; |
| 501 | } |
| 502 | elsif ( $aValue > $found ) |
| 503 | { |
| 504 | $low = $index + 1; |
| 505 | } |
| 506 | else |
| 507 | { |
| 508 | $high = $index - 1; |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | # now insertion point is in $low. |
| 513 | $array->[$low] = $aValue; # overwrite next larger |
| 514 | return $low; |
| 515 | } |
| 516 | |
| 517 | # This method computes the longest common subsequence in $a and $b. |
| 518 | |
| 519 | # Result is array or ref, whose contents is such that |
| 520 | # $a->[ $i ] == $b->[ $result[ $i ] ] |
| 521 | # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined. |
| 522 | |
| 523 | # An additional argument may be passed; this is a hash or key generating |
| 524 | # function that should return a string that uniquely identifies the given |
| 525 | # element. It should be the case that if the key is the same, the elements |
| 526 | # will compare the same. If this parameter is undef or missing, the key |
| 527 | # will be the element as a string. |
| 528 | |
| 529 | # By default, comparisons will use "eq" and elements will be turned into keys |
| 530 | # using the default stringizing operator '""'. |
| 531 | |
| 532 | # Additional parameters, if any, will be passed to the key generation routine. |
| 533 | |
| 534 | sub _longestCommonSubsequence |
| 535 | { |
| 536 | my $a = shift; # array ref |
| 537 | my $b = shift; # array ref |
| 538 | my $keyGen = shift; # code ref |
| 539 | my $compare; # code ref |
| 540 | |
| 541 | # set up code refs |
| 542 | # Note that these are optimized. |
| 543 | if ( !defined($keyGen) ) # optimize for strings |
| 544 | { |
| 545 | $keyGen = sub { $_[0] }; |
| 546 | $compare = sub { my ( $a, $b ) = @_; $a eq $b }; |
| 547 | } |
| 548 | else |
| 549 | { |
| 550 | $compare = sub { |
| 551 | my $a = shift; |
| 552 | my $b = shift; |
| 553 | &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ ); |
| 554 | }; |
| 555 | } |
| 556 | |
| 557 | my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) = |
| 558 | ( 0, $#$a, 0, $#$b, [] ); |
| 559 | |
| 560 | # First we prune off any common elements at the beginning |
| 561 | while ( $aStart <= $aFinish |
| 562 | and $bStart <= $bFinish |
| 563 | and &$compare( $a->[$aStart], $b->[$bStart], @_ ) ) |
| 564 | { |
| 565 | $matchVector->[ $aStart++ ] = $bStart++; |
| 566 | } |
| 567 | |
| 568 | # now the end |
| 569 | while ( $aStart <= $aFinish |
| 570 | and $bStart <= $bFinish |
| 571 | and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) ) |
| 572 | { |
| 573 | $matchVector->[ $aFinish-- ] = $bFinish--; |
| 574 | } |
| 575 | |
| 576 | # Now compute the equivalence classes of positions of elements |
| 577 | my $bMatches = |
| 578 | _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ ); |
| 579 | my $thresh = []; |
| 580 | my $links = []; |
| 581 | |
| 582 | my ( $i, $ai, $j, $k ); |
| 583 | for ( $i = $aStart ; $i <= $aFinish ; $i++ ) |
| 584 | { |
| 585 | $ai = &$keyGen( $a->[$i], @_ ); |
| 586 | if ( exists( $bMatches->{$ai} ) ) |
| 587 | { |
| 588 | $k = 0; |
| 589 | for $j ( @{ $bMatches->{$ai} } ) |
| 590 | { |
| 591 | |
| 592 | # optimization: most of the time this will be true |
| 593 | if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j ) |
| 594 | { |
| 595 | $thresh->[$k] = $j; |
| 596 | } |
| 597 | else |
| 598 | { |
| 599 | $k = _replaceNextLargerWith( $thresh, $j, $k ); |
| 600 | } |
| 601 | |
| 602 | # oddly, it's faster to always test this (CPU cache?). |
| 603 | if ( defined($k) ) |
| 604 | { |
| 605 | $links->[$k] = |
| 606 | [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ]; |
| 607 | } |
| 608 | } |
| 609 | } |
| 610 | } |
| 611 | |
| 612 | if (@$thresh) |
| 613 | { |
| 614 | for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] ) |
| 615 | { |
| 616 | $matchVector->[ $link->[1] ] = $link->[2]; |
| 617 | } |
| 618 | } |
| 619 | |
| 620 | return wantarray ? @$matchVector : $matchVector; |
| 621 | } |
| 622 | |
| 623 | sub traverse_sequences |
| 624 | { |
| 625 | my $a = shift; # array ref |
| 626 | my $b = shift; # array ref |
| 627 | my $callbacks = shift || {}; |
| 628 | my $keyGen = shift; |
| 629 | my $matchCallback = $callbacks->{'MATCH'} || sub { }; |
| 630 | my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; |
| 631 | my $finishedACallback = $callbacks->{'A_FINISHED'}; |
| 632 | my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; |
| 633 | my $finishedBCallback = $callbacks->{'B_FINISHED'}; |
| 634 | my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); |
| 635 | |
| 636 | # Process all the lines in @$matchVector |
| 637 | my $lastA = $#$a; |
| 638 | my $lastB = $#$b; |
| 639 | my $bi = 0; |
| 640 | my $ai; |
| 641 | |
| 642 | for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ ) |
| 643 | { |
| 644 | my $bLine = $matchVector->[$ai]; |
| 645 | if ( defined($bLine) ) # matched |
| 646 | { |
| 647 | &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine; |
| 648 | &$matchCallback( $ai, $bi++, @_ ); |
| 649 | } |
| 650 | else |
| 651 | { |
| 652 | &$discardACallback( $ai, $bi, @_ ); |
| 653 | } |
| 654 | } |
| 655 | |
| 656 | # The last entry (if any) processed was a match. |
| 657 | # $ai and $bi point just past the last matching lines in their sequences. |
| 658 | |
| 659 | while ( $ai <= $lastA or $bi <= $lastB ) |
| 660 | { |
| 661 | |
| 662 | # last A? |
| 663 | if ( $ai == $lastA + 1 and $bi <= $lastB ) |
| 664 | { |
| 665 | if ( defined($finishedACallback) ) |
| 666 | { |
| 667 | &$finishedACallback( $lastA, @_ ); |
| 668 | $finishedACallback = undef; |
| 669 | } |
| 670 | else |
| 671 | { |
| 672 | &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB; |
| 673 | } |
| 674 | } |
| 675 | |
| 676 | # last B? |
| 677 | if ( $bi == $lastB + 1 and $ai <= $lastA ) |
| 678 | { |
| 679 | if ( defined($finishedBCallback) ) |
| 680 | { |
| 681 | &$finishedBCallback( $lastB, @_ ); |
| 682 | $finishedBCallback = undef; |
| 683 | } |
| 684 | else |
| 685 | { |
| 686 | &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA; |
| 687 | } |
| 688 | } |
| 689 | |
| 690 | &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA; |
| 691 | &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB; |
| 692 | } |
| 693 | |
| 694 | return 1; |
| 695 | } |
| 696 | |
| 697 | sub traverse_balanced |
| 698 | { |
| 699 | my $a = shift; # array ref |
| 700 | my $b = shift; # array ref |
| 701 | my $callbacks = shift || {}; |
| 702 | my $keyGen = shift; |
| 703 | my $matchCallback = $callbacks->{'MATCH'} || sub { }; |
| 704 | my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; |
| 705 | my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; |
| 706 | my $changeCallback = $callbacks->{'CHANGE'}; |
| 707 | my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ ); |
| 708 | |
| 709 | # Process all the lines in match vector |
| 710 | my $lastA = $#$a; |
| 711 | my $lastB = $#$b; |
| 712 | my $bi = 0; |
| 713 | my $ai = 0; |
| 714 | my $ma = -1; |
| 715 | my $mb; |
| 716 | |
| 717 | while (1) |
| 718 | { |
| 719 | |
| 720 | # Find next match indices $ma and $mb |
| 721 | do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] ); |
| 722 | |
| 723 | last if $ma > $#$matchVector; # end of matchVector? |
| 724 | $mb = $matchVector->[$ma]; |
| 725 | |
| 726 | # Proceed with discard a/b or change events until |
| 727 | # next match |
| 728 | while ( $ai < $ma || $bi < $mb ) |
| 729 | { |
| 730 | |
| 731 | if ( $ai < $ma && $bi < $mb ) |
| 732 | { |
| 733 | |
| 734 | # Change |
| 735 | if ( defined $changeCallback ) |
| 736 | { |
| 737 | &$changeCallback( $ai++, $bi++, @_ ); |
| 738 | } |
| 739 | else |
| 740 | { |
| 741 | &$discardACallback( $ai++, $bi, @_ ); |
| 742 | &$discardBCallback( $ai, $bi++, @_ ); |
| 743 | } |
| 744 | } |
| 745 | elsif ( $ai < $ma ) |
| 746 | { |
| 747 | &$discardACallback( $ai++, $bi, @_ ); |
| 748 | } |
| 749 | else |
| 750 | { |
| 751 | |
| 752 | # $bi < $mb |
| 753 | &$discardBCallback( $ai, $bi++, @_ ); |
| 754 | } |
| 755 | } |
| 756 | |
| 757 | # Match |
| 758 | &$matchCallback( $ai++, $bi++, @_ ); |
| 759 | } |
| 760 | |
| 761 | while ( $ai <= $lastA || $bi <= $lastB ) |
| 762 | { |
| 763 | if ( $ai <= $lastA && $bi <= $lastB ) |
| 764 | { |
| 765 | |
| 766 | # Change |
| 767 | if ( defined $changeCallback ) |
| 768 | { |
| 769 | &$changeCallback( $ai++, $bi++, @_ ); |
| 770 | } |
| 771 | else |
| 772 | { |
| 773 | &$discardACallback( $ai++, $bi, @_ ); |
| 774 | &$discardBCallback( $ai, $bi++, @_ ); |
| 775 | } |
| 776 | } |
| 777 | elsif ( $ai <= $lastA ) |
| 778 | { |
| 779 | &$discardACallback( $ai++, $bi, @_ ); |
| 780 | } |
| 781 | else |
| 782 | { |
| 783 | |
| 784 | # $bi <= $lastB |
| 785 | &$discardBCallback( $ai, $bi++, @_ ); |
| 786 | } |
| 787 | } |
| 788 | |
| 789 | return 1; |
| 790 | } |
| 791 | |
| 792 | sub LCS |
| 793 | { |
| 794 | my $a = shift; # array ref |
| 795 | my $matchVector = _longestCommonSubsequence( $a, @_ ); |
| 796 | my @retval; |
| 797 | my $i; |
| 798 | for ( $i = 0 ; $i <= $#$matchVector ; $i++ ) |
| 799 | { |
| 800 | if ( defined( $matchVector->[$i] ) ) |
| 801 | { |
| 802 | push ( @retval, $a->[$i] ); |
| 803 | } |
| 804 | } |
| 805 | return wantarray ? @retval : \@retval; |
| 806 | } |
| 807 | |
| 808 | sub diff |
| 809 | { |
| 810 | my $a = shift; # array ref |
| 811 | my $b = shift; # array ref |
| 812 | my $retval = []; |
| 813 | my $hunk = []; |
| 814 | my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) }; |
| 815 | my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) }; |
| 816 | my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] }; |
| 817 | traverse_sequences( $a, $b, |
| 818 | { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ ); |
| 819 | &$match(); |
| 820 | return wantarray ? @$retval : $retval; |
| 821 | } |
| 822 | |
| 823 | sub sdiff |
| 824 | { |
| 825 | my $a = shift; # array ref |
| 826 | my $b = shift; # array ref |
| 827 | my $retval = []; |
| 828 | my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) }; |
| 829 | my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) }; |
| 830 | my $change = sub { |
| 831 | push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] ); |
| 832 | }; |
| 833 | my $match = sub { |
| 834 | push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] ); |
| 835 | }; |
| 836 | traverse_balanced( |
| 837 | $a, |
| 838 | $b, |
| 839 | { |
| 840 | MATCH => $match, |
| 841 | DISCARD_A => $discard, |
| 842 | DISCARD_B => $add, |
| 843 | CHANGE => $change, |
| 844 | }, |
| 845 | @_ |
| 846 | ); |
| 847 | return wantarray ? @$retval : $retval; |
| 848 | } |
| 849 | |
| 850 | 1; |