Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | # This is a version of Algorithm::Diff that uses only a comparison function, |
2 | # like versions <= 0.59 used to. | |
3 | # $Revision: 1.3 $ | |
4 | ||
5 | package Algorithm::DiffOld; | |
6 | use strict; | |
7 | use vars qw($VERSION @EXPORT_OK @ISA @EXPORT); | |
8 | use integer; # see below in _replaceNextLargerWith() for mod to make | |
9 | # if you don't use this | |
10 | require Exporter; | |
11 | @ISA = qw(Exporter); | |
12 | @EXPORT = qw(); | |
13 | @EXPORT_OK = qw(LCS diff traverse_sequences); | |
14 | $VERSION = 1.10; # manually tracking Algorithm::Diff | |
15 | ||
16 | # McIlroy-Hunt diff algorithm | |
17 | # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com> | |
18 | # by Ned Konz, perl@bike-nomad.com | |
19 | ||
20 | =head1 NAME | |
21 | ||
22 | Algorithm::DiffOld - Compute `intelligent' differences between two files / lists | |
23 | but use the old (<=0.59) interface. | |
24 | ||
25 | =head1 NOTE | |
26 | ||
27 | This has been provided as part of the Algorithm::Diff package by Ned Konz. | |
28 | This particular module is B<ONLY> for people who B<HAVE> to have the old | |
29 | interface, which uses a comparison function rather than a key generating | |
30 | function. | |
31 | ||
32 | Because each of the lines in one array have to be compared with each | |
33 | of the lines in the other array, this does M*N comparisions. This can | |
34 | be very slow. I clocked it at taking 18 times as long as the stock | |
35 | version of Algorithm::Diff for a 4000-line file. It will get worse | |
36 | quadratically as array sizes increase. | |
37 | ||
38 | =head1 SYNOPSIS | |
39 | ||
40 | use Algorithm::DiffOld qw(diff LCS traverse_sequences); | |
41 | ||
42 | @lcs = LCS( \@seq1, \@seq2, $comparison_function ); | |
43 | ||
44 | $lcsref = LCS( \@seq1, \@seq2, $comparison_function ); | |
45 | ||
46 | @diffs = diff( \@seq1, \@seq2, $comparison_function ); | |
47 | ||
48 | traverse_sequences( \@seq1, \@seq2, | |
49 | { MATCH => $callback, | |
50 | DISCARD_A => $callback, | |
51 | DISCARD_B => $callback, | |
52 | }, | |
53 | $comparison_function ); | |
54 | ||
55 | =head1 COMPARISON FUNCTIONS | |
56 | ||
57 | Each of the main routines should be passed a comparison function. If you | |
58 | aren't passing one in, B<use Algorithm::Diff instead>. | |
59 | ||
60 | These functions should return a true value when two items should compare | |
61 | as equal. | |
62 | ||
63 | For instance, | |
64 | ||
65 | @lcs = LCS( \@seq1, \@seq2, sub { my ($a, $b) = @_; $a eq $b } ); | |
66 | ||
67 | but if that is all you're doing with your comparison function, just use | |
68 | Algorithm::Diff and let it do this (this is its default). | |
69 | ||
70 | Or: | |
71 | ||
72 | sub someFunkyComparisonFunction | |
73 | { | |
74 | my ($a, $b) = @_; | |
75 | $a =~ m{$b}; | |
76 | } | |
77 | ||
78 | @diffs = diff( \@lines, \@patterns, \&someFunkyComparisonFunction ); | |
79 | ||
80 | which would allow you to diff an array @lines which consists of text | |
81 | lines with an array @patterns which consists of regular expressions. | |
82 | ||
83 | This is actually the reason I wrote this version -- there is no way | |
84 | to do this with a key generation function as in the stock Algorithm::Diff. | |
85 | ||
86 | =cut | |
87 | ||
88 | # Find the place at which aValue would normally be inserted into the array. If | |
89 | # that place is already occupied by aValue, do nothing, and return undef. If | |
90 | # the place does not exist (i.e., it is off the end of the array), add it to | |
91 | # the end, otherwise replace the element at that point with aValue. | |
92 | # It is assumed that the array's values are numeric. | |
93 | # This is where the bulk (75%) of the time is spent in this module, so try to | |
94 | # make it fast! | |
95 | ||
96 | sub _replaceNextLargerWith | |
97 | { | |
98 | my ( $array, $aValue, $high ) = @_; | |
99 | $high ||= $#$array; | |
100 | ||
101 | # off the end? | |
102 | if ( $high == -1 || $aValue > $array->[ -1 ] ) | |
103 | { | |
104 | push( @$array, $aValue ); | |
105 | return $high + 1; | |
106 | } | |
107 | ||
108 | # binary search for insertion point... | |
109 | my $low = 0; | |
110 | my $index; | |
111 | my $found; | |
112 | while ( $low <= $high ) | |
113 | { | |
114 | $index = ( $high + $low ) / 2; | |
115 | # $index = int(( $high + $low ) / 2); # without 'use integer' | |
116 | $found = $array->[ $index ]; | |
117 | ||
118 | if ( $aValue == $found ) | |
119 | { | |
120 | return undef; | |
121 | } | |
122 | elsif ( $aValue > $found ) | |
123 | { | |
124 | $low = $index + 1; | |
125 | } | |
126 | else | |
127 | { | |
128 | $high = $index - 1; | |
129 | } | |
130 | } | |
131 | ||
132 | # now insertion point is in $low. | |
133 | $array->[ $low ] = $aValue; # overwrite next larger | |
134 | return $low; | |
135 | } | |
136 | ||
137 | # This method computes the longest common subsequence in $a and $b. | |
138 | ||
139 | # Result is array or ref, whose contents is such that | |
140 | # $a->[ $i ] == $b->[ $result[ $i ] ] | |
141 | # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined. | |
142 | ||
143 | # An additional argument may be passed; this is a CODE ref to a comparison | |
144 | # routine. By default, comparisons will use "eq" . | |
145 | # Note that this routine will be called as many as M*N times, so make it fast! | |
146 | ||
147 | # Additional parameters, if any, will be passed to the key generation routine. | |
148 | ||
149 | sub _longestCommonSubsequence | |
150 | { | |
151 | my $a = shift; # array ref | |
152 | my $b = shift; # array ref | |
153 | my $compare = shift || sub { my $a = shift; my $b = shift; $a eq $b }; | |
154 | ||
155 | my $aStart = 0; | |
156 | my $aFinish = $#$a; | |
157 | my $bStart = 0; | |
158 | my $bFinish = $#$b; | |
159 | my $matchVector = []; | |
160 | ||
161 | # First we prune off any common elements at the beginning | |
162 | while ( $aStart <= $aFinish | |
163 | and $bStart <= $bFinish | |
164 | and &$compare( $a->[ $aStart ], $b->[ $bStart ], @_ ) ) | |
165 | { | |
166 | $matchVector->[ $aStart++ ] = $bStart++; | |
167 | } | |
168 | ||
169 | # now the end | |
170 | while ( $aStart <= $aFinish | |
171 | and $bStart <= $bFinish | |
172 | and &$compare( $a->[ $aFinish ], $b->[ $bFinish ], @_ ) ) | |
173 | { | |
174 | $matchVector->[ $aFinish-- ] = $bFinish--; | |
175 | } | |
176 | ||
177 | my $thresh = []; | |
178 | my $links = []; | |
179 | ||
180 | my ( $i, $ai, $j, $k ); | |
181 | for ( $i = $aStart; $i <= $aFinish; $i++ ) | |
182 | { | |
183 | $k = 0; | |
184 | # look for each element of @b between $bStart and $bFinish | |
185 | # that matches $a->[ $i ], in reverse order | |
186 | for ($j = $bFinish; $j >= $bStart; $j--) | |
187 | { | |
188 | next if ! &$compare( $a->[$i], $b->[$j] ); | |
189 | # optimization: most of the time this will be true | |
190 | if ( $k | |
191 | and $thresh->[ $k ] > $j | |
192 | and $thresh->[ $k - 1 ] < $j ) | |
193 | { | |
194 | $thresh->[ $k ] = $j; | |
195 | } | |
196 | else | |
197 | { | |
198 | $k = _replaceNextLargerWith( $thresh, $j, $k ); | |
199 | } | |
200 | ||
201 | # oddly, it's faster to always test this (CPU cache?). | |
202 | if ( defined( $k ) ) | |
203 | { | |
204 | $links->[ $k ] = | |
205 | [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ]; | |
206 | } | |
207 | } | |
208 | } | |
209 | ||
210 | if ( @$thresh ) | |
211 | { | |
212 | for ( my $link = $links->[ $#$thresh ]; $link; $link = $link->[ 0 ] ) | |
213 | { | |
214 | $matchVector->[ $link->[ 1 ] ] = $link->[ 2 ]; | |
215 | } | |
216 | } | |
217 | ||
218 | return wantarray ? @$matchVector : $matchVector; | |
219 | } | |
220 | ||
221 | sub traverse_sequences | |
222 | { | |
223 | my $a = shift; # array ref | |
224 | my $b = shift; # array ref | |
225 | my $callbacks = shift || { }; | |
226 | my $compare = shift; | |
227 | my $matchCallback = $callbacks->{'MATCH'} || sub { }; | |
228 | my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; | |
229 | my $finishedACallback = $callbacks->{'A_FINISHED'}; | |
230 | my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; | |
231 | my $finishedBCallback = $callbacks->{'B_FINISHED'}; | |
232 | my $matchVector = _longestCommonSubsequence( $a, $b, $compare, @_ ); | |
233 | # Process all the lines in match vector | |
234 | my $lastA = $#$a; | |
235 | my $lastB = $#$b; | |
236 | my $bi = 0; | |
237 | my $ai; | |
238 | for ( $ai = 0; $ai <= $#$matchVector; $ai++ ) | |
239 | { | |
240 | my $bLine = $matchVector->[ $ai ]; | |
241 | if ( defined( $bLine ) ) # matched | |
242 | { | |
243 | &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine; | |
244 | &$matchCallback( $ai, $bi++, @_ ); | |
245 | } | |
246 | else | |
247 | { | |
248 | &$discardACallback( $ai, $bi, @_ ); | |
249 | } | |
250 | } | |
251 | # the last entry (if any) processed was a match. | |
252 | ||
253 | if ( defined( $finishedBCallback ) && $ai <= $lastA ) | |
254 | { | |
255 | &$finishedBCallback( $bi, @_ ); | |
256 | } | |
257 | else | |
258 | { | |
259 | &$discardACallback( $ai++, $bi, @_ ) while ( $ai <= $lastA ); | |
260 | } | |
261 | ||
262 | if ( defined( $finishedACallback ) && $bi <= $lastB ) | |
263 | { | |
264 | &$finishedACallback( $ai, @_ ); | |
265 | } | |
266 | else | |
267 | { | |
268 | &$discardBCallback( $ai, $bi++, @_ ) while ( $bi <= $lastB ); | |
269 | } | |
270 | return 1; | |
271 | } | |
272 | ||
273 | sub LCS | |
274 | { | |
275 | my $a = shift; # array ref | |
276 | my $matchVector = _longestCommonSubsequence( $a, @_ ); | |
277 | my @retval; | |
278 | my $i; | |
279 | for ( $i = 0; $i <= $#$matchVector; $i++ ) | |
280 | { | |
281 | if ( defined( $matchVector->[ $i ] ) ) | |
282 | { | |
283 | push( @retval, $a->[ $i ] ); | |
284 | } | |
285 | } | |
286 | return wantarray ? @retval : \@retval; | |
287 | } | |
288 | ||
289 | sub diff | |
290 | { | |
291 | my $a = shift; # array ref | |
292 | my $b = shift; # array ref | |
293 | my $retval = []; | |
294 | my $hunk = []; | |
295 | my $discard = sub { push( @$hunk, [ '-', $_[ 0 ], $a->[ $_[ 0 ] ] ] ) }; | |
296 | my $add = sub { push( @$hunk, [ '+', $_[ 1 ], $b->[ $_[ 1 ] ] ] ) }; | |
297 | my $match = sub { push( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] }; | |
298 | traverse_sequences( $a, $b, | |
299 | { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, | |
300 | @_ ); | |
301 | &$match(); | |
302 | return wantarray ? @$retval : $retval; | |
303 | } | |
304 | ||
305 | 1; |