1 | 5 | simandl | <?php |
2 | | | /** |
3 | | | * Class used internally by Text_Diff to actually compute the diffs. |
4 | | | * |
5 | | | * This class is implemented using native PHP code. |
6 | | | * |
7 | | | * The algorithm used here is mostly lifted from the perl module |
8 | | | * Algorithm::Diff (version 1.06) by Ned Konz, which is available at: |
9 | | | * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip |
10 | | | * |
11 | | | * More ideas are taken from: http://www.ics.uci.edu/~eppstein/161/960229.html |
12 | | | * |
13 | | | * Some ideas (and a bit of code) are taken from analyze.c, of GNU |
14 | | | * diffutils-2.7, which can be found at: |
15 | | | * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz |
16 | | | * |
17 | | | * Some ideas (subdivision by NCHUNKS > 2, and some optimizations) are from |
18 | | | * Geoffrey T. Dairiki <dairiki@dairiki.org>. The original PHP version of this |
19 | | | * code was written by him, and is used/adapted with his permission. |
20 | | | * |
21 | | | * $Horde: framework/Text_Diff/Diff/Engine/native.php,v 1.7.2.4 2008/01/04 10:38:10 jan Exp $ |
22 | | | * |
23 | | | * Copyright 2004-2008 The Horde Project (http://www.horde.org/) |
24 | | | * |
25 | | | * See the enclosed file COPYING for license information (LGPL). If you did |
26 | | | * not receive this file, see http://opensource.org/licenses/lgpl-license.php. |
27 | | | * |
28 | | | * @author Geoffrey T. Dairiki <dairiki@dairiki.org> |
29 | | | * @package Text_Diff |
30 | | | */ |
31 | | | class Text_Diff_Engine_native { |
32 | | | |
33 | | | function diff($from_lines, $to_lines) |
34 | | | { |
35 | | | array_walk($from_lines, array('Text_Diff', 'trimNewlines')); |
36 | | | array_walk($to_lines, array('Text_Diff', 'trimNewlines')); |
37 | | | |
38 | | | $n_from = count($from_lines); |
39 | | | $n_to = count($to_lines); |
40 | | | |
41 | | | $this->xchanged = $this->ychanged = array(); |
42 | | | $this->xv = $this->yv = array(); |
43 | | | $this->xind = $this->yind = array(); |
44 | | | unset($this->seq); |
45 | | | unset($this->in_seq); |
46 | | | unset($this->lcs); |
47 | | | |
48 | | | // Skip leading common lines. |
49 | | | for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) { |
50 | | | if ($from_lines[$skip] !== $to_lines[$skip]) { |
51 | | | break; |
52 | | | } |
53 | | | $this->xchanged[$skip] = $this->ychanged[$skip] = false; |
54 | | | } |
55 | | | |
56 | | | // Skip trailing common lines. |
57 | | | $xi = $n_from; $yi = $n_to; |
58 | | | for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) { |
59 | | | if ($from_lines[$xi] !== $to_lines[$yi]) { |
60 | | | break; |
61 | | | } |
62 | | | $this->xchanged[$xi] = $this->ychanged[$yi] = false; |
63 | | | } |
64 | | | |
65 | | | // Ignore lines which do not exist in both files. |
66 | | | for ($xi = $skip; $xi < $n_from - $endskip; $xi++) { |
67 | | | $xhash[$from_lines[$xi]] = 1; |
68 | | | } |
69 | | | for ($yi = $skip; $yi < $n_to - $endskip; $yi++) { |
70 | | | $line = $to_lines[$yi]; |
71 | | | if (($this->ychanged[$yi] = empty($xhash[$line]))) { |
72 | | | continue; |
73 | | | } |
74 | | | $yhash[$line] = 1; |
75 | | | $this->yv[] = $line; |
76 | | | $this->yind[] = $yi; |
77 | | | } |
78 | | | for ($xi = $skip; $xi < $n_from - $endskip; $xi++) { |
79 | | | $line = $from_lines[$xi]; |
80 | | | if (($this->xchanged[$xi] = empty($yhash[$line]))) { |
81 | | | continue; |
82 | | | } |
83 | | | $this->xv[] = $line; |
84 | | | $this->xind[] = $xi; |
85 | | | } |
86 | | | |
87 | | | // Find the LCS. |
88 | | | $this->_compareseq(0, count($this->xv), 0, count($this->yv)); |
89 | | | |
90 | | | // Merge edits when possible. |
91 | | | $this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged); |
92 | | | $this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged); |
93 | | | |
94 | | | // Compute the edit operations. |
95 | | | $edits = array(); |
96 | | | $xi = $yi = 0; |
97 | | | while ($xi < $n_from || $yi < $n_to) { |
98 | | | assert($yi < $n_to || $this->xchanged[$xi]); |
99 | | | assert($xi < $n_from || $this->ychanged[$yi]); |
100 | | | |
101 | | | // Skip matching "snake". |
102 | | | $copy = array(); |
103 | | | while ($xi < $n_from && $yi < $n_to |
104 | | | && !$this->xchanged[$xi] && !$this->ychanged[$yi]) { |
105 | | | $copy[] = $from_lines[$xi++]; |
106 | | | ++$yi; |
107 | | | } |
108 | | | if ($copy) { |
109 | | | $edits[] = &new Text_Diff_Op_copy($copy); |
110 | | | } |
111 | | | |
112 | | | // Find deletes & adds. |
113 | | | $delete = array(); |
114 | | | while ($xi < $n_from && $this->xchanged[$xi]) { |
115 | | | $delete[] = $from_lines[$xi++]; |
116 | | | } |
117 | | | |
118 | | | $add = array(); |
119 | | | while ($yi < $n_to && $this->ychanged[$yi]) { |
120 | | | $add[] = $to_lines[$yi++]; |
121 | | | } |
122 | | | |
123 | | | if ($delete && $add) { |
124 | | | $edits[] = &new Text_Diff_Op_change($delete, $add); |
125 | | | } elseif ($delete) { |
126 | | | $edits[] = &new Text_Diff_Op_delete($delete); |
127 | | | } elseif ($add) { |
128 | | | $edits[] = &new Text_Diff_Op_add($add); |
129 | | | } |
130 | | | } |
131 | | | |
132 | | | return $edits; |
133 | | | } |
134 | | | |
135 | | | /** |
136 | | | * Divides the Largest Common Subsequence (LCS) of the sequences (XOFF, |
137 | | | * XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized |
138 | | | * segments. |
139 | | | * |
140 | | | * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an array of |
141 | | | * NCHUNKS+1 (X, Y) indexes giving the diving points between sub |
142 | | | * sequences. The first sub-sequence is contained in (X0, X1), (Y0, Y1), |
143 | | | * the second in (X1, X2), (Y1, Y2) and so on. Note that (X0, Y0) == |
144 | | | * (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM). |
145 | | | * |
146 | | | * This function assumes that the first lines of the specified portions of |
147 | | | * the two files do not match, and likewise that the last lines do not |
148 | | | * match. The caller must trim matching lines from the beginning and end |
149 | | | * of the portions it is going to specify. |
150 | | | */ |
151 | | | function _diag ($xoff, $xlim, $yoff, $ylim, $nchunks) |
152 | | | { |
153 | | | $flip = false; |
154 | | | |
155 | | | if ($xlim - $xoff > $ylim - $yoff) { |
156 | | | /* Things seems faster (I'm not sure I understand why) when the |
157 | | | * shortest sequence is in X. */ |
158 | | | $flip = true; |
159 | | | list ($xoff, $xlim, $yoff, $ylim) |
160 | | | = array($yoff, $ylim, $xoff, $xlim); |
161 | | | } |
162 | | | |
163 | | | if ($flip) { |
164 | | | for ($i = $ylim - 1; $i >= $yoff; $i--) { |
165 | | | $ymatches[$this->xv[$i]][] = $i; |
166 | | | } |
167 | | | } else { |
168 | | | for ($i = $ylim - 1; $i >= $yoff; $i--) { |
169 | | | $ymatches[$this->yv[$i]][] = $i; |
170 | | | } |
171 | | | } |
172 | | | |
173 | | | $this->lcs = 0; |
174 | | | $this->seq[0]= $yoff - 1; |
175 | | | $this->in_seq = array(); |
176 | | | $ymids[0] = array(); |
177 | | | |
178 | | | $numer = $xlim - $xoff + $nchunks - 1; |
179 | | | $x = $xoff; |
180 | | | for ($chunk = 0; $chunk < $nchunks; $chunk++) { |
181 | | | if ($chunk > 0) { |
182 | | | for ($i = 0; $i <= $this->lcs; $i++) { |
183 | | | $ymids[$i][$chunk - 1] = $this->seq[$i]; |
184 | | | } |
185 | | | } |
186 | | | |
187 | | | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks); |
188 | | | for (; $x < $x1; $x++) { |
189 | | | $line = $flip ? $this->yv[$x] : $this->xv[$x]; |
190 | | | if (empty($ymatches[$line])) { |
191 | | | continue; |
192 | | | } |
193 | | | $matches = $ymatches[$line]; |
194 | | | reset($matches); |
195 | | | while (list(, $y) = each($matches)) { |
196 | | | if (empty($this->in_seq[$y])) { |
197 | | | $k = $this->_lcsPos($y); |
198 | | | assert($k > 0); |
199 | | | $ymids[$k] = $ymids[$k - 1]; |
200 | | | break; |
201 | | | } |
202 | | | } |
203 | | | while (list(, $y) = each($matches)) { |
204 | | | if ($y > $this->seq[$k - 1]) { |
205 | | | assert($y <= $this->seq[$k]); |
206 | | | /* Optimization: this is a common case: next match is |
207 | | | * just replacing previous match. */ |
208 | | | $this->in_seq[$this->seq[$k]] = false; |
209 | | | $this->seq[$k] = $y; |
210 | | | $this->in_seq[$y] = 1; |
211 | | | } elseif (empty($this->in_seq[$y])) { |
212 | | | $k = $this->_lcsPos($y); |
213 | | | assert($k > 0); |
214 | | | $ymids[$k] = $ymids[$k - 1]; |
215 | | | } |
216 | | | } |
217 | | | } |
218 | | | } |
219 | | | |
220 | | | $seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff); |
221 | | | $ymid = $ymids[$this->lcs]; |
222 | | | for ($n = 0; $n < $nchunks - 1; $n++) { |
223 | | | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks); |
224 | | | $y1 = $ymid[$n] + 1; |
225 | | | $seps[] = $flip ? array($y1, $x1) : array($x1, $y1); |
226 | | | } |
227 | | | $seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim); |
228 | | | |
229 | | | return array($this->lcs, $seps); |
230 | | | } |
231 | | | |
232 | | | function _lcsPos($ypos) |
233 | | | { |
234 | | | $end = $this->lcs; |
235 | | | if ($end == 0 || $ypos > $this->seq[$end]) { |
236 | | | $this->seq[++$this->lcs] = $ypos; |
237 | | | $this->in_seq[$ypos] = 1; |
238 | | | return $this->lcs; |
239 | | | } |
240 | | | |
241 | | | $beg = 1; |
242 | | | while ($beg < $end) { |
243 | | | $mid = (int)(($beg + $end) / 2); |
244 | | | if ($ypos > $this->seq[$mid]) { |
245 | | | $beg = $mid + 1; |
246 | | | } else { |
247 | | | $end = $mid; |
248 | | | } |
249 | | | } |
250 | | | |
251 | | | assert($ypos != $this->seq[$end]); |
252 | | | |
253 | | | $this->in_seq[$this->seq[$end]] = false; |
254 | | | $this->seq[$end] = $ypos; |
255 | | | $this->in_seq[$ypos] = 1; |
256 | | | return $end; |
257 | | | } |
258 | | | |
259 | | | /** |
260 | | | * Finds LCS of two sequences. |
261 | | | * |
262 | | | * The results are recorded in the vectors $this->{x,y}changed[], by |
263 | | | * storing a 1 in the element for each line that is an insertion or |
264 | | | * deletion (ie. is not in the LCS). |
265 | | | * |
266 | | | * The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1. |
267 | | | * |
268 | | | * Note that XLIM, YLIM are exclusive bounds. All line numbers are |
269 | | | * origin-0 and discarded lines are not counted. |
270 | | | */ |
271 | | | function _compareseq ($xoff, $xlim, $yoff, $ylim) |
272 | | | { |
273 | | | /* Slide down the bottom initial diagonal. */ |
274 | | | while ($xoff < $xlim && $yoff < $ylim |
275 | | | && $this->xv[$xoff] == $this->yv[$yoff]) { |
276 | | | ++$xoff; |
277 | | | ++$yoff; |
278 | | | } |
279 | | | |
280 | | | /* Slide up the top initial diagonal. */ |
281 | | | while ($xlim > $xoff && $ylim > $yoff |
282 | | | && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) { |
283 | | | --$xlim; |
284 | | | --$ylim; |
285 | | | } |
286 | | | |
287 | | | if ($xoff == $xlim || $yoff == $ylim) { |
288 | | | $lcs = 0; |
289 | | | } else { |
290 | | | /* This is ad hoc but seems to work well. $nchunks = |
291 | | | * sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); $nchunks = |
292 | | | * max(2,min(8,(int)$nchunks)); */ |
293 | | | $nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1; |
294 | | | list($lcs, $seps) |
295 | | | = $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks); |
296 | | | } |
297 | | | |
298 | | | if ($lcs == 0) { |
299 | | | /* X and Y sequences have no common subsequence: mark all |
300 | | | * changed. */ |
301 | | | while ($yoff < $ylim) { |
302 | | | $this->ychanged[$this->yind[$yoff++]] = 1; |
303 | | | } |
304 | | | while ($xoff < $xlim) { |
305 | | | $this->xchanged[$this->xind[$xoff++]] = 1; |
306 | | | } |
307 | | | } else { |
308 | | | /* Use the partitions to split this problem into subproblems. */ |
309 | | | reset($seps); |
310 | | | $pt1 = $seps[0]; |
311 | | | while ($pt2 = next($seps)) { |
312 | | | $this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]); |
313 | | | $pt1 = $pt2; |
314 | | | } |
315 | | | } |
316 | | | } |
317 | | | |
318 | | | /** |
319 | | | * Adjusts inserts/deletes of identical lines to join changes as much as |
320 | | | * possible. |
321 | | | * |
322 | | | * We do something when a run of changed lines include a line at one end |
323 | | | * and has an excluded, identical line at the other. We are free to |
324 | | | * choose which identical line is included. `compareseq' usually chooses |
325 | | | * the one at the beginning, but usually it is cleaner to consider the |
326 | | | * following identical line to be the "change". |
327 | | | * |
328 | | | * This is extracted verbatim from analyze.c (GNU diffutils-2.7). |
329 | | | */ |
330 | | | function _shiftBoundaries($lines, &$changed, $other_changed) |
331 | | | { |
332 | | | $i = 0; |
333 | | | $j = 0; |
334 | | | |
335 | | | assert('count($lines) == count($changed)'); |
336 | | | $len = count($lines); |
337 | | | $other_len = count($other_changed); |
338 | | | |
339 | | | while (1) { |
340 | | | /* Scan forward to find the beginning of another run of |
341 | | | * changes. Also keep track of the corresponding point in the |
342 | | | * other file. |
343 | | | * |
344 | | | * Throughout this code, $i and $j are adjusted together so that |
345 | | | * the first $i elements of $changed and the first $j elements of |
346 | | | * $other_changed both contain the same number of zeros (unchanged |
347 | | | * lines). |
348 | | | * |
349 | | | * Furthermore, $j is always kept so that $j == $other_len or |
350 | | | * $other_changed[$j] == false. */ |
351 | | | while ($j < $other_len && $other_changed[$j]) { |
352 | | | $j++; |
353 | | | } |
354 | | | |
355 | | | while ($i < $len && ! $changed[$i]) { |
356 | | | assert('$j < $other_len && ! $other_changed[$j]'); |
357 | | | $i++; $j++; |
358 | | | while ($j < $other_len && $other_changed[$j]) { |
359 | | | $j++; |
360 | | | } |
361 | | | } |
362 | | | |
363 | | | if ($i == $len) { |
364 | | | break; |
365 | | | } |
366 | | | |
367 | | | $start = $i; |
368 | | | |
369 | | | /* Find the end of this run of changes. */ |
370 | | | while (++$i < $len && $changed[$i]) { |
371 | | | continue; |
372 | | | } |
373 | | | |
374 | | | do { |
375 | | | /* Record the length of this run of changes, so that we can |
376 | | | * later determine whether the run has grown. */ |
377 | | | $runlength = $i - $start; |
378 | | | |
379 | | | /* Move the changed region back, so long as the previous |
380 | | | * unchanged line matches the last changed one. This merges |
381 | | | * with previous changed regions. */ |
382 | | | while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) { |
383 | | | $changed[--$start] = 1; |
384 | | | $changed[--$i] = false; |
385 | | | while ($start > 0 && $changed[$start - 1]) { |
386 | | | $start--; |
387 | | | } |
388 | | | assert('$j > 0'); |
389 | | | while ($other_changed[--$j]) { |
390 | | | continue; |
391 | | | } |
392 | | | assert('$j >= 0 && !$other_changed[$j]'); |
393 | | | } |
394 | | | |
395 | | | /* Set CORRESPONDING to the end of the changed run, at the |
396 | | | * last point where it corresponds to a changed run in the |
397 | | | * other file. CORRESPONDING == LEN means no such point has |
398 | | | * been found. */ |
399 | | | $corresponding = $j < $other_len ? $i : $len; |
400 | | | |
401 | | | /* Move the changed region forward, so long as the first |
402 | | | * changed line matches the following unchanged one. This |
403 | | | * merges with following changed regions. Do this second, so |
404 | | | * that if there are no merges, the changed region is moved |
405 | | | * forward as far as possible. */ |
406 | | | while ($i < $len && $lines[$start] == $lines[$i]) { |
407 | | | $changed[$start++] = false; |
408 | | | $changed[$i++] = 1; |
409 | | | while ($i < $len && $changed[$i]) { |
410 | | | $i++; |
411 | | | } |
412 | | | |
413 | | | assert('$j < $other_len && ! $other_changed[$j]'); |
414 | | | $j++; |
415 | | | if ($j < $other_len && $other_changed[$j]) { |
416 | | | $corresponding = $i; |
417 | | | while ($j < $other_len && $other_changed[$j]) { |
418 | | | $j++; |
419 | | | } |
420 | | | } |
421 | | | } |
422 | | | } while ($runlength != $i - $start); |
423 | | | |
424 | | | /* If possible, move the fully-merged run of changes back to a |
425 | | | * corresponding run in the other file. */ |
426 | | | while ($corresponding < $i) { |
427 | | | $changed[--$start] = 1; |
428 | | | $changed[--$i] = 0; |
429 | | | assert('$j > 0'); |
430 | | | while ($other_changed[--$j]) { |
431 | | | continue; |
432 | | | } |
433 | | | assert('$j >= 0 && !$other_changed[$j]'); |
434 | | | } |
435 | | | } |
436 | | | } |
437 | | | |
438 | | | } |