001: /*
002: * Gruntspud
003: *
004: * Copyright (C) 2002 Brett Smith.
005: *
006: * Written by: Brett Smith <t_magicthize@users.sourceforge.net>
007: *
008: * This program is free software; you can redistribute it and/or
009: * modify it under the terms of the GNU Library General Public License
010: * as published by the Free Software Foundation; either version 2 of
011: * the License, or (at your option) any later version.
012: * This program is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
015: * GNU Library General Public License for more details.
016: *
017: * You should have received a copy of the GNU Library General Public
018: * License along with this program; if not, write to the Free Software
019: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
020: */
021:
022: /* $Log: Diff.java,v $
023: /* Revision 1.4 2003/07/21 20:25:08 t_magicthize
024: /* Preparation for release.
025: /*
026: /* Revision 1.3 2003/02/01 11:50:09 t_magicthize
027: /* Preparation for release 0_3_0-beta
028: /*
029: /* Revision 1.2 2003/01/30 23:37:19 t_magicthize
030: /* Global source format using jalopy
031: /*
032: /* Revision 1.1 2002/08/17 16:14:10 t_magicthize
033: /* Added built in diff viewer
034: /*
035: * Revision 1.3 2000/03/03 21:58:03 stuart
036: * move discard_confusing_lines and shift_boundaries to class file_data
037: *
038: * Revision 1.2 2000/03/02 16:37:38 stuart
039: * Add GPL and copyright
040: *
041: */
042: package bmsi.util;
043:
044: import java.util.Hashtable;
045:
046: /** A class to compare vectors of objects. The result of comparison
047: is a list of <code>change</code> objects which form an
048: edit script. The objects compared are traditionally lines
049: of text from two files. Comparison options such as "ignore
050: whitespace" are implemented by modifying the <code>equals</code>
051: and <code>hashcode</code> methods for the objects compared.
052: <p>
053: The basic algorithm is described in: </br>
054: "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
055: Algorithmica Vol. 1 No. 2, 1986, p 251.
056: <p>
057: This class outputs different results from GNU diff 1.15 on some
058: inputs. Our results are actually better (smaller change list, smaller
059: total size of changes), but it would be nice to know why. Perhaps
060: there is a memory overwrite bug in GNU diff 1.15.
061: @author Stuart D. Gathman, translated from GNU diff 1.15
062: Copyright (C) 2000 Business Management Systems, Inc.
063: <p>
064: This program is free software; you can redistribute it and/or modify
065: it under the terms of the GNU General Public License as published by
066: the Free Software Foundation; either version 1, or (at your option)
067: any later version.
068: <p>
069: This program is distributed in the hope that it will be useful,
070: but WITHOUT ANY WARRANTY; without even the implied warranty of
071: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
072: GNU General Public License for more details.
073: <p>
074: You should have received a copy of the <a href=COPYING.txt>
075: GNU General Public License</a>
076: along with this program; if not, write to the Free Software
077: Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
078: */
079: public class Diff {
080: /** 1 more than the maximum equivalence value used for this or its
081: sibling file. */
082: private int equiv_max = 1;
083:
084: /** When set to true, the comparison uses a heuristic to speed it up.
085: With this heuristic, for files with a constant small density
086: of changes, the algorithm is linear in the file size. */
087: public boolean heuristic = false;
088:
089: /** When set to true, the algorithm returns a guarranteed minimal
090: set of changes. This makes things slower, sometimes much slower. */
091: public boolean no_discards = false;
092: private int[] xvec; /* Vectors being compared. */
093: private int[] yvec; /* Vectors being compared. */
094: private int[] fdiag; /* Vector, indexed by diagonal, containing
095: the X coordinate of the point furthest
096: along the given diagonal in the forward
097: search of the edit matrix. */
098: private int[] bdiag; /* Vector, indexed by diagonal, containing
099: the X coordinate of the point furthest
100: along the given diagonal in the backward
101: search of the edit matrix. */
102: private int fdiagoff;
103: private int bdiagoff;
104: private final file_data[] filevec = new file_data[2];
105: private int cost;
106: private boolean inhibit = false;
107:
108: /** Prepare to find differences between two arrays. Each element of
109: the arrays is translated to an "equivalence number" based on
110: the result of <code>equals</code>. The original Object arrays
111: are no longer needed for computing the differences. They will
112: be needed again later to print the results of the comparison as
113: an edit script, if desired.
114: */
115: public Diff(Object[] a, Object[] b) {
116: Hashtable h = new Hashtable(a.length + b.length);
117: filevec[0] = new file_data(a, h);
118: filevec[1] = new file_data(b, h);
119: }
120:
121: /** Find the midpoint of the shortest edit script for a specified
122: portion of the two files.
123: We scan from the beginnings of the files, and simultaneously from the ends,
124: doing a breadth-first search through the space of edit-sequence.
125: When the two searches meet, we have found the midpoint of the shortest
126: edit sequence.
127: The value returned is the number of the diagonal on which the midpoint lies.
128: The diagonal number equals the number of inserted lines minus the number
129: of deleted lines (counting only lines before the midpoint).
130: The edit cost is stored into COST; this is the total number of
131: lines inserted or deleted (counting only lines before the midpoint).
132: This function assumes that the first lines of the specified portions
133: of the two files do not match, and likewise that the last lines do not
134: match. The caller must trim matching lines from the beginning and end
135: of the portions it is going to specify.
136: Note that if we return the "wrong" diagonal value, or if
137: the value of bdiag at that diagonal is "wrong",
138: the worst this can do is cause suboptimal diff output.
139: It cannot cause incorrect diff output. */
140: private int diag(int xoff, int xlim, int yoff, int ylim) {
141: final int[] fd = fdiag; // Give the compiler a chance.
142: final int[] bd = bdiag; // Additional help for the compiler.
143: final int[] xv = xvec; // Still more help for the compiler.
144: final int[] yv = yvec; // And more and more . . .
145: final int dmin = xoff - ylim; // Minimum valid diagonal.
146: final int dmax = xlim - yoff; // Maximum valid diagonal.
147: final int fmid = xoff - yoff; // Center diagonal of top-down search.
148: final int bmid = xlim - ylim; // Center diagonal of bottom-up search.
149: int fmin = fmid; // Limits of top-down search.
150: int fmax = fmid; // Limits of top-down search.
151: int bmin = bmid; // Limits of bottom-up search.
152: int bmax = bmid; // Limits of bottom-up search.
153:
154: /* True if southeast corner is on an odd
155: diagonal with respect to the northwest. */
156: final boolean odd = ((fmid - bmid) & 1) != 0;
157:
158: fd[fdiagoff + fmid] = xoff;
159: bd[bdiagoff + bmid] = xlim;
160:
161: for (int c = 1;; ++c) {
162: int d; /* Active diagonal. */
163: boolean big_snake = false;
164:
165: /* Extend the top-down search by an edit step in each diagonal. */
166: if (fmin > dmin)
167: fd[(fdiagoff + --fmin) - 1] = -1;
168: else
169: ++fmin;
170:
171: if (fmax < dmax)
172: fd[fdiagoff + ++fmax + 1] = -1;
173: else
174: --fmax;
175:
176: for (d = fmax; d >= fmin; d -= 2) {
177: int x;
178: int y;
179: int oldx;
180: int tlo = fd[(fdiagoff + d) - 1];
181: int thi = fd[fdiagoff + d + 1];
182:
183: if (tlo >= thi)
184: x = tlo + 1;
185: else
186: x = thi;
187:
188: oldx = x;
189: y = x - d;
190:
191: while ((x < xlim) && (y < ylim) && (xv[x] == yv[y])) {
192: ++x;
193: ++y;
194: }
195:
196: if ((x - oldx) > 20)
197: big_snake = true;
198:
199: fd[fdiagoff + d] = x;
200:
201: if (odd && (bmin <= d) && (d <= bmax)
202: && (bd[bdiagoff + d] <= fd[fdiagoff + d])) {
203: cost = (2 * c) - 1;
204:
205: return d;
206: }
207: }
208:
209: /* Similar extend the bottom-up search. */
210: if (bmin > dmin)
211: bd[(bdiagoff + --bmin) - 1] = Integer.MAX_VALUE;
212: else
213: ++bmin;
214:
215: if (bmax < dmax)
216: bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
217: else
218: --bmax;
219:
220: for (d = bmax; d >= bmin; d -= 2) {
221: int x;
222: int y;
223: int oldx;
224: int tlo = bd[(bdiagoff + d) - 1];
225: int thi = bd[bdiagoff + d + 1];
226:
227: if (tlo < thi)
228: x = tlo;
229: else
230: x = thi - 1;
231:
232: oldx = x;
233: y = x - d;
234:
235: while ((x > xoff) && (y > yoff)
236: && (xv[x - 1] == yv[y - 1])) {
237: --x;
238: --y;
239: }
240:
241: if ((oldx - x) > 20)
242: big_snake = true;
243:
244: bd[bdiagoff + d] = x;
245:
246: if (!odd && (fmin <= d) && (d <= fmax)
247: && (bd[bdiagoff + d] <= fd[fdiagoff + d])) {
248: cost = 2 * c;
249:
250: return d;
251: }
252: }
253:
254: /* Heuristic: check occasionally for a diagonal that has made
255: lots of progress compared with the edit distance.
256: If we have any such, find the one that has made the most
257: progress and return it as if it had succeeded.
258: With this heuristic, for files with a constant small density
259: of changes, the algorithm is linear in the file size. */
260: if ((c > 200) && big_snake && heuristic) {
261: int best = 0;
262: int bestpos = -1;
263:
264: for (d = fmax; d >= fmin; d -= 2) {
265: int dd = d - fmid;
266:
267: if ((((fd[fdiagoff + d] - xoff) * 2) - dd) > (12 * (c + ((dd > 0) ? dd
268: : (-dd))))) {
269: if ((((fd[fdiagoff + d] * 2) - dd) > best)
270: && ((fd[fdiagoff + d] - xoff) > 20)
271: && ((fd[fdiagoff + d] - d - yoff) > 20)) {
272: int k;
273: int x = fd[fdiagoff + d];
274:
275: /* We have a good enough best diagonal;
276: now insist that it end with a significant snake. */
277: for (k = 1; k <= 20; k++)
278: if (xvec[x - k] != yvec[x - d - k])
279: break;
280:
281: if (k == 21) {
282: best = (fd[fdiagoff + d] * 2) - dd;
283: bestpos = d;
284: }
285: }
286: }
287: }
288:
289: if (best > 0) {
290: cost = (2 * c) - 1;
291:
292: return bestpos;
293: }
294:
295: best = 0;
296:
297: for (d = bmax; d >= bmin; d -= 2) {
298: int dd = d - bmid;
299:
300: if ((((xlim - bd[bdiagoff + d]) * 2) + dd) > (12 * (c + ((dd > 0) ? dd
301: : (-dd))))) {
302: if (((((xlim - bd[bdiagoff + d]) * 2) + dd) > best)
303: && ((xlim - bd[bdiagoff + d]) > 20)
304: && ((ylim - (bd[bdiagoff + d] - d)) > 20)) {
305: /* We have a good enough best diagonal;
306: now insist that it end with a significant snake. */
307: int k;
308: int x = bd[bdiagoff + d];
309:
310: for (k = 0; k < 20; k++)
311: if (xvec[x + k] != yvec[x - d + k])
312: break;
313:
314: if (k == 20) {
315: best = ((xlim - bd[bdiagoff + d]) * 2)
316: + dd;
317: bestpos = d;
318: }
319: }
320: }
321: }
322:
323: if (best > 0) {
324: cost = (2 * c) - 1;
325:
326: return bestpos;
327: }
328: }
329: }
330: }
331:
332: /** Compare in detail contiguous subsequences of the two files
333: which are known, as a whole, to match each other.
334: The results are recorded in the vectors filevec[N].changed_flag, by
335: storing a 1 in the element for each line that is an insertion or deletion.
336: The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
337: Note that XLIM, YLIM are exclusive bounds.
338: All line numbers are origin-0 and discarded lines are not counted. */
339: private void compareseq(int xoff, int xlim, int yoff, int ylim) {
340: /* Slide down the bottom initial diagonal. */
341: while ((xoff < xlim) && (yoff < ylim)
342: && (xvec[xoff] == yvec[yoff])) {
343: ++xoff;
344: ++yoff;
345: }
346:
347: /* Slide up the top initial diagonal. */
348: while ((xlim > xoff) && (ylim > yoff)
349: && (xvec[xlim - 1] == yvec[ylim - 1])) {
350: --xlim;
351: --ylim;
352: }
353:
354: /* Handle simple cases. */
355: if (xoff == xlim)
356: while (yoff < ylim)
357: filevec[1].changed_flag[1 + filevec[1].realindexes[yoff++]] = true;
358: else if (yoff == ylim)
359: while (xoff < xlim)
360: filevec[0].changed_flag[1 + filevec[0].realindexes[xoff++]] = true;
361: else {
362: /* Find a point of correspondence in the middle of the files. */
363: int d = diag(xoff, xlim, yoff, ylim);
364: int c = cost;
365: int f = fdiag[fdiagoff + d];
366: int b = bdiag[bdiagoff + d];
367:
368: if (c == 1)
369:
370: /* This should be impossible, because it implies that
371: one of the two subsequences is empty,
372: and that case was handled above without calling `diag'.
373: Let's verify that this is true. */
374: throw new IllegalArgumentException("Empty subsequence");
375: else {
376: /* Use that point to split this problem into two subproblems. */
377: compareseq(xoff, b, yoff, b - d);
378:
379: /* This used to use f instead of b,
380: but that is incorrect!
381: It is not necessarily the case that diagonal d
382: has a snake from b to f. */
383: compareseq(b, xlim, b - d, ylim);
384: }
385: }
386: }
387:
388: /** Discard lines from one file that have no matches in the other file.
389: */
390: private void discard_confusing_lines() {
391: filevec[0].discard_confusing_lines(filevec[1]);
392: filevec[1].discard_confusing_lines(filevec[0]);
393: }
394:
395: /** Adjust inserts/deletes of blank lines to join changes
396: as much as possible.
397: */
398: private void shift_boundaries() {
399: if (inhibit)
400: return;
401:
402: filevec[0].shift_boundaries(filevec[1]);
403: filevec[1].shift_boundaries(filevec[0]);
404: }
405:
406: /** Scan the tables of which lines are inserted and deleted,
407: producing an edit script in reverse order. */
408: private change build_reverse_script() {
409: change script = null;
410: final boolean[] changed0 = filevec[0].changed_flag;
411: final boolean[] changed1 = filevec[1].changed_flag;
412: final int len0 = filevec[0].buffered_lines;
413: final int len1 = filevec[1].buffered_lines;
414:
415: /* Note that changedN[len0] does exist, and contains 0. */
416: int i0 = 0;
417:
418: /* Note that changedN[len0] does exist, and contains 0. */
419: int i1 = 0;
420:
421: while ((i0 < len0) || (i1 < len1)) {
422: if (changed0[1 + i0] || changed1[1 + i1]) {
423: int line0 = i0;
424: int line1 = i1;
425:
426: /* Find # lines changed here in each file. */
427: while (changed0[1 + i0])
428: ++i0;
429:
430: while (changed1[1 + i1])
431: ++i1;
432:
433: /* Record this change. */
434: script = new change(line0, line1, i0 - line0, i1
435: - line1, script);
436: }
437:
438: /* We have reached lines in the two files that match each other. */
439: i0++;
440: i1++;
441: }
442:
443: return script;
444: }
445:
446: /** Scan the tables of which lines are inserted and deleted,
447: producing an edit script in forward order. */
448: private change build_script() {
449: change script = null;
450: final boolean[] changed0 = filevec[0].changed_flag;
451: final boolean[] changed1 = filevec[1].changed_flag;
452: final int len0 = filevec[0].buffered_lines;
453: final int len1 = filevec[1].buffered_lines;
454: int i0 = len0;
455: int i1 = len1;
456:
457: /* Note that changedN[-1] does exist, and contains 0. */
458: while ((i0 >= 0) || (i1 >= 0)) {
459: if (changed0[i0] || changed1[i1]) {
460: int line0 = i0;
461: int line1 = i1;
462:
463: /* Find # lines changed here in each file. */
464: while (changed0[i0])
465: --i0;
466:
467: while (changed1[i1])
468: --i1;
469:
470: /* Record this change. */
471: script = new change(i0, i1, line0 - i0, line1 - i1,
472: script);
473: }
474:
475: /* We have reached lines in the two files that match each other. */
476: i0--;
477: i1--;
478: }
479:
480: return script;
481: }
482:
483: /* Report the differences of two files. DEPTH is the current directory
484: depth. */
485: public change diff_2(final boolean reverse) {
486: /* Some lines are obviously insertions or deletions
487: because they don't match anything. Detect them now,
488: and avoid even thinking about them in the main comparison algorithm. */
489: discard_confusing_lines();
490:
491: /* Now do the main comparison algorithm, considering just the
492: undiscarded lines. */
493: xvec = filevec[0].undiscarded;
494: yvec = filevec[1].undiscarded;
495:
496: int diags = filevec[0].nondiscarded_lines
497: + filevec[1].nondiscarded_lines + 3;
498: fdiag = new int[diags];
499: fdiagoff = filevec[1].nondiscarded_lines + 1;
500: bdiag = new int[diags];
501: bdiagoff = filevec[1].nondiscarded_lines + 1;
502:
503: compareseq(0, filevec[0].nondiscarded_lines, 0,
504: filevec[1].nondiscarded_lines);
505: fdiag = null;
506: bdiag = null;
507:
508: /* Modify the results slightly to make them prettier
509: in cases where that can validly be done. */
510: shift_boundaries();
511:
512: /* Get the results of comparison in the form of a chain
513: of `struct change's -- an edit script. */
514: if (reverse)
515: return build_reverse_script();
516: else
517:
518: return build_script();
519: }
520:
521: /** The result of comparison is an "edit script": a chain of change objects.
522: Each change represents one place where some lines are deleted
523: and some are inserted.
524: LINE0 and LINE1 are the first affected lines in the two files (origin 0).
525: DELETED is the number of lines deleted here from file 0.
526: INSERTED is the number of lines inserted here in file 1.
527: If DELETED is 0 then LINE0 is the number of the line before
528: which the insertion was done; vice versa for INSERTED and LINE1. */
529: public static class change {
530: /** Previous or next edit command. */
531: public change link;
532:
533: /** # lines of file 1 changed here. */
534: public final int inserted;
535:
536: /** # lines of file 0 changed here. */
537: public final int deleted;
538:
539: /** Line number of 1st deleted line. */
540: public final int line0;
541:
542: /** Line number of 1st inserted line. */
543: public final int line1;
544:
545: /** Cons an additional entry onto the front of an edit script OLD.
546: LINE0 and LINE1 are the first affected lines in the two files (origin 0).
547: DELETED is the number of lines deleted here from file 0.
548: INSERTED is the number of lines inserted here in file 1.
549: If DELETED is 0 then LINE0 is the number of the line before
550: which the insertion was done; vice versa for INSERTED and LINE1. */
551: change(int line0, int line1, int deleted, int inserted,
552: change old) {
553: this .line0 = line0;
554: this .line1 = line1;
555: this .inserted = inserted;
556: this .deleted = deleted;
557: this .link = old;
558:
559: //System.err.println(line0+","+line1+","+inserted+","+deleted);
560: }
561: }
562:
563: /** Data on one input file being compared.
564: */
565: class file_data {
566: /** Number of elements (lines) in this file. */
567: final int buffered_lines;
568:
569: /** Vector, indexed by line number, containing an equivalence code for
570: each line. It is this vector that is actually compared with that
571: of another file to generate differences. */
572: private final int[] equivs;
573:
574: /** Vector, like the previous one except that
575: the elements for discarded lines have been squeezed out. */
576: final int[] undiscarded;
577:
578: /** Vector mapping virtual line numbers (not counting discarded lines)
579: to real ones (counting those lines). Both are origin-0. */
580: final int[] realindexes;
581:
582: /** Total number of nondiscarded lines. */
583: int nondiscarded_lines;
584:
585: /** Array, indexed by real origin-1 line number,
586: containing true for a line that is an insertion or a deletion.
587: The results of comparison are stored here. */
588: boolean[] changed_flag;
589:
590: file_data(Object[] data, Hashtable h) {
591: buffered_lines = data.length;
592:
593: equivs = new int[buffered_lines];
594: undiscarded = new int[buffered_lines];
595: realindexes = new int[buffered_lines];
596:
597: for (int i = 0; i < data.length; ++i) {
598: Integer ir = (Integer) h.get(data[i]);
599:
600: if (ir == null)
601: h
602: .put(data[i], new Integer(
603: equivs[i] = equiv_max++));
604: else
605: equivs[i] = ir.intValue();
606: }
607: }
608:
609: /** Allocate changed array for the results of comparison. */
610: void clear() {
611: /* Allocate a flag for each line of each file, saying whether that line
612: is an insertion or deletion.
613: Allocate an extra element, always zero, at each end of each vector.
614: */
615: changed_flag = new boolean[buffered_lines + 2];
616: }
617:
618: /** Return equiv_count[I] as the number of lines in this file
619: that fall in equivalence class I.
620: @return the array of equivalence class counts.
621: */
622: int[] equivCount() {
623: int[] equiv_count = new int[equiv_max];
624:
625: for (int i = 0; i < buffered_lines; ++i)
626: ++equiv_count[equivs[i]];
627:
628: return equiv_count;
629: }
630:
631: /** Discard lines that have no matches in another file.
632: A line which is discarded will not be considered by the actual
633: comparison algorithm; it will be as if that line were not in the file.
634: The file's `realindexes' table maps virtual line numbers
635: (which don't count the discarded lines) into real line numbers;
636: this is how the actual comparison algorithm produces results
637: that are comprehensible when the discarded lines are counted.
638: <p>
639: When we discard a line, we also mark it as a deletion or insertion
640: so that it will be printed in the output.
641: @param f the other file
642: */
643: void discard_confusing_lines(file_data f) {
644: clear();
645:
646: /* Set up table of which lines are going to be discarded. */
647: final byte[] discarded = discardable(f.equivCount());
648:
649: /* Don't really discard the provisional lines except when they occur
650: in a run of discardables, with nonprovisionals at the beginning
651: and end. */
652: filterDiscards(discarded);
653:
654: /* Actually discard the lines. */
655: discard(discarded);
656: }
657:
658: /** Mark to be discarded each line that matches no line of another file.
659: If a line matches many lines, mark it as provisionally discardable.
660: @see equivCount()
661: @param counts The count of each equivalence number for the other file.
662: @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
663: for each line
664: */
665: private byte[] discardable(final int[] counts) {
666: final int end = buffered_lines;
667: final byte[] discards = new byte[end];
668: final int[] equivs = this .equivs;
669: int many = 5;
670: int tem = end / 64;
671:
672: /* Multiply MANY by approximate square root of number of lines.
673: That is the threshold for provisionally discardable lines. */
674: while ((tem = tem >> 2) > 0)
675: many *= 2;
676:
677: for (int i = 0; i < end; i++) {
678: int nmatch;
679:
680: if (equivs[i] == 0)
681: continue;
682:
683: nmatch = counts[equivs[i]];
684:
685: if (nmatch == 0)
686: discards[i] = 1;
687: else if (nmatch > many)
688: discards[i] = 2;
689: }
690:
691: return discards;
692: }
693:
694: /** Don't really discard the provisional lines except when they occur
695: in a run of discardables, with nonprovisionals at the beginning
696: and end. */
697: private void filterDiscards(final byte[] discards) {
698: final int end = buffered_lines;
699:
700: for (int i = 0; i < end; i++) {
701: /* Cancel provisional discards not in middle of run of discards. */
702: if (discards[i] == 2)
703: discards[i] = 0;
704: else if (discards[i] != 0) {
705: /* We have found a nonprovisional discard. */
706: int j;
707: int length;
708: int provisional = 0;
709:
710: /* Find end of this run of discardable lines.
711: Count how many are provisionally discardable. */
712: for (j = i; j < end; j++) {
713: if (discards[j] == 0)
714: break;
715:
716: if (discards[j] == 2)
717: ++provisional;
718: }
719:
720: /* Cancel provisional discards at end, and shrink the run. */
721: while ((j > i) && (discards[j - 1] == 2)) {
722: discards[--j] = 0;
723: --provisional;
724: }
725:
726: /* Now we have the length of a run of discardable lines
727: whose first and last are not provisional. */
728: length = j - i;
729:
730: /* If 1/4 of the lines in the run are provisional,
731: cancel discarding of all provisional lines in the run. */
732: if ((provisional * 4) > length) {
733: while (j > i)
734:
735: if (discards[--j] == 2)
736: discards[j] = 0;
737: } else {
738: int consec;
739: int minimum = 1;
740: int tem = length / 4;
741:
742: /* MINIMUM is approximate square root of LENGTH/4.
743: A subrun of two or more provisionals can stand
744: when LENGTH is at least 16.
745: A subrun of 4 or more can stand when LENGTH >= 64. */
746: while ((tem = tem >> 2) > 0)
747: minimum *= 2;
748:
749: minimum++;
750:
751: /* Cancel any subrun of MINIMUM or more provisionals
752: within the larger run. */
753: for (j = 0, consec = 0; j < length; j++)
754: if (discards[i + j] != 2)
755: consec = 0;
756: else if (minimum == ++consec)
757:
758: /* Back up to start of subrun, to cancel it all. */
759: j -= consec;
760: else if (minimum < consec)
761: discards[i + j] = 0;
762:
763: /* Scan from beginning of run
764: until we find 3 or more nonprovisionals in a row
765: or until the first nonprovisional at least 8 lines in.
766: Until that point, cancel any provisionals. */
767: for (j = 0, consec = 0; j < length; j++) {
768: if ((j >= 8) && (discards[i + j] == 1))
769: break;
770:
771: if (discards[i + j] == 2) {
772: consec = 0;
773: discards[i + j] = 0;
774: } else if (discards[i + j] == 0)
775: consec = 0;
776: else
777: consec++;
778:
779: if (consec == 3)
780: break;
781: }
782:
783: /* I advances to the last line of the run. */
784: i += (length - 1);
785:
786: /* Same thing, from end. */
787: for (j = 0, consec = 0; j < length; j++) {
788: if ((j >= 8) && (discards[i - j] == 1))
789: break;
790:
791: if (discards[i - j] == 2) {
792: consec = 0;
793: discards[i - j] = 0;
794: } else if (discards[i - j] == 0)
795: consec = 0;
796: else
797: consec++;
798:
799: if (consec == 3)
800: break;
801: }
802: }
803: }
804: }
805: }
806:
807: /** Actually discard the lines.
808: @param discards flags lines to be discarded
809: */
810: private void discard(final byte[] discards) {
811: final int end = buffered_lines;
812: int j = 0;
813:
814: for (int i = 0; i < end; ++i)
815: if (no_discards || (discards[i] == 0)) {
816: undiscarded[j] = equivs[i];
817: realindexes[j++] = i;
818: } else
819: changed_flag[1 + i] = true;
820:
821: nondiscarded_lines = j;
822: }
823:
824: /** Adjust inserts/deletes of blank lines to join changes
825: as much as possible.
826: We do something when a run of changed lines include a blank
827: line at one end and have an excluded blank line at the other.
828: We are free to choose which blank line is included.
829: `compareseq' always chooses the one at the beginning,
830: but usually it is cleaner to consider the following blank line
831: to be the "change". The only exception is if the preceding blank line
832: would join this change to other changes.
833: @param f the file being compared against
834: */
835: void shift_boundaries(file_data f) {
836: final boolean[] changed = changed_flag;
837: final boolean[] other_changed = f.changed_flag;
838: int i = 0;
839: int j = 0;
840: int i_end = buffered_lines;
841: int preceding = -1;
842: int other_preceding = -1;
843:
844: for (;;) {
845: int start;
846: int end;
847: int other_start;
848:
849: /* Scan forwards to find beginning of another run of changes.
850: Also keep track of the corresponding point in the other file. */
851: while ((i < i_end) && !changed[1 + i]) {
852: while (other_changed[1 + j++])
853:
854: /* Non-corresponding lines in the other file
855: will count as the preceding batch of changes. */
856: other_preceding = j;
857:
858: i++;
859: }
860:
861: if (i == i_end)
862: break;
863:
864: start = i;
865: other_start = j;
866:
867: for (;;) {
868: /* Now find the end of this run of changes. */
869: while ((i < i_end) && changed[1 + i])
870: i++;
871:
872: end = i;
873:
874: /* If the first changed line matches the following unchanged one,
875: and this run does not follow right after a previous run,
876: and there are no lines deleted from the other file here,
877: then classify the first changed line as unchanged
878: and the following line as changed in its place. */
879: /* You might ask, how could this run follow right after another?
880: Only because the previous run was shifted here. */
881: if ((end != i_end)
882: && (equivs[start] == equivs[end])
883: && !other_changed[1 + j]
884: && (end != i_end)
885: && !(((preceding >= 0) && (start == preceding)) || ((other_preceding >= 0) && (other_start == other_preceding)))) {
886: changed[1 + end++] = true;
887: changed[1 + start++] = false;
888: ++i;
889: ++j;
890: } else
891:
892: break;
893: }
894:
895: preceding = i;
896: other_preceding = j;
897: }
898: }
899: }
900: }
|