001: /*
002: * Copyright (C) 2001, 2002 Robert MacGrogan
003: *
004: * This program is free software; you can redistribute it and/or modify
005: * it under the terms of the GNU General Public License as published by
006: * the Free Software Foundation; either version 2 of the License, or
007: * (at your option) any later version.
008: *
009: * This program is distributed in the hope that it will be useful,
010: * but WITHOUT ANY WARRANTY; without even the implied warranty of
011: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
012: * GNU General Public License for more details.
013: *
014: * You should have received a copy of the GNU General Public License
015: * along with this program; if not, write to the Free Software
016: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
017: *
018: *
019: * $Archive: SourceJammer$
020: * $FileName: FileData.java$
021: * $FileID: 4688$
022: *
023: * Last change:
024: * $AuthorName: Rob MacGrogan$
025: * $Date: 5/1/03 6:39 PM$
026: * $Comment: Modify header to show that this module is GPL, not GLPL.$
027: */
028: package JLibDiff;
029:
030: import java.util.Hashtable;
031:
032: /**
033: * Title: $FileName: FileData.java$
034: * @version $VerNum: 3$
035: * @author $AuthorName: Rob MacGrogan$<br><br>
036: *
037: * $Description: $<br>
038: * $KeyWordsOff: $
039: *
040: * This class is used by GnuDiffAlgorithm. Please not that this module is only
041: * released under the GNU Public License.
042: */
043: public class FileData {
044:
045: /** Number of elements (lines) in this file. */
046: private int numLinesInBuffer;
047:
048: /** Vector, indexed by line number, containing an equivalence code for
049: each line. It is this vector that is actually compared with that
050: of another file to generate differences. */
051: private int[] equivs;
052:
053: /** Vector, like the previous one except that
054: the elements for discarded lines have been squeezed out. */
055: private int[] undiscarded;
056:
057: /** Vector mapping virtual line numbers (not counting discarded lines)
058: to real ones (counting those lines). Both are origin-0. */
059: private int[] realindexes;
060:
061: /** Total number of nondiscarded lines. */
062: private int numLinesNotDiscarded;
063:
064: /** Array, indexed by real origin-1 line number,
065: containing true for a line that is an insertion or a deletion.
066: The results of comparison are stored here. */
067: private boolean[] changedFlags;
068:
069: private DiffMaker maker = null;
070:
071: public FileData(Object[] data, Hashtable h, DiffMaker maker) {
072: this .maker = maker;
073: numLinesInBuffer = data.length;
074:
075: equivs = new int[numLinesInBuffer];
076: undiscarded = new int[numLinesInBuffer];
077: realindexes = new int[numLinesInBuffer];
078:
079: for (int i = 0; i < data.length; ++i) {
080: Integer ir = (Integer) h.get(data[i]);
081: if (ir == null) {
082: h.put(data[i], new Integer(equivs[i] = maker
083: .getEquivMax()));
084: maker.incrimentEquivMax();
085: } else {
086: equivs[i] = ir.intValue();
087: }
088: }
089: }
090:
091: /** Allocate changed array for the results of comparison. */
092: void clear() {
093: /* Allocate a flag for each line of each file, saying whether that line
094: is an insertion or deletion.
095: Allocate an extra element, always zero, at each end of each vector.
096: */
097: changedFlags = new boolean[numLinesInBuffer + 2];
098: }
099:
100: /** Return equiv_count[I] as the number of lines in this file
101: that fall in equivalence class I.
102: @return the array of equivalence class counts.
103: */
104: int[] equivCount() {
105: int[] equiv_count = new int[maker.getEquivMax()];
106: for (int i = 0; i < numLinesInBuffer; ++i) {
107: ++equiv_count[equivs[i]];
108: }
109: return equiv_count;
110: }
111:
112: /** Discard lines that have no matches in another file.
113:
114: A line which is discarded will not be considered by the actual
115: comparison algorithm; it will be as if that line were not in the file.
116: The file's `realindexes' table maps virtual line numbers
117: (which don't count the discarded lines) into real line numbers;
118: this is how the actual comparison algorithm produces results
119: that are comprehensible when the discarded lines are counted.
120: <p>
121: When we discard a line, we also mark it as a deletion or insertion
122: so that it will be printed in the output.
123: @param f the other file
124: */
125: void discardConfusingLines(FileData f) {
126: clear();
127: /* Set up table of which lines are going to be discarded. */
128: final byte[] discarded = discardable(f.equivCount());
129:
130: /* Don't really discard the provisional lines except when they occur
131: in a run of discardables, with nonprovisionals at the beginning
132: and end. */
133: filterDiscards(discarded);
134:
135: /* Actually discard the lines. */
136: discard(discarded);
137: }
138:
139: /** Mark to be discarded each line that matches no line of another file.
140: If a line matches many lines, mark it as provisionally discardable.
141: @see equivCount()
142: @param counts The count of each equivalence number for the other file.
143: @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
144: for each line
145: */
146:
147: private byte[] discardable(final int[] counts) {
148: final int end = numLinesInBuffer;
149: final byte[] discards = new byte[end];
150: final int[] equivs = this .equivs;
151: int many = 5;
152: int tem = end / 64;
153:
154: /* Multiply MANY by approximate square root of number of lines.
155: That is the threshold for provisionally discardable lines. */
156: while ((tem = tem >> 2) > 0)
157: many *= 2;
158:
159: for (int i = 0; i < end; i++) {
160: int nmatch;
161: if (equivs[i] == 0)
162: continue;
163: nmatch = counts[equivs[i]];
164: if (nmatch == 0)
165: discards[i] = 1;
166: else if (nmatch > many)
167: discards[i] = 2;
168: }
169: return discards;
170: }
171:
172: /** Don't really discard the provisional lines except when they occur
173: in a run of discardables, with nonprovisionals at the beginning
174: and end. */
175:
176: private void filterDiscards(final byte[] discards) {
177: final int end = numLinesInBuffer;
178:
179: for (int i = 0; i < end; i++) {
180: /* Cancel provisional discards not in middle of run of discards. */
181: if (discards[i] == 2)
182: discards[i] = 0;
183: else if (discards[i] != 0) {
184: /* We have found a nonprovisional discard. */
185: int j;
186: int length;
187: int provisional = 0;
188:
189: /* Find end of this run of discardable lines.
190: Count how many are provisionally discardable. */
191: for (j = i; j < end; j++) {
192: if (discards[j] == 0)
193: break;
194: if (discards[j] == 2)
195: ++provisional;
196: }
197:
198: /* Cancel provisional discards at end, and shrink the run. */
199: while (j > i && discards[j - 1] == 2) {
200: discards[--j] = 0;
201: --provisional;
202: }
203:
204: /* Now we have the length of a run of discardable lines
205: whose first and last are not provisional. */
206: length = j - i;
207:
208: /* If 1/4 of the lines in the run are provisional,
209: cancel discarding of all provisional lines in the run. */
210: if (provisional * 4 > length) {
211: while (j > i)
212: if (discards[--j] == 2)
213: discards[j] = 0;
214: } else {
215: int consec;
216: int minimum = 1;
217: int tem = length / 4;
218:
219: /* MINIMUM is approximate square root of LENGTH/4.
220: A subrun of two or more provisionals can stand
221: when LENGTH is at least 16.
222: A subrun of 4 or more can stand when LENGTH >= 64. */
223: while ((tem = tem >> 2) > 0)
224: minimum *= 2;
225: minimum++;
226:
227: /* Cancel any subrun of MINIMUM or more provisionals
228: within the larger run. */
229: for (j = 0, consec = 0; j < length; j++)
230: if (discards[i + j] != 2)
231: consec = 0;
232: else if (minimum == ++consec)
233: /* Back up to start of subrun, to cancel it all. */
234: j -= consec;
235: else if (minimum < consec)
236: discards[i + j] = 0;
237:
238: /* Scan from beginning of run
239: until we find 3 or more nonprovisionals in a row
240: or until the first nonprovisional at least 8 lines in.
241: Until that point, cancel any provisionals. */
242: for (j = 0, consec = 0; j < length; j++) {
243: if (j >= 8 && discards[i + j] == 1)
244: break;
245: if (discards[i + j] == 2) {
246: consec = 0;
247: discards[i + j] = 0;
248: } else if (discards[i + j] == 0)
249: consec = 0;
250: else
251: consec++;
252: if (consec == 3)
253: break;
254: }
255:
256: /* I advances to the last line of the run. */
257: i += length - 1;
258:
259: /* Same thing, from end. */
260: for (j = 0, consec = 0; j < length; j++) {
261: if (j >= 8 && discards[i - j] == 1)
262: break;
263: if (discards[i - j] == 2) {
264: consec = 0;
265: discards[i - j] = 0;
266: } else if (discards[i - j] == 0)
267: consec = 0;
268: else
269: consec++;
270: if (consec == 3)
271: break;
272: }
273: }
274: }
275: }
276: }
277:
278: /** Actually discard the lines.
279: @param discards flags lines to be discarded
280: */
281: private void discard(final byte[] discards) {
282: final int end = numLinesInBuffer;
283: int j = 0;
284: for (int i = 0; i < end; ++i)
285: if (maker.isNoDiscards() || discards[i] == 0) {
286: undiscarded[j] = equivs[i];
287: realindexes[j++] = i;
288: } else
289: changedFlags[1 + i] = true;
290: numLinesNotDiscarded = j;
291: }
292:
293: /** Adjust inserts/deletes of blank lines to join changes
294: as much as possible.
295:
296: We do something when a run of changed lines include a blank
297: line at one end and have an excluded blank line at the other.
298: We are free to choose which blank line is included.
299: `compareseq' always chooses the one at the beginning,
300: but usually it is cleaner to consider the following blank line
301: to be the "change". The only exception is if the preceding blank line
302: would join this change to other changes.
303: @param f the file being compared against
304: */
305:
306: void shift_boundaries(FileData f) {
307: final boolean[] changed = changedFlags;
308: final boolean[] other_changed = f.changedFlags;
309: int i = 0;
310: int j = 0;
311: int i_end = numLinesInBuffer;
312: int preceding = -1;
313: int other_preceding = -1;
314:
315: for (;;) {
316: int start, end, other_start;
317:
318: /* Scan forwards to find beginning of another run of changes.
319: Also keep track of the corresponding point in the other file. */
320:
321: while (i < i_end && !changed[1 + i]) {
322: while (other_changed[1 + j++])
323: /* Non-corresponding lines in the other file
324: will count as the preceding batch of changes. */
325: other_preceding = j;
326: i++;
327: }
328:
329: if (i == i_end)
330: break;
331:
332: start = i;
333: other_start = j;
334:
335: for (;;) {
336: /* Now find the end of this run of changes. */
337:
338: while (i < i_end && changed[1 + i])
339: i++;
340: end = i;
341:
342: /* If the first changed line matches the following unchanged one,
343: and this run does not follow right after a previous run,
344: and there are no lines deleted from the other file here,
345: then classify the first changed line as unchanged
346: and the following line as changed in its place. */
347:
348: /* You might ask, how could this run follow right after another?
349: Only because the previous run was shifted here. */
350:
351: if (end != i_end
352: && equivs[start] == equivs[end]
353: && !other_changed[1 + j]
354: && end != i_end
355: && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) {
356: changed[1 + end++] = true;
357: changed[1 + start++] = false;
358: ++i;
359: /* Since one line-that-matches is now before this run
360: instead of after, we must advance in the other file
361: to keep in synch. */
362: ++j;
363: } else
364: break;
365: }
366:
367: preceding = i;
368: other_preceding = j;
369: }
370: }
371:
372: /**
373: * Returns the undiscarded.
374: * @return int[]
375: */
376: public int[] getUndiscarded() {
377: return undiscarded;
378: }
379:
380: /**
381: * Returns the numLinesNotDiscarded.
382: * @return int
383: */
384: public int getNumLinesNotDiscarded() {
385: return numLinesNotDiscarded;
386: }
387:
388: /**
389: * Returns the changedFlags.
390: * @return boolean[]
391: */
392: public boolean[] getChangedFlags() {
393: return changedFlags;
394: }
395:
396: /**
397: * Returns the realindexes.
398: * @return int[]
399: */
400: public int[] getRealindexes() {
401: return realindexes;
402: }
403:
404: /**
405: * Returns the numLinesInBuffer.
406: * @return int
407: */
408: public int getNumLinesInBuffer() {
409: return numLinesInBuffer;
410: }
411:
412: }
|