001: /*
002: * ====================================================================
003: *
004: * The Apache Software License, Version 1.1
005: *
006: * Copyright (c) 1999-2003 The Apache Software Foundation.
007: * All rights reserved.
008: *
009: * Redistribution and use in source and binary forms, with or without
010: * modification, are permitted provided that the following conditions
011: * are met:
012: *
013: * 1. Redistributions of source code must retain the above copyright
014: * notice, this list of conditions and the following disclaimer.
015: *
016: * 2. Redistributions in binary form must reproduce the above copyright
017: * notice, this list of conditions and the following disclaimer in
018: * the documentation and/or other materials provided with the
019: * distribution.
020: *
021: * 3. The end-user documentation included with the redistribution, if
022: * any, must include the following acknowledgement:
023: * "This product includes software developed by the
024: * Apache Software Foundation (http://www.apache.org/)."
025: * Alternately, this acknowledgement may appear in the software itself,
026: * if and wherever such third-party acknowledgements normally appear.
027: *
028: * 4. The names "The Jakarta Project", "Commons", and "Apache Software
029: * Foundation" must not be used to endorse or promote products derived
030: * from this software without prior written permission. For written
031: * permission, please contact apache@apache.org.
032: *
033: * 5. Products derived from this software may not be called "Apache"
034: * nor may "Apache" appear in their names without prior written
035: * permission of the Apache Software Foundation.
036: *
037: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
038: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
039: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
040: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
041: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
042: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
043: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
044: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
045: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
046: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
047: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
048: * SUCH DAMAGE.
049: * ====================================================================
050: *
051: * This software consists of voluntary contributions made by many
052: * individuals on behalf of the Apache Software Foundation. For more
053: * information on the Apache Software Foundation, please see
054: * <http://www.apache.org/>.
055: *
056: */
057:
058: package org.apache.commons.jrcs.diff;
059:
060: import java.util.Arrays;
061: import java.util.HashMap;
062: import java.util.HashSet;
063: import java.util.Map;
064: import java.util.Set;
065:
066: /**
067: * Implements a simple differencing algortithm.
068: * <p>
069: *
070: * @date $Date: 2006-04-13 08:25:49 -0400 (Thu, 13 Apr 2006) $
071: * @version $Revision: 7756 $
072: * @author <a href="mailto:juanco@suigeneris.org">Juanco Anez</a>
073: * <p>
074: * <b>Overview of Algorithm</b>
075: * </p>
076: * <p>
077: * <i>by <a href='http://www.topmeadow.net/bwm'> bwm</a>
078: * </p>
079: * <p>
080: * The algorithm is optimised for situations where the input sequences
081: * have few repeated objects. If it is given input with many repeated
082: * objects it will report sub-optimal changes. However, given
083: * appropriate input, it is fast, and linear in memory usage.
084: * </p>
085: * <p>
086: * The algorithm consists of the following steps:
087: * </p>
088: * <ul>
089: * <li>compute an equivalence set for the input data</li>
090: * <li>translate each element of the orginal and revised input
091: * sequences to a member of the equivalence set </li>
092: * <li>match the the input sequences to determine the deltas, i.e. the
093: * differences between the original and revised sequences.</li>
094: * </ul>
095: * <p>
096: * The first step is to compute a an equivalence set for the input data.
097: * The equivalence set is computed from objects that are in the original
098: * input sequence
099: * </p>
100: *
101: * <pre>
102: * eq(x) = the index of the first occurence of x in the original sequence.
103: * </pre>
104: *
105: * <p>
106: * With this equivalence function, the algorithm can compare integers
107: * rather than strings, which is considerably more efficient.
108: * </p>
109: * <p>
110: * The second step is to compute the datastructure on which the
111: * algorithm will operate. Having computed the equivalence function in
112: * the previous step, we can compute two arrays where indx[i] =
113: * eqs(orig[i]) and jndx[i] = eqs(rev[i]). The algorithm can now operate
114: * on indx and jndx instead of orig and rev. Thus, comparisons are then
115: * on O(int == int) instead of O(Object.equals(Object)).
116: * </p>
117: * <p>
118: * The algorithm now matches indx and jndx. Whilst indx[i] == jndx[i] it
119: * skips matching objects in the sequence. In seeking to match objects
120: * in the input sequence it assumes that each object is likely to be
121: * unique. It uses the known characteristics of the unique equivalence
122: * function. It can tell from the eq value if this object appeared in
123: * the other sequence at all. If it did not, there is no point in
124: * searching for a match.
125: * </p>
126: * <p>
127: * Recall that the eq function value is the index earliest occurrence in
128: * the orig sequence. This information is used to search efficiently for
129: * the next match. The algorithm is perfect when all input objects are
130: * unique, but degrades when input objects are not unique. When input
131: * objects are not unique an optimal match may not be found, but a
132: * correct match will be.
133: * </p>
134: * <p>
135: * Having identified common matching objects in the orig and revised
136: * sequences, the differences between them are easily computed.
137: * </p>
138: * @see Delta
139: * @see Revision Modifications: 27/Apr/2003 bwm Added some comments whilst
140: * trying to figure out the algorithm 03 May 2003 bwm Created this
141: * implementation class by refactoring it out of the Diff class to enable
142: * plug in difference algorithms
143: */
144: public class SimpleDiff implements DiffAlgorithm {
145:
146: static final int NOT_FOUND_i = -2;
147:
148: static final int NOT_FOUND_j = -1;
149:
150: static final int EOS = Integer.MAX_VALUE;
151:
152: public SimpleDiff() {
153: }
154:
155: protected int scan(int[] ndx, int i, int target) {
156: while (ndx[i] < target) {
157: i++;
158: }
159: return i;
160: }
161:
162: /**
163: * Compute the difference between original and revised sequences.
164: *
165: * @param orig
166: * The original sequence.
167: * @param rev
168: * The revised sequence to be compared with the original.
169: * @return A Revision object describing the differences.
170: * @throws DifferenciationFailedException
171: * if the diff could not be computed.
172: */
173: public Revision diff(Object[] orig, Object[] rev)
174: throws DifferentiationFailedException {
175: // create map eqs, such that for each item in both orig and rev
176: // eqs(item) = firstOccurrence(item, orig);
177: Map eqs = buildEqSet(orig, rev);
178:
179: // create an array such that
180: // indx[i] = NOT_FOUND_i if orig[i] is not in rev
181: // indx[i] = firstOccurrence(orig[i], orig)
182: int[] indx = buildIndex(eqs, orig, NOT_FOUND_i);
183:
184: // create an array such that
185: // jndx[j] = NOT_FOUND_j if orig[j] is not in rev
186: // jndx[j] = firstOccurrence(rev[j], orig)
187: int[] jndx = buildIndex(eqs, rev, NOT_FOUND_j);
188:
189: // what in effect has been done is to build a unique hash
190: // for each item that is in both orig and rev
191: // and to label each item in orig and new with that hash value
192: // or a marker that the item is not common to both.
193:
194: eqs = null; // let gc know we're done with this
195:
196: Revision deltas = new Revision(); // !!! new Revision()
197: int i = 0;
198: int j = 0;
199:
200: // skip matching
201: // skip leading items that are equal
202: // could be written
203: // for (i=0; indx[i] != EOS && indx[i] == jndx[i]; i++);
204: // j = i;
205: for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++) {
206: /* void */
207: }
208:
209: while (indx[i] != jndx[j]) { // only equal if both == EOS
210: // they are different
211: int ia = i;
212: int ja = j;
213:
214: // size of this delta
215: do {
216: // look down rev for a match
217: // stop at a match
218: // or if the FO(rev[j]) > FO(orig[i])
219: // or at the end
220: while (jndx[j] < 0 || jndx[j] < indx[i]) {
221: j++;
222: }
223: // look down orig for a match
224: // stop at a match
225: // or if the FO(orig[i]) > FO(rev[j])
226: // or at the end
227: while (indx[i] < 0 || indx[i] < jndx[j]) {
228: i++;
229: }
230:
231: // this doesn't do a compare each line with each other line
232: // so it won't find all matching lines
233: } while (indx[i] != jndx[j]);
234:
235: // on exit we have a match
236:
237: // they are equal, reverse any exedent matches
238: // it is possible to overshoot, so count back matching items
239: while (i > ia && j > ja && indx[i - 1] == jndx[j - 1]) {
240: --i;
241: --j;
242: }
243:
244: deltas.addDelta(Delta.newDelta(new Chunk(orig, ia, i - ia),
245: new Chunk(rev, ja, j - ja)));
246: // skip matching
247: for (; indx[i] != EOS && indx[i] == jndx[j]; i++, j++) {
248: /* void */
249: }
250: }
251: return deltas;
252: }
253:
254: /**
255: * create a <code>Map</code> from each common item in orig and rev to the
256: * index of its first occurrence in orig
257: *
258: * @param orig
259: * the original sequence of items
260: * @param rev
261: * the revised sequence of items
262: */
263: protected Map buildEqSet(Object[] orig, Object[] rev) {
264: // construct a set of the objects that orig and rev have in common
265:
266: // first construct a set containing all the elements in orig
267: Set items = new HashSet(Arrays.asList(orig));
268:
269: // then remove all those not in rev
270: items.retainAll(Arrays.asList(rev));
271:
272: Map eqs = new HashMap();
273: for (int i = 0; i < orig.length; i++) {
274: // if its a common item and hasn't been found before
275: if (items.contains(orig[i])) {
276: // add it to the map
277: eqs.put(orig[i], new Integer(i));
278: // and make sure its not considered again
279: items.remove(orig[i]);
280: }
281: }
282: return eqs;
283: }
284:
285: /**
286: * build a an array such each a[i] = eqs([i]) or NF if eqs([i]) undefined
287: *
288: * @param eqs
289: * a mapping from Object to Integer
290: * @param seq
291: * a sequence of objects
292: * @param NF
293: * the not found marker
294: */
295: protected int[] buildIndex(Map eqs, Object[] seq, int NF) {
296: int[] result = new int[seq.length + 1];
297: for (int i = 0; i < seq.length; i++) {
298: Integer value = (Integer) eqs.get(seq[i]);
299: if (value == null || value.intValue() < 0) {
300: result[i] = NF;
301: } else {
302: result[i] = value.intValue();
303: }
304: }
305: result[seq.length] = EOS;
306: return result;
307: }
308:
309: }
|