001: /*
002: * Copyright (c) 1998-2006 Caucho Technology -- all rights reserved
003: *
004: * This file is part of Resin(R) Open Source
005: *
006: * Each copy or derived work must preserve the copyright notice and this
007: * notice unmodified.
008: *
009: * Resin Open Source is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU General Public License as published by
011: * the Free Software Foundation; either version 2 of the License, or
012: * (at your option) any later version.
013: *
014: * Resin Open Source is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
017: * of NON-INFRINGEMENT. See the GNU General Public License for more
018: * details.
019: *
020: * You should have received a copy of the GNU General Public License
021: * along with Resin Open Source; if not, write to the
022: * Free SoftwareFoundation, Inc.
023: * 59 Temple Place, Suite 330
024: * Boston, MA 02111-1307 USA
025: *
026: * @author Scott Ferguson
027: */
028:
029: package com.caucho.util;
030:
031: import com.caucho.vfs.*;
032:
033: import java.io.*;
034: import java.util.*;
035:
036: public class BinaryHashDiff {
037: //private static long PRIME = (1L << 61) - 1;
038: // private static long PRIME = (1L << 31) - 1;
039:
040: // Prime table at http://primes.utm.edu/lists/2small/0bit.html
041: // 2^54 to avoid long overflow when multiplying byte data and MUL
042: private static long PRIME = (1L << 54) - 33;
043: private static long MUL = 251;
044:
045: int _oldLength;
046: ArrayList<TempBuffer> _oldBuffers;
047: byte[][] _oldBytes;
048:
049: int _copyMin = 8;
050: int _chunkSize = 4;
051: int _chunkCount;
052:
053: long _hashFactor;
054:
055: long[] _hashArray;
056: int[] _suffixArray;
057:
058: int[] _charMin;
059: int[] _charMax;
060:
061: public void delta(OutputStream out, InputStream oldFile,
062: InputStream newFile) throws IOException {
063: readBuffers(oldFile);
064:
065: processOrigFile();
066:
067: processNewFile(out, newFile);
068: }
069:
070: protected void add(OutputStream out, byte[] buffer, int offset,
071: int length) throws IOException {
072: System.out.println("ADD(" + length + "): '"
073: + new String(buffer, offset, length) + "'");
074: }
075:
076: protected void copy(OutputStream out, int offset, int length)
077: throws IOException {
078: System.out.println("COPY(" + offset + "," + length + ")");
079: }
080:
081: private void readBuffers(InputStream oldFile) throws IOException {
082: _oldBuffers = new ArrayList<TempBuffer>();
083: _oldLength = 0;
084:
085: while (true) {
086: TempBuffer buf = TempBuffer.allocate();
087: byte[] buffer = buf.getBuffer();
088:
089: int sublen = readAll(oldFile, buffer);
090:
091: if (sublen < 0) {
092: TempBuffer.free(buf);
093: break;
094: }
095:
096: _oldLength += sublen;
097: _oldBuffers.add(buf);
098:
099: if (sublen < buffer.length)
100: break;
101: }
102:
103: _oldBytes = new byte[_oldBuffers.size()][];
104: for (int i = 0; i < _oldBytes.length; i++) {
105: _oldBytes[i] = _oldBuffers.get(i).getBuffer();
106: }
107: }
108:
109: private void processOrigFile() {
110: long factor = 1;
111:
112: for (int i = 1; i < _chunkSize; i++) {
113: factor = (MUL * factor) % PRIME;
114: }
115:
116: _hashFactor = factor;
117:
118: _chunkCount = _oldLength / _chunkSize;
119:
120: _hashArray = new long[_chunkCount];
121: _suffixArray = new int[_chunkCount];
122:
123: int chunkSize = _chunkSize;
124: int chunkCount = _chunkCount;
125:
126: long[] hashArray = _hashArray;
127: int[] suffixArray = _suffixArray;
128:
129: byte[][] data = _oldBytes;
130:
131: for (int i = 0; i < chunkCount; i++) {
132: int offset = i * chunkSize;
133:
134: suffixArray[i] = i;
135:
136: byte[] buffer = data[offset / TempBuffer.SIZE];
137:
138: offset = offset % TempBuffer.SIZE;
139:
140: long hash = 0;
141: for (int k = 0; k < chunkSize; k++) {
142: hash = hash(hash, buffer[offset + k], 0, factor);
143: }
144:
145: hashArray[i] = hash;
146: }
147:
148: sort(suffixArray, hashArray);
149: }
150:
151: private static void sort(int[] suffixArray, long[] hashArray) {
152: int length = suffixArray.length;
153: int dataLength = length;
154:
155: sort(suffixArray, hashArray, dataLength, 0, length);
156: }
157:
158: private static void sort(int[] suffixArray, long[] hashArray,
159: int dataLength, int min, int max) {
160: int delta = max - min;
161:
162: if (delta < 2) {
163: } else if (delta == 2) {
164: int aIndex = suffixArray[min];
165: int bIndex = suffixArray[min + 1];
166:
167: long aValue = hashArray[aIndex];
168: long bValue = hashArray[bIndex];
169:
170: if (bValue < aValue) {
171: suffixArray[min] = bIndex;
172: suffixArray[min + 1] = aIndex;
173: }
174: } else {
175: int pivotIndex = suffixArray[min];
176: long pivotValue = hashArray[pivotIndex];
177: int pivotMax = max;
178:
179: int pivot = min;
180:
181: while (pivot + 1 < pivotMax) {
182: long value = hashArray[suffixArray[pivot + 1]];
183:
184: if (value < pivotValue) {
185: suffixArray[pivot] = suffixArray[pivot + 1];
186: suffixArray[pivot + 1] = pivotIndex;
187:
188: pivot += 1;
189: } else {
190: int temp = suffixArray[pivotMax - 1];
191: suffixArray[pivotMax - 1] = suffixArray[pivot + 1];
192: suffixArray[pivot + 1] = temp;
193:
194: pivotMax -= 1;
195: }
196: }
197:
198: if (min < pivot) {
199: sort(suffixArray, hashArray, dataLength, min, pivot);
200: sort(suffixArray, hashArray, dataLength, pivot, max);
201: } else {
202: sort(suffixArray, hashArray, dataLength, pivot + 1, max);
203: }
204: }
205: }
206:
207: private static int suffixCompareTo(int a, int b, long[] hashArray,
208: int length) {
209: int sublen = length - a;
210:
211: if (length - b < sublen)
212: sublen = length - b;
213:
214: for (int i = 0; i < sublen; i++) {
215: long a1 = hashArray[a + i];
216: long b1 = hashArray[b + i];
217:
218: if (a1 < b1)
219: return -1;
220: else if (b1 < a1)
221: return 1;
222: }
223:
224: return 0;
225: }
226:
227: private void processNewFile(OutputStream out, InputStream newIn)
228: throws IOException {
229: TempBuffer tempBuf = TempBuffer.allocate();
230: byte[] buffer = tempBuf.getBuffer();
231: ;
232:
233: int length = readAll(newIn, buffer);
234:
235: if (length < _chunkSize)
236: return;
237:
238: int prevOffset = 0;
239: int offset = 0;
240:
241: int chunkSize = _chunkSize;
242:
243: byte[][] data = _oldBytes;
244: int[] suffixArray = _suffixArray;
245:
246: long[] hashArray = _hashArray;
247: long hashFactor = _hashFactor;
248:
249: long hash = 0;
250:
251: for (int i = 0; i < chunkSize - 1; i++) {
252: hash = hash(hash, buffer[i], 0, hashFactor);
253: }
254:
255: loop: for (; offset + chunkSize < length; offset += 1) {
256: byte oldValue = 0;
257:
258: if (offset > 0)
259: oldValue = buffer[offset - 1];
260:
261: hash = hash(hash, buffer[offset + chunkSize - 1], oldValue,
262: hashFactor);
263:
264: int suffixIndex = findBlock(hash, suffixArray, hashArray);
265:
266: if (suffixIndex < 0)
267: continue;
268:
269: int suffixOffset = suffixArray[suffixIndex] + 1;
270: int dataOffset = offset + chunkSize;
271: long prevHash = hash;
272:
273: loop_match: for (; dataOffset + chunkSize < length
274: && suffixOffset < hashArray.length; dataOffset += chunkSize, suffixOffset += 1) {
275: long hash2 = 0;
276:
277: for (int i = 0; i < chunkSize; i++) {
278: hash2 = hash(hash2, buffer[dataOffset + i], 0,
279: hashFactor);
280: }
281:
282: if (hash2 == hashArray[suffixOffset]) {
283: } else if (hash2 < hashArray[suffixOffset]) {
284: int delta = suffixOffset - suffixArray[suffixIndex];
285:
286: for (int i = suffixIndex - 1; i >= 0
287: && hashArray[suffixArray[i]] == hash
288: && suffixArray[i] + delta < hashArray.length
289: && hashArray[suffixArray[i] + delta - 1] == prevHash; i--) {
290: if (hashArray[suffixArray[i] + delta] == hash2) {
291: suffixIndex = i;
292: suffixOffset = suffixArray[i] + delta;
293: prevHash = hash2;
294:
295: // XXX: also need to revalidate in between
296: continue loop_match;
297: }
298: }
299:
300: break;
301: } else {
302: int delta = suffixOffset - suffixArray[suffixIndex];
303:
304: for (int i = suffixIndex + 1; i < suffixArray.length
305: && hashArray[suffixArray[i]] == hash
306: && suffixArray[i] + delta < hashArray.length
307: && hashArray[suffixArray[i] + delta - 1] == prevHash; i++) {
308: if (hashArray[suffixArray[i] + delta] == hash2) {
309: suffixIndex = i;
310: suffixOffset = suffixArray[i] + delta;
311: prevHash = hash2;
312: // XXX: also need to revalidate in between
313:
314: continue loop_match;
315: }
316: }
317:
318: break;
319: }
320:
321: prevHash = hash2;
322: }
323:
324: if (dataOffset - offset >= _copyMin) {
325: if (prevOffset < offset)
326: add(out, buffer, prevOffset, offset - prevOffset);
327:
328: copy(out, suffixArray[suffixIndex] * _chunkSize,
329: dataOffset - offset);
330:
331: hash = 0;
332: offset = dataOffset - 1;
333: for (int i = 0; i < _chunkSize; i++)
334: hash = hash(hash, buffer[offset + i], 0, hashFactor);
335:
336: prevOffset = offset + 1;
337: }
338: }
339:
340: if (prevOffset < length)
341: add(out, buffer, prevOffset, length - prevOffset);
342:
343: TempBuffer.free(tempBuf);
344: }
345:
346: private int findBlock(long hash, int[] suffixArray, long[] hashArray) {
347: int min = 0;
348: int max = suffixArray.length;
349:
350: while (min < max) {
351: int pivot = (min + max) / 2;
352:
353: long hashValue = hashArray[suffixArray[pivot]];
354:
355: if (hash == hashValue)
356: return pivot;
357: else if (hash < hashValue)
358: max = pivot;
359: else
360: min = pivot + 1;
361: }
362:
363: return -1;
364: }
365:
366: private int readAll(InputStream is, byte[] buffer)
367: throws IOException {
368: int offset = 0;
369:
370: while (offset < buffer.length) {
371: int sublen = buffer.length - offset;
372:
373: sublen = is.read(buffer, offset, sublen);
374:
375: if (sublen < 0)
376: return offset > 0 ? offset : -1;
377:
378: offset += sublen;
379: }
380:
381: return buffer.length;
382: }
383:
384: public void testHash() {
385: Random random = new Random();
386:
387: byte[] data = new byte[8192];
388:
389: for (int k = 0; k < 256 * 1024 * 1024; k++) {
390: int size = random.nextInt(256);
391:
392: if (size < 1)
393: size = 1;
394:
395: long factor = 1;
396:
397: for (int i = 1; i < size; i++) {
398: factor = (MUL * factor) % PRIME;
399: }
400:
401: byte d0 = (byte) random.nextInt();
402:
403: for (int i = 0; i < size; i++) {
404: data[i] = (byte) random.nextInt();
405: }
406:
407: long hash = 0;
408: hash = hash(hash, d0, 0, factor);
409: for (int i = 0; i < size - 1; i++) {
410: hash = hash(hash, data[i], 0, factor);
411: }
412: hash = hash(hash, data[size - 1], d0, factor);
413:
414: long oldHash = hash;
415:
416: hash = 0;
417: for (int i = 0; i < size; i++) {
418: hash = hash(hash, data[i], 0, factor);
419: }
420:
421: long newHash = hash;
422:
423: if (oldHash != newHash)
424: System.out.println("OLD: " + oldHash + " " + newHash);
425: }
426: }
427:
428: // rabin-karp hash (PRIME = (1L << 54) - 33, MUL = 13);
429: private static long hash(long hash, int newData, int oldData,
430: long factor) {
431: oldData = oldData & 0xff;
432: newData = newData & 0xff;
433:
434: long old = ((PRIME << 8) + hash - factor * oldData) % PRIME;
435:
436: return (MUL * old + newData) % PRIME;
437: }
438:
439: }
|