001: package st.ata.util;
002:
003: import java.util.Hashtable;
004:
005: /**
006:
007: <p> This class provides methods that construct fingerprints of strings
008: of bytes via operations in <i>GF[2^d]</i> for <i>0 < d <= 64</i>.
009: <i>GF[2^d]</i> is represented as the set of polynomials of degree
010: <i>d</i> with coefficients in <i>Z(2)</i>, modulo an irreducible
011: polynomial <i>P</i> of degree <i>d</i>. The representation of
012: polynomials is as an unsigned binary number in which the least
013: significant exponent is kept in the most significant bit.
014:
015: <p> Let S be a string of bytes and <i>g(S)</i> the string obtained by
016: taking the byte <code>0x01</code> followed by eight <code>0x00</code>
017: bytes followed by <code>S</code>. Let <i>f(S)</i> be the polynomial
018: associated to the string <i>S</i> viewed as a polynomial with
019: coefficients in the field <i>Z(2)</i>. The fingerprint of S is simply
020: the value <i>f(g(S))</i> modulo <i>P</i>. Because polynomials are
021: represented with the least significant coefficient in the most
022: significant bit, fingerprints of degree <i>d</i> are stored in the
023: <code>d</code> <strong>most</code> significant bits of a long word.
024:
025: <p> Fingerprints can be used as a probably unique id for the input
026: string. More precisely, if <i>P</i> is chosen at random among
027: irreducible polynomials of degree <i>d</i>, then the probability that
028: any two strings <i>A</i> and <i>B</i> have the same fingerprint is
029: less than <i>max(|A|,|B|)/2^(d+1)</i> where <i>|A|</i> is the length
030: of A in bits.
031:
032: <p> The routines named <code>extend[8]</code> and <code>fp[8]</code>
033: return reduced results, while <code>extend_[byte/char/int/long]</code>
034: do not. An <em>un</em>reduced result is a number that is equal (mod
035: </code>polynomial</code> to the desired fingerprint but may have
036: degree <code>degree</code> or higher. The method <code>reduce</code>
037: reduces such a result to a polynomial of degree less than
038: <code>degree</code>. Obtaining reduced results takes longer than
039: obtaining unreduced results; thus, when fingerprinting long strings,
040: it's better to obtain irreduced results inside the fingerprinting loop
041: and use <code>reduce</code> to reduce to a fingerprint after the loop.
042:
043: */
044:
045: // Tested by: TestFPGenerator
046: @SuppressWarnings("unchecked")
047: public final class FPGenerator {
048:
049: /** Return a fingerprint generator. The fingerprints generated
050: will have degree <code>degree</code> and will be generated by
051: <code>polynomial</code>. If a generator based on
052: <code>polynomial</code> has already been created, it will be
053: returned. Requires that <code>polynomial</code> is an
054: irreducible polynomial of degree <code>degree</code> (the
055: array <code>polynomials</code> contains some irreducible
056: polynomials). */
057: public static FPGenerator make(long polynomial, int degree) {
058: Long l = new Long(polynomial);
059: FPGenerator fpgen = (FPGenerator) generators.get(l);
060: if (fpgen == null) {
061: fpgen = new FPGenerator(polynomial, degree);
062: generators.put(l, fpgen);
063: }
064: return fpgen;
065: }
066:
067: private static final Hashtable generators = new Hashtable(10);
068:
069: private static final long zero = 0;
070: private static final long one = 0x8000000000000000L;
071:
072: /** Return a value equal (mod <code>polynomial</code>) to
073: <code>fp</code> and of degree less than <code>degree</code>. */
074: public long reduce(long fp) {
075: int N = (8 - degree / 8);
076: long local = (N == 8 ? 0 : fp & (-1L << 8 * N));
077: long temp = zero;
078: for (int i = 0; i < N; i++) {
079: temp ^= ByteModTable[8 + i][((int) fp) & 0xff];
080: fp >>>= 8;
081: }
082: ;
083: return local ^ temp;
084: }
085:
086: /** Extends <code>f</code> with lower eight bits of <code>v</code>
087: with<em>out</em> full reduction. In other words, returns a
088: polynomial that is equal (mod <code>polynomial</code>) to the
089: desired fingerprint but may be of higher degree than the
090: desired fingerprint. */
091: public long extend_byte(long f, int v) {
092: f ^= (0xff & v);
093: int i = (int) f;
094: long result = (f >>> 8);
095: result ^= ByteModTable[7][i & 0xff];
096: return result;
097: }
098:
099: /** Extends <code>f</code> with lower sixteen bits of <code>v</code>.
100: Does not reduce. */
101: public long extend_char(long f, int v) {
102: f ^= (0xffff & v);
103: int i = (int) f;
104: long result = (f >>> 16);
105: result ^= ByteModTable[6][i & 0xff];
106: i >>>= 8;
107: result ^= ByteModTable[7][i & 0xff];
108: return result;
109: }
110:
111: /** Extends <code>f</code> with (all bits of) <code>v</code>.
112: Does not reduce. */
113: public long extend_int(long f, int v) {
114: f ^= (0xffffffffL & v);
115: int i = (int) f;
116: long result = (f >>> 32);
117: result ^= ByteModTable[4][i & 0xff];
118: i >>>= 8;
119: result ^= ByteModTable[5][i & 0xff];
120: i >>>= 8;
121: result ^= ByteModTable[6][i & 0xff];
122: i >>>= 8;
123: result ^= ByteModTable[7][i & 0xff];
124: return result;
125: }
126:
127: /** Extends <code>f</code> with <code>v</code>.
128: Does not reduce. */
129: public long extend_long(long f, long v) {
130: f ^= v;
131: long result = ByteModTable[0][(int) (f & 0xff)];
132: f >>>= 8;
133: result ^= ByteModTable[1][(int) (f & 0xff)];
134: f >>>= 8;
135: result ^= ByteModTable[2][(int) (f & 0xff)];
136: f >>>= 8;
137: result ^= ByteModTable[3][(int) (f & 0xff)];
138: f >>>= 8;
139: result ^= ByteModTable[4][(int) (f & 0xff)];
140: f >>>= 8;
141: result ^= ByteModTable[5][(int) (f & 0xff)];
142: f >>>= 8;
143: result ^= ByteModTable[6][(int) (f & 0xff)];
144: f >>>= 8;
145: result ^= ByteModTable[7][(int) (f & 0xff)];
146: return result;
147: }
148:
149: /** Compute fingerprint of "n" bytes of "buf" starting from
150: "buf[start]". Requires "[start, start+n)" is in bounds. */
151: public long fp(byte[] buf, int start, int n) {
152: return extend(empty, buf, start, n);
153: }
154:
155: /** Compute fingerprint of (all bits of) "n" characters of "buf"
156: starting from "buf[i]". Requires "[i, i+n)" is in bounds. */
157: public long fp(char[] buf, int start, int n) {
158: return extend(empty, buf, start, n);
159: }
160:
161: // COMMENTED OUT TO REMOVE Dependency on st.ata.util.Text
162: // /** Compute fingerprint of (all bits of) <code>t</code> */
163: // public long fp(Text t) {
164: // return extend(empty, t);
165: // }
166: /** Compute fingerprint of (all bits of) the characters of "s". */
167: public long fp(CharSequence s) {
168: return extend(empty, s);
169: }
170:
171: /** Compute fingerprint of (all bits of) "n" characters of "buf"
172: starting from "buf[i]". Requires "[i, i+n)" is in bounds. */
173: public long fp(int[] buf, int start, int n) {
174: return extend(empty, buf, start, n);
175: }
176:
177: /** Compute fingerprint of (all bits of) "n" characters of "buf"
178: starting from "buf[i]". Requires "[i, i+n)" is in bounds. */
179: public long fp(long[] buf, int start, int n) {
180: return extend(empty, buf, start, n);
181: }
182:
183: /** Compute fingerprint of the lower eight bits of the characters
184: of "s". */
185: public long fp8(String s) {
186: return extend8(empty, s);
187: }
188:
189: /** Compute fingerprint of the lower eight bits of "n" characters
190: of "buf" starting from "buf[i]". Requires "[i, i+n)" is in
191: bounds. */
192: public long fp8(char[] buf, int start, int n) {
193: return extend8(empty, buf, start, n);
194: }
195:
196: /** Extends fingerprint <code>f</code> by adding the low eight
197: bits of "b". */
198: public long extend(long f, byte v) {
199: return reduce(extend_byte(f, v));
200: }
201:
202: /** Extends fingerprint <code>f</code> by adding (all bits of)
203: "v". */
204: public long extend(long f, char v) {
205: return reduce(extend_char(f, v));
206: }
207:
208: /** Extends fingerprint <code>f</code> by adding (all bits of)
209: "v". */
210: public long extend(long f, int v) {
211: return reduce(extend_int(f, v));
212: }
213:
214: /** Extends fingerprint <code>f</code> by adding (all bits of)
215: "v". */
216: public long extend(long f, long v) {
217: return reduce(extend_long(f, v));
218: }
219:
220: /** Extends fingerprint <code>f</code> by adding "n" bytes of
221: "buf" starting from "buf[start]".
222: Result is reduced.
223: Requires "[i, i+n)" is in bounds. */
224: public long extend(long f, byte[] buf, int start, int n) {
225: for (int i = 0; i < n; i++) {
226: f = extend_byte(f, buf[start + i]);
227: }
228: return reduce(f);
229: }
230:
231: /** Extends fingerprint <code>f</code> by adding (all bits of) "n"
232: characters of "buf" starting from "buf[i]".
233: Result is reduced.
234: Requires "[i, i+n)" is in bounds. */
235: public long extend(long f, char[] buf, int start, int n) {
236: for (int i = 0; i < n; i++) {
237: f = extend_char(f, buf[start + i]);
238: }
239: return reduce(f);
240: }
241:
242: /** Extends fingerprint <code>f</code> by adding (all bits of)
243: the characters of "s".
244: Result is reduced. */
245: public long extend(long f, CharSequence s) {
246: int n = s.length();
247: for (int i = 0; i < n; i++) {
248: int v = (int) s.charAt(i);
249: f = extend_char(f, v);
250: }
251: return reduce(f);
252: }
253:
254: // COMMENTED OUT TO REMOVE Dependency on st.ata.util.Text
255: // /** Extends fingerprint <code>f</code> by adding (all bits of)
256: // * <code>t</code> */
257: // public long extend(long f, Text t) {
258: // return extend(f, t.buf, t.start, t.length());
259: // }
260:
261: /** Extends fingerprint <code>f</code> by adding (all bits of) "n"
262: characters of "buf" starting from "buf[i]".
263: Result is reduced.
264: Requires "[i, i+n)" is in bounds. */
265: public long extend(long f, int[] buf, int start, int n) {
266: for (int i = 0; i < n; i++) {
267: f = extend_int(f, buf[start + i]);
268: }
269: return reduce(f);
270: }
271:
272: /** Extends fingerprint <code>f</code> by adding (all bits of) "n"
273: characters of "buf" starting from "buf[i]".
274: Result is reduced.
275: Requires "[i, i+n)" is in bounds. */
276: public long extend(long f, long[] buf, int start, int n) {
277: for (int i = 0; i < n; i++) {
278: f = extend_long(f, buf[start + i]);
279: }
280: return reduce(f);
281: }
282:
283: /** Extends fingerprint <code>f</code> by adding the lower eight
284: bits of the characters of "s".
285: Result is reduced. */
286: public long extend8(long f, String s) {
287: int n = s.length();
288: for (int i = 0; i < n; i++) {
289: int x = (int) s.charAt(i);
290: f = extend_byte(f, x);
291: }
292: return reduce(f);
293: }
294:
295: /** Extends fingerprint <code>f</code> by adding the lower eight
296: bits of "n" characters of "buf" starting from "buf[i]".
297: Result is reduced.
298: Requires "[i, i+n)" is in bounds. */
299: public long extend8(long f, char[] buf, int start, int n) {
300: for (int i = 0; i < n; i++) {
301: f = extend_byte(f, buf[start + i]);
302: }
303: return reduce(f);
304: }
305:
306: /** Fingerprint of the empty string of bytes. */
307: public final long empty;
308:
309: /** The number of bits in fingerprints generated by
310: <code>this</code>. */
311: public final int degree;
312:
313: /** The polynomial used by <code>this</code> to generate
314: fingerprints. */
315: public long polynomial;
316:
317: /** Result of reducing certain polynomials. Specifically, if
318: <code>f(S)</code> is bit string <code>S</code> interpreted as
319: a polynomial, <code>f(ByteModTable[i][j])</code> equals
320: <code>mod(x^(127 - 8*i) * f(j), P)</code>. */
321: private long[][] ByteModTable;
322:
323: /** Create a fingerprint generator. The fingerprints generated
324: will have degree <code>degree</code> and will be generated by
325: <code>polynomial</code>. Requires that
326: <code>polynomial</code> is an irreducible polynomial of degree
327: <code>degree</code> (the array <code>polynomials</code>
328: contains some irreducible polynomials). */
329: private FPGenerator(long polynomial, int degree) {
330: this .degree = degree;
331: this .polynomial = polynomial;
332: ByteModTable = new long[16][256];
333:
334: long[] PowerTable = new long[128];
335:
336: long x_to_the_i = one;
337: long x_to_the_degree_minus_one = (one >>> (degree - 1));
338: for (int i = 0; i < 128; i++) {
339: // Invariants:
340: // x_to_the_i = mod(x^i, polynomial)
341: // forall 0 <= j < i, PowerTable[i] = mod(x^i, polynomial)
342: PowerTable[i] = x_to_the_i;
343: boolean overflow = ((x_to_the_i & x_to_the_degree_minus_one) != 0);
344: x_to_the_i >>>= 1;
345: if (overflow) {
346: x_to_the_i ^= polynomial;
347: }
348: }
349: this .empty = PowerTable[64];
350:
351: for (int i = 0; i < 16; i++) {
352: // Invariant: forall 0 <= i' < i, forall 0 <= j' < 256,
353: // ByteModTable[i'][j'] = mod(x^(127 - 8*i') * f(j'), polynomial)
354: for (int j = 0; j < 256; j++) {
355: // Invariant: forall 0 <= i' < i, forall 0 <= j' < j,
356: // ByteModTable[i'][j'] = mod(x^(degree+i')*f(j'),polynomial)
357: long v = zero;
358: for (int k = 0; k < 8; k++) {
359: // Invariant:
360: // v = mod(x^(degree+i) * f(j & ((1<<k)-1)), polynomial)
361: if ((j & (1 << k)) != 0) {
362: v ^= PowerTable[127 - i * 8 - k];
363: }
364: }
365: ByteModTable[i][j] = v;
366: }
367: }
368: }
369:
370: /** Array of irreducible polynomials. For each degree
371: <code>d</code> between 1 and 64 (inclusive),
372: <code>polynomials[d][i]</code> is an irreducible polynomial of
373: degree <code>d</code>. There are at least two irreducible
374: polynomials for each degree. */
375: public static final long polynomials[][] = { null,
376: { 0xC000000000000000L, 0xC000000000000000L },
377: { 0xE000000000000000L, 0xE000000000000000L },
378: { 0xD000000000000000L, 0xB000000000000000L },
379: { 0xF800000000000000L, 0xF800000000000000L },
380: { 0xEC00000000000000L, 0xBC00000000000000L },
381: { 0xDA00000000000000L, 0xB600000000000000L },
382: { 0xE500000000000000L, 0xE500000000000000L },
383: { 0x9680000000000000L, 0xD480000000000000L },
384: { 0x80C0000000000000L, 0x8840000000000000L },
385: { 0xB0A0000000000000L, 0xE9A0000000000000L },
386: { 0xD9F0000000000000L, 0xC9B0000000000000L },
387: { 0xE758000000000000L, 0xDE98000000000000L },
388: { 0xE42C000000000000L, 0x94E4000000000000L },
389: { 0xD4CE000000000000L, 0xB892000000000000L },
390: { 0xE2AB000000000000L, 0x9E39000000000000L },
391: { 0xCCE4800000000000L, 0x9783800000000000L },
392: { 0xD8F8C00000000000L, 0xA9CDC00000000000L },
393: { 0x9A28200000000000L, 0xFD79E00000000000L },
394: { 0xC782500000000000L, 0x96CD300000000000L },
395: { 0xBEE6880000000000L, 0xE902C80000000000L },
396: { 0x86D7E40000000000L, 0xF066340000000000L },
397: { 0x9888060000000000L, 0x910ABE0000000000L },
398: { 0xFF30E30000000000L, 0xB27EFB0000000000L },
399: { 0x8E375B8000000000L, 0xA03D948000000000L },
400: { 0xD1415C4000000000L, 0xF5357CC000000000L },
401: { 0x91A916E000000000L, 0xB6CE66E000000000L },
402: { 0xE6D2FC5000000000L, 0xD55882B000000000L },
403: { 0x9A3BA0B800000000L, 0xFBD654E800000000L },
404: { 0xAEA5D2E400000000L, 0xF0E533AC00000000L },
405: { 0xDA88B7BE00000000L, 0xAA3AAEDE00000000L },
406: { 0xBA75BB4300000000L, 0xF5A811C500000000L },
407: { 0x9B6C9A2F80000000L, 0x9603CCED80000000L },
408: { 0xFABB538840000000L, 0xE2747090C0000000L },
409: { 0x8358898EA0000000L, 0x8C698D3D20000000L },
410: { 0xDA8ABD5BF0000000L, 0xF6DF3A0AF0000000L },
411: { 0xB090C3F758000000L, 0xD3B4D3D298000000L },
412: { 0xAD9882F5BC000000L, 0x88DA4FB544000000L },
413: { 0xC3C366272A000000L, 0xDCCF2A2262000000L },
414: { 0x9BC0224A97000000L, 0xAF5D96F273000000L },
415: { 0x8643FFF621800000L, 0x8E390C6EDC800000L },
416: { 0xE45C01919BC00000L, 0xCBB34C8945C00000L },
417: { 0x80D8141BC2E00000L, 0x886AFC3912200000L },
418: { 0xF605807C26500000L, 0xA3B92D28F6300000L },
419: { 0xCE9A2CFC76280000L, 0x98400C1921280000L },
420: { 0xF61894904C040000L, 0xC8BE6DBCEC8C0000L },
421: { 0xE3A44C104D160000L, 0xCA84A59443760000L },
422: { 0xC7E84953A11B0000L, 0xD9D4F6AA02CB0000L },
423: { 0xC26CDD1C9A358000L, 0x8BE8478434328000L },
424: { 0xAE125DBEB198C000L, 0xFCC5DBFD5AAAC000L },
425: { 0x86DE52A079A6A000L, 0xC5F16BD883816000L },
426: { 0xDF82486AAFE37000L, 0xA293EC735692D000L },
427: { 0xE91ABA275C272800L, 0xD686192874E3C800L },
428: { 0x963D0960DAB3FC00L, 0xBA9DE62072621400L },
429: { 0xA2188C4E8A46CE00L, 0xD31F75BCB4977E00L },
430: { 0xC43A416020A6CB00L, 0x99F57FECA12B3900L },
431: { 0xA4F72EF82A58AE80L, 0xCECE4391B81DA380L },
432: { 0xB39F119264BC0940L, 0x80A277D20DABB9C0L },
433: { 0xFD6616C0CBFA0B20L, 0xED16E64117DC11A0L },
434: { 0xFFA8BC44327B5390L, 0xEDFB13DB3B66C210L },
435: { 0xCAE8EB99E73AB548L, 0xC86135B6EA2F0B98L },
436: { 0xBA49BADCDD19B16CL, 0x8F1944AFB18564C4L },
437: { 0xECFC86D765EABBEEL, 0x9190E1C46CC13702L },
438: { 0xE1F8D6B3195D6D97L, 0xDF70267FF5E4C979L },
439: { 0xD74307D3FD3382DBL, 0x9999B3FFDC769B48L } };
440:
441: /** The standard 64-bit fingerprint generator using
442: <code>polynomials[0][64]</code>. */
443: public static final FPGenerator std64 = make(polynomials[64][0], 64);
444:
445: /** A standard 32-bit fingerprint generator using
446: <code>polynomials[0][32]</code>. */
447: public static final FPGenerator std32 = make(polynomials[32][0], 32);
448:
449: // Below added by St.Ack on 09/30/2004.
450: /** A standard 40-bit fingerprint generator using
451: <code>polynomials[0][40]</code>. */
452: public static final FPGenerator std40 = make(polynomials[40][0], 40);
453: /** A standard 24-bit fingerprint generator using
454: <code>polynomials[0][24]</code>. */
455: public static final FPGenerator std24 = make(polynomials[24][0], 24);
456: }
|