001: /*
002: * Enhydra Java Application Server Project
003: *
004: * The contents of this file are subject to the Enhydra Public License
005: * Version 1.1 (the "License"); you may not use this file except in
006: * compliance with the License. You may obtain a copy of the License on
007: * the Enhydra web site ( http://www.enhydra.org/ ).
008: *
009: * Software distributed under the License is distributed on an "AS IS"
010: * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
011: * the License for the specific terms governing rights and limitations
012: * under the License.
013: *
014: * The Initial Developer of the Enhydra Application Server is Lutris
015: * Technologies, Inc. The Enhydra Application Server and portions created
016: * by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
017: * All Rights Reserved.
018: *
019: * Contributor(s):
020: *
021: * $Id: BMByteSearch.java,v 1.2 2006-06-15 13:47:00 sinisa Exp $
022: */
023:
024: package com.lutris.util;
025:
026: /**
027: * Implements the Boyer-Moore pattern matching algorithm for a given
028: * byte pattern. This object implements searches on signed byte arrays.
029: * The algorithm was obtained from "Computer Algorithms - Introduction to
030: * Design and Analysis, Second Edition" by Sara Baase.
031: */
032: public class BMByteSearch {
033:
034: /**
035: * Byte array, beginning at index 1 (for algorithmic convenience),
036: * that contains the intended search pattern data.
037: */
038: private byte[] P;
039:
040: /**
041: * The length of the search pattern.
042: */
043: private int m;
044:
045: /**
046: * Table of jump distances for each mismatched character in the
047: * alphabet for a given search pattern. Must be recomputed for
048: * each new pattern.
049: */
050: private int[] charJump;
051:
052: /**
053: * Table of partial suffix match jump distances for a given pattern.
054: * Must be recomputed for each new pattern.
055: */
056: private int[] matchJump;
057:
058: /**
059: * Creates a precomputed Boyer-Moore byte string search object
060: * from the given pattern. The unicode characters in <code>pattern</code>
061: * are truncated if greater than 255, and converted in twos-complement
062: * fashion, to appropriate negative byte values, if necessary.
063: * This method is provided as a convenience for searching for patterns
064: * within 8 bit byte strings composed of character data.
065: *
066: * @param pattern The pattern create this object for.
067: */
068: public BMByteSearch(String pattern) {
069: genPatternFromCharArray(pattern.toCharArray());
070: computeJumps();
071: computeMatchJumps();
072: }
073:
074: /**
075: * Creates a precomputed Boyer-Moore byte string search object
076: * from the given pattern.
077: *
078: * @param pattern Binary pattern to search for.
079: */
080: public BMByteSearch(byte[] pattern) {
081: genPatternFromByteArray(pattern, 0, pattern.length);
082: computeJumps();
083: computeMatchJumps();
084: }
085:
086: /**
087: * Creates a precomputed Boyer-Moore byte string search object
088: * from a portion of the given pattern array.
089: *
090: * @param pattern Byte array containing a pattern to search for.
091: * @param offset Offset to beginning of search pattern.
092: * @param length Length of the search pattern.
093: */
094: public BMByteSearch(byte[] pattern, int offset, int length) {
095: genPatternFromByteArray(pattern, offset, length);
096: computeJumps();
097: computeMatchJumps();
098: }
099:
100: /**
101: * Compares two integers and returns the lesser value.
102: *
103: * @param i1 First integer to compare.
104: * @param i2 Second integer to compare.
105: * @return The lesser of <code>i1</code> or <code>i2</code>.
106: */
107: private static final int min(int i1, int i2) {
108: return (i1 < i2) ? i1 : i2;
109: }
110:
111: /**
112: * Compares two integers and returns the greater value.
113: *
114: * @param i1 First integer to compare.
115: * @param i2 Second integer to compare.
116: * @return The greater of <code>i1</code> or <code>i2</code>.
117: */
118: private static final int max(int i1, int i2) {
119: return (i1 > i2) ? i1 : i2;
120: }
121:
122: /**
123: * Generates the pattern byte string <code>P</code> from a portion
124: * of another byte string.
125: *
126: * @param bytes The byte string from which to extract the pattern.
127: * @param off The array index within <code>bytes</code> from
128: * which to extract the pattern.
129: * @param length The number of characters to extract from
130: * <code>bytes</code> into the pattern.
131: */
132: private final void genPatternFromByteArray(byte[] bytes, int off,
133: int length) {
134: int i, j;
135: m = length;
136: // 31.03.2003. patch
137: // P = new byte[length];
138: P = new byte[length + 1];
139: for (i = 1, j = off; i <= length; i++, j++)
140: P[i] = bytes[j];
141: }
142:
143: /**
144: * Generates the pattern byte string <code>P</code> from a character
145: * array. The signed unicode characters are truncated to 8 bits, and
146: * converted into signed byte values. Characters between 128 and 255
147: * are converted to their signed negative counterpart in
148: * twos-complement fashion by subtracting 256.
149: *
150: * @param chars Unsigned unicode character array to turn into
151: * a signed byte array.
152: */
153: private final void genPatternFromCharArray(char[] chars) {
154: m = chars.length;
155: P = new byte[m + 1];
156: for (int i = 1; i <= m; i++) {
157: if (chars[i - 1] > 127)
158: P[i] = (byte) ((chars[i - 1] - 256) & 0xff);
159: else
160: P[i] = (byte) (chars[i - 1] & 0xff);
161: }
162: }
163:
164: /**
165: * Initializes the per-character jump table <code>charJump</code>
166: * as specified by the Boyer-Moore algorithm.
167: */
168: private final void computeJumps() {
169: charJump = new int[256];
170: for (int i = 0; i < 255; i++)
171: charJump[i] = m;
172: for (int k = 1; k <= m; k++)
173: charJump[P[k] + 128] = m - k;
174: }
175:
176: /**
177: * Computes a partial-match jump table that skips over
178: * partially matching suffixes.
179: */
180: private void computeMatchJumps() {
181: int k, q, qq, mm;
182: int[] back = new int[m + 2];
183:
184: matchJump = new int[m + 2];
185: mm = 2 * m;
186:
187: for (k = 1; k <= m; k++)
188: matchJump[k] = mm - k;
189: k = m;
190: q = m + 1;
191: while (k > 0) {
192: back[k] = q;
193: while ((q <= m) && (P[k] != P[q])) {
194: matchJump[q] = min(matchJump[q], m - k);
195: q = back[q];
196: }
197: k = k - 1;
198: q = q - 1;
199: }
200: for (k = 1; k <= q; k++) {
201: matchJump[k] = min(matchJump[k], m + q - k);
202: }
203: qq = back[q];
204: while (q <= m) {
205: while (q <= qq) {
206: matchJump[q] = min(matchJump[q], qq - q + m);
207: q = q + 1;
208: }
209: qq = back[qq];
210: }
211: }
212:
213: /**
214: * Returns the length of the pattern for this searcher.
215: *
216: * @return The search pattern length.
217: */
218: public int getPatternLength() {
219: return (m);
220: }
221:
222: /**
223: * Search for the previously pre-compiled pattern string in an
224: * array of bytes. This method uses the Boyer-Moore pattern
225: * search algorithm.
226: *
227: * @param byteString Array of bytes in which to search
228: * for the pattern.
229: * @return The array index where the pattern
230: * begins in the string, or <code>-1</code>
231: * if the pattern was not found.
232: */
233: public int search(byte[] byteString) {
234: return (search(byteString, 0, byteString.length));
235: }
236:
237: /**
238: * Search for the previously pre-compiled pattern string in an
239: * array of bytes. This method uses the Boyer-Moore pattern
240: * search algorithm.
241: *
242: * @param byteString Array of bytes in which to search
243: * for the pattern.
244: * @param offset The the index in <code>byteString</code>
245: * where the search is to begin.
246: * @param length The number of bytes to search in
247: * <code>byteString</code>.
248: * @return The array index where the pattern
249: * begins in the string, or <code>-1</code>
250: * if the pattern was not found.
251: */
252: public int search(byte[] byteString, int offset, int length) {
253: int j, k, len;
254: j = m + offset;
255: k = m;
256: len = min(byteString.length, offset + length);
257: while ((j <= len) && (k > 0)) {
258: if ((byteString[j - 1]) == P[k]) {
259: j = j - 1;
260: k = k - 1;
261: } else {
262: j = j
263: + max(charJump[byteString[j - 1] + 128],
264: matchJump[k]);
265: k = m;
266: }
267: }
268: if (k == 0)
269: return (j);
270: return (-1); // No match.
271: }
272: }
|