001: /*
002: * ShiftOrMismatches.java
003: *
004: * Created on 14.11.2003.
005: *
006: * eaio: StringSearch - high-performance pattern matching algorithms in Java
007: * Copyright (c) 2003, 2004 Johann Burkard (jb@eaio.com) http://eaio.com
008: *
009: * Permission is hereby granted, free of charge, to any person obtaining a copy
010: * of this software and associated documentation files (the "Software"), to deal
011: * in the Software without restriction, including without limitation the rights
012: * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
013: * copies of the Software, and to permit persons to whom the Software is
014: * furnished to do so, subject to the following conditions:
015: *
016: * The above copyright notice and this permission notice shall be included in
017: * all copies or substantial portions of the Software.
018: *
019: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
020: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
021: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
022: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
023: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
024: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
025: * SOFTWARE.
026: *
027: */
028: package com.eaio.stringsearch;
029:
030: /**
031: * An implementation of the Shift-Or algorithm with mismatches. Note that the
032: * pattern length may not be larger than 31 / ⌈ log<sub>2</sub> (k + 1)
033: * ⌉.
034: * <br><br>
035: * <table style="border: 1px solid #ccc" cellpadding="4">
036: * <tr>
037: * <td>Editing distance (k)</td>
038: * <td>Maximum pattern length</td>
039: * </tr>
040: * <tr>
041: * <td>0</td>
042: * <td>31</td>
043: * </tr>
044: * <tr>
045: * <td>1</td>
046: * <td>15</td>
047: * </tr>
048: * <tr>
049: * <td>2-3</td>
050: * <td>10</td>
051: * </tr>
052: * <tr>
053: * <td>4-5</td>
054: * <td>7</td>
055: * </tr>
056: * </table>
057: * <br><br>
058: * This algorithm is slower than {@link com.eaio.stringsearch.ShiftOr}. In
059: * future versions of this library, faster alternatives are likely to be added.
060: * <pre>
061: * Preprocessing: O(3n + ∑) time
062: *
063: * Searching : O(mn / log n) (worst case and average)
064: * </pre>
065: *
066: * @author <a href="mailto:burkard@ergosign.de">Johann Burkard</a>
067: * @version 1.2
068: * @see #processBytes(byte[], int)
069: * @see #processChars(char[], int)
070: * @see com.eaio.stringsearch.ShiftOr
071: */
072: public class ShiftOrMismatches extends MismatchSearch {
073:
074: private static final Object MISMATCH = new Object();
075: private static final Object MATCH = new Object();
076:
077: /**
078: * Constructor for ShiftOrMismatches. Note that it is not required to create
079: * multiple instances.
080: */
081: public ShiftOrMismatches() {
082: }
083:
084: /**
085: * @throws IllegalArgumentException if the pattern length is larger than 31 /
086: * ⌈ log<sub>2</sub> (k + 1) ⌉
087: * @see com.eaio.stringsearch.MismatchSearch#processBytes(byte[], int)
088: */
089: public Object processBytes(byte[] pattern, int k) {
090:
091: Object type = MISMATCH;
092:
093: if ((k << 1) > pattern.length) {
094: type = MATCH;
095: k = pattern.length - k;
096: }
097:
098: int b = clog2(k + 1) + 1;
099:
100: if (pattern.length > (31 / b)) {
101: throw new IllegalArgumentException();
102: }
103:
104: /* Preprocessing */
105:
106: int i;
107: int lim = k << ((pattern.length - 1) * b);
108: int ovmask = 0;
109:
110: for (i = 0; i < pattern.length; i++) {
111: ovmask = (ovmask << b) | (1 << (b - 1));
112: }
113:
114: int[] t = new int[256];
115:
116: /* Loop that nulls the array if type == MATCH removed */
117:
118: if (type == MISMATCH) {
119: lim += 1 << ((pattern.length - 1) * b);
120: for (i = 0; i < t.length; i++) {
121: t[i] = ovmask >> (b - 1);
122: }
123: }
124:
125: i = 1;
126: for (int p = 0; p < pattern.length; p++, i <<= b) {
127: if (type == MATCH) {
128: t[index(pattern[p])] += i;
129: } else {
130: t[index(pattern[p])] &= ~i;
131: }
132: }
133:
134: return new Object[] { t, type, new Integer(i - 1),
135: new Integer(ovmask), new Integer(b), new Integer(lim) };
136: }
137:
138: /**
139: * @throws IllegalArgumentException if the pattern length is larger than 31 /
140: * ⌈ log<sub>2</sub> (k + 1) ⌉
141: * @see com.eaio.stringsearch.MismatchSearch#processChars(char[], int)
142: */
143: public Object processChars(char[] pattern, int k) {
144:
145: Object type = MISMATCH;
146:
147: if ((k << 1) > pattern.length) {
148: type = MATCH;
149: k = pattern.length - k;
150: }
151:
152: int b = clog2(k + 1) + 1;
153:
154: if (pattern.length > (31 / b)) {
155: throw new IllegalArgumentException();
156: }
157:
158: /* Preprocessing */
159:
160: int i;
161: int lim = k << ((pattern.length - 1) * b);
162: int ovmask = 0;
163:
164: for (i = 0; i < pattern.length; i++) {
165: ovmask = (ovmask << b) | (1 << (b - 1));
166: }
167:
168: CharIntMap t;
169:
170: if (type == MATCH) {
171: t = createCharIntMap(pattern);
172: } else {
173: lim += 1 << ((pattern.length - 1) * b);
174: t = createCharIntMap(pattern, ovmask >> (b - 1));
175: }
176:
177: i = 1;
178: for (int p = 0; p < pattern.length; p++, i <<= b) {
179: if (type == MATCH) {
180: t.set(pattern[p], t.get(pattern[p]) + i);
181: } else {
182: t.set(pattern[p], t.get(pattern[p]) & ~i);
183: }
184: }
185:
186: return new Object[] { t, type, new Integer(i - 1),
187: new Integer(ovmask), new Integer(b), new Integer(lim) };
188:
189: }
190:
191: /**
192: * @see com.eaio.stringsearch.MismatchSearch#searchBytes(byte[], int, int,
193: * byte[], Object, int)
194: */
195: public int[] searchBytes(byte[] text, int textStart, int textEnd,
196: byte[] pattern, Object processed, int k) {
197:
198: Object[] o = (Object[]) processed;
199: int[] t = (int[]) o[0];
200: Object type = o[1];
201: int mask = ((Integer) o[2]).intValue();
202: int ovmask = ((Integer) o[3]).intValue();
203: int b = ((Integer) o[4]).intValue();
204: int lim = ((Integer) o[5]).intValue();
205:
206: int state, overflow;
207:
208: if (type == MATCH) {
209: state = 0;
210: overflow = 0;
211: } else {
212: state = mask & ~ovmask;
213: overflow = ovmask;
214: }
215:
216: for (int p = textStart; p < textEnd; p++) {
217: state = ((state << b) + t[index(text[p])]) & mask;
218: overflow = ((overflow << b) | (state & ovmask)) & mask;
219: state &= ~ovmask;
220: if (type == MATCH) {
221: if ((state | overflow) >= lim) {
222: return new int[] { p - pattern.length + 1,
223: pattern.length - k };
224: }
225: } else if ((state | overflow) < lim) {
226: return new int[] { p - pattern.length + 1,
227: (state >> (pattern.length - 1) * b) };
228: }
229: }
230:
231: return new int[] { -1, 0 };
232:
233: }
234:
235: /**
236: * @see com.eaio.stringsearch.MismatchSearch#searchChars(char[], int, int,
237: * char[], Object, int)
238: */
239: public int[] searchChars(char[] text, int textStart, int textEnd,
240: char[] pattern, Object processed, int k) {
241:
242: Object[] o = (Object[]) processed;
243: CharIntMap t = (CharIntMap) o[0];
244: Object type = o[1];
245: int mask = ((Integer) o[2]).intValue();
246: int ovmask = ((Integer) o[3]).intValue();
247: int b = ((Integer) o[4]).intValue();
248: int lim = ((Integer) o[5]).intValue();
249:
250: int state, overflow;
251:
252: if (type == MATCH) {
253: state = 0;
254: overflow = 0;
255: } else {
256: state = mask & ~ovmask;
257: overflow = ovmask;
258: }
259:
260: for (int p = textStart; p < textEnd; p++) {
261: state = ((state << b) + t.get(text[p])) & mask;
262: overflow = ((overflow << b) | (state & ovmask)) & mask;
263: state &= ~ovmask;
264: if (type == MATCH) {
265: if ((state | overflow) >= lim) {
266: return new int[] { p - pattern.length + 1,
267: pattern.length - k };
268: }
269: } else if ((state | overflow) < lim) {
270: return new int[] { p - pattern.length + 1,
271: (state >> (pattern.length - 1) * b) };
272: }
273: }
274:
275: return new int[] { -1, 0 };
276:
277: }
278:
279: /**
280: * Ceiling of log2(x).
281: *
282: * @param x x
283: * @return ⌈log2(x)⌉
284: */
285: private int clog2(int x) {
286: int i = 0;
287: while (x > (1 << i)) {
288: ++i;
289: }
290: return i;
291: }
292:
293: /**
294: * This algorithm is currently not using the native library. This method
295: * therefore always returns <code>false</code>.
296: *
297: * @see com.eaio.stringsearch.StringSearch#usesNative()
298: */
299: public boolean usesNative() {
300: return false;
301: }
302:
303: }
|