001: /*
002: * ShiftOr.java
003: *
004: * Created on 12.08.2003.
005: *
006: * eaio: StringSearch - high-performance pattern matching algorithms in Java
007: * Copyright (c) 2003, 2004 Johann Burkard (jb@eaio.com) http://eaio.com
008: *
009: * Permission is hereby granted, free of charge, to any person obtaining a copy
010: * of this software and associated documentation files (the "Software"), to deal
011: * in the Software without restriction, including without limitation the rights
012: * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
013: * copies of the Software, and to permit persons to whom the Software is
014: * furnished to do so, subject to the following conditions:
015: *
016: * The above copyright notice and this permission notice shall be included in
017: * all copies or substantial portions of the Software.
018: *
019: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
020: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
021: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
022: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
023: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
024: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
025: * SOFTWARE.
026: *
027: */
028: package com.eaio.stringsearch;
029:
030: /**
031: * An implementation of the Shift-Or algorithm by Ricardo Baeza-Yates and
032: * Gaston Gonnet as outlined in "A New Approach to Text Searching" (appeared in
033: * <em>Proceedings of the 12th International Conference on Research and
034: * Development in Datum Retrieval</em>). The Shift-Or algorithm is a
035: * bit-parallel algorithm.
036: * <br><br>
037: * The Shift-Or algorithm is not the fastest and by itself slower than String's
038: * <code>indexOf</code> method. It's usefulness comes from it's ability to
039: * support character classes and searching with errors at the same speed.
040: * <br><br>
041: * It's {@link #searchChars(char[], int, int, char[], Object)} method is
042: * extremely slow in the Sun Java Virtual Machines. If possible, the
043: * {@link #searchBytes(byte[], int, int, byte[], Object)} methods should be
044: * preferred. Because the main loop is also used by the
045: * {@link com.eaio.stringsearch.ShiftOrWildcards} class, the implementation
046: * cannot skip forward until the first character matches (as in the original
047: * algorithm).
048: * <br><br>
049: * This implementation currently limited to at most 31 characters because Java
050: * has no unsigned <code>int</code> type. An implementation that used
051: * <code>long</code> has proved to take twice the amount of time.
052: * <pre>
053: * Preprocessing: O(n + ∑) time
054: *
055: * Searching : O(mn / log n) (worst case and average)
056: * </pre>
057: *
058: * @see
059: * <a href="ftp://sunsite.dcc.uchile.cl/pub/users/rbaeza/papers/CACM92.ps.gz">
060: * ftp://sunsite.dcc.uchile.cl/pub/users/rbaeza/papers/CACM92.ps.gz
061: * </a>
062: * @see <a href="http://citeseer.nj.nec.com/50265.html">
063: * http://citeseer.nj.nec.com/50265.html
064: * </a>
065: * @author <a href="mailto:jb@eaio.com">Johann Burkard</a>
066: * @version 1.2
067: * @see com.eaio.stringsearch.ShiftOrWildcards
068: * @see com.eaio.stringsearch.ShiftOrClasses
069: * @see com.eaio.stringsearch.ShiftOrMismatches
070: */
071: public class ShiftOr extends StringSearch {
072:
073: /**
074: * Constructor for ShiftOr. Note that it is not required to create multiple
075: * instances.
076: */
077: public ShiftOr() {
078: }
079:
080: /**
081: * Pre-processing of the pattern. The pattern may not exceed 31 bytes in
082: * length. If it does, <b>only it's first 31 bytes</b> are processed which
083: * might lead to unexpected results. Returns an <code>int</code> array.
084: *
085: * @see com.eaio.stringsearch.StringSearch#processBytes(byte[])
086: */
087: public Object processBytes(byte[] pattern) {
088: int j = ~0;
089: int end = Math.min(pattern.length, 31);
090:
091: int[] t = new int[256];
092:
093: for (int i = 0; i < t.length; ++i) {
094: t[i] = j;
095: }
096:
097: for (int i = 0; i < end; ++i) {
098: t[index(pattern[i])] &= ~(1 << i);
099: }
100:
101: return t;
102: }
103:
104: /**
105: * Pre-processing of the pattern. The pattern may not exceed 31 characters in
106: * length. If it does, <b>only it's first 31 characters</b> are processed which
107: * might lead to unexpected results. Returns a {@link CharIntMap}.
108: *
109: * @param char[] the pattern
110: * @return an Object
111: * @see StringSearch#processChars(char[])
112: */
113: public Object processChars(char[] pattern) {
114: int end = Math.min(pattern.length, 31);
115:
116: CharIntMap m = createCharIntMap(pattern, ~0);
117:
118: for (int i = 0; i < end; ++i) {
119: m.set(pattern[i], m.get(pattern[i]) & ~(1 << i));
120: }
121:
122: return m;
123: }
124:
125: /**
126: * @see com.eaio.stringsearch.StringSearch#searchBytes(byte[], int, int,
127: * byte[], java.lang.Object)
128: */
129: public int searchBytes(byte[] text, int textStart, int textEnd,
130: byte[] pattern, Object processed) {
131:
132: if (StringSearch.useNative) {
133: if (processed instanceof int[]) {
134: return nativeSearchBytes(text, textStart, textEnd,
135: pattern, (int[]) processed, pattern.length);
136:
137: } else {
138: Object[] params = (Object[]) processed;
139: int[] t = (int[]) params[0];
140: int l = ((Integer) params[1]).intValue();
141: return nativeSearchBytes(text, textStart, textEnd,
142: pattern, t, l);
143: }
144: } else {
145: return javaSearchBytes(text, textStart, textEnd, pattern,
146: processed);
147: }
148:
149: }
150:
151: private int javaSearchBytes(byte[] text, int textStart,
152: int textEnd, byte[] pattern, Object processed) {
153:
154: int[] t;
155: int l = pattern.length;
156:
157: if (processed instanceof int[]) {
158: t = (int[]) processed;
159: } else {
160: Object[] params = (Object[]) processed;
161: t = (int[]) params[0];
162: l = ((Integer) params[1]).intValue();
163: }
164:
165: int lim = ~((1 << (l - 1)) - 1);
166: int state = ~0;
167: for (int i = textStart; i < textEnd; ++i) {
168: state = (state << 1) | t[index(text[i])];
169: if (state < lim) {
170: return i - l + 1;
171: }
172: }
173: return -1;
174:
175: }
176:
177: private native int nativeSearchBytes(byte[] text, int textStart,
178: int textEnd, byte[] pattern, int[] t, int l);
179:
180: /**
181: * @see com.eaio.stringsearch.StringSearch#searchChars(char[], int, int,
182: * char[], Object)
183: */
184: public int searchChars(char[] text, int textStart, int textEnd,
185: char[] pattern, Object processed) {
186:
187: CharIntMap m;
188: int l = pattern.length;
189:
190: if (processed instanceof CharIntMap) {
191: m = (CharIntMap) processed;
192: } else {
193: Object[] params = (Object[]) processed;
194: m = (CharIntMap) params[0];
195: l = ((Integer) params[1]).intValue();
196: }
197:
198: int lim = ~((1 << (l - 1)) - 1);
199: int state = ~0;
200: for (int i = textStart; i < textEnd; ++i) {
201: state = (state << 1) | m.get(text[i]);
202: if (state < lim) {
203: return i - l + 1;
204: }
205: }
206:
207: return -1;
208:
209: }
210:
211: }
|