001: /*
002: * ShiftOrClassesTest.java
003: *
004: * Created on 28.10.2003
005: *
006: * eaio: StringSearch - high-performance pattern matching algorithms in Java
007: * Copyright (c) 2003, 2004 Johann Burkard (jb@eaio.com) http://eaio.com
008: *
009: * Permission is hereby granted, free of charge, to any person obtaining a copy
010: * of this software and associated documentation files (the "Software"), to deal
011: * in the Software without restriction, including without limitation the rights
012: * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
013: * copies of the Software, and to permit persons to whom the Software is
014: * furnished to do so, subject to the following conditions:
015: *
016: * The above copyright notice and this permission notice shall be included in
017: * all copies or substantial portions of the Software.
018: *
019: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
020: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
021: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
022: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
023: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
024: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
025: * SOFTWARE.
026: *
027: */
028: package com.eaio.stringsearch;
029:
030: /**
031: * An implementation of the Shift-Or algorithm that supports character classes.
032: * The following character classes are supported:
033: * <br><br>
034: * <table style="border: 1px solid #ccc" cellpadding="4">
035: * <tr>
036: * <td><em>x</em></td>
037: * <td>a character from the Alphabet Σ</td>
038: * </tr>
039: * <tr>
040: * <td><em>?</em></td>
041: * <td>a "don't care" symbol which matches all symbols</td>
042: * </tr>
043: * <tr>
044: * <td>[<em>characters</em>]</td>
045: * <td>a class of characters where ranges (a-z, 0-9) are allowed</td>
046: * </tr>
047: * <tr>
048: * <td><em>^</em></td>
049: * <td>the negation of a class(^a, ^[abc], ^[c-h])</td>
050: * </tr>
051: * <tr>
052: * <td>\</td>
053: * <td>escapes the next character (\ must be written as \\ in Java).</td>
054: * </table>
055: * <br>
056: * Examples:
057: * <br><br>
058: * <ul>
059: * <li><code>mpeg</code> will obviously match mpeg, and not mpag</li>
060: * <li><code>mp?g</code> will match mpeg, mp7g, mpog, but not mpg (see
061: * {@link com.eaio.stringsearch.ShiftOrWildcards})</li>
062: * <li><code>mp^ag</code> will match mpeg, mpog, mpcg, but not mpag</li>
063: * <li><code>mpe^</code> will only match mpe^</li>
064: * <li><code>mp\^ag ("mp\\^ag" in Java)</code> will match mp^ag, but not
065: * mpeg</li>
066: * <li><code>mp[aeiou]g</code> will match mpeg, mpug, but not mptg</li>
067: * <li><code>mp^[a-k]g</code> will match mpog, mpzg, but not mpeg</li>
068: * <li><code>mp[u-a]g</code> will match mpeg, too</li>
069: * <li><code>mp^?g</code> will match mpeg</li>
070: * <li><code>mp^[]g</code> will match mpeg, too (negation of the empty
071: * class == all characters)</li>
072: * </ul>
073: * <pre>
074: * Preprocessing: O(2n + ∑) time
075: *
076: * Searching : O(mn / log n) (worst case and average)
077: * </pre>
078: *
079: * @see com.eaio.stringsearch.ShiftOr
080: * @author <a href="mailto:jb@eaio.de">Johann Burkard</a>
081: * @version 1.2
082: */
083: public class ShiftOrClasses extends ShiftOr {
084:
085: /**
086: * Constructor for ShiftOrClasses. Note that it is not required to create
087: * multiple instances.
088: */
089: public ShiftOrClasses() {
090: }
091:
092: /**
093: * @see ShiftOr#processBytes(byte[])
094: */
095: public Object processBytes(byte[] pattern) {
096: int j = ~0;
097:
098: int offset = 0;
099:
100: for (int i = 0; (i < pattern.length) && (offset < 31); i++) {
101: if (pattern[i] == '\\') {
102: ++i;
103: ++offset;
104: } else if (pattern[i] == '?') {
105: j -= 1 << offset++;
106: } else if (pattern[i] == '^' && i < pattern.length - 1
107: && offset < 31) {
108:
109: /* Negate a bit only if there are more characters coming up */
110:
111: if (pattern[i + 1] == '?') {
112: ++i;
113: j -= 1 << offset++;
114: }
115: j -= 1 << offset;
116:
117: } else if (pattern[i] == '[' && i < pattern.length - 1
118: && offset < 31) {
119:
120: /* Process patterns only if there are more characters */
121:
122: while (i < pattern.length /* && offset < 31 */
123: && pattern[i] != ']') {
124: if (pattern[i] == '\\') {
125: ++i;
126: }
127: ++i;
128: }
129: ++offset;
130:
131: } else {
132: ++offset;
133: }
134: }
135:
136: int[] t = new int[256];
137:
138: for (int i = 0; i < t.length; ++i) {
139: t[i] = j;
140: }
141:
142: offset = 0;
143:
144: boolean negate = false;
145:
146: for (int i = 0; (i < pattern.length) && (offset < 31); i++) {
147: if (pattern[i] == '\\') {
148: ++i;
149: if (i < pattern.length - 1 && offset < 31) {
150: t[index(pattern[i])] &= ~(1 << offset++);
151: }
152: } else if (pattern[i] == '?') {
153: for (int l = 0; l < t.length; ++l) {
154: t[l] &= ~(1 << offset);
155: }
156: ++offset;
157: } else if (pattern[i] == '^') {
158: if (i < pattern.length - 1 && offset < 31) {
159: byte next = pattern[i + 1];
160: if (next == '[') {
161: negate = true;
162: } else {
163: for (int l = 0; l < t.length; ++l) {
164: if (l != next) {
165: t[l] &= ~(1 << offset);
166: } else {
167: t[l] |= (1 << offset);
168: }
169: }
170: ++i;
171: ++offset;
172: }
173: } else {
174: t[index(pattern[i])] &= ~(1 << offset++);
175: }
176: } else if (pattern[i] == '[') {
177:
178: boolean end = false;
179:
180: if (i < pattern.length - 1 && offset < 31) {
181:
182: byte low, high;
183:
184: do {
185: low = pattern[++i];
186:
187: if (low == '\\') {
188: ++i;
189: if (i < pattern.length && offset < 31) {
190: low = pattern[i];
191: }
192: }
193:
194: if (negate) {
195: t[low] |= 1 << offset;
196: } else {
197: t[low] &= ~(1 << offset);
198: }
199:
200: if (i < pattern.length - 2 && offset < 31
201: && pattern[i + 1] == '-'
202: && pattern[i + 2] != ']') {
203:
204: i += 2;
205: high = pattern[i];
206:
207: /* handle [a-\\]] */
208:
209: if (high == '\\' && i < pattern.length - 1) {
210: high = pattern[++i];
211: }
212:
213: char highest = (char) Math.max(low, high);
214: char lowest = (char) Math.min(low, high);
215:
216: for (; lowest <= highest; ++lowest) {
217: if (negate) {
218: t[lowest] |= 1 << offset;
219: } else {
220: t[lowest] &= ~(1 << offset);
221: }
222: }
223:
224: }
225:
226: if (i < pattern.length - 1 && offset < 31) {
227: if (pattern[i + 1] == ']') {
228: end = true;
229: }
230: }
231:
232: } while (i < pattern.length - 1 && offset < 31
233: && low != ']' && !end);
234: ++i;
235: ++offset;
236: }
237:
238: else {
239: if (negate) {
240: t[index(pattern[i])] |= 1 << offset++;
241: } else {
242: t[index(pattern[i])] &= ~(1 << offset++);
243: }
244: }
245:
246: negate = false;
247:
248: } else {
249: t[index(pattern[i])] &= ~(1 << offset++);
250: }
251: }
252:
253: return new Object[] { t, new Integer(offset) };
254: }
255:
256: /**
257: * @see ShiftOr#processChars(char[])
258: */
259: public Object processChars(char[] pattern) {
260: int j = ~0;
261:
262: int offset = 0;
263:
264: for (int i = 0; (i < pattern.length) && (offset < 31); i++) {
265: if (pattern[i] == '\\') {
266: ++i;
267: ++offset;
268: } else if (pattern[i] == '?') {
269: j -= 1 << offset++;
270: } else if (pattern[i] == '^' && i < pattern.length - 1
271: && offset < 31) {
272:
273: /* Negate a bit only if there are more characters coming up */
274:
275: if (pattern[i + 1] == '?') {
276: ++i;
277: j -= 1 << offset++;
278: }
279: j -= 1 << offset;
280:
281: } else if (pattern[i] == '[' && i < pattern.length - 1
282: && offset < 31) {
283:
284: /* Process patterns only if there are more characters */
285:
286: while (i < pattern.length /* && offset < 31 */
287: && pattern[i] != ']') {
288: if (pattern[i] == '\\') {
289: ++i;
290: }
291: ++i;
292: }
293: ++offset;
294:
295: } else {
296: ++offset;
297: }
298: }
299:
300: CharIntMap m = createCharIntMap(pattern, j);
301:
302: offset = 0;
303:
304: boolean negate = false;
305:
306: for (int i = 0; (i < pattern.length) && (offset < 31); i++) {
307: if (pattern[i] == '\\') {
308: ++i;
309: if (i < pattern.length - 1 && offset < 31) {
310: m.set(pattern[i], m.get(pattern[i])
311: & ~(1 << offset++));
312: }
313: } else if (pattern[i] == '?') {
314: char h = m.getHighest();
315: for (char c = m.getLowest(); c < h; c++) {
316: m.set(c, m.get(c) & ~(1 << offset));
317: }
318: ++offset;
319: } else if (pattern[i] == '^') {
320: if (i < pattern.length - 1 && offset < 31) {
321: char next = pattern[i + 1];
322: if (next == '[') {
323: negate = true;
324: } else {
325: char h = m.getHighest();
326: for (char c = m.getLowest(); c < h; c++) {
327: if (c != next) {
328: m.set(c, m.get(c) & ~(1 << offset));
329: } else {
330: m.set(c, m.get(c) | (1 << offset));
331: }
332: }
333: ++i;
334: ++offset;
335: }
336: } else {
337: m.set(pattern[i], m.get(pattern[i])
338: & ~(1 << offset++));
339: }
340: } else if (pattern[i] == '[') {
341:
342: boolean end = false;
343:
344: if (i < pattern.length - 1 && offset < 31) {
345:
346: char low, high;
347:
348: do {
349: low = pattern[++i];
350:
351: if (low == '\\') {
352: ++i;
353: if (i < pattern.length && offset < 31) {
354: low = pattern[i];
355: }
356: }
357:
358: if (negate) {
359: m.set(low, m.get(low) | 1 << offset);
360: } else {
361: m.set(low, m.get(low) & ~(1 << offset));
362: }
363:
364: if (i < pattern.length - 2 && offset < 31
365: && pattern[i + 1] == '-'
366: && pattern[i + 2] != ']') {
367:
368: i += 2;
369: high = pattern[i];
370:
371: /* handle [a-\\]] */
372:
373: if (high == '\\' && i < pattern.length - 1) {
374: high = pattern[++i];
375: }
376:
377: char highest = (char) Math.max(low, high);
378: char lowest = (char) Math.min(low, high);
379:
380: for (; lowest <= highest; ++lowest) {
381: if (negate) {
382: m.set(lowest, m.get(lowest)
383: | 1 << offset);
384: } else {
385: m.set(lowest, m.get(lowest)
386: & ~(1 << offset));
387: }
388: }
389:
390: }
391:
392: if (i < pattern.length - 1 && offset < 31) {
393: if (pattern[i + 1] == ']') {
394: end = true;
395: }
396: }
397:
398: } while (i < pattern.length - 1 && offset < 31
399: && low != ']' && !end);
400: ++i;
401: ++offset;
402: }
403:
404: else {
405: if (negate) {
406: m.set(pattern[i], m.get(pattern[i])
407: | 1 << offset++);
408: } else {
409: m.set(pattern[i], m.get(pattern[i])
410: & ~(1 << offset++));
411: }
412: }
413:
414: negate = false;
415:
416: } else {
417: m.set(pattern[i], m.get(pattern[i]) & ~(1 << offset++));
418: }
419: }
420:
421: return new Object[] { m, new Integer(offset) };
422: }
423:
424: }
|