001: // cmp.Boyer.Boyer.java fast string search.
002:
003: package cmp.Boyer;
004:
005: /*
006: Fast string search (indexOf) using the Boyer-Moore
007: algorithm.
008:
009: use:
010: import cmp.Boyer.Boyer;
011: ...
012: Boyer b = new Boyer("dogcatwombat");
013: int where = b.indexOf("cat");
014: or
015: Boyer b = new Boyer();
016: b.setText("cowpighorse");
017: b.setPattern("pig");
018: int where = b.indexOf();
019: or
020: int where = Boyer.indexOf("dogcatwombat","cat");
021:
022: Boyer-Moore is about twice as fast as String.indexOf when
023: the string you are searching in is 2K or over and the
024: pattern you are searching for is 4 characters or longer.
025:
026: String.indexOf is particularly slow when the pattern begins
027: with a common letter such as "e". Boyer-Moore is fastest
028: when the pattern is long and composed only of uncommon
029: letters, e.g. "z" or "^". If you use a char[] instead of
030: String for your text to be searched, it will run an
031: additional 33% faster.
032:
033: You don't have to worry which is faster. Boyer automatically
034: reverts to String.indexOf when that would be faster.
035:
036:
037: */
038:
039: /**
040: * @author copyright (c) 1998-2000 Roedy Green of Canadian Mind Products
041: * may be copied and used freely for any purpose but military.
042: *
043: * Roedy Green
044: * Canadian Mind Products
045: * #208 - 525 Ninth Street
046: * New Westminster, BC Canada V3M 5T9
047: * tel: (604) 777-1804
048: * mailto:roedy@mindprod.com
049: * http://mindprod.com
050: *
051: * version 1.1 1999 January 10
052: * - use simple String.indexOf for short patterns and texts
053: * - lazy evaluation of skip[] array, to avoid work of calculating it.
054: * - more comments.
055: * - lenPat and lenText now local variables.
056: * - more efficient code to catch the degenerate cases of null and 0-length strings.
057: * - unravel main loop slightly to avoid extra charAt.
058: * - now throw NullPointerExceptions on null arguments.
059: * - also support searches of char arrays.
060: *
061: * version 1.0 1999 January 8
062: *
063: */
064:
065: /*
066: Futures:
067: - search given an InputStream
068: - search starting at a given offset
069: */
070:
071: import java.io.*;
072:
073: public class Boyer {
074:
075: private static final boolean debugging = false;
076:
077: private static final String EmbeddedCopyright = "Copyright 1999 Roedy Green, Canadian Mind Products, http://mindprod.com";
078:
079: /**
080: * Pattern length under which might as well use String.indexOf
081: */
082: private static final int breakEvenLenPat = 4;
083:
084: /**
085: * Text length under which might as well use String.indexOf
086: */
087: private static final int breakEvenLenText = 2048;
088:
089: /**
090: * what we search for
091: */
092: private String pattern;
093:
094: private String prevPattern;
095:
096: /**
097: * store pattern as a char array for efficient access.
098: */
099: private char[] pat;
100:
101: /**
102: * what we search in
103: */
104: private String text;
105:
106: /**
107: * what we search in, alternate form
108: */
109: private char[] textArray;
110:
111: /**
112: * how much we can skip to right
113: * based on letter we find in the text corresponding to the
114: * end of the pattern after we find a mismatch.
115: */
116: private int[] skip;
117:
118: /**
119: * default constructor. Must use setText and setPattern afterward.
120: */
121: public Boyer() {
122:
123: }
124:
125: /**
126: * constructor that also
127: * sets text to search in for subsequent indexOf searches.
128: * Must also provide a pattern before using indexOf with setPattern.
129: *
130: * @param text String to search in. may be "" but not null.
131: *
132: */
133: public Boyer(String text) {
134: setText(text);
135: }
136:
137: /**
138: * constructor that also
139: * sets text to search in for subsequent indexOf searches.
140: * Must also provide a pattern before using indexOf with setPattern.
141: *
142: * @param text char array to search in. may be 0-length but not null.
143: *
144: */
145: public Boyer(char[] text) {
146: setText(text);
147: }
148:
149: /**
150: * Set text to search in for subsequent indexOf searches.
151: *
152: * @param text String to search in. May be "" but not null.
153: */
154: public void setText(String text) {
155: if (text == null) {
156: throw new NullPointerException();
157: }
158: this .text = text;
159: this .textArray = null;
160: }
161:
162: /**
163: * Set pattern to use for subsequent indexOf searches.
164: *
165: * @param pattern String to search for. May be "" but not null..
166: */
167: public void setPattern(String pattern) {
168: if (pattern == null) {
169: throw new NullPointerException();
170: }
171: this .pattern = pattern;
172: }
173:
174: /**
175: * Calculate how many chars you can skip
176: * to the right if you find a mismatch.
177: * It depends on what character is at
178: * the end of the word when you find a mismatch.
179: * We must match the pattern, char by char, right to left.
180: * Only called after degenerate cases,
181: * e.g. null, zero-length and 1-length Pattern are eliminated.
182: */
183: private final void analysePattern() {
184: if (pattern.equals(prevPattern)) {
185: return;
186: }
187: int lenPat = pattern.length();
188: // get pattern in fast-to-access charArray form
189: pat = pattern.toCharArray();
190: // Calculate how many slots we can skip to the right
191: // depending on which char is at the end of the word
192: // when we find a match.
193: // Recycle old array if possible.
194: if (skip == null)
195: skip = new int[256];
196: for (int i = 0; i < 256; i++) {
197: skip[i] = lenPat;
198: } // end for
199:
200: for (int i = 0; i < lenPat - 1; i++) {
201: // The following line is the key to the whole algorithm.
202: // It also deals with repeating letters in the pattern.
203: // It works conservatively, considering only the last
204: // instance of repeating letter.
205: // We exclude the last letter of the pattern, because we are
206: // only concerned with what to do on a mismatch.
207: skip[pat[i] & 0xff] = lenPat - i - 1;
208: } // end for
209: prevPattern = pattern;
210: } // end analysePattern
211:
212: /**
213: * Search for given pattern in string.
214: *
215: * @param text String to search in. May be "" but not null.
216: *
217: * @param pattern String to search for. May be "" but not null.
218: *
219: * @return 0-based offset in text, just like String.indexOf.
220: * -1 means not found.
221: */
222: public static int indexOf(String text, String pattern) {
223: return new Boyer(text).indexOf(pattern);
224: } // end indexOf
225:
226: /**
227: * Search for given pattern in char array
228: *
229: * @param text char array to search in. May be "" but not null.
230: *
231: * @param pattern String to search for. May be "" but not null.
232: *
233: * @return 0-based offset in text, just like String.indexOf.
234: * -1 means not found.
235: */
236: public static int indexOf(char[] text, String pattern) {
237: return new Boyer(text).indexOf(pattern);
238: } // end indexOf
239:
240: /**
241: * Set text to search in for subsequent indexOf searches.
242: *
243: * @param text char array to search in. May be empty but not null.
244: */
245: public void setText(char[] text) {
246: if (text == null) {
247: throw new NullPointerException();
248: }
249: this .textArray = text;
250: this .text = null;
251: }
252:
253: /**
254: * Search for given pattern in string.
255: * Text must have been set previously by the constructor or setText.
256: *
257: * @param pattern String to search for. May be "" but not null.
258: *
259: * @return 0-based offset in text, just like String.indexOf.
260: * -1 means not found.
261: */
262: public int indexOf(String pattern) {
263: setPattern(pattern);
264: return indexOf();
265: } // end indexOF
266:
267: /**
268: * Search for given pattern in String or char array.
269: * Presume Pattern and Text have been previously set either
270: * with the constructor or with setText setPattern.
271: *
272: * @return 0-based offset in text, just like String.indexOf.
273: * -1 means not found.
274: */
275: public final int indexOf() {
276: if (text != null) {
277: return indexOfviaText();
278: } else {
279: return indexOfviaTextArray();
280: }
281:
282: } // end indexOf
283:
284: /**
285: * Search for given pattern in String.
286: * Presume Pattern and Text have been previously set either
287: * with the constructor or with setText setPattern.
288: *
289: * @return 0-based offset in text, just like String.indexOf.
290: * -1 means not found.
291: */
292: private final int indexOfviaText() {
293: // If either of text or pattern is null,
294: // we will throw a NullPointerException
295: int lenText = text.length();
296: int lenPat = pattern.length();
297:
298: // Deal with cases that don't rate the full
299: // Boyer-Moore treatment.
300: if (lenText <= breakEvenLenText || lenPat <= breakEvenLenPat) {
301: // this way we are consistent with
302: // String.indexOf for "".indexOf("")
303: // which is -1 in JDK 1.1
304: // and 0 in JDK 1.2. See Bug Parade entry 4096273.
305: // "".indexOf("abc") is always -1
306: return text.indexOf(pattern);
307: } // end if
308:
309: analysePattern();
310:
311: // At this point we have the pattern, and have skip[] calculated
312: // We are commited to calculated the indexOf via Boyer-Moore.
313:
314: // tforward works left to right through the text, skipping depending
315: // on what char it found in the text corresponding to the end of the pattern,
316: // not to the place of the mismatch.
317: char testChar = 0;
318: final int lastPatChar = pat[lenPat - 1];
319: outer: for (int tforward = lenPat - 1; tforward < lenText; tforward += skip[testChar & 0xff]) {
320: // compare working right to left through both pattern and text
321: testChar = text.charAt(tforward);
322: if (testChar != lastPatChar) {
323: continue outer;
324: }
325: // step back through pattern
326: // step back through text
327: inner: for (int tback = tforward - 1, pback = lenPat - 2; pback >= 0; tback--, pback--) {
328: if (text.charAt(tback) != pat[pback])
329: continue outer;
330: } // end inner for
331: // we stepped all the way back through the pattern comparing
332: // without finding a mismatch. We found it!
333: return tforward - lenPat + 1;
334: } // end outer for
335: // stepped through entire text without finding it.
336: return -1;
337: } // end indexOf
338:
339: /**
340: * Search for given pattern in charArray.
341: * presume Pattern and Text have been previously set either
342: * with the constructor or with setText setPattern.
343: *
344: * @return 0-based offset in text, just like String.indexOf.
345: * -1 means not found.
346: */
347: private final int indexOfviaTextArray() {
348: // If either of text or pattern is null,
349: // we will throw a NullPointerException
350: int lenText = textArray.length;
351: int lenPat = pattern.length();
352:
353: // Deal with cases that don't rate the full
354: // Boyer-Moore treatment.
355: if (lenText <= breakEvenLenText / 2
356: || lenPat <= breakEvenLenPat) {
357: // this way we are consistent with
358: // String.indexOf for "".indexOf("")
359: // which is -1 in JDK 1.1
360: // and 0 in JDK 1.2
361: // "".indexOf("abc") is always -1
362: return new String(textArray).indexOf(pattern);
363: } // end if
364:
365: analysePattern();
366:
367: // At this point we have the pattern, and have skip[] calculated
368: // We are commited to calculated the indexOf via Boyer-Moore.
369:
370: // tforward works left to right through the text, skipping depending
371: // on what char it found in the text corresponding to the end of the pattern,
372: // not to the place of the mismatch.
373: char testChar = 0;
374: final int lastPatChar = pat[lenPat - 1];
375: outer: for (int tforward = lenPat - 1; tforward < lenText; tforward += skip[testChar & 0xff]) {
376: // compare working right to left through both pattern and text
377: testChar = textArray[tforward];
378: if (testChar != lastPatChar) {
379: continue outer;
380: }
381: // step back through pattern
382: // step back through text
383: inner: for (int tback = tforward - 1, pback = lenPat - 2; pback >= 0; tback--, pback--) {
384: if (textArray[tback] != pat[pback])
385: continue outer;
386: } // end inner for
387: // we stepped all the way back through the pattern comparing
388: // without finding a mismatch. We found it!
389: return tforward - lenPat + 1;
390: } // end outer for
391: // stepped through entire text without finding it.
392: return -1;
393: } // end indexOf
394:
395: /**
396: * test harness
397: */
398: public static void main(String[] args) {
399: if (debugging) {
400: System.out.println(Boyer.indexOf("dogcatwombat", "cat"));
401: System.out.println("dogcatwombat".indexOf("cat"));
402: System.out.println(Boyer.indexOf("crtcamccmcarogcatwombat",
403: "cat"));
404: System.out
405: .println("crtcamccmcarogcatwombat".indexOf("cat"));
406: System.out.println(Boyer.indexOf("dogcatwombat", ""));
407: System.out.println("dogcatwombat".indexOf(""));
408: System.out.println(Boyer.indexOf("", ""));
409: System.out.println("".indexOf(""));
410: System.out.println(Boyer.indexOf("", "abcde"));
411: System.out.println("".indexOf("abcde"));
412: System.out.println(Boyer.indexOf("dogcatwombat", "cow"));
413: System.out.println("dogcatwombat".indexOf("cow"));
414:
415: try {
416:
417: // O P E N
418: // Any file of sample characters
419: File f = new File("C:/temp/temp.txt");
420: int size = (int) f.length();
421: FileReader fr;
422: fr = new FileReader(f);
423:
424: // R E A D
425: char[] ca = new char[size];
426: int charsRead = fr.read(ca);
427: String s = new String(ca);
428:
429: long start;
430: long stop;
431: int result = 0;
432:
433: start = System.currentTimeMillis();
434: for (int i = 0; i < 1000; i++) {
435: // Need to make different so optimiser will actually do
436: // the work repeatedly.
437: result = Boyer.indexOf(ca, "efficiency" + i % 10);
438: }
439: System.out.println("Boyer char[]" + result);
440: stop = System.currentTimeMillis();
441: System.out.println("Elapsed:" + (stop - start));
442:
443: // benchmark Boyer.indexOf
444: start = System.currentTimeMillis();
445: for (int i = 0; i < 1000; i++) {
446: // Need to make different so optimiser will actually do
447: // the work repeatedly.
448: result = Boyer.indexOf(s, "efficiency" + i % 10);
449: }
450: System.out.println("Boyer String" + result);
451: stop = System.currentTimeMillis();
452: System.out.println("Elapsed:" + (stop - start));
453:
454: // Benchmark String.IndexOf
455: start = System.currentTimeMillis();
456: for (int i = 0; i < 1000; i++) {
457: result = s.indexOf("efficiency" + i % 10);
458: }
459: System.out.println("String " + result);
460: stop = System.currentTimeMillis();
461: System.out.println("Elapsed:" + (stop - start));
462:
463: // C L O S E
464: fr.close();
465:
466: } catch (IOException e) {
467: return;
468: }
469:
470: } // end if debugging
471: } // end main
472: } // end class Boyer
|