001: /*
002: * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
003: * Boyer-Moore algorithm
004: * :tabSize=8:indentSize=8:noTabs=false:
005: * :folding=explicit:collapseFolds=1:
006: *
007: * Copyright (C) 1999, 2000 mike dillon
008: * Portions copyright (C) 2001 Tom Locke
009: * Portions copyright (C) 2001, 2002 Slava Pestov
010: *
011: * This program is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU General Public License
013: * as published by the Free Software Foundation; either version 2
014: * of the License, or any later version.
015: *
016: * This program is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU General Public License for more details.
020: *
021: * You should have received a copy of the GNU General Public License
022: * along with this program; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
024: */
025:
026: package org.gjt.sp.jedit.search;
027:
028: /**
029: * Implements literal search using the Boyer-Moore algorithm.
030: * @version $Id: BoyerMooreSearchMatcher.java 9481 2007-05-02 00:34:44Z k_satoda $
031: */
032: public class BoyerMooreSearchMatcher extends SearchMatcher {
033: //{{{ BoyerMooreSearchMatcher constructor
034: /**
035: * Creates a new string literal matcher.
036: * @param pattern the search pattern
037: * @param ignoreCase <code>true</code> if you want to ignore case
038: */
039: public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase) {
040: this .pattern = pattern.toCharArray();
041: if (ignoreCase) {
042: for (int i = 0; i < this .pattern.length; i++) {
043: this .pattern[i] = Character
044: .toUpperCase(this .pattern[i]);
045: }
046: }
047:
048: this .ignoreCase = ignoreCase;
049:
050: pattern_end = this .pattern.length - 1;
051: } //}}}
052:
053: //{{{ nextMatch() method
054: /**
055: * Returns the offset of the first match of the specified text
056: * within this matcher.
057: * @param text The text to search in
058: * @param start True if the start of the segment is the beginning of the
059: * buffer
060: * @param end True if the end of the segment is the end of the buffer
061: * @param firstTime If false and the search string matched at the start
062: * offset with length zero, automatically find next match
063: * @param reverse If true, searching will be performed in a backward
064: * direction.
065: * @return an array where the first element is the start offset
066: * of the match, and the second element is the end offset of
067: * the match
068: * @since jEdit 4.2pre4
069: */
070: public SearchMatcher.Match nextMatch(CharSequence text,
071: boolean start, boolean end, boolean firstTime,
072: boolean reverse) {
073: int pos = match(text, reverse);
074:
075: if (pos == -1) {
076: return null;
077: } else {
078: returnValue.start = pos;
079: returnValue.end = pos + pattern.length;
080: return returnValue;
081: }
082: } //}}}
083:
084: //{{{ match() method
085: /**
086: * a good introduction to the Boyer-Moore fast string matching
087: * algorithm may be found on Moore's website at:
088: *
089: * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
090: *
091: * @since jEdit 4.3pre5
092: */
093: public int match(CharSequence text, boolean reverse) {
094: //{{{
095: // lazily create skip and suffix arrays for either the
096: // search pattern, or the reversed search pattern
097: int[] skip, suffix;
098: if (reverse) {
099: if (back_skip == null) {
100: back_skip = generateSkipArray(true);
101: back_suffix = generateSuffixArray(true);
102: }
103: skip = back_skip;
104: suffix = back_suffix;
105: } else {
106: if (fwd_skip == null) {
107: fwd_skip = generateSkipArray(false);
108: fwd_suffix = generateSuffixArray(false);
109: }
110: skip = fwd_skip;
111: suffix = fwd_suffix;
112: } //}}}
113:
114: // position variable for pattern test position
115: int pos;
116:
117: // position variable for pattern start
118: int anchor = 0;
119:
120: // last possible start position of a match with this pattern;
121: // this is negative if the pattern is longer than the text
122: // causing the search loop below to immediately fail
123: //int last_anchor = reverseSearch
124: // ? offset + pattern.length - 1
125: // : length - pattern.length;
126:
127: char ch = 0;
128:
129: int bad_char;
130: int good_suffix;
131:
132: // the search works by starting the anchor (first character
133: // of the pattern) at the initial offset. as long as the
134: // anchor is far enough from the enough of the text for the
135: // pattern to match, and until the pattern matches, we
136: // compare the pattern to the text from the last character
137: // to the first character in reverse order. where a character
138: // in the pattern mismatches, we use the two heuristics
139: // based on the mismatch character and its position in the
140: // pattern to determine the furthest we can move the anchor
141: // without missing any potential pattern matches.
142: SEARCH: while (anchor + pattern_end < text.length()) {
143: for (pos = pattern_end; pos >= 0; --pos) {
144: ch = text.charAt(pos + anchor);
145: if (ignoreCase)
146: ch = Character.toUpperCase(ch);
147:
148: // pattern test
149: if ((reverse ? ch != pattern[pattern_end - pos]
150: : ch != pattern[pos])) {
151: // character mismatch, determine how many characters to skip
152:
153: // heuristic #1
154: bad_char = pos - skip[getSkipIndex(ch)];
155:
156: // heuristic #2
157: good_suffix = suffix[pos];
158:
159: // skip the greater of the two distances provided by the
160: // heuristics
161: int skip_index = (bad_char > good_suffix) ? bad_char
162: : good_suffix;
163: anchor += skip_index;
164:
165: // go back to the while loop
166: continue SEARCH;
167: }
168: }
169:
170: // MATCH: return the position of its first character
171: return anchor;
172: }
173:
174: // MISMATCH: return -1 as defined by API
175: return -1;
176: } //}}}
177:
178: //{{{ toString() method
179: public String toString() {
180: return "BoyerMooreSearchMatcher[" + new String(pattern) + ']';
181: } //}}}
182:
183: //{{{ Private members
184: private char[] pattern;
185: private int pattern_end;
186: private boolean ignoreCase;
187:
188: // Boyer-Moore member fields
189: private int[] fwd_skip;
190: private int[] fwd_suffix;
191: private int[] back_skip;
192: private int[] back_suffix;
193:
194: // Boyer-Moore helper methods
195:
196: //{{{ generateSkipArray() method
197: /*
198: * the 'skip' array is used to determine for each index in the
199: * hashed alphabet how many characters can be skipped if
200: * a mismatch occurs on a characater hashing to that index.
201: */
202: private int[] generateSkipArray(boolean reverse) {
203: // initialize the skip array to all zeros
204: int[] skip = new int[256];
205:
206: // leave the table cleanly-initialized for an empty pattern
207: if (pattern.length == 0)
208: return skip;
209:
210: int pos = 0;
211:
212: do {
213: skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
214: } while (++pos < pattern.length);
215:
216: return skip;
217: } //}}}
218:
219: //{{{ getSkipIndex() method
220: /*
221: * to avoid our skip table having a length of 2 ^ 16, we hash each
222: * character of the input into a character in the alphabet [\x00-\xFF]
223: * using the lower 8 bits of the character's value (resulting in
224: * a more reasonable skip table of length 2 ^ 8).
225: *
226: * the result of this is that more than one character can hash to the
227: * same index, but since the skip table encodes the position of
228: * occurence of the character furthest into the string with a particular
229: * index (whether or not it is the only character with that index), an
230: * index collision only means that that this heuristic will give a
231: * sub-optimal skip (i.e. a complete skip table could use the differences
232: * between colliding characters to maximal effect, at the expense of
233: * building a table that is over 2 orders of magnitude larger and very
234: * sparse).
235: */
236: private static final int getSkipIndex(char ch) {
237: return ch & 0x000000FF;
238: } //}}}
239:
240: //{{{ generateSuffixArray() method
241: /*
242: * XXX: hairy code that is basically just a functional(?) port of some
243: * other code i barely understood
244: */
245: private int[] generateSuffixArray(boolean reverse) {
246: int m = pattern.length;
247:
248: int j = m + 1;
249:
250: int[] suffix = new int[j];
251: int[] tmp = new int[j];
252: tmp[m] = j;
253:
254: for (int i = m; i > 0; --i) {
255: while (j <= m
256: && pattern[reverse ? pattern_end - i + 1 : i - 1] != pattern[reverse ? pattern_end
257: - j + 1
258: : j - 1]) {
259: if (suffix[j] == 0) {
260: suffix[j] = j - i;
261: }
262:
263: j = tmp[j];
264: }
265:
266: tmp[i - 1] = --j;
267: }
268:
269: int k = tmp[0];
270:
271: for (j = 0; j <= m; j++) {
272: // the code above builds a 1-indexed suffix array,
273: // but we shift it to be 0-indexed, ignoring the
274: // original 0-th element
275: if (j > 0) {
276: suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
277: }
278:
279: if (j == k) {
280: k = tmp[k];
281: }
282: }
283:
284: return suffix;
285: } //}}}
286:
287: //}}}
288: }
|