001: /**
002: * com.mckoi.database.PatternSearch 05 Sep 1998
003: *
004: * Mckoi SQL Database ( http://www.mckoi.com/database )
005: * Copyright (C) 2000, 2001, 2002 Diehl and Associates, Inc.
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License
009: * Version 2 as published by the Free Software Foundation.
010: *
011: * This program is distributed in the hope that it will be useful,
012: * but WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: * GNU General Public License Version 2 for more details.
015: *
016: * You should have received a copy of the GNU General Public License
017: * Version 2 along with this program; if not, write to the Free Software
018: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
019: *
020: * Change Log:
021: *
022: *
023: */package com.mckoi.database;
024:
025: import com.mckoi.database.global.Types;
026: import com.mckoi.util.IntegerVector;
027: import com.mckoi.util.BlockIntegerList;
028: import com.mckoi.util.IntegerIterator;
029:
030: /**
031: * This is a static class that performs the operations to do a pattern search
032: * on a given column of a table. The pattern syntax is very simple and follows
033: * that of the SQL standard.
034: * <p>
035: * It works as follows:
036: * The '%' character represents any sequence of characters.
037: * The '_' character represents some character.
038: * <p>
039: * Therefore, the pattern search 'Toby%' will find all rows that start with
040: * the string 'Toby' and end with any sequence of characters. The pattern
041: * 'T% Downer%' will find all names starting with T and containing 'Downer'
042: * somewhere in the end. The pattern '_at' will find all three letter words
043: * ending with 'at'.
044: * <p>
045: * NOTE: A 'ab%' type search is faster than a '%bc' type search. If the start
046: * of the search pattern is unknown then the entire contents of the column
047: * need to be accessed.
048: *
049: * @author Tobias Downer
050: */
051:
052: public final class PatternSearch {
053:
054: /**
055: * Statics for the tokens.
056: */
057: private final static char ZERO_OR_MORE_CHARS = '%';
058: private final static char ONE_CHAR = '_';
059:
060: public static boolean testSearch(String pattern, String expression,
061: boolean result) {
062: System.out.print("Pattern: ");
063: System.out.println("'" + pattern + "'");
064: System.out.print("Expression: ");
065: System.out.println("'" + expression + "'");
066:
067: boolean tested_as = fullPatternMatch(pattern, expression, '\\');
068: System.out.print("Result: ");
069: System.out.print(tested_as);
070: if (tested_as != result) {
071: System.out.println(" *** FAILED, Expected: " + result
072: + " ***");
073: } else {
074: System.out.println();
075: }
076: return tested_as;
077: }
078:
079: public static void main(String[] args) {
080: // Testing the SQL expression parser.
081: testSearch("", "abc", false);
082: testSearch("%", "abc", true);
083: testSearch("a%", "abc", true);
084: testSearch("ab%", "abc", true);
085: testSearch("abc%", "abc", true);
086: testSearch("abcd%", "abc", false);
087: testSearch("abcd", "abc", false);
088: testSearch("abc", "abc", true);
089: testSearch("ab", "abc", false);
090: testSearch("a", "abc", false);
091: testSearch("a_", "abc", false);
092: testSearch("ab_", "abc", true);
093: testSearch("abc_", "abc", false);
094: testSearch("a_c", "abc", true);
095: testSearch("a%bc", "abc", true);
096: testSearch("a%c", "abc", true);
097: testSearch("%c", "abc", true);
098: testSearch("a_bc", "abc", false);
099: testSearch("__c", "abc", true);
100: testSearch("a__", "abc", true);
101:
102: testSearch("a\\_\\_", "a__", true);
103: testSearch("a\\__", "a__", true);
104: testSearch("a\\__", "a_b", true);
105: testSearch("\\___", "_ab", true);
106: testSearch("\\_\\__", "_ab", false);
107: testSearch("\\_\\__", "__b", true);
108:
109: testSearch("\\%ab", "%ab", true);
110: testSearch("ab\\%", "ab%", true);
111: testSearch("cab\\%", "cab", false);
112: testSearch("cab%", "cab", true);
113:
114: }
115:
116: /**
117: * Returns true if the given character is a wild card (unknown).
118: */
119: private static boolean isWildCard(char ch) {
120: return (ch == ONE_CHAR || ch == ZERO_OR_MORE_CHARS);
121: }
122:
123: /**
124: * Matches a pattern against a string and returns true if it matches or
125: * false otherwise. This matches patterns that do not necessarily start
126: * with a wild card unlike the 'patternMatch' method.
127: */
128: public static boolean fullPatternMatch(String pattern,
129: final String str, char escape_char) {
130: StringBuffer start = new StringBuffer();
131: String rezt = null;
132: int len = pattern.length();
133: int i = 0;
134: boolean last_escape_char = false;
135: for (; i < len && rezt == null; ++i) {
136: char c = pattern.charAt(i);
137: if (last_escape_char) {
138: last_escape_char = false;
139: start.append(c);
140: } else if (c == escape_char) {
141: last_escape_char = true;
142: } else if (isWildCard(c)) {
143: rezt = pattern.substring(i);
144: } else {
145: start.append(c);
146: }
147: }
148:
149: if (rezt == null) {
150: rezt = "";
151: }
152:
153: String st = new String(start);
154:
155: // System.out.println("--");
156: // System.out.println(str);
157: // System.out.println(st);
158:
159: if (str.startsWith(st)) {
160: String str_rezt = str.substring(st.length()); // (i)
161:
162: if (rezt.length() > 0) {
163: return patternMatch(rezt, str_rezt, escape_char);
164: } else {
165: return str_rezt.length() == 0;
166: }
167: } else {
168: return false;
169: }
170:
171: }
172:
173: /**
174: * This is the pattern match recurrsive method. It recurses on each wildcard
175: * expression in the pattern which makes for slightly better efficiency than
176: * a character recurse algorithm. However, patterns such as "_%_a" will
177: * result in many recursive calls.
178: * <p>
179: * Returns true if the pattern matches.
180: * <p>
181: * NOTE: That "_%_" will be less efficient than "__%" and will produce the
182: * same result.
183: * NOTE: It requires that a wild card character is the first character in
184: * the expression.
185: * ISSUE: Pattern optimiser, we should optimise wild cards of type "%__" to
186: * "__%", or "%__%_%_%" to "____%". Optimised forms are identical in
187: * result and more efficient. This optimisation could be performed by the
188: * client during parsing of the LIKE statement.
189: * HACKING ISSUE: Badly formed wild cards may result in hogging of server
190: * side resources.
191: */
192: public static boolean patternMatch(String pattern,
193: String expression, char escape_char) {
194:
195: // Look at first character in pattern, if it's a ONE_CHAR wildcard then
196: // check expression and pattern match until next wild card.
197:
198: if (pattern.charAt(0) == ONE_CHAR) {
199:
200: // Else step through each character in pattern and see if it matches up
201: // with the expression until a wild card is found or the end is reached.
202: // When the end of the pattern is reached, 'finished' is set to true.
203:
204: int i = 1;
205: boolean finished = (i >= pattern.length() || expression
206: .length() < 1);
207: boolean last_was_escape = false;
208: int checked = 0;
209: while (!finished) {
210: char c = pattern.charAt(i);
211: if (!last_was_escape && c == escape_char) {
212: last_was_escape = true;
213: if (i >= expression.length()) {
214: return false;
215: }
216: ++i;
217: } else if (last_was_escape || !isWildCard(c)) {
218: last_was_escape = false;
219: // If expression and pattern character doesn't match or end of
220: // expression reached, search has failed.
221: if (i >= expression.length()
222: || c != expression.charAt(i)) {
223: return false;
224: }
225: ++i;
226: ++checked;
227: } else {
228: // found a wildcard, so recurse on this wildcard
229: return patternMatch(pattern.substring(i),
230: expression.substring(i), escape_char);
231: }
232:
233: finished = (i >= pattern.length());
234: }
235:
236: // The pattern length minus any escaped characters
237: int real_pattern_length = 0;
238: int sz = pattern.length();
239: for (int n = 0; n < sz; ++n) {
240: if (pattern.charAt(n) != escape_char) {
241: ++real_pattern_length;
242: } else {
243: ++n;
244: }
245: }
246:
247: // If pattern and expression lengths match then we have walked through
248: // the expression and found a match, otherwise no match.
249:
250: return real_pattern_length == expression.length();
251:
252: }
253:
254: // Therefore we are doing a ZERO_OR_MORE_CHARS wildcard check.
255:
256: // If the pattern is '%' (ie. pattern.length() == 1 because it's only 1
257: // character in length (the '%' character)) then it doesn't matter what the
258: // expression is, we have found a match.
259:
260: if (pattern.length() == 1) {
261: return true;
262: }
263:
264: // Look at following character in pattern, and extract all the characters
265: // before the next wild card.
266:
267: StringBuffer next_string = new StringBuffer();
268: int i = 1;
269: boolean finished = (i >= pattern.length());
270: boolean last_was_escape = false;
271: while (!finished) {
272: char next_char = pattern.charAt(i);
273: if (!last_was_escape && next_char == escape_char) {
274: last_was_escape = true;
275: ++i;
276: if (i >= pattern.length()) {
277: finished = true;
278: }
279: } else if (last_was_escape || !isWildCard(next_char)) {
280: last_was_escape = false;
281: next_string.append(next_char);
282: ++i;
283: if (i >= pattern.length()) {
284: finished = true;
285: }
286: } else {
287: finished = true;
288: }
289: }
290:
291: String find_string = new String(next_string);
292:
293: // Special case optimisation if we have found the end of the pattern, all
294: // we need to do is check if 'find_string' is on the end of the expression.
295: // eg. pattern = "%er", will have a 'find_string' of "er" and it is saying
296: // 'does the expression end with 'er''.
297:
298: if (i >= pattern.length()) {
299: return (expression.endsWith(find_string));
300: }
301:
302: // Otherwise we must have finished with another wild card.
303: // Try and find 'next_string' in the expression. If its found then
304: // recurse over the next pattern.
305:
306: int find_str_length = find_string.length();
307: int str_index = expression.indexOf(find_string, 0);
308:
309: while (str_index != -1) {
310:
311: boolean matched = patternMatch(pattern
312: .substring(1 + find_str_length), expression
313: .substring(str_index + find_str_length),
314: escape_char);
315:
316: if (matched) {
317: return true;
318: }
319:
320: str_index = expression.indexOf(find_string, str_index + 1);
321: }
322:
323: return false;
324:
325: }
326:
327: /**
328: * This is the search method. It requires a table to search, a column of the
329: * table, and a pattern. It returns the rows in the table that match the
330: * pattern if any. Pattern searching only works successfully on columns that
331: * are of type Types.DB_STRING.
332: * This works by first reducing the search to all cells that contain the
333: * first section of text. ie. pattern = "Toby% ___ner" will first reduce
334: * search to all rows between "Toby" and "Tobz". This makes for better
335: * efficiency.
336: */
337: static IntegerVector search(Table table, int column, String pattern) {
338: return search(table, column, pattern, '\\');
339: }
340:
341: /**
342: * This is the search method. It requires a table to search, a column of the
343: * table, and a pattern. It returns the rows in the table that match the
344: * pattern if any. Pattern searching only works successfully on columns that
345: * are of type Types.DB_STRING.
346: * This works by first reducing the search to all cells that contain the
347: * first section of text. ie. pattern = "Toby% ___ner" will first reduce
348: * search to all rows between "Toby" and "Tobz". This makes for better
349: * efficiency.
350: */
351: static IntegerVector search(Table table, int column,
352: String pattern, char escape_char) {
353:
354: // Get the type for the column
355: TType col_type = table.getDataTableDef().columnAt(column)
356: .getTType();
357:
358: // If the column type is not a string type then report an error.
359: if (!(col_type instanceof TStringType)) {
360: throw new Error("Unable to perform a pattern search "
361: + "on a non-String type column.");
362: }
363: TStringType col_string_type = (TStringType) col_type;
364:
365: // ---------- Pre Search ----------
366:
367: // First perform a 'pre-search' on the head of the pattern. Note that
368: // there may be no head in which case the entire column is searched which
369: // has more potential to be expensive than if there is a head.
370:
371: StringBuffer pre_pattern = new StringBuffer();
372: int i = 0;
373: boolean finished = i >= pattern.length();
374: boolean last_is_escape = false;
375:
376: while (!finished) {
377: char c = pattern.charAt(i);
378: if (last_is_escape) {
379: last_is_escape = true;
380: pre_pattern.append(c);
381: } else if (c == escape_char) {
382: last_is_escape = true;
383: } else if (!isWildCard(c)) {
384: pre_pattern.append(c);
385:
386: ++i;
387: if (i >= pattern.length()) {
388: finished = true;
389: }
390:
391: } else {
392: finished = true;
393: }
394: }
395:
396: // This is set with the remaining search.
397: String post_pattern;
398:
399: // This is our initial search row set. In the second stage, rows are
400: // eliminated from this vector.
401: IntegerVector search_case;
402:
403: if (i >= pattern.length()) {
404: // If the pattern has no 'wildcards' then just perform an EQUALS
405: // operation on the column and return the results.
406:
407: TObject cell = new TObject(col_type, pattern);
408: return table.selectRows(column, Operator.get("="), cell);
409:
410: // RETURN
411: } else if (pre_pattern.length() == 0
412: || col_string_type.getLocale() != null) {
413:
414: // No pre-pattern easy search :-(. This is either because there is no
415: // pre pattern (it starts with a wild-card) or the locale of the string
416: // is non-lexicographical. In either case, we need to select all from
417: // the column and brute force the search space.
418:
419: search_case = table.selectAll(column);
420: post_pattern = pattern;
421:
422: } else {
423:
424: // Criteria met: There is a pre_pattern, and the column locale is
425: // lexicographical.
426:
427: // Great, we can do an upper and lower bound search on our pre-search
428: // set. eg. search between 'Geoff' and 'Geofg' or 'Geoff ' and
429: // 'Geoff\33'
430:
431: String lower_bounds = new String(pre_pattern);
432: int next_char = pre_pattern.charAt(i - 1) + 1;
433: pre_pattern.setCharAt(i - 1, (char) next_char);
434: String upper_bounds = new String(pre_pattern);
435:
436: post_pattern = pattern.substring(i);
437:
438: TObject cell_lower = new TObject(col_type, lower_bounds);
439: TObject cell_upper = new TObject(col_type, upper_bounds);
440:
441: // Select rows between these two points.
442:
443: search_case = table.selectRows(column, cell_lower,
444: cell_upper);
445:
446: }
447:
448: // ---------- Post search ----------
449:
450: // [This optimization assumes that (NULL like '%' = true) which is incorrect]
451: // // EFFICIENCY: This is a special case efficiency case.
452: // // If 'post_pattern' is '%' then we have already found all the records in
453: // // our pattern.
454: //
455: // if (post_pattern.equals("%")) {
456: // return search_case;
457: // }
458:
459: int pre_index = i;
460:
461: // Now eliminate from our 'search_case' any cells that don't match our
462: // search pattern.
463: // Note that by this point 'post_pattern' will start with a wild card.
464: // This follows the specification for the 'patternMatch' method.
465: // EFFICIENCY: This is a brute force iterative search. Perhaps there is
466: // a faster way of handling this?
467:
468: BlockIntegerList i_list = new BlockIntegerList(search_case);
469: IntegerIterator iterator = i_list
470: .iterator(0, i_list.size() - 1);
471:
472: while (iterator.hasNext()) {
473:
474: // Get the expression (the contents of the cell at the given column, row)
475:
476: boolean pattern_matches = false;
477: TObject cell = table.getCellContents(column, iterator
478: .next());
479: // Null values doesn't match with anything
480: if (!cell.isNull()) {
481: String expression = cell.getObject().toString();
482: // We must remove the head of the string, which has already been
483: // found from the pre-search section.
484: expression = expression.substring(pre_index);
485: pattern_matches = patternMatch(post_pattern,
486: expression, escape_char);
487: }
488: if (!pattern_matches) {
489: // If pattern does not match then remove this row from the search.
490: iterator.remove();
491: }
492:
493: }
494:
495: return new IntegerVector(i_list);
496:
497: }
498:
499: // ---------- Matching against a regular expression ----------
500:
501: /**
502: * Matches a string against a regular expression pattern. We use the regex
503: * library as specified in the DatabaseSystem configuration.
504: */
505: static boolean regexMatch(TransactionSystem system, String pattern,
506: String value) {
507: // If the first character is a '/' then we assume it's a Perl style regular
508: // expression (eg. "/.*[0-9]+\/$/i")
509: if (pattern.startsWith("/")) {
510: int end = pattern.lastIndexOf('/');
511: String pat = pattern.substring(1, end);
512: String ops = pattern.substring(end + 1);
513: return system.getRegexLibrary().regexMatch(pat, ops, value);
514: } else {
515: // Otherwise it's a regular expression with no operators
516: return system.getRegexLibrary().regexMatch(pattern, "",
517: value);
518: }
519: }
520:
521: /**
522: * Matches a column of a table against a constant regular expression
523: * pattern. We use the regex library as specified in the DatabaseSystem
524: * configuration.
525: */
526: static IntegerVector regexSearch(Table table, int column,
527: String pattern) {
528: // If the first character is a '/' then we assume it's a Perl style regular
529: // expression (eg. "/.*[0-9]+\/$/i")
530: if (pattern.startsWith("/")) {
531: int end = pattern.lastIndexOf('/');
532: String pat = pattern.substring(1, end);
533: String ops = pattern.substring(end + 1);
534: return table.getDatabase().getSystem().getRegexLibrary()
535: .regexSearch(table, column, pat, ops);
536: } else {
537: // Otherwise it's a regular expression with no operators
538: return table.getDatabase().getSystem().getRegexLibrary()
539: .regexSearch(table, column, pattern, "");
540: }
541: }
542:
543: }
|