001: package org.sakaiproject.citation.util.impl;
002:
003: import java.util.HashSet;
004: import java.util.Set;
005: import java.util.StringTokenizer;
006:
007: public class InputStringParser implements
008: org.sakaiproject.citation.util.api.InputStringParser {
009: private static final java.util.Set<String> COMMON_WORDS = new java.util.HashSet<String>();
010: private static final String DOUBLE_QUOTE = "\"";
011:
012: // the parser switches between these two sets of delimiters
013: private static final String WHITESPACE_AND_QUOTES = " \t\r\n\"";
014: private static final String QUOTES_ONLY = "\"";
015:
016: // Common words against which searches will not be performed.
017: static {
018: COMMON_WORDS.add("a");
019: COMMON_WORDS.add("and");
020: COMMON_WORDS.add("be");
021: COMMON_WORDS.add("for");
022: COMMON_WORDS.add("from");
023: COMMON_WORDS.add("has");
024: COMMON_WORDS.add("i");
025: COMMON_WORDS.add("in");
026: COMMON_WORDS.add("is");
027: COMMON_WORDS.add("it");
028: COMMON_WORDS.add("of");
029: COMMON_WORDS.add("on");
030: COMMON_WORDS.add("to");
031: COMMON_WORDS.add("the");
032: COMMON_WORDS.add("not");
033: COMMON_WORDS.add("or");
034: }
035:
036: /**
037: * Parse keywords into a Set of Strings. This method recognizes phrases
038: * (marked using quotation marks) and drops common words (i.e. and, or, not,
039: * to, the, etc.) if they are not part of a phrase. Each element of the
040: * resulting Set will be a single term or a phrase.
041: *
042: * @param inputString the input a user has submitted (i.e. from an HTML
043: * input field in a form)
044: * @return a Set containing individual search terms or phrases or null if
045: * inputString is null or empty.
046: */
047: public Set<String> parseInputString(String inputString) {
048: if (inputString == null || inputString.trim().equals("")) {
049: return null;
050: }
051:
052: Set<String> result = new HashSet<String>();
053:
054: boolean returnTokens = true;
055: String currentDelim = WHITESPACE_AND_QUOTES;
056: StringTokenizer parser = new StringTokenizer(inputString,
057: currentDelim, returnTokens);
058:
059: String token = null;
060: while (parser.hasMoreTokens()) {
061: token = parser.nextToken(currentDelim);
062: if (!isDoubleQuote(token)) {
063: addNonTrivialWordToResult(token, result);
064: } else {
065: currentDelim = switchDelimiters(currentDelim);
066: }
067: }
068: return result;
069: }
070:
071: private static boolean isCommonWord(String searchTokenCandidate) {
072: return COMMON_WORDS.contains(searchTokenCandidate);
073: }
074:
075: private static boolean textHasContent(String text) {
076: return (text != null) && (!text.trim().equals(""));
077: }
078:
079: private static void addNonTrivialWordToResult(String token,
080: Set<String> result) {
081: if (textHasContent(token) && !isCommonWord(token.trim())) {
082: result.add(token.trim());
083: }
084: }
085:
086: private static boolean isDoubleQuote(String token) {
087: return token.equals(DOUBLE_QUOTE);
088: }
089:
090: private static String switchDelimiters(String currentDelim) {
091: String result = null;
092: if (currentDelim.equals(WHITESPACE_AND_QUOTES)) {
093: result = QUOTES_ONLY;
094: } else {
095: result = WHITESPACE_AND_QUOTES;
096: }
097: return result;
098: }
099:
100: }
|