001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.analysis;
017:
018: import org.apache.commons.io.IOUtils;
019: import org.apache.lucene.analysis.Token;
020: import org.apache.lucene.analysis.TokenStream;
021: import org.apache.solr.core.SolrException;
022:
023: import java.io.IOException;
024: import java.io.Reader;
025: import java.util.ArrayList;
026: import java.util.Iterator;
027: import java.util.List;
028: import java.util.Map;
029: import java.util.regex.Matcher;
030: import java.util.regex.Pattern;
031:
032: /**
033: * This tokenizer uses regex pattern matching to construct distinct tokens
034: * for the input stream. It takes two arguments: "pattern" and "group"
035: *
036: * "pattern" is the regular expression.
037: * "group" says which group to extract into tokens.
038: *
039: * group=-1 (the default) is equivalent to "split". In this case, the tokens will
040: * be equivalent to the output from:
041: *
042: * http://java.sun.com/j2se/1.4.2/docs/api/java/lang/String.html#split(java.lang.String)
043: *
044: * Using group >= 0 selects the matching group as the token. For example, if you have:
045: *
046: * pattern = \'([^\']+)\'
047: * group = 0
048: * input = aaa 'bbb' 'ccc'
049: *
050: * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
051: * but using group=1, the output would be: bbb and ccc (no ' marks)
052: *
053: *
054: * @author ryan
055: * @since solr1.2
056: * @version $Id:$
057: */
058: public class PatternTokenizerFactory implements TokenizerFactory {
059: public static final String PATTERN = "pattern";
060: public static final String GROUP = "group";
061:
062: protected Map<String, String> args;
063: protected Pattern pattern;
064: protected int group;
065:
066: /**
067: * Require a configured pattern
068: */
069: public void init(Map<String, String> args) {
070: this .args = args;
071: String regex = args.get(PATTERN);
072: if (regex == null) {
073: throw new SolrException(
074: SolrException.ErrorCode.SERVER_ERROR,
075: "missing required argument: " + PATTERN);
076: }
077: int flags = 0; // TODO? -- read flags from config CASE_INSENSITIVE, etc
078: pattern = Pattern.compile(regex, flags);
079:
080: group = -1; // use 'split'
081: String g = args.get(GROUP);
082: if (g != null) {
083: try {
084: group = Integer.parseInt(g);
085: } catch (Exception ex) {
086: throw new SolrException(
087: SolrException.ErrorCode.SERVER_ERROR,
088: "invalid group argument: " + g);
089: }
090: }
091: }
092:
093: /**
094: * The arguments passed to init()
095: */
096: public Map<String, String> getArgs() {
097: return this .args;
098: }
099:
100: /**
101: * Split the input using configured pattern
102: */
103: public TokenStream create(Reader input) {
104: try {
105: // Read the input into a single string
106: String str = IOUtils.toString(input);
107:
108: Matcher matcher = pattern.matcher(str);
109: List<Token> tokens = (group < 0) ? split(matcher, str)
110: : group(matcher, str, group);
111:
112: final Iterator<Token> iter = tokens.iterator();
113: return new TokenStream() {
114: @Override
115: public Token next() throws IOException {
116: if (iter.hasNext()) {
117: return iter.next();
118: }
119: return null;
120: }
121: };
122: } catch (IOException ex) {
123: throw new SolrException(
124: SolrException.ErrorCode.SERVER_ERROR, ex);
125: }
126: }
127:
128: /**
129: * This behaves just like String.split( ), but returns a list of Tokens
130: * rather then an array of strings
131: */
132: public static List<Token> split(Matcher matcher, String input) {
133: int index = 0;
134: int lastNonEmptySize = Integer.MAX_VALUE;
135: ArrayList<Token> matchList = new ArrayList<Token>();
136:
137: // Add segments before each match found
138: while (matcher.find()) {
139: String match = input.subSequence(index, matcher.start())
140: .toString();
141: matchList.add(new Token(match, index, matcher.start()));
142: index = matcher.end();
143: if (match.length() > 0) {
144: lastNonEmptySize = matchList.size();
145: }
146: }
147:
148: // If no match is found, return the full string
149: if (index == 0) {
150: matchList.add(new Token(input, 0, input.length()));
151: } else {
152: String match = input.subSequence(index, input.length())
153: .toString();
154: matchList.add(new Token(match, index, input.length()));
155: if (match.length() > 0) {
156: lastNonEmptySize = matchList.size();
157: }
158: }
159:
160: // Don't use trailing empty strings. This behavior matches String.split();
161: if (lastNonEmptySize < matchList.size()) {
162: return matchList.subList(0, lastNonEmptySize);
163: }
164: return matchList;
165: }
166:
167: /**
168: * Create tokens from the matches in a matcher
169: */
170: public static List<Token> group(Matcher matcher, String input,
171: int group) {
172: ArrayList<Token> matchList = new ArrayList<Token>();
173: while (matcher.find()) {
174: Token t = new Token(matcher.group(group), matcher
175: .start(group), matcher.end(group));
176: matchList.add(t);
177: }
178: return matchList;
179: }
180: }
|