001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.analysis;
017:
018: import org.apache.lucene.analysis.TokenStream;
019: import org.apache.solr.core.Config;
020: import org.apache.solr.core.SolrCore;
021: import org.apache.solr.util.StrUtils;
022:
023: import java.io.IOException;
024: import java.util.ArrayList;
025: import java.util.List;
026: import java.util.Map;
027:
028: /**
029: * @author yonik
030: * @version $Id: SynonymFilterFactory.java 472574 2006-11-08 18:25:52Z yonik $
031: */
032: public class SynonymFilterFactory extends BaseTokenFilterFactory {
033: public void init(Map<String, String> args) {
034: super .init(args);
035: String synonyms = args.get("synonyms");
036:
037: ignoreCase = getBoolean("ignoreCase", false);
038: expand = getBoolean("expand", true);
039:
040: if (synonyms != null) {
041: List<String> wlist = null;
042: try {
043: wlist = Config.getLines(synonyms);
044: } catch (IOException e) {
045: throw new RuntimeException(e);
046: }
047: synMap = new SynonymMap();
048: parseRules(wlist, synMap, "=>", ",", ignoreCase, expand);
049: if (wlist.size() <= 20) {
050: SolrCore.log.fine("SynonymMap " + synonyms + ":"
051: + synMap);
052: }
053: }
054:
055: }
056:
057: private SynonymMap synMap;
058: private boolean ignoreCase;
059: private boolean expand;
060:
061: private static void parseRules(List<String> rules, SynonymMap map,
062: String mappingSep, String synSep, boolean ignoreCase,
063: boolean expansion) {
064: int count = 0;
065: for (String rule : rules) {
066: // To use regexes, we need an expression that specifies an odd number of chars.
067: // This can't really be done with string.split(), and since we need to
068: // do unescaping at some point anyway, we wouldn't be saving any effort
069: // by using regexes.
070:
071: List<String> mapping = StrUtils.splitSmart(rule,
072: mappingSep, false);
073:
074: List<List<String>> source;
075: List<List<String>> target;
076:
077: if (mapping.size() > 2) {
078: throw new RuntimeException("Invalid Synonym Rule:"
079: + rule);
080: } else if (mapping.size() == 2) {
081: source = getSynList(mapping.get(0), synSep);
082: target = getSynList(mapping.get(1), synSep);
083: } else {
084: source = getSynList(mapping.get(0), synSep);
085: if (expansion) {
086: // expand to all arguments
087: target = source;
088: } else {
089: // reduce to first argument
090: target = new ArrayList<List<String>>(1);
091: target.add(source.get(0));
092: }
093: }
094:
095: boolean includeOrig = false;
096: for (List<String> fromToks : source) {
097: count++;
098: for (List<String> toToks : target) {
099: map.add(ignoreCase ? StrUtils.toLower(fromToks)
100: : fromToks, SynonymMap.makeTokens(toToks),
101: includeOrig, true);
102: }
103: }
104: }
105: }
106:
107: // a , b c , d e f => [[a],[b,c],[d,e,f]]
108: private static List<List<String>> getSynList(String str,
109: String separator) {
110: List<String> strList = StrUtils.splitSmart(str, separator,
111: false);
112: // now split on whitespace to get a list of token strings
113: List<List<String>> synList = new ArrayList<List<String>>();
114: for (String toks : strList) {
115: List<String> tokList = StrUtils.splitWS(toks, true);
116: synList.add(tokList);
117: }
118: return synList;
119: }
120:
121: public TokenStream create(TokenStream input) {
122: return new SynonymFilter(input, synMap, ignoreCase);
123: }
124:
125: }
|