001: /*
002: * Copyright 2004-2006 the original author or authors.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.compass.core.lucene.engine.analyzer;
018:
019: import java.util.ArrayList;
020: import java.util.HashSet;
021: import java.util.Set;
022: import java.util.StringTokenizer;
023:
024: import org.apache.commons.logging.Log;
025: import org.apache.commons.logging.LogFactory;
026: import org.apache.lucene.analysis.Analyzer;
027: import org.compass.core.config.CompassConfigurable;
028: import org.compass.core.config.CompassSettings;
029: import org.compass.core.engine.SearchEngineException;
030: import org.compass.core.lucene.LuceneEnvironment;
031: import org.compass.core.util.ClassUtils;
032: import org.compass.core.util.StringUtils;
033:
034: /**
035: * @author kimchy
036: */
037: public class DefaultLuceneAnalyzerFactory implements
038: LuceneAnalyzerFactory {
039:
040: private static final Log log = LogFactory
041: .getLog(DefaultLuceneAnalyzerFactory.class);
042:
043: private static final Set extednedAnalyzers;
044:
045: private static final Set coreAnalyzers;
046:
047: static {
048: coreAnalyzers = new HashSet();
049: coreAnalyzers
050: .add(LuceneEnvironment.Analyzer.CoreTypes.WHITESPACE);
051: coreAnalyzers
052: .add(LuceneEnvironment.Analyzer.CoreTypes.STANDARD);
053: coreAnalyzers.add(LuceneEnvironment.Analyzer.CoreTypes.SIMPLE);
054: coreAnalyzers.add(LuceneEnvironment.Analyzer.CoreTypes.STOP);
055:
056: extednedAnalyzers = new HashSet();
057: extednedAnalyzers
058: .add(LuceneEnvironment.Analyzer.ExtendedTypes.BRAZILIAN);
059: extednedAnalyzers
060: .add(LuceneEnvironment.Analyzer.ExtendedTypes.CJK);
061: extednedAnalyzers
062: .add(LuceneEnvironment.Analyzer.ExtendedTypes.CHINESE);
063: extednedAnalyzers
064: .add(LuceneEnvironment.Analyzer.ExtendedTypes.CZECH);
065: extednedAnalyzers
066: .add(LuceneEnvironment.Analyzer.ExtendedTypes.GERMAN);
067: extednedAnalyzers
068: .add(LuceneEnvironment.Analyzer.ExtendedTypes.GREEK);
069: extednedAnalyzers
070: .add(LuceneEnvironment.Analyzer.ExtendedTypes.FRENCH);
071: extednedAnalyzers
072: .add(LuceneEnvironment.Analyzer.ExtendedTypes.DUTCH);
073: extednedAnalyzers
074: .add(LuceneEnvironment.Analyzer.ExtendedTypes.RUSSIAN);
075: }
076:
077: public Analyzer createAnalyzer(String analyzerName,
078: CompassSettings settings) throws SearchEngineException {
079: Analyzer analyzer;
080: String analyzerSetting = settings.getSetting(
081: LuceneEnvironment.Analyzer.TYPE,
082: LuceneEnvironment.Analyzer.CoreTypes.STANDARD);
083: if (log.isDebugEnabled()) {
084: log.debug("Analyzer [" + analyzerName
085: + "] uses Lucene analyzer [" + analyzerSetting
086: + "]");
087: }
088: if (coreAnalyzers.contains(analyzerSetting.toLowerCase())) {
089: AnalyzerBuilderDelegate analyzerBuilderDelegate = new CoreAnalyzerBuilderDelegate();
090: analyzer = analyzerBuilderDelegate.buildAnalyzer(
091: analyzerName, settings, this );
092: } else if (LuceneEnvironment.Analyzer.Snowball.SNOWBALL
093: .equalsIgnoreCase(analyzerSetting)) {
094: AnalyzerBuilderDelegate analyzerBuilderDelegate = new SnowballAnalyzerBuilderDelegate();
095: analyzer = analyzerBuilderDelegate.buildAnalyzer(
096: analyzerName, settings, this );
097: } else if (extednedAnalyzers.contains(analyzerSetting
098: .toLowerCase())) {
099: AnalyzerBuilderDelegate analyzerBuilderDelegate = new ExtendedAnalyzerBuilderDelegate();
100: analyzer = analyzerBuilderDelegate.buildAnalyzer(
101: analyzerName, settings, this );
102: } else {
103: // the analyzer must be a fully qualified class, try to instansiate
104: try {
105: analyzer = (Analyzer) ClassUtils.forName(
106: analyzerSetting, settings.getClassLoader())
107: .newInstance();
108: } catch (Exception e) {
109: throw new SearchEngineException(
110: "Cannot instantiate Lucene Analyzer ["
111: + analyzerSetting
112: + "] for analyzer ["
113: + analyzerName
114: + "]. Please verify the analyzer setting at ["
115: + LuceneEnvironment.Analyzer.TYPE + "]",
116: e);
117: }
118: if (analyzer instanceof CompassConfigurable) {
119: ((CompassConfigurable) analyzer).configure(settings);
120: }
121: }
122: return analyzer;
123: }
124:
125: public String[] parseStopWords(String analyzerName,
126: CompassSettings settings, String[] defaultStopWords) {
127: String stopWords = settings
128: .getSetting(LuceneEnvironment.Analyzer.STOPWORDS);
129: if (stopWords == null) {
130: if (log.isTraceEnabled()) {
131: log
132: .trace("Anayzer ["
133: + analyzerName
134: + "] uses default stop words ["
135: + StringUtils
136: .arrayToCommaDelimitedString(defaultStopWords)
137: + "]");
138: }
139: return defaultStopWords;
140: }
141: boolean addStopWords = false;
142: if (stopWords.startsWith("+")) {
143: addStopWords = true;
144: stopWords = stopWords.substring(1);
145: }
146: StringTokenizer st = new StringTokenizer(stopWords, ",");
147: ArrayList listStopWords = new ArrayList();
148: while (st.hasMoreTokens()) {
149: String stopword = st.nextToken().trim();
150: if (StringUtils.hasLength(stopword)) {
151: listStopWords.add(stopword);
152: }
153: }
154: String[] arrStopWords = (String[]) listStopWords
155: .toArray(new String[listStopWords.size()]);
156:
157: if (addStopWords) {
158: if (log.isTraceEnabled()) {
159: log
160: .trace("Analyzer ["
161: + analyzerName
162: + "] uses default stop words ["
163: + StringUtils
164: .arrayToCommaDelimitedString(defaultStopWords)
165: + "]");
166: log
167: .trace("Analyzer ["
168: + analyzerName
169: + "] and uses user stop words ["
170: + StringUtils
171: .arrayToCommaDelimitedString(arrStopWords)
172: + "]");
173: }
174: String[] tempStopWords = arrStopWords;
175: arrStopWords = new String[tempStopWords.length
176: + defaultStopWords.length];
177: System.arraycopy(defaultStopWords, 0, arrStopWords, 0,
178: defaultStopWords.length);
179: System.arraycopy(tempStopWords, 0, arrStopWords,
180: defaultStopWords.length, tempStopWords.length);
181: } else {
182: if (log.isTraceEnabled()) {
183: log
184: .trace("Analyzer ["
185: + analyzerName
186: + "] uses user stop words ["
187: + StringUtils
188: .arrayToCommaDelimitedString(arrStopWords)
189: + "]");
190: }
191: }
192: return arrStopWords;
193: }
194: }
|