001: /*
002: * Copyright 2004-2006 the original author or authors.
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: */
016:
017: package org.compass.core.lucene.engine;
018:
019: import java.io.IOException;
020: import java.io.StringReader;
021: import java.util.ArrayList;
022: import java.util.List;
023:
024: import org.apache.lucene.analysis.Analyzer;
025: import org.apache.lucene.analysis.TokenStream;
026: import org.apache.lucene.index.IndexReader;
027: import org.apache.lucene.index.TermFreqVector;
028: import org.apache.lucene.index.TermPositionVector;
029: import org.apache.lucene.search.Query;
030: import org.apache.lucene.search.highlight.Fragmenter;
031: import org.apache.lucene.search.highlight.Highlighter;
032: import org.apache.lucene.search.highlight.QueryScorer;
033: import org.apache.lucene.search.highlight.Scorer;
034: import org.apache.lucene.search.highlight.TokenSources;
035: import org.compass.core.CompassHighlighter;
036: import org.compass.core.Resource;
037: import org.compass.core.engine.SearchEngineException;
038: import org.compass.core.engine.SearchEngineHighlighter;
039: import org.compass.core.lucene.LuceneEnvironment;
040: import org.compass.core.lucene.LuceneResource;
041: import org.compass.core.lucene.engine.analyzer.LuceneAnalyzerManager;
042: import org.compass.core.lucene.engine.highlighter.LuceneHighlighterManager;
043: import org.compass.core.lucene.engine.highlighter.LuceneHighlighterSettings;
044:
045: /**
046: * @author kimchy
047: */
048: public class LuceneSearchEngineHighlighter implements
049: SearchEngineHighlighter, LuceneDelegatedClose {
050:
051: private IndexReader indexReader;
052:
053: private boolean closed;
054:
055: private Query query;
056:
057: private LuceneHighlighterSettings highlighterSettings;
058:
059: private LuceneAnalyzerManager analyzerManager;
060:
061: private LuceneHighlighterManager highlighterManager;
062:
063: private int maxNumFragments = -1;
064:
065: private Analyzer analyzer;
066:
067: private String separator;
068:
069: private int maxBytesToAnalyze = -1;
070:
071: private CompassHighlighter.TextTokenizer textTokenizer;
072:
073: public LuceneSearchEngineHighlighter(Query query,
074: IndexReader indexReader, LuceneSearchEngine searchEngine)
075: throws SearchEngineException {
076: this .indexReader = indexReader;
077: this .highlighterManager = searchEngine.getSearchEngineFactory()
078: .getHighlighterManager();
079: this .highlighterSettings = highlighterManager
080: .getDefaultHighlighterSettings();
081:
082: this .analyzerManager = searchEngine.getSearchEngineFactory()
083: .getAnalyzerManager();
084:
085: if (highlighterSettings.isRewriteQuery()) {
086: try {
087: this .query = query.rewrite(indexReader);
088: } catch (IOException e) {
089: throw new SearchEngineException(
090: "Failed to rewrite query [" + query
091: + "] for highlighter", e);
092: }
093: }
094:
095: clear();
096: }
097:
098: public SearchEngineHighlighter clear() {
099: analyzer = analyzerManager.getDefaultAnalyzer();
100: maxNumFragments = -1;
101: separator = null;
102: maxBytesToAnalyze = -1;
103: return this ;
104: }
105:
106: public SearchEngineHighlighter setMaxNumFragments(
107: int maxNumFragments) throws SearchEngineException {
108: this .maxNumFragments = maxNumFragments;
109: return this ;
110: }
111:
112: public SearchEngineHighlighter setMaxBytesToAnalyze(
113: int maxBytesToAnalyze) throws SearchEngineException {
114: this .maxBytesToAnalyze = maxBytesToAnalyze;
115: return this ;
116: }
117:
118: public SearchEngineHighlighter setAnalyzer(String analyzerName)
119: throws SearchEngineException {
120: this .analyzer = analyzerManager
121: .getAnalyzerMustExist(analyzerName);
122: return this ;
123: }
124:
125: public SearchEngineHighlighter setAnalyzer(Resource resource)
126: throws SearchEngineException {
127: this .analyzer = analyzerManager.getAnalyzerByResource(resource);
128: return this ;
129: }
130:
131: public SearchEngineHighlighter setHighlighter(String highlighterName)
132: throws SearchEngineException {
133: this .highlighterSettings = highlighterManager
134: .getHighlighterSettingsMustExists(highlighterName);
135: return this ;
136: }
137:
138: public SearchEngineHighlighter setSeparator(String separator)
139: throws SearchEngineException {
140: this .separator = separator;
141: return this ;
142: }
143:
144: public SearchEngineHighlighter setTextTokenizer(
145: CompassHighlighter.TextTokenizer textTokenizer)
146: throws SearchEngineException {
147: this .textTokenizer = textTokenizer;
148: return this ;
149: }
150:
151: public String fragment(Resource resource, String propertyName)
152: throws SearchEngineException {
153: return fragment(resource, propertyName, getTextFromResource(
154: resource, propertyName));
155: }
156:
157: public String fragment(Resource resource, String propertyName,
158: String text) throws SearchEngineException {
159:
160: Highlighter highlighter = createHighlighter(propertyName);
161: TokenStream tokenStream = createTokenStream(resource,
162: propertyName, text);
163:
164: try {
165: return highlighter.getBestFragment(tokenStream, text);
166: } catch (IOException e) {
167: throw new SearchEngineException(
168: "Failed to highlight fragments for alias ["
169: + resource.getAlias() + "] and property ["
170: + propertyName + "]");
171: }
172: }
173:
174: public String[] fragments(Resource resource, String propertyName)
175: throws SearchEngineException {
176: return fragments(resource, propertyName, getTextFromResource(
177: resource, propertyName));
178: }
179:
180: public String[] fragments(Resource resource, String propertyName,
181: String text) throws SearchEngineException {
182: Highlighter highlighter = createHighlighter(propertyName);
183: TokenStream tokenStream = createTokenStream(resource,
184: propertyName, text);
185: try {
186: return highlighter.getBestFragments(tokenStream, text,
187: getMaxNumFragments());
188: } catch (IOException e) {
189: throw new SearchEngineException(
190: "Failed to highlight fragments for alias ["
191: + resource.getAlias() + "] and property ["
192: + propertyName + "]");
193: }
194: }
195:
196: public String fragmentsWithSeparator(Resource resource,
197: String propertyName) throws SearchEngineException {
198: return fragmentsWithSeparator(resource, propertyName,
199: getTextFromResource(resource, propertyName));
200: }
201:
202: public String fragmentsWithSeparator(Resource resource,
203: String propertyName, String text)
204: throws SearchEngineException {
205: Highlighter highlighter = createHighlighter(propertyName);
206: TokenStream tokenStream = createTokenStream(resource,
207: propertyName, text);
208: try {
209: String actualSeparator = getActualSeparator();
210: return highlighter.getBestFragments(tokenStream, text,
211: getMaxNumFragments(), actualSeparator);
212: } catch (IOException e) {
213: throw new SearchEngineException(
214: "Failed to highlight fragments for alias ["
215: + resource.getAlias() + "] and property ["
216: + propertyName + "]");
217: }
218: }
219:
220: public String[] multiResourceFragment(Resource resource,
221: String propertyName) throws SearchEngineException {
222: return multiResourceFragment(resource, propertyName,
223: getTextsFromResource(resource, propertyName));
224: }
225:
226: public String[] multiResourceFragment(Resource resource,
227: String propertyName, String[] texts)
228: throws SearchEngineException {
229: List fragmentList = new ArrayList();
230: Highlighter highlighter = createHighlighter(propertyName);
231: for (int i = 0; i < texts.length; i++) {
232: String text = texts[i];
233: if (text != null && text.length() > 0) {
234: //TokenStream tokenStream = createTokenStream(resource, propertyName, text);
235: // We have to re-analyze one field value at a time
236: TokenStream tokenStream = analyzer.tokenStream(
237: propertyName, new StringReader(text));
238: try {
239: String fragment = highlighter.getBestFragment(
240: tokenStream, text);
241: if (fragment != null && fragment.length() > 0) {
242: fragmentList.add(fragment);
243: }
244: } catch (IOException e) {
245: throw new SearchEngineException(
246: "Failed to highlight fragments for alias ["
247: + resource.getAlias()
248: + "] and property [" + propertyName
249: + "]");
250: }
251: }
252: }
253: return (String[]) fragmentList.toArray(new String[fragmentList
254: .size()]);
255: }
256:
257: public String multiResourceFragmentWithSeparator(Resource resource,
258: String propertyName) throws SearchEngineException {
259: return multiResourceFragmentWithSeparator(resource,
260: propertyName, getTextsFromResource(resource,
261: propertyName));
262: }
263:
264: public String multiResourceFragmentWithSeparator(Resource resource,
265: String propertyName, String[] texts)
266: throws SearchEngineException {
267: String[] fragments = multiResourceFragment(resource,
268: propertyName, texts);
269: String actualSeparator = getActualSeparator();
270: StringBuffer fragment = new StringBuffer();
271: if (fragments.length > 0) {
272: for (int i = 0; i < (fragments.length - 1); i++) {
273: fragment.append(fragments[i]);
274: fragment.append(actualSeparator);
275: }
276: fragment.append(fragments[fragments.length - 1]);
277: }
278: return fragment.toString();
279: }
280:
281: protected TokenStream createTokenStream(Resource resource,
282: String propertyName, String text)
283: throws SearchEngineException {
284: CompassHighlighter.TextTokenizer actualTextTokenizer = highlighterSettings
285: .getTextTokenizer();
286: if (textTokenizer != null) {
287: actualTextTokenizer = textTokenizer;
288: }
289: if (actualTextTokenizer == CompassHighlighter.TextTokenizer.AUTO) {
290: TokenStream tokenStream = createTokenStreamFromTermPositions(
291: resource, propertyName);
292: if (tokenStream == null) {
293: tokenStream = analyzer.tokenStream(propertyName,
294: new StringReader(text));
295: }
296: return tokenStream;
297: } else if (actualTextTokenizer == CompassHighlighter.TextTokenizer.ANALYZER) {
298: return analyzer.tokenStream(propertyName, new StringReader(
299: text));
300: } else if (actualTextTokenizer == CompassHighlighter.TextTokenizer.TERM_VECTOR) {
301: TokenStream tokenStream = createTokenStreamFromTermPositions(
302: resource, propertyName);
303: if (tokenStream == null) {
304: throw new SearchEngineException(
305: "Highlighter configured/set to use term vector, but no term vector is available");
306: }
307: return tokenStream;
308: }
309: throw new SearchEngineException(
310: "No handling for text tokenizer ["
311: + actualTextTokenizer + "]");
312: }
313:
314: protected TokenStream createTokenStreamFromTermPositions(
315: Resource resource, String propertyName)
316: throws SearchEngineException {
317: int docId = ((LuceneResource) resource).getDocNum();
318: TermFreqVector tfv;
319: try {
320: tfv = indexReader.getTermFreqVector(docId, propertyName);
321: } catch (IOException e) {
322: throw new SearchEngineException(
323: "Failed to read term vector info", e);
324: }
325: if (tfv != null) {
326: if (tfv instanceof TermPositionVector) {
327: return TokenSources
328: .getTokenStream((TermPositionVector) tfv);
329: }
330: }
331: return null;
332: }
333:
334: protected Highlighter createHighlighter(String propertyName)
335: throws SearchEngineException {
336: Highlighter highlighter = new Highlighter(highlighterSettings
337: .getFormatter(), highlighterSettings.getEncoder(),
338: createScorer(propertyName));
339: Fragmenter f = highlighterSettings.getFragmenter();
340: highlighter.setTextFragmenter(f);
341: if (maxBytesToAnalyze == -1) {
342: highlighter.setMaxDocBytesToAnalyze(highlighterSettings
343: .getMaxBytesToAnalyze());
344: } else {
345: highlighter.setMaxDocBytesToAnalyze(maxBytesToAnalyze);
346: }
347: return highlighter;
348: }
349:
350: protected Scorer createScorer(String propertyName)
351: throws SearchEngineException {
352: if (highlighterSettings.isComputeIdf()) {
353: if (propertyName == null) {
354: throw new SearchEngineException(
355: "When using a formatter that requires idf or setting the ["
356: + LuceneEnvironment.Highlighter.COMPUTE_IDF
357: + "] setting, a resource property name must be provided");
358: }
359: return new QueryScorer(query, indexReader, propertyName);
360: }
361: return new QueryScorer(query);
362: }
363:
364: private String getTextFromResource(Resource resource,
365: String propertyName) {
366: String text = resource.getValue(propertyName);
367: if (text == null) {
368: throw new SearchEngineException(
369: "No text is stored for property [" + propertyName
370: + "] and alias [" + resource.getAlias()
371: + "]");
372: }
373: return text;
374: }
375:
376: private String[] getTextsFromResource(Resource resource,
377: String propertyName) {
378: String[] texts = resource.getValues(propertyName);
379: if (texts == null || texts.length == 0) {
380: throw new SearchEngineException(
381: "No texts are stored for property [" + propertyName
382: + "] and alias [" + resource.getAlias()
383: + "]");
384: }
385: return texts;
386: }
387:
388: private int getMaxNumFragments() {
389: if (maxNumFragments == -1) {
390: return highlighterSettings.getMaxNumFragments();
391: }
392: return maxNumFragments;
393: }
394:
395: private String getActualSeparator() {
396: String actualSeparator = separator;
397: if (actualSeparator == null) {
398: actualSeparator = highlighterSettings.getSeparator();
399: }
400: return actualSeparator;
401: }
402:
403: public void close() throws SearchEngineException {
404: if (closed) {
405: return;
406: }
407: closed = true;
408: }
409:
410: }
|