001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Copyright 2002-2005 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import java.io.IOException;
020: import java.io.StringReader;
021: import java.util.ArrayList;
022: import java.util.Iterator;
023:
024: import org.apache.lucene.analysis.Analyzer;
025: import org.apache.lucene.analysis.TokenStream;
026: import org.apache.lucene.util.PriorityQueue;
027:
028: /**
029: * Class used to markup highlighted terms found in the best sections of a text,
030: * using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
031: * {@link Encoder} and tokenizers.
032: *
033: * @author mark@searcharea.co.uk
034: */
035: public class Highlighter {
036:
037: public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE = 50 * 1024;
038:
039: private int maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
040:
041: private Formatter formatter;
042:
043: private Encoder encoder;
044:
045: private Fragmenter textFragmenter = new SimpleFragmenter();
046:
047: private Scorer fragmentScorer = null;
048:
049: public Highlighter(Scorer fragmentScorer) {
050: this (new SimpleHTMLFormatter(), fragmentScorer);
051: }
052:
053: public Highlighter(Formatter formatter, Scorer fragmentScorer) {
054: this (formatter, new DefaultEncoder(), fragmentScorer);
055: }
056:
057: public Highlighter(Formatter formatter, Encoder encoder,
058: Scorer fragmentScorer) {
059: this .formatter = formatter;
060: this .encoder = encoder;
061: this .fragmentScorer = fragmentScorer;
062: }
063:
064: /**
065: * Highlights chosen terms in a text, extracting the most relevant section.
066: * This is a convenience method that calls
067: * {@link #getBestFragment(TokenStream, String)}
068: *
069: * @param analyzer
070: * the analyzer that will be used to split <code>text</code> into
071: * chunks
072: * @param text
073: * text to highlight terms in
074: * @param fieldName
075: * Name of field used to influence analyzer's tokenization policy
076: * @return highlighted text fragment or null if no terms found
077: */
078: public final String getBestFragment(Analyzer analyzer,
079: String fieldName, String text) throws IOException {
080: TokenStream tokenStream = analyzer.tokenStream(fieldName,
081: new StringReader(text));
082: return getBestFragment(tokenStream, text);
083: }
084:
085: /**
086: * Highlights chosen terms in a text, extracting the most relevant section.
087: * The document text is analysed in chunks to record hit statistics across
088: * the document. After accumulating stats, the fragment with the highest
089: * score is returned
090: *
091: * @param tokenStream
092: * a stream of tokens identified in the text parameter, including
093: * offset information. This is typically produced by an analyzer
094: * re-parsing a document's text. Some work may be done on retrieving
095: * TokenStreams more efficently by adding support for storing
096: * original text position data in the Lucene index but this support
097: * is not currently available (as of Lucene 1.4 rc2).
098: * @param text
099: * text to highlight terms in
100: * @return highlighted text fragment or null if no terms found
101: */
102: public final String getBestFragment(TokenStream tokenStream,
103: String text) throws IOException {
104: String[] results = getBestFragments(tokenStream, text, 1);
105: if (results.length > 0) {
106: return results[0];
107: }
108: return null;
109: }
110:
111: /**
112: * Highlights chosen terms in a text, extracting the most relevant sections.
113: * This is a convenience method that calls
114: * {@link #getBestFragments(TokenStream, String, int)}
115: *
116: * @param analyzer
117: * the analyzer that will be used to split <code>text</code> into
118: * chunks
119: * @param text
120: * text to highlight terms in
121: * @param maxNumFragments
122: * the maximum number of fragments.
123: * @return highlighted text fragments (between 0 and maxNumFragments number
124: * of fragments)
125: */
126: public final String[] getBestFragments(Analyzer analyzer,
127: String text, int maxNumFragments) throws IOException {
128: TokenStream tokenStream = analyzer.tokenStream("field",
129: new StringReader(text));
130: return getBestFragments(tokenStream, text, maxNumFragments);
131: }
132:
133: /**
134: * Highlights chosen terms in a text, extracting the most relevant sections.
135: * The document text is analysed in chunks to record hit statistics across
136: * the document. After accumulating stats, the fragments with the highest
137: * scores are returned as an array of strings in order of score (contiguous
138: * fragments are merged into one in their original order to improve
139: * readability)
140: *
141: * @param text
142: * text to highlight terms in
143: * @param maxNumFragments
144: * the maximum number of fragments.
145: * @return highlighted text fragments (between 0 and maxNumFragments number
146: * of fragments)
147: */
148: public final String[] getBestFragments(TokenStream tokenStream,
149: String text, int maxNumFragments) throws IOException {
150: maxNumFragments = Math.max(1, maxNumFragments); // sanity check
151:
152: TextFragment[] frag = getBestTextFragments(tokenStream, text,
153: true, maxNumFragments);
154:
155: // Get text
156: ArrayList fragTexts = new ArrayList();
157: for (int i = 0; i < frag.length; i++) {
158: if ((frag[i] != null) && (frag[i].getScore() > 0)) {
159: fragTexts.add(frag[i].toString());
160: }
161: }
162: return (String[]) fragTexts.toArray(new String[0]);
163: }
164:
165: /**
166: * Low level api to get the most relevant (formatted) sections of the
167: * document. This method has been made public to allow visibility of score
168: * information held in TextFragment objects. Thanks to Jason Calabrese for
169: * help in redefining the interface.
170: *
171: * @param tokenStream
172: * @param text
173: * @param maxNumFragments
174: * @param mergeContiguousFragments
175: * @throws IOException
176: */
177: public final TextFragment[] getBestTextFragments(
178: TokenStream tokenStream, String text,
179: boolean mergeContiguousFragments, int maxNumFragments)
180: throws IOException {
181: ArrayList docFrags = new ArrayList();
182: StringBuffer newText = new StringBuffer();
183:
184: TextFragment currentFrag = new TextFragment(newText, newText
185: .length(), docFrags.size());
186: fragmentScorer.startFragment(currentFrag);
187: docFrags.add(currentFrag);
188:
189: FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
190:
191: try {
192: org.apache.lucene.analysis.Token token;
193: String tokenText;
194: int startOffset;
195: int endOffset;
196: int lastEndOffset = 0;
197: textFragmenter.start(text);
198:
199: TokenGroup tokenGroup = new TokenGroup();
200:
201: while ((token = tokenStream.next()) != null) {
202: if ((tokenGroup.numTokens > 0)
203: && (tokenGroup.isDistinct(token))) {
204: // the current token is distinct from previous tokens -
205: // markup the cached token group info
206: startOffset = tokenGroup.startOffset;
207: endOffset = tokenGroup.endOffset;
208: tokenText = text.substring(startOffset, endOffset);
209: String markedUpText = formatter.highlightTerm(
210: encoder.encodeText(tokenText), tokenGroup);
211: // store any whitespace etc from between this and last group
212: if (startOffset > lastEndOffset)
213: newText
214: .append(encoder.encodeText(text
215: .substring(lastEndOffset,
216: startOffset)));
217: newText.append(markedUpText);
218: lastEndOffset = endOffset;
219: tokenGroup.clear();
220:
221: // check if current token marks the start of a new fragment
222: if (textFragmenter.isNewFragment(token)) {
223: currentFrag.setScore(fragmentScorer
224: .getFragmentScore());
225: // record stats for a new fragment
226: currentFrag.textEndPos = newText.length();
227: currentFrag = new TextFragment(newText, newText
228: .length(), docFrags.size());
229: fragmentScorer.startFragment(currentFrag);
230: docFrags.add(currentFrag);
231: }
232: }
233:
234: tokenGroup.addToken(token, fragmentScorer
235: .getTokenScore(token));
236:
237: if (lastEndOffset > maxDocBytesToAnalyze) {
238: break;
239: }
240: }
241: currentFrag.setScore(fragmentScorer.getFragmentScore());
242:
243: if (tokenGroup.numTokens > 0) {
244: // flush the accumulated text (same code as in above loop)
245: startOffset = tokenGroup.startOffset;
246: endOffset = tokenGroup.endOffset;
247: tokenText = text.substring(startOffset, endOffset);
248: String markedUpText = formatter.highlightTerm(encoder
249: .encodeText(tokenText), tokenGroup);
250: // store any whitespace etc from between this and last group
251: if (startOffset > lastEndOffset)
252: newText.append(encoder.encodeText(text.substring(
253: lastEndOffset, startOffset)));
254: newText.append(markedUpText);
255: lastEndOffset = endOffset;
256: }
257:
258: // append text after end of last token
259: // if (lastEndOffset < text.length())
260: // newText.append(encoder.encodeText(text.substring(lastEndOffset)));
261:
262: currentFrag.textEndPos = newText.length();
263:
264: // sort the most relevant sections of the text
265: for (Iterator i = docFrags.iterator(); i.hasNext();) {
266: currentFrag = (TextFragment) i.next();
267:
268: // If you are running with a version of Lucene before 11th Sept
269: // 03
270: // you do not have PriorityQueue.insert() - so uncomment the
271: // code below
272: /*
273: * if (currentFrag.getScore() >= minScore) {
274: * fragQueue.put(currentFrag); if (fragQueue.size() >
275: * maxNumFragments) { // if hit queue overfull fragQueue.pop(); //
276: * remove lowest in hit queue minScore = ((TextFragment)
277: * fragQueue.top()).getScore(); // reset minScore } }
278: */
279: // The above code caused a problem as a result of Christoph
280: // Goller's 11th Sept 03
281: // fix to PriorityQueue. The correct method to use here is the
282: // new "insert" method
283: // USE ABOVE CODE IF THIS DOES NOT COMPILE!
284: fragQueue.insert(currentFrag);
285: }
286:
287: // return the most relevant fragments
288: TextFragment frag[] = new TextFragment[fragQueue.size()];
289: for (int i = frag.length - 1; i >= 0; i--) {
290: frag[i] = (TextFragment) fragQueue.pop();
291: }
292:
293: // merge any contiguous fragments to improve readability
294: if (mergeContiguousFragments) {
295: mergeContiguousFragments(frag);
296: ArrayList fragTexts = new ArrayList();
297: for (int i = 0; i < frag.length; i++) {
298: if ((frag[i] != null) && (frag[i].getScore() > 0)) {
299: fragTexts.add(frag[i]);
300: }
301: }
302: frag = (TextFragment[]) fragTexts
303: .toArray(new TextFragment[0]);
304: }
305:
306: return frag;
307:
308: } finally {
309: if (tokenStream != null) {
310: try {
311: tokenStream.close();
312: } catch (Exception e) {
313: }
314: }
315: }
316: }
317:
318: /**
319: * Improves readability of a score-sorted list of TextFragments by merging
320: * any fragments that were contiguous in the original text into one larger
321: * fragment with the correct order. This will leave a "null" in the array
322: * entry for the lesser scored fragment.
323: *
324: * @param frag
325: * An array of document fragments in descending score
326: */
327: private void mergeContiguousFragments(TextFragment[] frag) {
328: boolean mergingStillBeingDone;
329: if (frag.length > 1)
330: do {
331: mergingStillBeingDone = false; // initialise loop control flag
332: // for each fragment, scan other frags looking for contiguous blocks
333: for (int i = 0; i < frag.length; i++) {
334: if (frag[i] == null) {
335: continue;
336: }
337: // merge any contiguous blocks
338: for (int x = 0; x < frag.length; x++) {
339: if (frag[x] == null) {
340: continue;
341: }
342: if (frag[i] == null) {
343: break;
344: }
345: TextFragment frag1 = null;
346: TextFragment frag2 = null;
347: int frag1Num = 0;
348: int frag2Num = 0;
349: int bestScoringFragNum;
350: int worstScoringFragNum;
351: // if blocks are contiguous....
352: if (frag[i].follows(frag[x])) {
353: frag1 = frag[x];
354: frag1Num = x;
355: frag2 = frag[i];
356: frag2Num = i;
357: } else if (frag[x].follows(frag[i])) {
358: frag1 = frag[i];
359: frag1Num = i;
360: frag2 = frag[x];
361: frag2Num = x;
362: }
363: // merging required..
364: if (frag1 != null) {
365: if (frag1.getScore() > frag2.getScore()) {
366: bestScoringFragNum = frag1Num;
367: worstScoringFragNum = frag2Num;
368: } else {
369: bestScoringFragNum = frag2Num;
370: worstScoringFragNum = frag1Num;
371: }
372: frag1.merge(frag2);
373: frag[worstScoringFragNum] = null;
374: mergingStillBeingDone = true;
375: frag[bestScoringFragNum] = frag1;
376: }
377: }
378: }
379: } while (mergingStillBeingDone);
380: }
381:
382: /**
383: * Highlights terms in the text , extracting the most relevant sections and
384: * concatenating the chosen fragments with a separator (typically "...").
385: * The document text is analysed in chunks to record hit statistics across
386: * the document. After accumulating stats, the fragments with the highest
387: * scores are returned in order as "separator" delimited strings.
388: *
389: * @param text
390: * text to highlight terms in
391: * @param maxNumFragments
392: * the maximum number of fragments.
393: * @param separator
394: * the separator used to intersperse the document fragments
395: * (typically "...")
396: * @return highlighted text
397: */
398: public final String getBestFragments(TokenStream tokenStream,
399: String text, int maxNumFragments, String separator)
400: throws IOException {
401: String sections[] = getBestFragments(tokenStream, text,
402: maxNumFragments);
403: StringBuffer result = new StringBuffer();
404: for (int i = 0; i < sections.length; i++) {
405: if (i > 0) {
406: result.append(separator);
407: }
408: result.append(sections[i]);
409: }
410: return result.toString();
411: }
412:
413: /**
414: * @return the maximum number of bytes to be tokenized per doc
415: */
416: public int getMaxDocBytesToAnalyze() {
417: return maxDocBytesToAnalyze;
418: }
419:
420: /**
421: * @param byteCount
422: * the maximum number of bytes to be tokenized per doc (This can
423: * improve performance with large documents)
424: */
425: public void setMaxDocBytesToAnalyze(int byteCount) {
426: maxDocBytesToAnalyze = byteCount;
427: }
428:
429: /**
430: */
431: public Fragmenter getTextFragmenter() {
432: return textFragmenter;
433: }
434:
435: /**
436: * @param fragmenter
437: */
438: public void setTextFragmenter(Fragmenter fragmenter) {
439: textFragmenter = fragmenter;
440: }
441:
442: /**
443: * @return Object used to score each text fragment
444: */
445: public Scorer getFragmentScorer() {
446: return fragmentScorer;
447: }
448:
449: /**
450: * @param scorer
451: */
452: public void setFragmentScorer(Scorer scorer) {
453: fragmentScorer = scorer;
454: }
455:
456: public Encoder getEncoder() {
457: return encoder;
458: }
459:
460: public void setEncoder(Encoder encoder) {
461: this .encoder = encoder;
462: }
463: }
464:
465: class FragmentQueue extends PriorityQueue {
466: public FragmentQueue(int size) {
467: initialize(size);
468: }
469:
470: public final boolean lessThan(Object a, Object b) {
471: TextFragment fragA = (TextFragment) a;
472: TextFragment fragB = (TextFragment) b;
473: if (fragA.getScore() == fragB.getScore())
474: return fragA.fragNum > fragB.fragNum;
475: else
476: return fragA.getScore() < fragB.getScore();
477: }
478: }
|