001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Copyright 2002-2005 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: import java.io.IOException;
020: import java.io.StringReader;
021: import java.util.ArrayList;
022: import java.util.Iterator;
023:
024: import org.apache.lucene.analysis.Analyzer;
025: import org.apache.lucene.analysis.TokenStream;
026: import org.apache.lucene.util.PriorityQueue;
027:
028: /**
029: * Class used to markup highlighted terms found in the best sections of a
030: * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
031: * {@link Encoder} and tokenizers.
032: * @author mark@searcharea.co.uk
033: */
034: public class Highlighter {
035:
036: public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE = 50 * 1024;
037: private int maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
038: private Formatter formatter;
039: private Encoder encoder;
040: private Fragmenter textFragmenter = new SimpleFragmenter();
041: private Scorer fragmentScorer = null;
042:
043: public Highlighter(Scorer fragmentScorer) {
044: this (new SimpleHTMLFormatter(), fragmentScorer);
045: }
046:
047: public Highlighter(Formatter formatter, Scorer fragmentScorer) {
048: this (formatter, new DefaultEncoder(), fragmentScorer);
049: }
050:
051: public Highlighter(Formatter formatter, Encoder encoder,
052: Scorer fragmentScorer) {
053: this .formatter = formatter;
054: this .encoder = encoder;
055: this .fragmentScorer = fragmentScorer;
056: }
057:
058: /**
059: * Highlights chosen terms in a text, extracting the most relevant section.
060: * This is a convenience method that calls
061: * {@link #getBestFragment(TokenStream, String)}
062: *
063: * @param analyzer the analyzer that will be used to split <code>text</code>
064: * into chunks
065: * @param text text to highlight terms in
066: * @param fieldName Name of field used to influence analyzer's tokenization policy
067: *
068: * @return highlighted text fragment or null if no terms found
069: */
070: public final String getBestFragment(Analyzer analyzer,
071: String fieldName, String text) throws IOException {
072: TokenStream tokenStream = analyzer.tokenStream(fieldName,
073: new StringReader(text));
074: return getBestFragment(tokenStream, text);
075: }
076:
077: /**
078: * Highlights chosen terms in a text, extracting the most relevant section.
079: * The document text is analysed in chunks to record hit statistics
080: * across the document. After accumulating stats, the fragment with the highest score
081: * is returned
082: *
083: * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
084: * This is typically produced by an analyzer re-parsing a document's
085: * text. Some work may be done on retrieving TokenStreams more efficently
086: * by adding support for storing original text position data in the Lucene
087: * index but this support is not currently available (as of Lucene 1.4 rc2).
088: * @param text text to highlight terms in
089: *
090: * @return highlighted text fragment or null if no terms found
091: */
092: public final String getBestFragment(TokenStream tokenStream,
093: String text) throws IOException {
094: String[] results = getBestFragments(tokenStream, text, 1);
095: if (results.length > 0) {
096: return results[0];
097: }
098: return null;
099: }
100:
101: /**
102: * Highlights chosen terms in a text, extracting the most relevant sections.
103: * This is a convenience method that calls
104: * {@link #getBestFragments(TokenStream, String, int)}
105: *
106: * @param analyzer the analyzer that will be used to split <code>text</code>
107: * into chunks
108: * @param text text to highlight terms in
109: * @param maxNumFragments the maximum number of fragments.
110: * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
111: * method of the same name that takes a fieldname.
112: * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
113: */
114: public final String[] getBestFragments(Analyzer analyzer,
115: String text, int maxNumFragments) throws IOException {
116: TokenStream tokenStream = analyzer.tokenStream("field",
117: new StringReader(text));
118: return getBestFragments(tokenStream, text, maxNumFragments);
119: }
120:
121: /**
122: * Highlights chosen terms in a text, extracting the most relevant sections.
123: * This is a convenience method that calls
124: * {@link #getBestFragments(TokenStream, String, int)}
125: *
126: * @param analyzer the analyzer that will be used to split <code>text</code>
127: * into chunks
128: * @param fieldName the name of the field being highlighted (used by analyzer)
129: * @param text text to highlight terms in
130: * @param maxNumFragments the maximum number of fragments.
131: *
132: * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
133: */
134: public final String[] getBestFragments(Analyzer analyzer,
135: String fieldName, String text, int maxNumFragments)
136: throws IOException {
137: TokenStream tokenStream = analyzer.tokenStream(fieldName,
138: new StringReader(text));
139: return getBestFragments(tokenStream, text, maxNumFragments);
140: }
141:
142: /**
143: * Highlights chosen terms in a text, extracting the most relevant sections.
144: * The document text is analysed in chunks to record hit statistics
145: * across the document. After accumulating stats, the fragments with the highest scores
146: * are returned as an array of strings in order of score (contiguous fragments are merged into
147: * one in their original order to improve readability)
148: *
149: * @param text text to highlight terms in
150: * @param maxNumFragments the maximum number of fragments.
151: *
152: * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
153: */
154: public final String[] getBestFragments(TokenStream tokenStream,
155: String text, int maxNumFragments) throws IOException {
156: maxNumFragments = Math.max(1, maxNumFragments); //sanity check
157:
158: TextFragment[] frag = getBestTextFragments(tokenStream, text,
159: true, maxNumFragments);
160:
161: //Get text
162: ArrayList fragTexts = new ArrayList();
163: for (int i = 0; i < frag.length; i++) {
164: if ((frag[i] != null) && (frag[i].getScore() > 0)) {
165: fragTexts.add(frag[i].toString());
166: }
167: }
168: return (String[]) fragTexts.toArray(new String[0]);
169: }
170:
171: /**
172: * Low level api to get the most relevant (formatted) sections of the document.
173: * This method has been made public to allow visibility of score information held in TextFragment objects.
174: * Thanks to Jason Calabrese for help in redefining the interface.
175: * @param tokenStream
176: * @param text
177: * @param maxNumFragments
178: * @param mergeContiguousFragments
179: * @throws IOException
180: */
181: public final TextFragment[] getBestTextFragments(
182: TokenStream tokenStream, String text,
183: boolean mergeContiguousFragments, int maxNumFragments)
184: throws IOException {
185: ArrayList docFrags = new ArrayList();
186: StringBuffer newText = new StringBuffer();
187:
188: TextFragment currentFrag = new TextFragment(newText, newText
189: .length(), docFrags.size());
190: fragmentScorer.startFragment(currentFrag);
191: docFrags.add(currentFrag);
192:
193: FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
194:
195: try {
196: org.apache.lucene.analysis.Token token;
197: String tokenText;
198: int startOffset;
199: int endOffset;
200: int lastEndOffset = 0;
201: textFragmenter.start(text);
202:
203: TokenGroup tokenGroup = new TokenGroup();
204:
205: while ((token = tokenStream.next()) != null) {
206: if ((tokenGroup.numTokens > 0)
207: && (tokenGroup.isDistinct(token))) {
208: //the current token is distinct from previous tokens -
209: // markup the cached token group info
210: startOffset = tokenGroup.startOffset;
211: endOffset = tokenGroup.endOffset;
212: tokenText = text.substring(startOffset, endOffset);
213: String markedUpText = formatter.highlightTerm(
214: encoder.encodeText(tokenText), tokenGroup);
215: //store any whitespace etc from between this and last group
216: if (startOffset > lastEndOffset)
217: newText
218: .append(encoder.encodeText(text
219: .substring(lastEndOffset,
220: startOffset)));
221: newText.append(markedUpText);
222: lastEndOffset = endOffset;
223: tokenGroup.clear();
224:
225: //check if current token marks the start of a new fragment
226: if (textFragmenter.isNewFragment(token)) {
227: currentFrag.setScore(fragmentScorer
228: .getFragmentScore());
229: //record stats for a new fragment
230: currentFrag.textEndPos = newText.length();
231: currentFrag = new TextFragment(newText, newText
232: .length(), docFrags.size());
233: fragmentScorer.startFragment(currentFrag);
234: docFrags.add(currentFrag);
235: }
236: }
237:
238: tokenGroup.addToken(token, fragmentScorer
239: .getTokenScore(token));
240:
241: if (lastEndOffset > maxDocBytesToAnalyze) {
242: break;
243: }
244: }
245: currentFrag.setScore(fragmentScorer.getFragmentScore());
246:
247: if (tokenGroup.numTokens > 0) {
248: //flush the accumulated text (same code as in above loop)
249: startOffset = tokenGroup.startOffset;
250: endOffset = tokenGroup.endOffset;
251: tokenText = text.substring(startOffset, endOffset);
252: String markedUpText = formatter.highlightTerm(encoder
253: .encodeText(tokenText), tokenGroup);
254: //store any whitespace etc from between this and last group
255: if (startOffset > lastEndOffset)
256: newText.append(encoder.encodeText(text.substring(
257: lastEndOffset, startOffset)));
258: newText.append(markedUpText);
259: lastEndOffset = endOffset;
260: }
261:
262: // append text after end of last token
263: // if (lastEndOffset < text.length())
264: // newText.append(encoder.encodeText(text.substring(lastEndOffset)));
265:
266: currentFrag.textEndPos = newText.length();
267:
268: //sort the most relevant sections of the text
269: for (Iterator i = docFrags.iterator(); i.hasNext();) {
270: currentFrag = (TextFragment) i.next();
271:
272: //If you are running with a version of Lucene before 11th Sept 03
273: // you do not have PriorityQueue.insert() - so uncomment the code below
274: /*
275: if (currentFrag.getScore() >= minScore)
276: {
277: fragQueue.put(currentFrag);
278: if (fragQueue.size() > maxNumFragments)
279: { // if hit queue overfull
280: fragQueue.pop(); // remove lowest in hit queue
281: minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
282: }
283:
284:
285: }
286: */
287: //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
288: //fix to PriorityQueue. The correct method to use here is the new "insert" method
289: // USE ABOVE CODE IF THIS DOES NOT COMPILE!
290: fragQueue.insert(currentFrag);
291: }
292:
293: //return the most relevant fragments
294: TextFragment frag[] = new TextFragment[fragQueue.size()];
295: for (int i = frag.length - 1; i >= 0; i--) {
296: frag[i] = (TextFragment) fragQueue.pop();
297: }
298:
299: //merge any contiguous fragments to improve readability
300: if (mergeContiguousFragments) {
301: mergeContiguousFragments(frag);
302: ArrayList fragTexts = new ArrayList();
303: for (int i = 0; i < frag.length; i++) {
304: if ((frag[i] != null) && (frag[i].getScore() > 0)) {
305: fragTexts.add(frag[i]);
306: }
307: }
308: frag = (TextFragment[]) fragTexts
309: .toArray(new TextFragment[0]);
310: }
311:
312: return frag;
313:
314: } finally {
315: if (tokenStream != null) {
316: try {
317: tokenStream.close();
318: } catch (Exception e) {
319: }
320: }
321: }
322: }
323:
324: /** Improves readability of a score-sorted list of TextFragments by merging any fragments
325: * that were contiguous in the original text into one larger fragment with the correct order.
326: * This will leave a "null" in the array entry for the lesser scored fragment.
327: *
328: * @param frag An array of document fragments in descending score
329: */
330: private void mergeContiguousFragments(TextFragment[] frag) {
331: boolean mergingStillBeingDone;
332: if (frag.length > 1)
333: do {
334: mergingStillBeingDone = false; //initialise loop control flag
335: //for each fragment, scan other frags looking for contiguous blocks
336: for (int i = 0; i < frag.length; i++) {
337: if (frag[i] == null) {
338: continue;
339: }
340: //merge any contiguous blocks
341: for (int x = 0; x < frag.length; x++) {
342: if (frag[x] == null) {
343: continue;
344: }
345: if (frag[i] == null) {
346: break;
347: }
348: TextFragment frag1 = null;
349: TextFragment frag2 = null;
350: int frag1Num = 0;
351: int frag2Num = 0;
352: int bestScoringFragNum;
353: int worstScoringFragNum;
354: //if blocks are contiguous....
355: if (frag[i].follows(frag[x])) {
356: frag1 = frag[x];
357: frag1Num = x;
358: frag2 = frag[i];
359: frag2Num = i;
360: } else if (frag[x].follows(frag[i])) {
361: frag1 = frag[i];
362: frag1Num = i;
363: frag2 = frag[x];
364: frag2Num = x;
365: }
366: //merging required..
367: if (frag1 != null) {
368: if (frag1.getScore() > frag2.getScore()) {
369: bestScoringFragNum = frag1Num;
370: worstScoringFragNum = frag2Num;
371: } else {
372: bestScoringFragNum = frag2Num;
373: worstScoringFragNum = frag1Num;
374: }
375: frag1.merge(frag2);
376: frag[worstScoringFragNum] = null;
377: mergingStillBeingDone = true;
378: frag[bestScoringFragNum] = frag1;
379: }
380: }
381: }
382: } while (mergingStillBeingDone);
383: }
384:
385: /**
386: * Highlights terms in the text , extracting the most relevant sections
387: * and concatenating the chosen fragments with a separator (typically "...").
388: * The document text is analysed in chunks to record hit statistics
389: * across the document. After accumulating stats, the fragments with the highest scores
390: * are returned in order as "separator" delimited strings.
391: *
392: * @param text text to highlight terms in
393: * @param maxNumFragments the maximum number of fragments.
394: * @param separator the separator used to intersperse the document fragments (typically "...")
395: *
396: * @return highlighted text
397: */
398: public final String getBestFragments(TokenStream tokenStream,
399: String text, int maxNumFragments, String separator)
400: throws IOException {
401: String sections[] = getBestFragments(tokenStream, text,
402: maxNumFragments);
403: StringBuffer result = new StringBuffer();
404: for (int i = 0; i < sections.length; i++) {
405: if (i > 0) {
406: result.append(separator);
407: }
408: result.append(sections[i]);
409: }
410: return result.toString();
411: }
412:
413: /**
414: * @return the maximum number of bytes to be tokenized per doc
415: */
416: public int getMaxDocBytesToAnalyze() {
417: return maxDocBytesToAnalyze;
418: }
419:
420: /**
421: * @param byteCount the maximum number of bytes to be tokenized per doc
422: * (This can improve performance with large documents)
423: */
424: public void setMaxDocBytesToAnalyze(int byteCount) {
425: maxDocBytesToAnalyze = byteCount;
426: }
427:
428: /**
429: */
430: public Fragmenter getTextFragmenter() {
431: return textFragmenter;
432: }
433:
434: /**
435: * @param fragmenter
436: */
437: public void setTextFragmenter(Fragmenter fragmenter) {
438: textFragmenter = fragmenter;
439: }
440:
441: /**
442: * @return Object used to score each text fragment
443: */
444: public Scorer getFragmentScorer() {
445: return fragmentScorer;
446: }
447:
448: /**
449: * @param scorer
450: */
451: public void setFragmentScorer(Scorer scorer) {
452: fragmentScorer = scorer;
453: }
454:
455: public Encoder getEncoder() {
456: return encoder;
457: }
458:
459: public void setEncoder(Encoder encoder) {
460: this .encoder = encoder;
461: }
462: }
463:
464: class FragmentQueue extends PriorityQueue {
465: public FragmentQueue(int size) {
466: initialize(size);
467: }
468:
469: public final boolean lessThan(Object a, Object b) {
470: TextFragment fragA = (TextFragment) a;
471: TextFragment fragB = (TextFragment) b;
472: if (fragA.getScore() == fragB.getScore())
473: return fragA.fragNum > fragB.fragNum;
474: else
475: return fragA.getScore() < fragB.getScore();
476: }
477: }
|