001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.IOException;
021: import java.io.StringReader;
022: import java.util.ArrayList;
023: import java.util.Iterator;
024:
025: import org.apache.lucene.analysis.Analyzer;
026: import org.apache.lucene.analysis.Token;
027: import org.apache.lucene.analysis.TokenStream;
028: import org.apache.lucene.util.PriorityQueue;
029:
030: /**
031: * Class used to markup highlighted terms found in the best sections of a
032: * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
033: * {@link Encoder} and tokenizers.
034: * @author mark@searcharea.co.uk
035: */
036: public class Highlighter {
037:
038: public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE = 50 * 1024;
039: private int maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
040: private Formatter formatter;
041: private Encoder encoder;
042: private Fragmenter textFragmenter = new SimpleFragmenter();
043: private Scorer fragmentScorer = null;
044:
045: public Highlighter(Scorer fragmentScorer) {
046: this (new SimpleHTMLFormatter(), fragmentScorer);
047: }
048:
049: public Highlighter(Formatter formatter, Scorer fragmentScorer) {
050: this (formatter, new DefaultEncoder(), fragmentScorer);
051: }
052:
053: public Highlighter(Formatter formatter, Encoder encoder,
054: Scorer fragmentScorer) {
055: this .formatter = formatter;
056: this .encoder = encoder;
057: this .fragmentScorer = fragmentScorer;
058: }
059:
060: /**
061: * Highlights chosen terms in a text, extracting the most relevant section.
062: * This is a convenience method that calls
063: * {@link #getBestFragment(TokenStream, String)}
064: *
065: * @param analyzer the analyzer that will be used to split <code>text</code>
066: * into chunks
067: * @param text text to highlight terms in
068: * @param fieldName Name of field used to influence analyzer's tokenization policy
069: *
070: * @return highlighted text fragment or null if no terms found
071: */
072: public final String getBestFragment(Analyzer analyzer,
073: String fieldName, String text) throws IOException {
074: TokenStream tokenStream = analyzer.tokenStream(fieldName,
075: new StringReader(text));
076: return getBestFragment(tokenStream, text);
077: }
078:
079: /**
080: * Highlights chosen terms in a text, extracting the most relevant section.
081: * The document text is analysed in chunks to record hit statistics
082: * across the document. After accumulating stats, the fragment with the highest score
083: * is returned
084: *
085: * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
086: * This is typically produced by an analyzer re-parsing a document's
087: * text. Some work may be done on retrieving TokenStreams more efficently
088: * by adding support for storing original text position data in the Lucene
089: * index but this support is not currently available (as of Lucene 1.4 rc2).
090: * @param text text to highlight terms in
091: *
092: * @return highlighted text fragment or null if no terms found
093: */
094: public final String getBestFragment(TokenStream tokenStream,
095: String text) throws IOException {
096: String[] results = getBestFragments(tokenStream, text, 1);
097: if (results.length > 0) {
098: return results[0];
099: }
100: return null;
101: }
102:
103: /**
104: * Highlights chosen terms in a text, extracting the most relevant sections.
105: * This is a convenience method that calls
106: * {@link #getBestFragments(TokenStream, String, int)}
107: *
108: * @param analyzer the analyzer that will be used to split <code>text</code>
109: * into chunks
110: * @param text text to highlight terms in
111: * @param maxNumFragments the maximum number of fragments.
112: * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
113: * method of the same name that takes a fieldname.
114: * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
115: */
116: public final String[] getBestFragments(Analyzer analyzer,
117: String text, int maxNumFragments) throws IOException {
118: TokenStream tokenStream = analyzer.tokenStream("field",
119: new StringReader(text));
120: return getBestFragments(tokenStream, text, maxNumFragments);
121: }
122:
123: /**
124: * Highlights chosen terms in a text, extracting the most relevant sections.
125: * This is a convenience method that calls
126: * {@link #getBestFragments(TokenStream, String, int)}
127: *
128: * @param analyzer the analyzer that will be used to split <code>text</code>
129: * into chunks
130: * @param fieldName the name of the field being highlighted (used by analyzer)
131: * @param text text to highlight terms in
132: * @param maxNumFragments the maximum number of fragments.
133: *
134: * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
135: */
136: public final String[] getBestFragments(Analyzer analyzer,
137: String fieldName, String text, int maxNumFragments)
138: throws IOException {
139: TokenStream tokenStream = analyzer.tokenStream(fieldName,
140: new StringReader(text));
141: return getBestFragments(tokenStream, text, maxNumFragments);
142: }
143:
144: /**
145: * Highlights chosen terms in a text, extracting the most relevant sections.
146: * The document text is analysed in chunks to record hit statistics
147: * across the document. After accumulating stats, the fragments with the highest scores
148: * are returned as an array of strings in order of score (contiguous fragments are merged into
149: * one in their original order to improve readability)
150: *
151: * @param text text to highlight terms in
152: * @param maxNumFragments the maximum number of fragments.
153: *
154: * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
155: */
156: public final String[] getBestFragments(TokenStream tokenStream,
157: String text, int maxNumFragments) throws IOException {
158: maxNumFragments = Math.max(1, maxNumFragments); //sanity check
159:
160: TextFragment[] frag = getBestTextFragments(tokenStream, text,
161: true, maxNumFragments);
162:
163: //Get text
164: ArrayList fragTexts = new ArrayList();
165: for (int i = 0; i < frag.length; i++) {
166: if ((frag[i] != null) && (frag[i].getScore() > 0)) {
167: fragTexts.add(frag[i].toString());
168: }
169: }
170: return (String[]) fragTexts.toArray(new String[0]);
171: }
172:
173: /**
174: * Low level api to get the most relevant (formatted) sections of the document.
175: * This method has been made public to allow visibility of score information held in TextFragment objects.
176: * Thanks to Jason Calabrese for help in redefining the interface.
177: * @param tokenStream
178: * @param text
179: * @param maxNumFragments
180: * @param mergeContiguousFragments
181: * @throws IOException
182: */
183: public final TextFragment[] getBestTextFragments(
184: TokenStream tokenStream, String text,
185: boolean mergeContiguousFragments, int maxNumFragments)
186: throws IOException {
187: ArrayList docFrags = new ArrayList();
188: StringBuffer newText = new StringBuffer();
189:
190: TextFragment currentFrag = new TextFragment(newText, newText
191: .length(), docFrags.size());
192: fragmentScorer.startFragment(currentFrag);
193: docFrags.add(currentFrag);
194:
195: FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
196:
197: try {
198: org.apache.lucene.analysis.Token token;
199: String tokenText;
200: int startOffset;
201: int endOffset;
202: int lastEndOffset = 0;
203: textFragmenter.start(text);
204:
205: TokenGroup tokenGroup = new TokenGroup();
206: token = tokenStream.next();
207: while ((token != null)
208: && (token.startOffset() < maxDocBytesToAnalyze)) {
209: if ((tokenGroup.numTokens > 0)
210: && (tokenGroup.isDistinct(token))) {
211: //the current token is distinct from previous tokens -
212: // markup the cached token group info
213: startOffset = tokenGroup.matchStartOffset;
214: endOffset = tokenGroup.matchEndOffset;
215: tokenText = text.substring(startOffset, endOffset);
216: String markedUpText = formatter.highlightTerm(
217: encoder.encodeText(tokenText), tokenGroup);
218: //store any whitespace etc from between this and last group
219: if (startOffset > lastEndOffset)
220: newText
221: .append(encoder.encodeText(text
222: .substring(lastEndOffset,
223: startOffset)));
224: newText.append(markedUpText);
225: lastEndOffset = Math.max(endOffset, lastEndOffset);
226: tokenGroup.clear();
227:
228: //check if current token marks the start of a new fragment
229: if (textFragmenter.isNewFragment(token)) {
230: currentFrag.setScore(fragmentScorer
231: .getFragmentScore());
232: //record stats for a new fragment
233: currentFrag.textEndPos = newText.length();
234: currentFrag = new TextFragment(newText, newText
235: .length(), docFrags.size());
236: fragmentScorer.startFragment(currentFrag);
237: docFrags.add(currentFrag);
238: }
239: }
240:
241: tokenGroup.addToken(token, fragmentScorer
242: .getTokenScore(token));
243:
244: // if(lastEndOffset>maxDocBytesToAnalyze)
245: // {
246: // break;
247: // }
248: token = tokenStream.next();
249: }
250: currentFrag.setScore(fragmentScorer.getFragmentScore());
251:
252: if (tokenGroup.numTokens > 0) {
253: //flush the accumulated text (same code as in above loop)
254: startOffset = tokenGroup.matchStartOffset;
255: endOffset = tokenGroup.matchEndOffset;
256: tokenText = text.substring(startOffset, endOffset);
257: String markedUpText = formatter.highlightTerm(encoder
258: .encodeText(tokenText), tokenGroup);
259: //store any whitespace etc from between this and last group
260: if (startOffset > lastEndOffset)
261: newText.append(encoder.encodeText(text.substring(
262: lastEndOffset, startOffset)));
263: newText.append(markedUpText);
264: lastEndOffset = Math.max(lastEndOffset, endOffset);
265: }
266:
267: //Test what remains of the original text beyond the point where we stopped analyzing
268: if (
269: // if there is text beyond the last token considered..
270: (lastEndOffset < text.length()) &&
271: // and that text is not too large...
272: (text.length() < maxDocBytesToAnalyze)) {
273: //append it to the last fragment
274: newText.append(encoder.encodeText(text
275: .substring(lastEndOffset)));
276: }
277:
278: currentFrag.textEndPos = newText.length();
279:
280: //sort the most relevant sections of the text
281: for (Iterator i = docFrags.iterator(); i.hasNext();) {
282: currentFrag = (TextFragment) i.next();
283:
284: //If you are running with a version of Lucene before 11th Sept 03
285: // you do not have PriorityQueue.insert() - so uncomment the code below
286: /*
287: if (currentFrag.getScore() >= minScore)
288: {
289: fragQueue.put(currentFrag);
290: if (fragQueue.size() > maxNumFragments)
291: { // if hit queue overfull
292: fragQueue.pop(); // remove lowest in hit queue
293: minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
294: }
295:
296:
297: }
298: */
299: //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
300: //fix to PriorityQueue. The correct method to use here is the new "insert" method
301: // USE ABOVE CODE IF THIS DOES NOT COMPILE!
302: fragQueue.insert(currentFrag);
303: }
304:
305: //return the most relevant fragments
306: TextFragment frag[] = new TextFragment[fragQueue.size()];
307: for (int i = frag.length - 1; i >= 0; i--) {
308: frag[i] = (TextFragment) fragQueue.pop();
309: }
310:
311: //merge any contiguous fragments to improve readability
312: if (mergeContiguousFragments) {
313: mergeContiguousFragments(frag);
314: ArrayList fragTexts = new ArrayList();
315: for (int i = 0; i < frag.length; i++) {
316: if ((frag[i] != null) && (frag[i].getScore() > 0)) {
317: fragTexts.add(frag[i]);
318: }
319: }
320: frag = (TextFragment[]) fragTexts
321: .toArray(new TextFragment[0]);
322: }
323:
324: return frag;
325:
326: } finally {
327: if (tokenStream != null) {
328: try {
329: tokenStream.close();
330: } catch (Exception e) {
331: }
332: }
333: }
334: }
335:
336: /** Improves readability of a score-sorted list of TextFragments by merging any fragments
337: * that were contiguous in the original text into one larger fragment with the correct order.
338: * This will leave a "null" in the array entry for the lesser scored fragment.
339: *
340: * @param frag An array of document fragments in descending score
341: */
342: private void mergeContiguousFragments(TextFragment[] frag) {
343: boolean mergingStillBeingDone;
344: if (frag.length > 1)
345: do {
346: mergingStillBeingDone = false; //initialise loop control flag
347: //for each fragment, scan other frags looking for contiguous blocks
348: for (int i = 0; i < frag.length; i++) {
349: if (frag[i] == null) {
350: continue;
351: }
352: //merge any contiguous blocks
353: for (int x = 0; x < frag.length; x++) {
354: if (frag[x] == null) {
355: continue;
356: }
357: if (frag[i] == null) {
358: break;
359: }
360: TextFragment frag1 = null;
361: TextFragment frag2 = null;
362: int frag1Num = 0;
363: int frag2Num = 0;
364: int bestScoringFragNum;
365: int worstScoringFragNum;
366: //if blocks are contiguous....
367: if (frag[i].follows(frag[x])) {
368: frag1 = frag[x];
369: frag1Num = x;
370: frag2 = frag[i];
371: frag2Num = i;
372: } else if (frag[x].follows(frag[i])) {
373: frag1 = frag[i];
374: frag1Num = i;
375: frag2 = frag[x];
376: frag2Num = x;
377: }
378: //merging required..
379: if (frag1 != null) {
380: if (frag1.getScore() > frag2.getScore()) {
381: bestScoringFragNum = frag1Num;
382: worstScoringFragNum = frag2Num;
383: } else {
384: bestScoringFragNum = frag2Num;
385: worstScoringFragNum = frag1Num;
386: }
387: frag1.merge(frag2);
388: frag[worstScoringFragNum] = null;
389: mergingStillBeingDone = true;
390: frag[bestScoringFragNum] = frag1;
391: }
392: }
393: }
394: } while (mergingStillBeingDone);
395: }
396:
397: /**
398: * Highlights terms in the text , extracting the most relevant sections
399: * and concatenating the chosen fragments with a separator (typically "...").
400: * The document text is analysed in chunks to record hit statistics
401: * across the document. After accumulating stats, the fragments with the highest scores
402: * are returned in order as "separator" delimited strings.
403: *
404: * @param text text to highlight terms in
405: * @param maxNumFragments the maximum number of fragments.
406: * @param separator the separator used to intersperse the document fragments (typically "...")
407: *
408: * @return highlighted text
409: */
410: public final String getBestFragments(TokenStream tokenStream,
411: String text, int maxNumFragments, String separator)
412: throws IOException {
413: String sections[] = getBestFragments(tokenStream, text,
414: maxNumFragments);
415: StringBuffer result = new StringBuffer();
416: for (int i = 0; i < sections.length; i++) {
417: if (i > 0) {
418: result.append(separator);
419: }
420: result.append(sections[i]);
421: }
422: return result.toString();
423: }
424:
425: /**
426: * @return the maximum number of bytes to be tokenized per doc
427: */
428: public int getMaxDocBytesToAnalyze() {
429: return maxDocBytesToAnalyze;
430: }
431:
432: /**
433: * @param byteCount the maximum number of bytes to be tokenized per doc
434: * (This can improve performance with large documents)
435: */
436: public void setMaxDocBytesToAnalyze(int byteCount) {
437: maxDocBytesToAnalyze = byteCount;
438: }
439:
440: /**
441: */
442: public Fragmenter getTextFragmenter() {
443: return textFragmenter;
444: }
445:
446: /**
447: * @param fragmenter
448: */
449: public void setTextFragmenter(Fragmenter fragmenter) {
450: textFragmenter = fragmenter;
451: }
452:
453: /**
454: * @return Object used to score each text fragment
455: */
456: public Scorer getFragmentScorer() {
457: return fragmentScorer;
458: }
459:
460: /**
461: * @param scorer
462: */
463: public void setFragmentScorer(Scorer scorer) {
464: fragmentScorer = scorer;
465: }
466:
467: public Encoder getEncoder() {
468: return encoder;
469: }
470:
471: public void setEncoder(Encoder encoder) {
472: this .encoder = encoder;
473: }
474: }
475:
476: class FragmentQueue extends PriorityQueue {
477: public FragmentQueue(int size) {
478: initialize(size);
479: }
480:
481: public final boolean lessThan(Object a, Object b) {
482: TextFragment fragA = (TextFragment) a;
483: TextFragment fragB = (TextFragment) b;
484: if (fragA.getScore() == fragB.getScore())
485: return fragA.fragNum > fragB.fragNum;
486: else
487: return fragA.getScore() < fragB.getScore();
488: }
489: }
|