001: /*
002: Lucene-Highlighting � Lucene utilities to highlight terms in texts
003: Copyright (C) 2001 Maik Schreiber
004: This library is free software; you can redistribute it and/or modify it
005: under the terms of the GNU Lesser General Public License as published by
006: the Free Software Foundation; either version 2.1 of the License, or
007: (at your option) any later version.
008: This library is distributed in the hope that it will be useful, but
009: WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
010: or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
011: License for more details.
012: You should have received a copy of the GNU Lesser General Public
013: License along with this library; if not, write to the Free Software
014: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
015: */
016: package vqwiki.utils.lucene;
017:
018: import org.apache.log4j.Logger;
019: import org.apache.lucene.analysis.Analyzer;
020: import org.apache.lucene.analysis.TokenStream;
021: import org.apache.lucene.index.Term;
022: import org.apache.lucene.search.*;
023:
024: import java.io.IOException;
025: import java.io.Reader;
026: import java.io.StringReader;
027: import java.util.HashSet;
028:
029: /**
030: * Contains miscellaneous utility methods for use with Lucene.
031: *
032: * @version $Id: LuceneTools.java 365 2003-10-05 05:07:32Z garethc $
033: * @author Maik Schreiber (mailto: bZ@iq-computing.de)
034: */
035: public final class LuceneTools {
036: /**
037: * the log4j category/logger for this class
038: */
039: private final static Logger log = Logger
040: .getLogger(LuceneTools.class.getName());
041:
042: /**
043: * LuceneTools must not be instantiated directly.
044: */
045: private LuceneTools() {
046: }
047:
048: /**
049: * TODO: Document this method.
050: *
051: * @param text TODO: Document this parameter.
052: * @param position TODO: Document this parameter.
053: * @return TODO: Document the result.
054: */
055: public static int findAfter(String text, int position) {
056: return findAfter(text, position, 15);
057: }
058:
059: /**
060: * TODO: Document this method.
061: *
062: * @param text TODO: Document this parameter.
063: * @param position TODO: Document this parameter.
064: * @param howmany TODO: Document this parameter.
065: * @return TODO: Document the result.
066: */
067: public static int findAfter(String text, int position, int howmany) {
068: int counter = 0;
069: int foundPos = -1;
070: int lastcharwidth = 1;
071: // first find a valid character
072: while ((position > 0) && (position < text.length())
073: && text.charAt(position) == ' ') {
074: position++;
075: }
076:
077: while ((counter <= howmany) && (position > 0)
078: && (position < text.length())) {
079: lastcharwidth = 1;
080: if (text.charAt(position) == '\r') {
081: break;
082: }
083: if (text.charAt(position) == '\t') {
084: break;
085: }
086: if (text.charAt(position) == '\u00A0') {
087: break;
088: }
089: if ((text.charAt(position) == ' ')
090: || ((position + 5) < text.length()
091: && text.charAt(position) == '&'
092: && text.charAt(position + 1) == 'n'
093: && text.charAt(position + 2) == 'b'
094: && text.charAt(position + 3) == 's'
095: && text.charAt(position + 4) == 'p' && text
096: .charAt(position + 5) == ';')) {
097: if ((!(((position + 2) < text.length())
098: && (text.charAt(position + 2) == ' ')
099: && (text.charAt(position + 1) >= 'A') && (text
100: .charAt(position + 1) <= 'Z')))
101: || (text.charAt(position) != ' ')) {
102: counter++;
103: }
104: if (text.charAt(position) != ' ') {
105: position += 5;
106: lastcharwidth = 6;
107: }
108: }
109:
110: position++;
111:
112: }
113: position -= lastcharwidth;
114: return position;
115: }
116:
117: /**
118: * TODO: Document this method.
119: *
120: * @param text TODO: Document this parameter.
121: * @param position TODO: Document this parameter.
122: * @return TODO: Document the result.
123: */
124: public static int findBefore(String text, int position) {
125: int counter = 0;
126: int foundPos = -1;
127: int lastspacePos = 0;
128:
129: // first find a valid character
130: /*while ( (position>0) && (position < text.length()) && text.charAt(position) == ' ')
131: {
132: position++;
133: }*/
134: while ((counter < 16) && (position > 0)
135: && (position < text.length())) {
136: position--;
137: if (text.charAt(position) == '\n') {
138: break;
139: }
140: if (text.charAt(position) == '\t') {
141: break;
142: }
143: if (text.charAt(position) == '\u00A0') {
144: break;
145: }
146: if (text.charAt(position) == ' ') {
147: if (!(((position - 2) >= 0)
148: && (text.charAt(position - 2) == ' ')
149: && (text.charAt(position - 1) >= 'A') && (text
150: .charAt(position - 1) <= 'Z'))) {
151: counter++;
152: }
153: }
154: }
155: if (text.charAt(position) == ' ') {
156: position++;
157: }
158: //log.debug("Returning position " + position);
159: return position;
160: }
161:
162: /**
163: * Extracts all term texts of a given Query. Term texts will be returned in lower-case.
164: *
165: * @param query Query to extract term texts from
166: * @param terms HashSet where extracted term texts should be put into (Elements: String)
167: * @param prohibited <code>true</code> to extract "prohibited" terms, too
168: * @exception IOException TODO: Document this exception.
169: */
170: public final static void getTerms(Query query, HashSet terms,
171: boolean prohibited) throws IOException {
172: if (query instanceof BooleanQuery) {
173: getTermsFromBooleanQuery((BooleanQuery) query, terms,
174: prohibited);
175: } else if (query instanceof PhraseQuery) {
176: getTermsFromPhraseQuery((PhraseQuery) query, terms);
177: } else if (query instanceof TermQuery) {
178: getTermsFromTermQuery((TermQuery) query, terms);
179: } else if (query instanceof PrefixQuery) {
180: getTermsFromPrefixQuery((PrefixQuery) query, terms,
181: prohibited);
182: } else if (query instanceof RangeQuery) {
183: getTermsFromRangeQuery((RangeQuery) query, terms,
184: prohibited);
185: } else if (query instanceof MultiTermQuery) {
186: getTermsFromMultiTermQuery((MultiTermQuery) query, terms,
187: prohibited);
188: }
189: }
190:
191: /**
192: * Extracts all term texts of a given BooleanQuery. Term texts will be returned in lower-case.
193: *
194: * @param query BooleanQuery to extract term texts from
195: * @param terms HashSet where extracted term texts should be put into (Elements: String)
196: * @param prohibited <code>true</code> to extract "prohibited" terms, too
197: * @exception IOException TODO: Document this exception.
198: */
199: private final static void getTermsFromBooleanQuery(
200: BooleanQuery query, HashSet terms, boolean prohibited)
201: throws IOException {
202: BooleanClause[] queryClauses = query.getClauses();
203: int i;
204:
205: for (i = 0; i < queryClauses.length; i++) {
206: if (prohibited || !queryClauses[i].prohibited) {
207: getTerms(queryClauses[i].query, terms, prohibited);
208: }
209: }
210: }
211:
212: /**
213: * Extracts all term texts of a given MultiTermQuery. Term texts will be returned in lower-case.
214: *
215: * @param query MultiTermQuery to extract term texts from
216: * @param terms HashSet where extracted term texts should be put into (Elements: String)
217: * @param prohibited <code>true</code> to extract "prohibited" terms, too
218: * @exception IOException TODO: Document this exception.
219: */
220: private final static void getTermsFromMultiTermQuery(
221: MultiTermQuery query, HashSet terms, boolean prohibited)
222: throws IOException {
223: getTerms(query.getQuery(), terms, prohibited);
224: }
225:
226: /**
227: * Extracts all term texts of a given PhraseQuery. Term texts will be returned in lower-case.
228: *
229: * @param query PhraseQuery to extract term texts from
230: * @param terms HashSet where extracted term texts should be put into (Elements: String)
231: */
232: private final static void getTermsFromPhraseQuery(
233: PhraseQuery query, HashSet terms) {
234: Term[] queryTerms = query.getTerms();
235: int i;
236:
237: for (i = 0; i < queryTerms.length; i++) {
238: terms.add(getTermsFromTerm(queryTerms[i]));
239: }
240: }
241:
242: /**
243: * Extracts all term texts of a given PrefixQuery. Term texts will be returned in lower-case.
244: *
245: * @param query PrefixQuery to extract term texts from
246: * @param terms HashSet where extracted term texts should be put into (Elements: String)
247: * @param prohibited <code>true</code> to extract "prohibited" terms, too
248: * @exception IOException TODO: Document this exception.
249: */
250: private final static void getTermsFromPrefixQuery(
251: PrefixQuery query, HashSet terms, boolean prohibited)
252: throws IOException {
253: getTerms(query.getQuery(), terms, prohibited);
254: }
255:
256: /**
257: * Extracts all term texts of a given RangeQuery. Term texts will be returned in lower-case.
258: *
259: * @param query RangeQuery to extract term texts from
260: * @param terms HashSet where extracted term texts should be put into (Elements: String)
261: * @param prohibited <code>true</code> to extract "prohibited" terms, too
262: * @exception IOException TODO: Document this exception.
263: */
264: private final static void getTermsFromRangeQuery(RangeQuery query,
265: HashSet terms, boolean prohibited) throws IOException {
266: getTerms(query.getQuery(), terms, prohibited);
267: }
268:
269: /**
270: * Extracts the term of a given Term. The term will be returned in lower-case.
271: *
272: * @param term Term to extract term from
273: * @return the Term's term text
274: */
275: private final static String getTermsFromTerm(Term term) {
276: return term.text().toLowerCase();
277: }
278:
279: /**
280: * Extracts all term texts of a given TermQuery. Term texts will be returned in lower-case.
281: *
282: * @param query TermQuery to extract term texts from
283: * @param terms HashSet where extracted term texts should be put into (Elements: String)
284: */
285: private final static void getTermsFromTermQuery(TermQuery query,
286: HashSet terms) {
287: terms.add(getTermsFromTerm(query.getTerm()));
288: }
289:
290: /**
291: * TODO: Document this method.
292: *
293: * @param term TODO: Document this parameter.
294: * @return TODO: Document the result.
295: */
296: public static String highlightTerm(String term) {
297: return "<B style=\"color:black;background-color:#ffff66\">"
298: + term + "</B>";
299: }
300:
301: /**
302: * Highlights a text in accordance to a given query.
303: *
304: * @param text text to highlight terms in
305: * @param highlighter TermHighlighter to use to highlight terms in the text
306: * @param query Query which contains the terms to be highlighted in the text
307: * @param analyzer Analyzer used to construct the Query
308: * @return highlighted text
309: * @exception IOException TODO: Document this exception.
310: */
311: public final static String highlightTerms(String text,
312: TermHighlighter highlighter, Query query, Analyzer analyzer)
313: throws IOException {
314: StringBuffer newText = new StringBuffer();
315: TokenStream stream = null;
316:
317: try {
318: HashSet terms = new HashSet();
319: org.apache.lucene.analysis.Token token;
320: String tokenText;
321: int startOffset;
322: int endOffset;
323: int lastEndOffset = 0;
324:
325: // get terms in query
326: getTerms(query, terms, false);
327:
328: boolean foundBodyStart = false;
329:
330: stream = analyzer.tokenStream(null, new StringReader(text));
331: while ((token = stream.next()) != null) {
332: if (!token.termText().equalsIgnoreCase("body")
333: && !foundBodyStart) {
334: continue;
335: } else {
336: if (!foundBodyStart) {
337: token = stream.next();
338: }
339: foundBodyStart = true;
340: }
341:
342: startOffset = token.startOffset();
343: endOffset = token.endOffset();
344: tokenText = text.substring(startOffset, endOffset);
345:
346: // append text between end of last token (or beginning of text) and start of current token
347: if (startOffset > lastEndOffset) {
348: newText.append(text.substring(lastEndOffset,
349: startOffset));
350: }
351:
352: // does query contain current token?
353: if (terms.contains(token.termText())) {
354: newText.append(highlightTerm(tokenText));
355: } else {
356: newText.append(tokenText);
357: }
358:
359: lastEndOffset = endOffset;
360: }
361:
362: // append text after end of last token
363: if (lastEndOffset < text.length()) {
364: newText.append(text.substring(lastEndOffset));
365: }
366:
367: return newText.toString();
368: } finally {
369: if (stream != null) {
370: try {
371: stream.close();
372: } catch (Exception e) {
373: }
374: }
375: }
376: }
377:
378: /**
379: * Give the text, which is before the highlighted text, the highlighted text
380: * and the text, which is afterwards
381: *
382: * @param text The source text
383: * @param query The query containing the string searched for
384: * @param analyzer Some analyzer
385: * @return An array of 3 strings, with the ten words before (as pos 0), the keyword (as pos 1) and the ten words after (as pos 2)
386: * @throws IOException The stream can throw an IOException
387: */
388: public final static String[] outputHits(String text, Query query,
389: Analyzer[] analyzer) throws IOException {
390: HTMLParser htmlparser = new HTMLParser(new StringReader(text));
391:
392: Reader in = htmlparser.getReader();
393: StringBuffer buffer = new StringBuffer();
394:
395: int ch;
396:
397: while ((ch = in.read()) > -1) {
398: buffer.append((char) ch);
399: }
400:
401: in.close();
402:
403: String cleanText = buffer.toString();
404:
405: TokenStream stream = null;
406:
407: String[] result = new String[3];
408: result[0] = "";
409: result[1] = "";
410: result[2] = "";
411:
412: try {
413: HashSet terms = new HashSet();
414: org.apache.lucene.analysis.Token token;
415: String tokenText;
416: int startOffset;
417: int endOffset;
418: int tenBeforeOffset;
419: int tenAfterOffset;
420:
421: // get terms in query
422: LuceneTools.getTerms(query, terms, false);
423: log.debug("Terms: " + terms);
424:
425: for (int i = 0; i < analyzer.length; i++) {
426: stream = analyzer[i].tokenStream("content",
427: new java.io.StringReader(cleanText));
428:
429: while ((token = stream.next()) != null) {
430: startOffset = token.startOffset();
431: endOffset = token.endOffset();
432: tokenText = cleanText.substring(startOffset,
433: endOffset);
434:
435: // does query contain current token?
436: if (terms.contains(token.termText())) {
437: // find 10 words before this position
438: tenBeforeOffset = LuceneTools.findBefore(
439: cleanText, startOffset);
440:
441: if ((tenBeforeOffset != startOffset)
442: && (startOffset > tenBeforeOffset)
443: && (tenBeforeOffset != -1)) {
444: //log.debug("Before: " + tenBeforeOffset + " / " + startOffset );
445: result[0] = cleanText.substring(
446: tenBeforeOffset, startOffset);
447: }
448:
449: result[1] = tokenText;
450:
451: // find 10 words after this position
452: tenAfterOffset = LuceneTools.findAfter(
453: cleanText, endOffset);
454:
455: if ((tenAfterOffset != endOffset)
456: && (endOffset < tenAfterOffset)
457: && (tenAfterOffset != -1)) {
458: //log.debug("After: " + endOffset + " / " + tenAfterOffset);
459: result[2] = cleanText.substring(endOffset,
460: tenAfterOffset + 1).trim();
461: }
462:
463: stream.close();
464: return result;
465: }
466: }
467: }
468: return result;
469: } finally {
470: if (stream != null) {
471: try {
472: stream.close();
473: } catch (Exception e) {
474: ;
475: }
476: }
477: }
478: }
479:
480: }
|