001: package org.apache.lucene.search.highlight;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import java.io.ByteArrayInputStream;
021: import java.io.IOException;
022: import java.io.Reader;
023: import java.io.StringReader;
024: import java.util.*;
025:
026: import javax.xml.parsers.DocumentBuilder;
027: import javax.xml.parsers.DocumentBuilderFactory;
028:
029: import junit.framework.TestCase;
030:
031: import org.apache.lucene.analysis.*;
032: import org.apache.lucene.analysis.standard.StandardAnalyzer;
033: import org.apache.lucene.document.Document;
034: import org.apache.lucene.document.Field;
035: import org.apache.lucene.index.IndexReader;
036: import org.apache.lucene.index.IndexWriter;
037: import org.apache.lucene.index.Term;
038: import org.apache.lucene.queryParser.ParseException;
039: import org.apache.lucene.queryParser.QueryParser;
040: import org.apache.lucene.search.FilteredQuery;
041: import org.apache.lucene.search.Hits;
042: import org.apache.lucene.search.IndexSearcher;
043: import org.apache.lucene.search.MultiSearcher;
044: import org.apache.lucene.search.PhraseQuery;
045: import org.apache.lucene.search.Query;
046: import org.apache.lucene.search.RangeFilter;
047: import org.apache.lucene.search.Searcher;
048: import org.apache.lucene.search.TermQuery;
049: import org.apache.lucene.search.spans.SpanNearQuery;
050: import org.apache.lucene.search.spans.SpanQuery;
051: import org.apache.lucene.search.spans.SpanTermQuery;
052: import org.apache.lucene.store.RAMDirectory;
053: import org.w3c.dom.Element;
054: import org.w3c.dom.NodeList;
055:
056: /**
057: * JUnit Test for Highlighter class.
058: * @author mark@searcharea.co.uk
059: */
060: public class HighlighterTest extends TestCase implements Formatter {
061: private IndexReader reader;
062: private static final String FIELD_NAME = "contents";
063: private Query query;
064: RAMDirectory ramDir;
065: public Searcher searcher = null;
066: public Hits hits = null;
067: int numHighlights = 0;
068: Analyzer analyzer = new StandardAnalyzer();
069:
070: String texts[] = {
071: "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
072: "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
073: "JFK has been shot", "John Kennedy has been shot",
074: "This text has a typo in referring to Keneddy" };
075:
076: /**
077: * Constructor for HighlightExtractorTest.
078: * @param arg0
079: */
080: public HighlighterTest(String arg0) {
081: super (arg0);
082: }
083:
084: public void testSimpleHighlighter() throws Exception {
085: doSearching("Kennedy");
086: Highlighter highlighter = new Highlighter(
087: new QueryScorer(query));
088: highlighter.setTextFragmenter(new SimpleFragmenter(40));
089: int maxNumFragmentsRequired = 2;
090: for (int i = 0; i < hits.length(); i++) {
091: String text = hits.doc(i).get(FIELD_NAME);
092: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
093: new StringReader(text));
094:
095: String result = highlighter.getBestFragments(tokenStream,
096: text, maxNumFragmentsRequired, "...");
097: System.out.println("\t" + result);
098: }
099: //Not sure we can assert anything here - just running to check we dont throw any exceptions
100: }
101:
102: public void testGetBestFragmentsSimpleQuery() throws Exception {
103: doSearching("Kennedy");
104: doStandardHighlights();
105: assertTrue("Failed to find correct number of highlights "
106: + numHighlights + " found", numHighlights == 4);
107: }
108:
109: public void testGetFuzzyFragments() throws Exception {
110: doSearching("Kinnedy~");
111: doStandardHighlights();
112: assertTrue("Failed to find correct number of highlights "
113: + numHighlights + " found", numHighlights == 5);
114: }
115:
116: public void testGetWildCardFragments() throws Exception {
117: doSearching("K?nnedy");
118: doStandardHighlights();
119: assertTrue("Failed to find correct number of highlights "
120: + numHighlights + " found", numHighlights == 4);
121: }
122:
123: public void testGetMidWildCardFragments() throws Exception {
124: doSearching("K*dy");
125: doStandardHighlights();
126: assertTrue("Failed to find correct number of highlights "
127: + numHighlights + " found", numHighlights == 5);
128: }
129:
130: public void testGetRangeFragments() throws Exception {
131: String queryString = FIELD_NAME + ":[kannedy TO kznnedy]";
132:
133: //Need to explicitly set the QueryParser property to use RangeQuery rather than RangeFilters
134: QueryParser parser = new QueryParser(FIELD_NAME,
135: new StandardAnalyzer());
136: parser.setUseOldRangeQuery(true);
137: query = parser.parse(queryString);
138: doSearching(query);
139:
140: doStandardHighlights();
141: assertTrue("Failed to find correct number of highlights "
142: + numHighlights + " found", numHighlights == 5);
143: }
144:
145: public void testGetBestFragmentsPhrase() throws Exception {
146: doSearching("\"John Kennedy\"");
147: doStandardHighlights();
148: //Currently highlights "John" and "Kennedy" separately
149: assertTrue("Failed to find correct number of highlights "
150: + numHighlights + " found", numHighlights == 2);
151: }
152:
153: public void testGetBestFragmentsSpan() throws Exception {
154: SpanQuery clauses[] = {
155: new SpanTermQuery(new Term("contents", "john")),
156: new SpanTermQuery(new Term("contents", "kennedy")), };
157:
158: SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
159: doSearching(snq);
160: doStandardHighlights();
161: //Currently highlights "John" and "Kennedy" separately
162: assertTrue("Failed to find correct number of highlights "
163: + numHighlights + " found", numHighlights == 2);
164: }
165:
166: public void testOffByOne() throws IOException {
167: TermQuery query = new TermQuery(new Term("data", "help"));
168: Highlighter hg = new Highlighter(new SimpleHTMLFormatter(),
169: new QueryScorer(query));
170: hg.setTextFragmenter(new NullFragmenter());
171:
172: String match = null;
173: match = hg.getBestFragment(new StandardAnalyzer(), "data",
174: "help me [54-65]");
175: assertEquals("<B>help</B> me [54-65]", match);
176: }
177:
178: public void testGetBestFragmentsFilteredQuery() throws Exception {
179: RangeFilter rf = new RangeFilter("contents", "john", "john",
180: true, true);
181: SpanQuery clauses[] = {
182: new SpanTermQuery(new Term("contents", "john")),
183: new SpanTermQuery(new Term("contents", "kennedy")), };
184: SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
185: FilteredQuery fq = new FilteredQuery(snq, rf);
186:
187: doSearching(fq);
188: doStandardHighlights();
189: //Currently highlights "John" and "Kennedy" separately
190: assertTrue("Failed to find correct number of highlights "
191: + numHighlights + " found", numHighlights == 2);
192: }
193:
194: public void testGetBestFragmentsFilteredPhraseQuery()
195: throws Exception {
196: RangeFilter rf = new RangeFilter("contents", "john", "john",
197: true, true);
198: PhraseQuery pq = new PhraseQuery();
199: pq.add(new Term("contents", "john"));
200: pq.add(new Term("contents", "kennedy"));
201: FilteredQuery fq = new FilteredQuery(pq, rf);
202:
203: doSearching(fq);
204: doStandardHighlights();
205: //Currently highlights "John" and "Kennedy" separately
206: assertTrue("Failed to find correct number of highlights "
207: + numHighlights + " found", numHighlights == 2);
208: }
209:
210: public void testGetBestFragmentsMultiTerm() throws Exception {
211: doSearching("John Kenn*");
212: doStandardHighlights();
213: assertTrue("Failed to find correct number of highlights "
214: + numHighlights + " found", numHighlights == 5);
215: }
216:
217: public void testGetBestFragmentsWithOr() throws Exception {
218: doSearching("JFK OR Kennedy");
219: doStandardHighlights();
220: assertTrue("Failed to find correct number of highlights "
221: + numHighlights + " found", numHighlights == 5);
222: }
223:
224: public void testGetBestSingleFragment() throws Exception {
225: doSearching("Kennedy");
226: Highlighter highlighter = new Highlighter(this ,
227: new QueryScorer(query));
228: highlighter.setTextFragmenter(new SimpleFragmenter(40));
229:
230: for (int i = 0; i < hits.length(); i++) {
231: String text = hits.doc(i).get(FIELD_NAME);
232: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
233: new StringReader(text));
234: String result = highlighter.getBestFragment(tokenStream,
235: text);
236: System.out.println("\t" + result);
237: }
238: assertTrue("Failed to find correct number of highlights "
239: + numHighlights + " found", numHighlights == 4);
240:
241: numHighlights = 0;
242: for (int i = 0; i < hits.length(); i++) {
243: String text = hits.doc(i).get(FIELD_NAME);
244: highlighter.getBestFragment(analyzer, FIELD_NAME, text);
245: }
246: assertTrue("Failed to find correct number of highlights "
247: + numHighlights + " found", numHighlights == 4);
248:
249: numHighlights = 0;
250: for (int i = 0; i < hits.length(); i++) {
251: String text = hits.doc(i).get(FIELD_NAME);
252: highlighter
253: .getBestFragments(analyzer, FIELD_NAME, text, 10);
254: }
255: assertTrue("Failed to find correct number of highlights "
256: + numHighlights + " found", numHighlights == 4);
257:
258: }
259:
260: public void testGetBestSingleFragmentWithWeights() throws Exception {
261: WeightedTerm[] wTerms = new WeightedTerm[2];
262: wTerms[0] = new WeightedTerm(10f, "hello");
263: wTerms[1] = new WeightedTerm(1f, "kennedy");
264: Highlighter highlighter = new Highlighter(new QueryScorer(
265: wTerms));
266: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
267: new StringReader(texts[0]));
268: highlighter.setTextFragmenter(new SimpleFragmenter(2));
269:
270: String result = highlighter.getBestFragment(tokenStream,
271: texts[0]).trim();
272: assertTrue(
273: "Failed to find best section using weighted terms. Found: ["
274: + result + "]", "<B>Hello</B>".equals(result));
275:
276: //readjust weights
277: wTerms[1].setWeight(50f);
278: tokenStream = analyzer.tokenStream(FIELD_NAME,
279: new StringReader(texts[0]));
280: highlighter = new Highlighter(new QueryScorer(wTerms));
281: highlighter.setTextFragmenter(new SimpleFragmenter(2));
282:
283: result = highlighter.getBestFragment(tokenStream, texts[0])
284: .trim();
285: assertTrue(
286: "Failed to find best section using weighted terms. Found: "
287: + result, "<B>kennedy</B>".equals(result));
288: }
289:
290: // tests a "complex" analyzer that produces multiple
291: // overlapping tokens
292: public void testOverlapAnalyzer() throws Exception {
293: HashMap synonyms = new HashMap();
294: synonyms.put("football", "soccer,footie");
295: Analyzer analyzer = new SynonymAnalyzer(synonyms);
296: String srchkey = "football";
297:
298: String s = "football-soccer in the euro 2004 footie competition";
299: QueryParser parser = new QueryParser("bookid", analyzer);
300: Query query = parser.parse(srchkey);
301:
302: Highlighter highlighter = new Highlighter(
303: new QueryScorer(query));
304: TokenStream tokenStream = analyzer.tokenStream(null,
305: new StringReader(s));
306: // Get 3 best fragments and seperate with a "..."
307: String result = highlighter.getBestFragments(tokenStream, s, 3,
308: "...");
309: String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
310: assertTrue("overlapping analyzer should handle highlights OK",
311: expectedResult.equals(result));
312: }
313:
314: public void testGetSimpleHighlight() throws Exception {
315: doSearching("Kennedy");
316: Highlighter highlighter = new Highlighter(this ,
317: new QueryScorer(query));
318:
319: for (int i = 0; i < hits.length(); i++) {
320: String text = hits.doc(i).get(FIELD_NAME);
321: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
322: new StringReader(text));
323:
324: String result = highlighter.getBestFragment(tokenStream,
325: text);
326: System.out.println("\t" + result);
327: }
328: assertTrue("Failed to find correct number of highlights "
329: + numHighlights + " found", numHighlights == 4);
330: }
331:
332: public void testGetTextFragments() throws Exception {
333: doSearching("Kennedy");
334: Highlighter highlighter = new Highlighter(this ,
335: new QueryScorer(query));
336: highlighter.setTextFragmenter(new SimpleFragmenter(20));
337:
338: for (int i = 0; i < hits.length(); i++) {
339: String text = hits.doc(i).get(FIELD_NAME);
340: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
341: new StringReader(text));
342:
343: String stringResults[] = highlighter.getBestFragments(
344: tokenStream, text, 10);
345:
346: tokenStream = analyzer.tokenStream(FIELD_NAME,
347: new StringReader(text));
348: TextFragment fragmentResults[] = highlighter
349: .getBestTextFragments(tokenStream, text, true, 10);
350:
351: assertTrue(
352: "Failed to find correct number of text Fragments: "
353: + fragmentResults.length + " vs "
354: + stringResults.length,
355: fragmentResults.length == stringResults.length);
356: for (int j = 0; j < stringResults.length; j++) {
357: System.out.println(fragmentResults[j]);
358: assertTrue("Failed to find same text Fragments: "
359: + fragmentResults[j] + " found",
360: fragmentResults[j].toString().equals(
361: stringResults[j]));
362:
363: }
364:
365: }
366: }
367:
368: public void testMaxSizeHighlight() throws Exception {
369: doSearching("meat");
370: Highlighter highlighter = new Highlighter(this ,
371: new QueryScorer(query));
372: highlighter.setMaxDocBytesToAnalyze(30);
373: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
374: new StringReader(texts[0]));
375: highlighter.getBestFragment(tokenStream, texts[0]);
376: assertTrue(
377: "Setting MaxDocBytesToAnalyze should have prevented "
378: + "us from finding matches for this record: "
379: + numHighlights + " found", numHighlights == 0);
380: }
381:
382: public void testMaxSizeHighlightTruncates() throws IOException {
383: String goodWord = "goodtoken";
384: String stopWords[] = { "stoppedtoken" };
385:
386: TermQuery query = new TermQuery(new Term("data", goodWord));
387: SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
388: Highlighter hg = new Highlighter(fm, new QueryScorer(query));
389: hg.setTextFragmenter(new NullFragmenter());
390:
391: String match = null;
392: StringBuffer sb = new StringBuffer();
393: sb.append(goodWord);
394: for (int i = 0; i < 10000; i++) {
395: sb.append(" ");
396: sb.append(stopWords[0]);
397: }
398:
399: hg.setMaxDocBytesToAnalyze(100);
400: match = hg.getBestFragment(new StandardAnalyzer(stopWords),
401: "data", sb.toString());
402: assertTrue(
403: "Matched text should be no more than 100 chars in length ",
404: match.length() < hg.getMaxDocBytesToAnalyze());
405:
406: //add another tokenized word to the overrall length - but set way beyond
407: //the length of text under consideration (after a large slug of stop words + whitespace)
408: sb.append(" ");
409: sb.append(goodWord);
410: match = hg.getBestFragment(new StandardAnalyzer(stopWords),
411: "data", sb.toString());
412: assertTrue(
413: "Matched text should be no more than 100 chars in length ",
414: match.length() < hg.getMaxDocBytesToAnalyze());
415:
416: }
417:
418: public void testUnRewrittenQuery() throws IOException,
419: ParseException {
420: //test to show how rewritten query can still be used
421: searcher = new IndexSearcher(ramDir);
422: Analyzer analyzer = new StandardAnalyzer();
423:
424: QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
425: Query query = parser.parse("JF? or Kenned*");
426: System.out.println("Searching with primitive query");
427: //forget to set this and...
428: //query=query.rewrite(reader);
429: Hits hits = searcher.search(query);
430:
431: //create an instance of the highlighter with the tags used to surround highlighted text
432: // QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
433: Highlighter highlighter = new Highlighter(this ,
434: new QueryScorer(query));
435:
436: highlighter.setTextFragmenter(new SimpleFragmenter(40));
437:
438: int maxNumFragmentsRequired = 3;
439:
440: for (int i = 0; i < hits.length(); i++) {
441: String text = hits.doc(i).get(FIELD_NAME);
442: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
443: new StringReader(text));
444:
445: String highlightedText = highlighter.getBestFragments(
446: tokenStream, text, maxNumFragmentsRequired, "...");
447: System.out.println(highlightedText);
448: }
449: //We expect to have zero highlights if the query is multi-terms and is not rewritten!
450: assertTrue("Failed to find correct number of highlights "
451: + numHighlights + " found", numHighlights == 0);
452: }
453:
454: public void testNoFragments() throws Exception {
455: doSearching("AnInvalidQueryWhichShouldYieldNoResults");
456: Highlighter highlighter = new Highlighter(this ,
457: new QueryScorer(query));
458:
459: for (int i = 0; i < texts.length; i++) {
460: String text = texts[i];
461: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
462: new StringReader(text));
463:
464: String result = highlighter.getBestFragment(tokenStream,
465: text);
466: assertNull(
467: "The highlight result should be null for text with no query terms",
468: result);
469: }
470: }
471:
472: /**
473: * Demonstrates creation of an XHTML compliant doc using new encoding facilities.
474: * @throws Exception
475: */
476: public void testEncoding() throws Exception {
477: String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
478: //run the highlighter on the raw content (scorer does not score any tokens for
479: // highlighting but scores a single fragment for selection
480: Highlighter highlighter = new Highlighter(this ,
481: new SimpleHTMLEncoder(), new Scorer() {
482: public void startFragment(TextFragment newFragment) {
483: }
484:
485: public float getTokenScore(Token token) {
486: return 0;
487: }
488:
489: public float getFragmentScore() {
490: return 1;
491: }
492: });
493: highlighter.setTextFragmenter(new SimpleFragmenter(2000));
494: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
495: new StringReader(rawDocContent));
496:
497: String encodedSnippet = highlighter.getBestFragments(
498: tokenStream, rawDocContent, 1, "");
499: //An ugly bit of XML creation:
500: String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
501: + "<!DOCTYPE html\n"
502: + "PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
503: + "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
504: + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"
505: + "<head>\n" + "<title>My Test HTML Document</title>\n"
506: + "</head>\n" + "<body>\n" + "<h2>" + encodedSnippet
507: + "</h2>\n" + "</body>\n" + "</html>";
508: //now an ugly built of XML parsing to test the snippet is encoded OK
509: DocumentBuilderFactory dbf = DocumentBuilderFactory
510: .newInstance();
511: DocumentBuilder db = dbf.newDocumentBuilder();
512: org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(
513: xhtml.getBytes()));
514: Element root = doc.getDocumentElement();
515: NodeList nodes = root.getElementsByTagName("body");
516: Element body = (Element) nodes.item(0);
517: nodes = body.getElementsByTagName("h2");
518: Element h2 = (Element) nodes.item(0);
519: String decodedSnippet = h2.getFirstChild().getNodeValue();
520: assertEquals("XHTML Encoding should have worked:",
521: rawDocContent, decodedSnippet);
522: }
523:
524: public void testMultiSearcher() throws Exception {
525: //setup index 1
526: RAMDirectory ramDir1 = new RAMDirectory();
527: IndexWriter writer1 = new IndexWriter(ramDir1,
528: new StandardAnalyzer(), true);
529: Document d = new Document();
530: Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES,
531: Field.Index.TOKENIZED);
532: d.add(f);
533: writer1.addDocument(d);
534: writer1.optimize();
535: writer1.close();
536: IndexReader reader1 = IndexReader.open(ramDir1);
537:
538: //setup index 2
539: RAMDirectory ramDir2 = new RAMDirectory();
540: IndexWriter writer2 = new IndexWriter(ramDir2,
541: new StandardAnalyzer(), true);
542: d = new Document();
543: f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES,
544: Field.Index.TOKENIZED);
545: d.add(f);
546: writer2.addDocument(d);
547: writer2.optimize();
548: writer2.close();
549: IndexReader reader2 = IndexReader.open(ramDir2);
550:
551: IndexSearcher searchers[] = new IndexSearcher[2];
552: searchers[0] = new IndexSearcher(ramDir1);
553: searchers[1] = new IndexSearcher(ramDir2);
554: MultiSearcher multiSearcher = new MultiSearcher(searchers);
555: QueryParser parser = new QueryParser(FIELD_NAME,
556: new StandardAnalyzer());
557: query = parser.parse("multi*");
558: System.out.println("Searching for: "
559: + query.toString(FIELD_NAME));
560: //at this point the multisearcher calls combine(query[])
561: hits = multiSearcher.search(query);
562:
563: //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
564: Query expandedQueries[] = new Query[2];
565: expandedQueries[0] = query.rewrite(reader1);
566: expandedQueries[1] = query.rewrite(reader2);
567: query = query.combine(expandedQueries);
568:
569: //create an instance of the highlighter with the tags used to surround highlighted text
570: Highlighter highlighter = new Highlighter(this ,
571: new QueryScorer(query));
572:
573: for (int i = 0; i < hits.length(); i++) {
574: String text = hits.doc(i).get(FIELD_NAME);
575: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
576: new StringReader(text));
577: String highlightedText = highlighter.getBestFragment(
578: tokenStream, text);
579: System.out.println(highlightedText);
580: }
581: assertTrue("Failed to find correct number of highlights "
582: + numHighlights + " found", numHighlights == 2);
583:
584: }
585:
586: public void testFieldSpecificHighlighting() throws IOException,
587: ParseException {
588: String docMainText = "fred is one of the people";
589: QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
590: Query query = parser.parse("fred category:people");
591:
592: //highlighting respects fieldnames used in query
593: QueryScorer fieldSpecificScorer = new QueryScorer(query,
594: "contents");
595: Highlighter fieldSpecificHighlighter = new Highlighter(
596: new SimpleHTMLFormatter(), fieldSpecificScorer);
597: fieldSpecificHighlighter
598: .setTextFragmenter(new NullFragmenter());
599: String result = fieldSpecificHighlighter.getBestFragment(
600: analyzer, FIELD_NAME, docMainText);
601: assertEquals("Should match", result,
602: "<B>fred</B> is one of the people");
603:
604: //highlighting does not respect fieldnames used in query
605: QueryScorer fieldInSpecificScorer = new QueryScorer(query);
606: Highlighter fieldInSpecificHighlighter = new Highlighter(
607: new SimpleHTMLFormatter(), fieldInSpecificScorer);
608: fieldInSpecificHighlighter
609: .setTextFragmenter(new NullFragmenter());
610: result = fieldInSpecificHighlighter.getBestFragment(analyzer,
611: FIELD_NAME, docMainText);
612: assertEquals("Should match", result,
613: "<B>fred</B> is one of the <B>people</B>");
614:
615: reader.close();
616:
617: }
618:
619: protected TokenStream getTS2() {
620: //String s = "Hi-Speed10 foo";
621: return new TokenStream() {
622: Iterator iter;
623: List lst;
624: {
625: lst = new ArrayList();
626: Token t;
627: t = new Token("hi", 0, 2);
628: lst.add(t);
629: t = new Token("hispeed", 0, 8);
630: lst.add(t);
631: t = new Token("speed", 3, 8);
632: t.setPositionIncrement(0);
633: lst.add(t);
634: t = new Token("10", 8, 10);
635: lst.add(t);
636: t = new Token("foo", 11, 14);
637: lst.add(t);
638: iter = lst.iterator();
639: }
640:
641: public Token next() throws IOException {
642: return iter.hasNext() ? (Token) iter.next() : null;
643: }
644: };
645: }
646:
647: // same token-stream as above, but the bigger token comes first this time
648: protected TokenStream getTS2a() {
649: //String s = "Hi-Speed10 foo";
650: return new TokenStream() {
651: Iterator iter;
652: List lst;
653: {
654: lst = new ArrayList();
655: Token t;
656: t = new Token("hispeed", 0, 8);
657: lst.add(t);
658: t = new Token("hi", 0, 2);
659: t.setPositionIncrement(0);
660: lst.add(t);
661: t = new Token("speed", 3, 8);
662: lst.add(t);
663: t = new Token("10", 8, 10);
664: lst.add(t);
665: t = new Token("foo", 11, 14);
666: lst.add(t);
667: iter = lst.iterator();
668: }
669:
670: public Token next() throws IOException {
671: return iter.hasNext() ? (Token) iter.next() : null;
672: }
673: };
674: }
675:
676: public void testOverlapAnalyzer2() throws Exception {
677:
678: String s = "Hi-Speed10 foo";
679:
680: Query query;
681: Highlighter highlighter;
682: String result;
683:
684: query = new QueryParser("text", new WhitespaceAnalyzer())
685: .parse("foo");
686: highlighter = new Highlighter(new QueryScorer(query));
687: result = highlighter.getBestFragments(getTS2(), s, 3, "...");
688: assertEquals("Hi-Speed10 <B>foo</B>", result);
689:
690: query = new QueryParser("text", new WhitespaceAnalyzer())
691: .parse("10");
692: highlighter = new Highlighter(new QueryScorer(query));
693: result = highlighter.getBestFragments(getTS2(), s, 3, "...");
694: assertEquals("Hi-Speed<B>10</B> foo", result);
695:
696: query = new QueryParser("text", new WhitespaceAnalyzer())
697: .parse("hi");
698: highlighter = new Highlighter(new QueryScorer(query));
699: result = highlighter.getBestFragments(getTS2(), s, 3, "...");
700: assertEquals("<B>Hi</B>-Speed10 foo", result);
701:
702: query = new QueryParser("text", new WhitespaceAnalyzer())
703: .parse("speed");
704: highlighter = new Highlighter(new QueryScorer(query));
705: result = highlighter.getBestFragments(getTS2(), s, 3, "...");
706: assertEquals("Hi-<B>Speed</B>10 foo", result);
707:
708: query = new QueryParser("text", new WhitespaceAnalyzer())
709: .parse("hispeed");
710: highlighter = new Highlighter(new QueryScorer(query));
711: result = highlighter.getBestFragments(getTS2(), s, 3, "...");
712: assertEquals("<B>Hi-Speed</B>10 foo", result);
713:
714: query = new QueryParser("text", new WhitespaceAnalyzer())
715: .parse("hi speed");
716: highlighter = new Highlighter(new QueryScorer(query));
717: result = highlighter.getBestFragments(getTS2(), s, 3, "...");
718: assertEquals("<B>Hi-Speed</B>10 foo", result);
719:
720: /////////////////// same tests, just put the bigger overlapping token first
721: query = new QueryParser("text", new WhitespaceAnalyzer())
722: .parse("foo");
723: highlighter = new Highlighter(new QueryScorer(query));
724: result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
725: assertEquals("Hi-Speed10 <B>foo</B>", result);
726:
727: query = new QueryParser("text", new WhitespaceAnalyzer())
728: .parse("10");
729: highlighter = new Highlighter(new QueryScorer(query));
730: result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
731: assertEquals("Hi-Speed<B>10</B> foo", result);
732:
733: query = new QueryParser("text", new WhitespaceAnalyzer())
734: .parse("hi");
735: highlighter = new Highlighter(new QueryScorer(query));
736: result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
737: assertEquals("<B>Hi</B>-Speed10 foo", result);
738:
739: query = new QueryParser("text", new WhitespaceAnalyzer())
740: .parse("speed");
741: highlighter = new Highlighter(new QueryScorer(query));
742: result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
743: assertEquals("Hi-<B>Speed</B>10 foo", result);
744:
745: query = new QueryParser("text", new WhitespaceAnalyzer())
746: .parse("hispeed");
747: highlighter = new Highlighter(new QueryScorer(query));
748: result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
749: assertEquals("<B>Hi-Speed</B>10 foo", result);
750:
751: query = new QueryParser("text", new WhitespaceAnalyzer())
752: .parse("hi speed");
753: highlighter = new Highlighter(new QueryScorer(query));
754: result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
755: assertEquals("<B>Hi-Speed</B>10 foo", result);
756: }
757:
758: /*
759:
760: public void testBigramAnalyzer() throws IOException, ParseException
761: {
762: //test to ensure analyzers with none-consecutive start/end offsets
763: //dont double-highlight text
764: //setup index 1
765: RAMDirectory ramDir = new RAMDirectory();
766: Analyzer bigramAnalyzer=new CJKAnalyzer();
767: IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true);
768: Document d = new Document();
769: Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
770: d.add(f);
771: writer.addDocument(d);
772: writer.close();
773: IndexReader reader = IndexReader.open(ramDir);
774:
775: IndexSearcher searcher=new IndexSearcher(reader);
776: query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
777: System.out.println("Searching for: " + query.toString(FIELD_NAME));
778: hits = searcher.search(query);
779:
780: Highlighter highlighter =
781: new Highlighter(this,new QueryFragmentScorer(query));
782:
783: for (int i = 0; i < hits.length(); i++)
784: {
785: String text = hits.doc(i).get(FIELD_NAME);
786: TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
787: String highlightedText = highlighter.getBestFragment(tokenStream,text);
788: System.out.println(highlightedText);
789: }
790:
791: }
792: */
793:
794: public String highlightTerm(String originalText, TokenGroup group) {
795: if (group.getTotalScore() <= 0) {
796: return originalText;
797: }
798: numHighlights++; //update stats used in assertions
799: return "<b>" + originalText + "</b>";
800: }
801:
802: public void doSearching(String queryString) throws Exception {
803: QueryParser parser = new QueryParser(FIELD_NAME,
804: new StandardAnalyzer());
805: query = parser.parse(queryString);
806: doSearching(query);
807: }
808:
809: public void doSearching(Query unReWrittenQuery) throws Exception {
810: searcher = new IndexSearcher(ramDir);
811: //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
812: query = unReWrittenQuery.rewrite(reader);
813: System.out.println("Searching for: "
814: + query.toString(FIELD_NAME));
815: hits = searcher.search(query);
816: }
817:
818: void doStandardHighlights() throws Exception {
819: Highlighter highlighter = new Highlighter(this ,
820: new QueryScorer(query));
821: highlighter.setTextFragmenter(new SimpleFragmenter(20));
822: for (int i = 0; i < hits.length(); i++) {
823: String text = hits.doc(i).get(FIELD_NAME);
824: int maxNumFragmentsRequired = 2;
825: String fragmentSeparator = "...";
826: TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
827: new StringReader(text));
828:
829: String result = highlighter.getBestFragments(tokenStream,
830: text, maxNumFragmentsRequired, fragmentSeparator);
831: System.out.println("\t" + result);
832: }
833: }
834:
835: /*
836: * @see TestCase#setUp()
837: */
838: protected void setUp() throws Exception {
839: ramDir = new RAMDirectory();
840: IndexWriter writer = new IndexWriter(ramDir,
841: new StandardAnalyzer(), true);
842: for (int i = 0; i < texts.length; i++) {
843: addDoc(writer, texts[i]);
844: }
845:
846: writer.optimize();
847: writer.close();
848: reader = IndexReader.open(ramDir);
849: numHighlights = 0;
850: }
851:
852: private void addDoc(IndexWriter writer, String text)
853: throws IOException {
854: Document d = new Document();
855: Field f = new Field(FIELD_NAME, text, Field.Store.YES,
856: Field.Index.TOKENIZED);
857: d.add(f);
858: writer.addDocument(d);
859:
860: }
861:
862: /*
863: * @see TestCase#tearDown()
864: */
865: protected void tearDown() throws Exception {
866: super .tearDown();
867: }
868:
869: }
870:
871: //===================================================================
872: //========== BEGIN TEST SUPPORTING CLASSES
873: //========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
874: //========== MADE MORE GENERALLY USEFUL.
875: // TODO - make synonyms all interchangeable with each other and produce
876: // a version that does hyponyms - the "is a specialised type of ...."
877: // so that car = audi, bmw and volkswagen but bmw != audi so different
878: // behaviour to synonyms
879: //===================================================================
880:
881: class SynonymAnalyzer extends Analyzer {
882: private Map synonyms;
883:
884: public SynonymAnalyzer(Map synonyms) {
885: this .synonyms = synonyms;
886: }
887:
888: /* (non-Javadoc)
889: * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
890: */
891: public TokenStream tokenStream(String arg0, Reader arg1) {
892: return new SynonymTokenizer(new LowerCaseTokenizer(arg1),
893: synonyms);
894: }
895: }
896:
897: /**
898: * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
899: * @author MAHarwood
900: */
901: class SynonymTokenizer extends TokenStream {
902: private TokenStream realStream;
903: private Token currentRealToken = null;
904: private Map synonyms;
905: StringTokenizer st = null;
906:
907: public SynonymTokenizer(TokenStream realStream, Map synonyms) {
908: this .realStream = realStream;
909: this .synonyms = synonyms;
910: }
911:
912: public Token next() throws IOException {
913: if (currentRealToken == null) {
914: Token nextRealToken = realStream.next();
915: if (nextRealToken == null) {
916: return null;
917: }
918: String expansions = (String) synonyms.get(nextRealToken
919: .termText());
920: if (expansions == null) {
921: return nextRealToken;
922: }
923: st = new StringTokenizer(expansions, ",");
924: if (st.hasMoreTokens()) {
925: currentRealToken = nextRealToken;
926: }
927: return currentRealToken;
928: } else {
929: String nextExpandedValue = st.nextToken();
930: Token expandedToken = new Token(nextExpandedValue,
931: currentRealToken.startOffset(), currentRealToken
932: .endOffset());
933: expandedToken.setPositionIncrement(0);
934: if (!st.hasMoreTokens()) {
935: currentRealToken = null;
936: st = null;
937: }
938: return expandedToken;
939: }
940: }
941:
942: }
|