Source Code Cross Referenced for HighlighterTest.java in  » Net » lucene-connector » org » apache » lucene » search » highlight » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Net » lucene connector » org.apache.lucene.search.highlight 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package org.apache.lucene.search.highlight;
002:
003:        /**
004:         * Licensed to the Apache Software Foundation (ASF) under one or more
005:         * contributor license agreements.  See the NOTICE file distributed with
006:         * this work for additional information regarding copyright ownership.
007:         * The ASF licenses this file to You under the Apache License, Version 2.0
008:         * (the "License"); you may not use this file except in compliance with
009:         * the License.  You may obtain a copy of the License at
010:         *
011:         *     http://www.apache.org/licenses/LICENSE-2.0
012:         *
013:         * Unless required by applicable law or agreed to in writing, software
014:         * distributed under the License is distributed on an "AS IS" BASIS,
015:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016:         * See the License for the specific language governing permissions and
017:         * limitations under the License.
018:         */
019:
020:        import java.io.ByteArrayInputStream;
021:        import java.io.IOException;
022:        import java.io.Reader;
023:        import java.io.StringReader;
024:        import java.util.*;
025:
026:        import javax.xml.parsers.DocumentBuilder;
027:        import javax.xml.parsers.DocumentBuilderFactory;
028:
029:        import junit.framework.TestCase;
030:
031:        import org.apache.lucene.analysis.*;
032:        import org.apache.lucene.analysis.standard.StandardAnalyzer;
033:        import org.apache.lucene.document.Document;
034:        import org.apache.lucene.document.Field;
035:        import org.apache.lucene.index.IndexReader;
036:        import org.apache.lucene.index.IndexWriter;
037:        import org.apache.lucene.index.Term;
038:        import org.apache.lucene.queryParser.ParseException;
039:        import org.apache.lucene.queryParser.QueryParser;
040:        import org.apache.lucene.search.FilteredQuery;
041:        import org.apache.lucene.search.Hits;
042:        import org.apache.lucene.search.IndexSearcher;
043:        import org.apache.lucene.search.MultiSearcher;
044:        import org.apache.lucene.search.PhraseQuery;
045:        import org.apache.lucene.search.Query;
046:        import org.apache.lucene.search.RangeFilter;
047:        import org.apache.lucene.search.Searcher;
048:        import org.apache.lucene.search.TermQuery;
049:        import org.apache.lucene.search.spans.SpanNearQuery;
050:        import org.apache.lucene.search.spans.SpanQuery;
051:        import org.apache.lucene.search.spans.SpanTermQuery;
052:        import org.apache.lucene.store.RAMDirectory;
053:        import org.w3c.dom.Element;
054:        import org.w3c.dom.NodeList;
055:
056:        /**
057:         * JUnit Test for Highlighter class.
058:         * @author mark@searcharea.co.uk
059:         */
060:        public class HighlighterTest extends TestCase implements  Formatter {
061:            private IndexReader reader;
062:            private static final String FIELD_NAME = "contents";
063:            private Query query;
064:            RAMDirectory ramDir;
065:            public Searcher searcher = null;
066:            public Hits hits = null;
067:            int numHighlights = 0;
068:            Analyzer analyzer = new StandardAnalyzer();
069:
070:            String texts[] = {
071:                    "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
072:                    "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
073:                    "JFK has been shot", "John Kennedy has been shot",
074:                    "This text has a typo in referring to Keneddy" };
075:
076:            /**
077:             * Constructor for HighlightExtractorTest.
078:             * @param arg0
079:             */
080:            public HighlighterTest(String arg0) {
081:                super (arg0);
082:            }
083:
084:            public void testSimpleHighlighter() throws Exception {
085:                doSearching("Kennedy");
086:                Highlighter highlighter = new Highlighter(
087:                        new QueryScorer(query));
088:                highlighter.setTextFragmenter(new SimpleFragmenter(40));
089:                int maxNumFragmentsRequired = 2;
090:                for (int i = 0; i < hits.length(); i++) {
091:                    String text = hits.doc(i).get(FIELD_NAME);
092:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
093:                            new StringReader(text));
094:
095:                    String result = highlighter.getBestFragments(tokenStream,
096:                            text, maxNumFragmentsRequired, "...");
097:                    System.out.println("\t" + result);
098:                }
099:                //Not sure we can assert anything here - just running to check we dont throw any exceptions
100:            }
101:
102:            public void testGetBestFragmentsSimpleQuery() throws Exception {
103:                doSearching("Kennedy");
104:                doStandardHighlights();
105:                assertTrue("Failed to find correct number of highlights "
106:                        + numHighlights + " found", numHighlights == 4);
107:            }
108:
109:            public void testGetFuzzyFragments() throws Exception {
110:                doSearching("Kinnedy~");
111:                doStandardHighlights();
112:                assertTrue("Failed to find correct number of highlights "
113:                        + numHighlights + " found", numHighlights == 5);
114:            }
115:
116:            public void testGetWildCardFragments() throws Exception {
117:                doSearching("K?nnedy");
118:                doStandardHighlights();
119:                assertTrue("Failed to find correct number of highlights "
120:                        + numHighlights + " found", numHighlights == 4);
121:            }
122:
123:            public void testGetMidWildCardFragments() throws Exception {
124:                doSearching("K*dy");
125:                doStandardHighlights();
126:                assertTrue("Failed to find correct number of highlights "
127:                        + numHighlights + " found", numHighlights == 5);
128:            }
129:
130:            public void testGetRangeFragments() throws Exception {
131:                String queryString = FIELD_NAME + ":[kannedy TO kznnedy]";
132:
133:                //Need to explicitly set the QueryParser property to use RangeQuery rather than RangeFilters
134:                QueryParser parser = new QueryParser(FIELD_NAME,
135:                        new StandardAnalyzer());
136:                parser.setUseOldRangeQuery(true);
137:                query = parser.parse(queryString);
138:                doSearching(query);
139:
140:                doStandardHighlights();
141:                assertTrue("Failed to find correct number of highlights "
142:                        + numHighlights + " found", numHighlights == 5);
143:            }
144:
145:            public void testGetBestFragmentsPhrase() throws Exception {
146:                doSearching("\"John Kennedy\"");
147:                doStandardHighlights();
148:                //Currently highlights "John" and "Kennedy" separately
149:                assertTrue("Failed to find correct number of highlights "
150:                        + numHighlights + " found", numHighlights == 2);
151:            }
152:
153:            public void testGetBestFragmentsSpan() throws Exception {
154:                SpanQuery clauses[] = {
155:                        new SpanTermQuery(new Term("contents", "john")),
156:                        new SpanTermQuery(new Term("contents", "kennedy")), };
157:
158:                SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
159:                doSearching(snq);
160:                doStandardHighlights();
161:                //Currently highlights "John" and "Kennedy" separately
162:                assertTrue("Failed to find correct number of highlights "
163:                        + numHighlights + " found", numHighlights == 2);
164:            }
165:
166:            public void testOffByOne() throws IOException {
167:                TermQuery query = new TermQuery(new Term("data", "help"));
168:                Highlighter hg = new Highlighter(new SimpleHTMLFormatter(),
169:                        new QueryScorer(query));
170:                hg.setTextFragmenter(new NullFragmenter());
171:
172:                String match = null;
173:                match = hg.getBestFragment(new StandardAnalyzer(), "data",
174:                        "help me [54-65]");
175:                assertEquals("<B>help</B> me [54-65]", match);
176:            }
177:
178:            public void testGetBestFragmentsFilteredQuery() throws Exception {
179:                RangeFilter rf = new RangeFilter("contents", "john", "john",
180:                        true, true);
181:                SpanQuery clauses[] = {
182:                        new SpanTermQuery(new Term("contents", "john")),
183:                        new SpanTermQuery(new Term("contents", "kennedy")), };
184:                SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
185:                FilteredQuery fq = new FilteredQuery(snq, rf);
186:
187:                doSearching(fq);
188:                doStandardHighlights();
189:                //Currently highlights "John" and "Kennedy" separately
190:                assertTrue("Failed to find correct number of highlights "
191:                        + numHighlights + " found", numHighlights == 2);
192:            }
193:
194:            public void testGetBestFragmentsFilteredPhraseQuery()
195:                    throws Exception {
196:                RangeFilter rf = new RangeFilter("contents", "john", "john",
197:                        true, true);
198:                PhraseQuery pq = new PhraseQuery();
199:                pq.add(new Term("contents", "john"));
200:                pq.add(new Term("contents", "kennedy"));
201:                FilteredQuery fq = new FilteredQuery(pq, rf);
202:
203:                doSearching(fq);
204:                doStandardHighlights();
205:                //Currently highlights "John" and "Kennedy" separately
206:                assertTrue("Failed to find correct number of highlights "
207:                        + numHighlights + " found", numHighlights == 2);
208:            }
209:
210:            public void testGetBestFragmentsMultiTerm() throws Exception {
211:                doSearching("John Kenn*");
212:                doStandardHighlights();
213:                assertTrue("Failed to find correct number of highlights "
214:                        + numHighlights + " found", numHighlights == 5);
215:            }
216:
217:            public void testGetBestFragmentsWithOr() throws Exception {
218:                doSearching("JFK OR Kennedy");
219:                doStandardHighlights();
220:                assertTrue("Failed to find correct number of highlights "
221:                        + numHighlights + " found", numHighlights == 5);
222:            }
223:
224:            public void testGetBestSingleFragment() throws Exception {
225:                doSearching("Kennedy");
226:                Highlighter highlighter = new Highlighter(this ,
227:                        new QueryScorer(query));
228:                highlighter.setTextFragmenter(new SimpleFragmenter(40));
229:
230:                for (int i = 0; i < hits.length(); i++) {
231:                    String text = hits.doc(i).get(FIELD_NAME);
232:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
233:                            new StringReader(text));
234:                    String result = highlighter.getBestFragment(tokenStream,
235:                            text);
236:                    System.out.println("\t" + result);
237:                }
238:                assertTrue("Failed to find correct number of highlights "
239:                        + numHighlights + " found", numHighlights == 4);
240:
241:                numHighlights = 0;
242:                for (int i = 0; i < hits.length(); i++) {
243:                    String text = hits.doc(i).get(FIELD_NAME);
244:                    highlighter.getBestFragment(analyzer, FIELD_NAME, text);
245:                }
246:                assertTrue("Failed to find correct number of highlights "
247:                        + numHighlights + " found", numHighlights == 4);
248:
249:                numHighlights = 0;
250:                for (int i = 0; i < hits.length(); i++) {
251:                    String text = hits.doc(i).get(FIELD_NAME);
252:                    highlighter
253:                            .getBestFragments(analyzer, FIELD_NAME, text, 10);
254:                }
255:                assertTrue("Failed to find correct number of highlights "
256:                        + numHighlights + " found", numHighlights == 4);
257:
258:            }
259:
260:            public void testGetBestSingleFragmentWithWeights() throws Exception {
261:                WeightedTerm[] wTerms = new WeightedTerm[2];
262:                wTerms[0] = new WeightedTerm(10f, "hello");
263:                wTerms[1] = new WeightedTerm(1f, "kennedy");
264:                Highlighter highlighter = new Highlighter(new QueryScorer(
265:                        wTerms));
266:                TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
267:                        new StringReader(texts[0]));
268:                highlighter.setTextFragmenter(new SimpleFragmenter(2));
269:
270:                String result = highlighter.getBestFragment(tokenStream,
271:                        texts[0]).trim();
272:                assertTrue(
273:                        "Failed to find best section using weighted terms. Found: ["
274:                                + result + "]", "<B>Hello</B>".equals(result));
275:
276:                //readjust weights
277:                wTerms[1].setWeight(50f);
278:                tokenStream = analyzer.tokenStream(FIELD_NAME,
279:                        new StringReader(texts[0]));
280:                highlighter = new Highlighter(new QueryScorer(wTerms));
281:                highlighter.setTextFragmenter(new SimpleFragmenter(2));
282:
283:                result = highlighter.getBestFragment(tokenStream, texts[0])
284:                        .trim();
285:                assertTrue(
286:                        "Failed to find best section using weighted terms. Found: "
287:                                + result, "<B>kennedy</B>".equals(result));
288:            }
289:
290:            // tests a "complex" analyzer that produces multiple 
291:            // overlapping tokens 
292:            public void testOverlapAnalyzer() throws Exception {
293:                HashMap synonyms = new HashMap();
294:                synonyms.put("football", "soccer,footie");
295:                Analyzer analyzer = new SynonymAnalyzer(synonyms);
296:                String srchkey = "football";
297:
298:                String s = "football-soccer in the euro 2004 footie competition";
299:                QueryParser parser = new QueryParser("bookid", analyzer);
300:                Query query = parser.parse(srchkey);
301:
302:                Highlighter highlighter = new Highlighter(
303:                        new QueryScorer(query));
304:                TokenStream tokenStream = analyzer.tokenStream(null,
305:                        new StringReader(s));
306:                // Get 3 best fragments and seperate with a "..."
307:                String result = highlighter.getBestFragments(tokenStream, s, 3,
308:                        "...");
309:                String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
310:                assertTrue("overlapping analyzer should handle highlights OK",
311:                        expectedResult.equals(result));
312:            }
313:
314:            public void testGetSimpleHighlight() throws Exception {
315:                doSearching("Kennedy");
316:                Highlighter highlighter = new Highlighter(this ,
317:                        new QueryScorer(query));
318:
319:                for (int i = 0; i < hits.length(); i++) {
320:                    String text = hits.doc(i).get(FIELD_NAME);
321:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
322:                            new StringReader(text));
323:
324:                    String result = highlighter.getBestFragment(tokenStream,
325:                            text);
326:                    System.out.println("\t" + result);
327:                }
328:                assertTrue("Failed to find correct number of highlights "
329:                        + numHighlights + " found", numHighlights == 4);
330:            }
331:
332:            public void testGetTextFragments() throws Exception {
333:                doSearching("Kennedy");
334:                Highlighter highlighter = new Highlighter(this ,
335:                        new QueryScorer(query));
336:                highlighter.setTextFragmenter(new SimpleFragmenter(20));
337:
338:                for (int i = 0; i < hits.length(); i++) {
339:                    String text = hits.doc(i).get(FIELD_NAME);
340:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
341:                            new StringReader(text));
342:
343:                    String stringResults[] = highlighter.getBestFragments(
344:                            tokenStream, text, 10);
345:
346:                    tokenStream = analyzer.tokenStream(FIELD_NAME,
347:                            new StringReader(text));
348:                    TextFragment fragmentResults[] = highlighter
349:                            .getBestTextFragments(tokenStream, text, true, 10);
350:
351:                    assertTrue(
352:                            "Failed to find correct number of text Fragments: "
353:                                    + fragmentResults.length + " vs "
354:                                    + stringResults.length,
355:                            fragmentResults.length == stringResults.length);
356:                    for (int j = 0; j < stringResults.length; j++) {
357:                        System.out.println(fragmentResults[j]);
358:                        assertTrue("Failed to find same text Fragments: "
359:                                + fragmentResults[j] + " found",
360:                                fragmentResults[j].toString().equals(
361:                                        stringResults[j]));
362:
363:                    }
364:
365:                }
366:            }
367:
368:            public void testMaxSizeHighlight() throws Exception {
369:                doSearching("meat");
370:                Highlighter highlighter = new Highlighter(this ,
371:                        new QueryScorer(query));
372:                highlighter.setMaxDocBytesToAnalyze(30);
373:                TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
374:                        new StringReader(texts[0]));
375:                highlighter.getBestFragment(tokenStream, texts[0]);
376:                assertTrue(
377:                        "Setting MaxDocBytesToAnalyze should have prevented "
378:                                + "us from finding matches for this record: "
379:                                + numHighlights + " found", numHighlights == 0);
380:            }
381:
382:            public void testMaxSizeHighlightTruncates() throws IOException {
383:                String goodWord = "goodtoken";
384:                String stopWords[] = { "stoppedtoken" };
385:
386:                TermQuery query = new TermQuery(new Term("data", goodWord));
387:                SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
388:                Highlighter hg = new Highlighter(fm, new QueryScorer(query));
389:                hg.setTextFragmenter(new NullFragmenter());
390:
391:                String match = null;
392:                StringBuffer sb = new StringBuffer();
393:                sb.append(goodWord);
394:                for (int i = 0; i < 10000; i++) {
395:                    sb.append(" ");
396:                    sb.append(stopWords[0]);
397:                }
398:
399:                hg.setMaxDocBytesToAnalyze(100);
400:                match = hg.getBestFragment(new StandardAnalyzer(stopWords),
401:                        "data", sb.toString());
402:                assertTrue(
403:                        "Matched text should be no more than 100 chars in length ",
404:                        match.length() < hg.getMaxDocBytesToAnalyze());
405:
406:                //add another tokenized word to the overrall length - but set way beyond 
407:                //the length of text under consideration (after a large slug of stop words + whitespace)
408:                sb.append(" ");
409:                sb.append(goodWord);
410:                match = hg.getBestFragment(new StandardAnalyzer(stopWords),
411:                        "data", sb.toString());
412:                assertTrue(
413:                        "Matched text should be no more than 100 chars in length ",
414:                        match.length() < hg.getMaxDocBytesToAnalyze());
415:
416:            }
417:
418:            public void testUnRewrittenQuery() throws IOException,
419:                    ParseException {
420:                //test to show how rewritten query can still be used
421:                searcher = new IndexSearcher(ramDir);
422:                Analyzer analyzer = new StandardAnalyzer();
423:
424:                QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
425:                Query query = parser.parse("JF? or Kenned*");
426:                System.out.println("Searching with primitive query");
427:                //forget to set this and...
428:                //query=query.rewrite(reader);
429:                Hits hits = searcher.search(query);
430:
431:                //create an instance of the highlighter with the tags used to surround highlighted text
432:                //		QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
433:                Highlighter highlighter = new Highlighter(this ,
434:                        new QueryScorer(query));
435:
436:                highlighter.setTextFragmenter(new SimpleFragmenter(40));
437:
438:                int maxNumFragmentsRequired = 3;
439:
440:                for (int i = 0; i < hits.length(); i++) {
441:                    String text = hits.doc(i).get(FIELD_NAME);
442:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
443:                            new StringReader(text));
444:
445:                    String highlightedText = highlighter.getBestFragments(
446:                            tokenStream, text, maxNumFragmentsRequired, "...");
447:                    System.out.println(highlightedText);
448:                }
449:                //We expect to have zero highlights if the query is multi-terms and is not rewritten!
450:                assertTrue("Failed to find correct number of highlights "
451:                        + numHighlights + " found", numHighlights == 0);
452:            }
453:
454:            public void testNoFragments() throws Exception {
455:                doSearching("AnInvalidQueryWhichShouldYieldNoResults");
456:                Highlighter highlighter = new Highlighter(this ,
457:                        new QueryScorer(query));
458:
459:                for (int i = 0; i < texts.length; i++) {
460:                    String text = texts[i];
461:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
462:                            new StringReader(text));
463:
464:                    String result = highlighter.getBestFragment(tokenStream,
465:                            text);
466:                    assertNull(
467:                            "The highlight result should be null for text with no query terms",
468:                            result);
469:                }
470:            }
471:
472:            /**
473:             * Demonstrates creation of an XHTML compliant doc using new encoding facilities.
474:             * @throws Exception
475:             */
476:            public void testEncoding() throws Exception {
477:                String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
478:                //run the highlighter on the raw content (scorer does not score any tokens for 
479:                // highlighting but scores a single fragment for selection
480:                Highlighter highlighter = new Highlighter(this ,
481:                        new SimpleHTMLEncoder(), new Scorer() {
482:                            public void startFragment(TextFragment newFragment) {
483:                            }
484:
485:                            public float getTokenScore(Token token) {
486:                                return 0;
487:                            }
488:
489:                            public float getFragmentScore() {
490:                                return 1;
491:                            }
492:                        });
493:                highlighter.setTextFragmenter(new SimpleFragmenter(2000));
494:                TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
495:                        new StringReader(rawDocContent));
496:
497:                String encodedSnippet = highlighter.getBestFragments(
498:                        tokenStream, rawDocContent, 1, "");
499:                //An ugly bit of XML creation:
500:                String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
501:                        + "<!DOCTYPE html\n"
502:                        + "PUBLIC \"//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
503:                        + "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
504:                        + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"
505:                        + "<head>\n" + "<title>My Test HTML Document</title>\n"
506:                        + "</head>\n" + "<body>\n" + "<h2>" + encodedSnippet
507:                        + "</h2>\n" + "</body>\n" + "</html>";
508:                //now an ugly built of XML parsing to test the snippet is encoded OK 
509:                DocumentBuilderFactory dbf = DocumentBuilderFactory
510:                        .newInstance();
511:                DocumentBuilder db = dbf.newDocumentBuilder();
512:                org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(
513:                        xhtml.getBytes()));
514:                Element root = doc.getDocumentElement();
515:                NodeList nodes = root.getElementsByTagName("body");
516:                Element body = (Element) nodes.item(0);
517:                nodes = body.getElementsByTagName("h2");
518:                Element h2 = (Element) nodes.item(0);
519:                String decodedSnippet = h2.getFirstChild().getNodeValue();
520:                assertEquals("XHTML Encoding should have worked:",
521:                        rawDocContent, decodedSnippet);
522:            }
523:
524:            public void testMultiSearcher() throws Exception {
525:                //setup index 1
526:                RAMDirectory ramDir1 = new RAMDirectory();
527:                IndexWriter writer1 = new IndexWriter(ramDir1,
528:                        new StandardAnalyzer(), true);
529:                Document d = new Document();
530:                Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES,
531:                        Field.Index.TOKENIZED);
532:                d.add(f);
533:                writer1.addDocument(d);
534:                writer1.optimize();
535:                writer1.close();
536:                IndexReader reader1 = IndexReader.open(ramDir1);
537:
538:                //setup index 2
539:                RAMDirectory ramDir2 = new RAMDirectory();
540:                IndexWriter writer2 = new IndexWriter(ramDir2,
541:                        new StandardAnalyzer(), true);
542:                d = new Document();
543:                f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES,
544:                        Field.Index.TOKENIZED);
545:                d.add(f);
546:                writer2.addDocument(d);
547:                writer2.optimize();
548:                writer2.close();
549:                IndexReader reader2 = IndexReader.open(ramDir2);
550:
551:                IndexSearcher searchers[] = new IndexSearcher[2];
552:                searchers[0] = new IndexSearcher(ramDir1);
553:                searchers[1] = new IndexSearcher(ramDir2);
554:                MultiSearcher multiSearcher = new MultiSearcher(searchers);
555:                QueryParser parser = new QueryParser(FIELD_NAME,
556:                        new StandardAnalyzer());
557:                query = parser.parse("multi*");
558:                System.out.println("Searching for: "
559:                        + query.toString(FIELD_NAME));
560:                //at this point the multisearcher calls combine(query[])
561:                hits = multiSearcher.search(query);
562:
563:                //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
564:                Query expandedQueries[] = new Query[2];
565:                expandedQueries[0] = query.rewrite(reader1);
566:                expandedQueries[1] = query.rewrite(reader2);
567:                query = query.combine(expandedQueries);
568:
569:                //create an instance of the highlighter with the tags used to surround highlighted text
570:                Highlighter highlighter = new Highlighter(this ,
571:                        new QueryScorer(query));
572:
573:                for (int i = 0; i < hits.length(); i++) {
574:                    String text = hits.doc(i).get(FIELD_NAME);
575:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
576:                            new StringReader(text));
577:                    String highlightedText = highlighter.getBestFragment(
578:                            tokenStream, text);
579:                    System.out.println(highlightedText);
580:                }
581:                assertTrue("Failed to find correct number of highlights "
582:                        + numHighlights + " found", numHighlights == 2);
583:
584:            }
585:
586:            public void testFieldSpecificHighlighting() throws IOException,
587:                    ParseException {
588:                String docMainText = "fred is one of the people";
589:                QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
590:                Query query = parser.parse("fred category:people");
591:
592:                //highlighting respects fieldnames used in query
593:                QueryScorer fieldSpecificScorer = new QueryScorer(query,
594:                        "contents");
595:                Highlighter fieldSpecificHighlighter = new Highlighter(
596:                        new SimpleHTMLFormatter(), fieldSpecificScorer);
597:                fieldSpecificHighlighter
598:                        .setTextFragmenter(new NullFragmenter());
599:                String result = fieldSpecificHighlighter.getBestFragment(
600:                        analyzer, FIELD_NAME, docMainText);
601:                assertEquals("Should match", result,
602:                        "<B>fred</B> is one of the people");
603:
604:                //highlighting does not respect fieldnames used in query
605:                QueryScorer fieldInSpecificScorer = new QueryScorer(query);
606:                Highlighter fieldInSpecificHighlighter = new Highlighter(
607:                        new SimpleHTMLFormatter(), fieldInSpecificScorer);
608:                fieldInSpecificHighlighter
609:                        .setTextFragmenter(new NullFragmenter());
610:                result = fieldInSpecificHighlighter.getBestFragment(analyzer,
611:                        FIELD_NAME, docMainText);
612:                assertEquals("Should match", result,
613:                        "<B>fred</B> is one of the <B>people</B>");
614:
615:                reader.close();
616:
617:            }
618:
619:            protected TokenStream getTS2() {
620:                //String s = "Hi-Speed10 foo";
621:                return new TokenStream() {
622:                    Iterator iter;
623:                    List lst;
624:                    {
625:                        lst = new ArrayList();
626:                        Token t;
627:                        t = new Token("hi", 0, 2);
628:                        lst.add(t);
629:                        t = new Token("hispeed", 0, 8);
630:                        lst.add(t);
631:                        t = new Token("speed", 3, 8);
632:                        t.setPositionIncrement(0);
633:                        lst.add(t);
634:                        t = new Token("10", 8, 10);
635:                        lst.add(t);
636:                        t = new Token("foo", 11, 14);
637:                        lst.add(t);
638:                        iter = lst.iterator();
639:                    }
640:
641:                    public Token next() throws IOException {
642:                        return iter.hasNext() ? (Token) iter.next() : null;
643:                    }
644:                };
645:            }
646:
647:            // same token-stream as above, but the bigger token comes first this time
648:            protected TokenStream getTS2a() {
649:                //String s = "Hi-Speed10 foo";
650:                return new TokenStream() {
651:                    Iterator iter;
652:                    List lst;
653:                    {
654:                        lst = new ArrayList();
655:                        Token t;
656:                        t = new Token("hispeed", 0, 8);
657:                        lst.add(t);
658:                        t = new Token("hi", 0, 2);
659:                        t.setPositionIncrement(0);
660:                        lst.add(t);
661:                        t = new Token("speed", 3, 8);
662:                        lst.add(t);
663:                        t = new Token("10", 8, 10);
664:                        lst.add(t);
665:                        t = new Token("foo", 11, 14);
666:                        lst.add(t);
667:                        iter = lst.iterator();
668:                    }
669:
670:                    public Token next() throws IOException {
671:                        return iter.hasNext() ? (Token) iter.next() : null;
672:                    }
673:                };
674:            }
675:
676:            public void testOverlapAnalyzer2() throws Exception {
677:
678:                String s = "Hi-Speed10 foo";
679:
680:                Query query;
681:                Highlighter highlighter;
682:                String result;
683:
684:                query = new QueryParser("text", new WhitespaceAnalyzer())
685:                        .parse("foo");
686:                highlighter = new Highlighter(new QueryScorer(query));
687:                result = highlighter.getBestFragments(getTS2(), s, 3, "...");
688:                assertEquals("Hi-Speed10 <B>foo</B>", result);
689:
690:                query = new QueryParser("text", new WhitespaceAnalyzer())
691:                        .parse("10");
692:                highlighter = new Highlighter(new QueryScorer(query));
693:                result = highlighter.getBestFragments(getTS2(), s, 3, "...");
694:                assertEquals("Hi-Speed<B>10</B> foo", result);
695:
696:                query = new QueryParser("text", new WhitespaceAnalyzer())
697:                        .parse("hi");
698:                highlighter = new Highlighter(new QueryScorer(query));
699:                result = highlighter.getBestFragments(getTS2(), s, 3, "...");
700:                assertEquals("<B>Hi</B>-Speed10 foo", result);
701:
702:                query = new QueryParser("text", new WhitespaceAnalyzer())
703:                        .parse("speed");
704:                highlighter = new Highlighter(new QueryScorer(query));
705:                result = highlighter.getBestFragments(getTS2(), s, 3, "...");
706:                assertEquals("Hi-<B>Speed</B>10 foo", result);
707:
708:                query = new QueryParser("text", new WhitespaceAnalyzer())
709:                        .parse("hispeed");
710:                highlighter = new Highlighter(new QueryScorer(query));
711:                result = highlighter.getBestFragments(getTS2(), s, 3, "...");
712:                assertEquals("<B>Hi-Speed</B>10 foo", result);
713:
714:                query = new QueryParser("text", new WhitespaceAnalyzer())
715:                        .parse("hi speed");
716:                highlighter = new Highlighter(new QueryScorer(query));
717:                result = highlighter.getBestFragments(getTS2(), s, 3, "...");
718:                assertEquals("<B>Hi-Speed</B>10 foo", result);
719:
720:                /////////////////// same tests, just put the bigger overlapping token first
721:                query = new QueryParser("text", new WhitespaceAnalyzer())
722:                        .parse("foo");
723:                highlighter = new Highlighter(new QueryScorer(query));
724:                result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
725:                assertEquals("Hi-Speed10 <B>foo</B>", result);
726:
727:                query = new QueryParser("text", new WhitespaceAnalyzer())
728:                        .parse("10");
729:                highlighter = new Highlighter(new QueryScorer(query));
730:                result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
731:                assertEquals("Hi-Speed<B>10</B> foo", result);
732:
733:                query = new QueryParser("text", new WhitespaceAnalyzer())
734:                        .parse("hi");
735:                highlighter = new Highlighter(new QueryScorer(query));
736:                result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
737:                assertEquals("<B>Hi</B>-Speed10 foo", result);
738:
739:                query = new QueryParser("text", new WhitespaceAnalyzer())
740:                        .parse("speed");
741:                highlighter = new Highlighter(new QueryScorer(query));
742:                result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
743:                assertEquals("Hi-<B>Speed</B>10 foo", result);
744:
745:                query = new QueryParser("text", new WhitespaceAnalyzer())
746:                        .parse("hispeed");
747:                highlighter = new Highlighter(new QueryScorer(query));
748:                result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
749:                assertEquals("<B>Hi-Speed</B>10 foo", result);
750:
751:                query = new QueryParser("text", new WhitespaceAnalyzer())
752:                        .parse("hi speed");
753:                highlighter = new Highlighter(new QueryScorer(query));
754:                result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
755:                assertEquals("<B>Hi-Speed</B>10 foo", result);
756:            }
757:
758:            /*
759:
760:             public void testBigramAnalyzer() throws IOException, ParseException
761:             {
762:             //test to ensure analyzers with none-consecutive start/end offsets
763:             //dont double-highlight text
764:             //setup index 1
765:             RAMDirectory ramDir = new RAMDirectory();
766:             Analyzer bigramAnalyzer=new CJKAnalyzer();
767:             IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true);
768:             Document d = new Document();
769:             Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
770:             d.add(f);
771:             writer.addDocument(d);
772:             writer.close();
773:             IndexReader reader = IndexReader.open(ramDir);
774:
775:             IndexSearcher searcher=new IndexSearcher(reader);
776:             query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
777:             System.out.println("Searching for: " + query.toString(FIELD_NAME));
778:             hits = searcher.search(query);
779:
780:             Highlighter highlighter =
781:             new Highlighter(this,new QueryFragmentScorer(query));
782:
783:             for (int i = 0; i < hits.length(); i++)
784:             {
785:             String text = hits.doc(i).get(FIELD_NAME);
786:             TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
787:             String highlightedText = highlighter.getBestFragment(tokenStream,text);
788:             System.out.println(highlightedText);
789:             }
790:
791:             }
792:             */
793:
794:            public String highlightTerm(String originalText, TokenGroup group) {
795:                if (group.getTotalScore() <= 0) {
796:                    return originalText;
797:                }
798:                numHighlights++; //update stats used in assertions
799:                return "<b>" + originalText + "</b>";
800:            }
801:
802:            public void doSearching(String queryString) throws Exception {
803:                QueryParser parser = new QueryParser(FIELD_NAME,
804:                        new StandardAnalyzer());
805:                query = parser.parse(queryString);
806:                doSearching(query);
807:            }
808:
809:            public void doSearching(Query unReWrittenQuery) throws Exception {
810:                searcher = new IndexSearcher(ramDir);
811:                //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
812:                query = unReWrittenQuery.rewrite(reader);
813:                System.out.println("Searching for: "
814:                        + query.toString(FIELD_NAME));
815:                hits = searcher.search(query);
816:            }
817:
818:            void doStandardHighlights() throws Exception {
819:                Highlighter highlighter = new Highlighter(this ,
820:                        new QueryScorer(query));
821:                highlighter.setTextFragmenter(new SimpleFragmenter(20));
822:                for (int i = 0; i < hits.length(); i++) {
823:                    String text = hits.doc(i).get(FIELD_NAME);
824:                    int maxNumFragmentsRequired = 2;
825:                    String fragmentSeparator = "...";
826:                    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
827:                            new StringReader(text));
828:
829:                    String result = highlighter.getBestFragments(tokenStream,
830:                            text, maxNumFragmentsRequired, fragmentSeparator);
831:                    System.out.println("\t" + result);
832:                }
833:            }
834:
835:            /*
836:             * @see TestCase#setUp()
837:             */
838:            protected void setUp() throws Exception {
839:                ramDir = new RAMDirectory();
840:                IndexWriter writer = new IndexWriter(ramDir,
841:                        new StandardAnalyzer(), true);
842:                for (int i = 0; i < texts.length; i++) {
843:                    addDoc(writer, texts[i]);
844:                }
845:
846:                writer.optimize();
847:                writer.close();
848:                reader = IndexReader.open(ramDir);
849:                numHighlights = 0;
850:            }
851:
852:            private void addDoc(IndexWriter writer, String text)
853:                    throws IOException {
854:                Document d = new Document();
855:                Field f = new Field(FIELD_NAME, text, Field.Store.YES,
856:                        Field.Index.TOKENIZED);
857:                d.add(f);
858:                writer.addDocument(d);
859:
860:            }
861:
862:            /*
863:             * @see TestCase#tearDown()
864:             */
865:            protected void tearDown() throws Exception {
866:                super .tearDown();
867:            }
868:
869:        }
870:
871:        //===================================================================
872:        //========== BEGIN TEST SUPPORTING CLASSES
873:        //========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
874:        //========== MADE MORE GENERALLY USEFUL.
875:        // TODO - make synonyms all interchangeable with each other and produce
876:        // a version that does hyponyms - the "is a specialised type of ...."
877:        // so that car = audi, bmw and volkswagen but bmw != audi so different
878:        // behaviour to synonyms
879:        //===================================================================
880:
881:        class SynonymAnalyzer extends Analyzer {
882:            private Map synonyms;
883:
884:            public SynonymAnalyzer(Map synonyms) {
885:                this .synonyms = synonyms;
886:            }
887:
888:            /* (non-Javadoc)
889:             * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
890:             */
891:            public TokenStream tokenStream(String arg0, Reader arg1) {
892:                return new SynonymTokenizer(new LowerCaseTokenizer(arg1),
893:                        synonyms);
894:            }
895:        }
896:
897:        /**
898:         * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
899:         * @author MAHarwood
900:         */
901:        class SynonymTokenizer extends TokenStream {
902:            private TokenStream realStream;
903:            private Token currentRealToken = null;
904:            private Map synonyms;
905:            StringTokenizer st = null;
906:
907:            public SynonymTokenizer(TokenStream realStream, Map synonyms) {
908:                this .realStream = realStream;
909:                this .synonyms = synonyms;
910:            }
911:
912:            public Token next() throws IOException {
913:                if (currentRealToken == null) {
914:                    Token nextRealToken = realStream.next();
915:                    if (nextRealToken == null) {
916:                        return null;
917:                    }
918:                    String expansions = (String) synonyms.get(nextRealToken
919:                            .termText());
920:                    if (expansions == null) {
921:                        return nextRealToken;
922:                    }
923:                    st = new StringTokenizer(expansions, ",");
924:                    if (st.hasMoreTokens()) {
925:                        currentRealToken = nextRealToken;
926:                    }
927:                    return currentRealToken;
928:                } else {
929:                    String nextExpandedValue = st.nextToken();
930:                    Token expandedToken = new Token(nextExpandedValue,
931:                            currentRealToken.startOffset(), currentRealToken
932:                                    .endOffset());
933:                    expandedToken.setPositionIncrement(0);
934:                    if (!st.hasMoreTokens()) {
935:                        currentRealToken = null;
936:                        st = null;
937:                    }
938:                    return expandedToken;
939:                }
940:            }
941:
942:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.