001: /*
002: JSPWiki - a JSP-based WikiWiki clone.
003:
004: Copyright (C) 2005 Janne Jalkanen (Janne.Jalkanen@iki.fi)
005:
006: This program is free software; you can redistribute it and/or modify
007: it under the terms of the GNU Lesser General Public License as published by
008: the Free Software Foundation; either version 2.1 of the License, or
009: (at your option) any later version.
010:
011: This program is distributed in the hope that it will be useful,
012: but WITHOUT ANY WARRANTY; without even the implied warranty of
013: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: GNU Lesser General Public License for more details.
015:
016: You should have received a copy of the GNU Lesser General Public License
017: along with this program; if not, write to the Free Software
018: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
019: */
020: package com.ecyrd.jspwiki.search;
021:
022: import java.io.*;
023: import java.util.*;
024:
025: import org.apache.commons.lang.StringUtils;
026: import org.apache.log4j.Logger;
027: import org.apache.lucene.analysis.Analyzer;
028: import org.apache.lucene.analysis.TokenStream;
029: import org.apache.lucene.document.Document;
030: import org.apache.lucene.document.Field;
031: import org.apache.lucene.index.IndexReader;
032: import org.apache.lucene.index.IndexWriter;
033: import org.apache.lucene.index.Term;
034: import org.apache.lucene.queryParser.MultiFieldQueryParser;
035: import org.apache.lucene.queryParser.ParseException;
036: import org.apache.lucene.queryParser.QueryParser;
037: import org.apache.lucene.search.Hits;
038: import org.apache.lucene.search.IndexSearcher;
039: import org.apache.lucene.search.Query;
040: import org.apache.lucene.search.Searcher;
041: import org.apache.lucene.search.highlight.Highlighter;
042: import org.apache.lucene.search.highlight.QueryScorer;
043: import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
044: import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
045: import org.apache.lucene.store.Directory;
046: import org.apache.lucene.store.FSDirectory;
047:
048: import com.ecyrd.jspwiki.*;
049: import com.ecyrd.jspwiki.attachment.Attachment;
050: import com.ecyrd.jspwiki.attachment.AttachmentManager;
051: import com.ecyrd.jspwiki.parser.MarkupParser;
052: import com.ecyrd.jspwiki.providers.ProviderException;
053: import com.ecyrd.jspwiki.providers.WikiPageProvider;
054: import com.ecyrd.jspwiki.util.ClassUtil;
055: import com.ecyrd.jspwiki.util.WatchDog;
056: import com.ecyrd.jspwiki.util.WikiBackgroundThread;
057:
058: /**
059: * Interface for the search providers that handle searching the Wiki
060: *
061: * @author Arent-Jan Banck
062: * @since 2.2.21.
063: */
064: public class LuceneSearchProvider implements SearchProvider {
065: protected static final Logger log = Logger
066: .getLogger(LuceneSearchProvider.class);
067:
068: private WikiEngine m_engine;
069:
070: // Lucene properties.
071:
072: /** Which analyzer to use. Default is StandardAnalyzer. */
073: public static final String PROP_LUCENE_ANALYZER = "jspwiki.lucene.analyzer";
074:
075: private static final String PROP_LUCENE_INDEXDELAY = "jspwiki.lucene.indexdelay";
076: private static final String PROP_LUCENE_INITIALDELAY = "jspwiki.lucene.initialdelay";
077:
078: private String m_analyzerClass = "org.apache.lucene.analysis.standard.StandardAnalyzer";
079:
080: private static final String LUCENE_DIR = "lucene";
081:
082: /**
083: * Number of page updates before we optimize the index.
084: */
085: public static final int LUCENE_OPTIMIZE_COUNT = 10;
086: protected static final String LUCENE_ID = "id";
087: protected static final String LUCENE_PAGE_CONTENTS = "contents";
088: protected static final String LUCENE_AUTHOR = "author";
089: protected static final String LUCENE_ATTACHMENTS = "attachment";
090: protected static final String LUCENE_PAGE_NAME = "name";
091:
092: private String m_luceneDirectory = null;
093: private int m_updateCount = 0;
094: protected Vector m_updates = new Vector(); // Vector because multi-threaded.
095:
096: /** Maximum number of fragments from search matches. */
097: private static final int MAX_FRAGMENTS = 3;
098:
099: private static String c_punctuationSpaces = StringUtils.repeat(" ",
100: MarkupParser.PUNCTUATION_CHARS_ALLOWED.length());
101:
102: /**
103: * {@inheritDoc}
104: */
105: public void initialize(WikiEngine engine, Properties props)
106: throws NoRequiredPropertyException, IOException {
107: m_engine = engine;
108:
109: m_luceneDirectory = engine.getWorkDir() + File.separator
110: + LUCENE_DIR;
111:
112: int initialDelay = TextUtil.getIntegerProperty(props,
113: PROP_LUCENE_INITIALDELAY, LuceneUpdater.INITIAL_DELAY);
114: int indexDelay = TextUtil.getIntegerProperty(props,
115: PROP_LUCENE_INDEXDELAY, LuceneUpdater.INDEX_DELAY);
116:
117: m_analyzerClass = TextUtil.getStringProperty(props,
118: PROP_LUCENE_ANALYZER, m_analyzerClass);
119: // FIXME: Just to be simple for now, we will do full reindex
120: // only if no files are in lucene directory.
121:
122: File dir = new File(m_luceneDirectory);
123:
124: log.info("Lucene enabled, cache will be in: "
125: + dir.getAbsolutePath());
126:
127: try {
128: if (!dir.exists()) {
129: dir.mkdirs();
130: }
131:
132: if (!dir.exists() || !dir.canWrite() || !dir.canRead()) {
133: log
134: .error("Cannot write to Lucene directory, disabling Lucene: "
135: + dir.getAbsolutePath());
136: throw new IOException("Invalid Lucene directory.");
137: }
138:
139: String[] filelist = dir.list();
140:
141: if (filelist == null) {
142: throw new IOException(
143: "Invalid Lucene directory: cannot produce listing: "
144: + dir.getAbsolutePath());
145: }
146: } catch (IOException e) {
147: log
148: .error(
149: "Problem while creating Lucene index - not using Lucene.",
150: e);
151: }
152:
153: // Start the Lucene update thread, which waits first
154: // for a little while before starting to go through
155: // the Lucene "pages that need updating".
156: LuceneUpdater updater = new LuceneUpdater(m_engine, this ,
157: initialDelay, indexDelay);
158: updater.start();
159: }
160:
161: /**
162: * Returns the handling engine.
163: *
164: * @return Current WikiEngine
165: */
166: protected WikiEngine getEngine() {
167: return m_engine;
168: }
169:
170: /**
171: * Performs a full Lucene reindex, if necessary.
172: *
173: * @throws IOException If there's a problem during indexing
174: */
175: protected void doFullLuceneReindex() throws IOException {
176: File dir = new File(m_luceneDirectory);
177:
178: String[] filelist = dir.list();
179:
180: if (filelist == null) {
181: throw new IOException(
182: "Invalid Lucene directory: cannot produce listing: "
183: + dir.getAbsolutePath());
184: }
185:
186: try {
187: if (filelist.length == 0) {
188: //
189: // No files? Reindex!
190: //
191: Date start = new Date();
192: IndexWriter writer = null;
193:
194: log
195: .info("Starting Lucene reindexing, this can take a couple minutes...");
196:
197: //
198: // Do lock recovery, in case JSPWiki was shut down forcibly
199: //
200: Directory luceneDir = FSDirectory.getDirectory(dir,
201: false);
202:
203: if (IndexReader.isLocked(luceneDir)) {
204: log
205: .info("JSPWiki was shut down while Lucene was indexing - unlocking now.");
206: IndexReader.unlock(luceneDir);
207: }
208:
209: try {
210: writer = new IndexWriter(m_luceneDirectory,
211: getLuceneAnalyzer(), true);
212: Collection allPages = m_engine.getPageManager()
213: .getAllPages();
214:
215: for (Iterator iterator = allPages.iterator(); iterator
216: .hasNext();) {
217: WikiPage page = (WikiPage) iterator.next();
218: String text = m_engine.getPageManager()
219: .getPageText(page.getName(),
220: WikiProvider.LATEST_VERSION);
221: luceneIndexPage(page, text, writer);
222: }
223:
224: Collection allAttachments = m_engine
225: .getAttachmentManager().getAllAttachments();
226: for (Iterator iterator = allAttachments.iterator(); iterator
227: .hasNext();) {
228: Attachment att = (Attachment) iterator.next();
229: String text = getAttachmentContent(att
230: .getName(), WikiProvider.LATEST_VERSION);
231: luceneIndexPage(att, text, writer);
232: }
233:
234: writer.optimize();
235: } finally {
236: try {
237: if (writer != null)
238: writer.close();
239: } catch (IOException e) {
240: }
241: }
242:
243: Date end = new Date();
244: log.info("Full Lucene index finished in "
245: + (end.getTime() - start.getTime())
246: + " milliseconds.");
247: } else {
248: log
249: .info("Files found in Lucene directory, not reindexing.");
250: }
251: } catch (NoClassDefFoundError e) {
252: log
253: .info("Lucene libraries do not exist - not using Lucene.");
254: } catch (IOException e) {
255: log
256: .error(
257: "Problem while creating Lucene index - not using Lucene.",
258: e);
259: } catch (ProviderException e) {
260: log
261: .error(
262: "Problem reading pages while creating Lucene index (JSPWiki won't start.)",
263: e);
264: throw new IllegalArgumentException(
265: "unable to create Lucene index");
266: } catch (ClassNotFoundException e) {
267: log.error("Illegal Analyzer specified:", e);
268: } catch (Exception e) {
269: log.error("Unable to start lucene", e);
270: }
271:
272: }
273:
274: /**
275: * Fetches the attachment content from the repository.
276: * Content is flat text that can be used for indexing/searching or display
277: */
278: protected String getAttachmentContent(String attachmentName,
279: int version) {
280: AttachmentManager mgr = m_engine.getAttachmentManager();
281:
282: try {
283: Attachment att = mgr.getAttachmentInfo(attachmentName,
284: version);
285: //FIXME: Find out why sometimes att is null
286: if (att != null) {
287: return getAttachmentContent(att);
288: }
289: } catch (ProviderException e) {
290: log.error("Attachment cannot be loaded", e);
291: }
292: // Something was wrong, no result is returned.
293: return null;
294: }
295:
296: /**
297: * @param att Attachment to get content for. Filename extension is used to determine the type of the attachment.
298: * @return String representing the content of the file.
299: * FIXME This is a very simple implementation of some text-based attachment, mainly used for testing.
300: * This should be replaced /moved to Attachment search providers or some other 'plugable' wat to search attachments
301: */
302: protected String getAttachmentContent(Attachment att) {
303: AttachmentManager mgr = m_engine.getAttachmentManager();
304: //FIXME: Add attachment plugin structure
305:
306: String filename = att.getFileName();
307:
308: if (filename.endsWith(".txt") || filename.endsWith(".xml")
309: || filename.endsWith(".ini")
310: || filename.endsWith(".html")) {
311: InputStream attStream;
312:
313: try {
314: attStream = mgr.getAttachmentStream(att);
315:
316: StringWriter sout = new StringWriter();
317: FileUtil.copyContents(new InputStreamReader(attStream),
318: sout);
319:
320: attStream.close();
321: sout.close();
322:
323: return sout.toString();
324: } catch (ProviderException e) {
325: log.error("Attachment cannot be loaded", e);
326: return null;
327: } catch (IOException e) {
328: log.error("Attachment cannot be loaded", e);
329: return null;
330: }
331: }
332:
333: return null;
334: }
335:
336: /**
337: * Updates the lucene index for a single page.
338: *
339: * @param page The WikiPage to check
340: * @param text The page text to index.
341: */
342: protected synchronized void updateLuceneIndex(WikiPage page,
343: String text) {
344: IndexWriter writer = null;
345:
346: log.debug("Updating Lucene index for page '" + page.getName()
347: + "'...");
348:
349: try {
350: pageRemoved(page);
351:
352: // Now add back the new version.
353: writer = new IndexWriter(m_luceneDirectory,
354: getLuceneAnalyzer(), false);
355: luceneIndexPage(page, text, writer);
356: m_updateCount++;
357: if (m_updateCount >= LUCENE_OPTIMIZE_COUNT) {
358: writer.optimize();
359: m_updateCount = 0;
360: }
361: } catch (IOException e) {
362: log.error("Unable to update page '" + page.getName()
363: + "' from Lucene index", e);
364: } catch (Exception e) {
365: log
366: .error(
367: "Unexpected Lucene exception - please check configuration!",
368: e);
369: } finally {
370: try {
371: if (writer != null)
372: writer.close();
373: } catch (IOException e) {
374: }
375: }
376:
377: log.debug("Done updating Lucene index for page '"
378: + page.getName() + "'.");
379: }
380:
381: private Analyzer getLuceneAnalyzer() throws ClassNotFoundException,
382: InstantiationException, IllegalAccessException {
383: Class clazz = ClassUtil.findClass("", m_analyzerClass);
384: Analyzer analyzer = (Analyzer) clazz.newInstance();
385: return analyzer;
386: }
387:
388: /**
389: * Indexes page using the given IndexWriter.
390: *
391: * @param page WikiPage
392: * @param text Page text to index
393: * @param writer The Lucene IndexWriter to use for indexing
394: * @return the created index Document
395: * @throws IOException If there's an indexing problem
396: */
397: protected Document luceneIndexPage(WikiPage page, String text,
398: IndexWriter writer) throws IOException {
399: // make a new, empty document
400: Document doc = new Document();
401:
402: if (text == null)
403: return doc;
404:
405: // Raw name is the keyword we'll use to refer to this document for updates.
406: Field field = new Field(LUCENE_ID, page.getName(),
407: Field.Store.YES, Field.Index.UN_TOKENIZED);
408: doc.add(field);
409:
410: // Body text. It is stored in the doc for search contexts.
411: field = new Field(LUCENE_PAGE_CONTENTS, text, Field.Store.YES,
412: Field.Index.TOKENIZED, Field.TermVector.NO);
413: doc.add(field);
414:
415: // Allow searching by page name. Both beautified and raw
416: String unTokenizedTitle = StringUtils.replaceChars(page
417: .getName(), MarkupParser.PUNCTUATION_CHARS_ALLOWED,
418: c_punctuationSpaces);
419:
420: field = new Field(LUCENE_PAGE_NAME, TextUtil
421: .beautifyString(page.getName())
422: + " " + unTokenizedTitle, Field.Store.YES,
423: Field.Index.TOKENIZED, Field.TermVector.NO);
424: doc.add(field);
425:
426: // Allow searching by authorname
427:
428: if (page.getAuthor() != null) {
429: field = new Field(LUCENE_AUTHOR, page.getAuthor(),
430: Field.Store.YES, Field.Index.TOKENIZED,
431: Field.TermVector.NO);
432: doc.add(field);
433: }
434:
435: // Now add the names of the attachments of this page
436: try {
437: Collection attachments = m_engine.getAttachmentManager()
438: .listAttachments(page);
439: String attachmentNames = "";
440:
441: for (Iterator it = attachments.iterator(); it.hasNext();) {
442: Attachment att = (Attachment) it.next();
443: attachmentNames += att.getName() + ";";
444: }
445: field = new Field(LUCENE_ATTACHMENTS, attachmentNames,
446: Field.Store.YES, Field.Index.TOKENIZED,
447: Field.TermVector.NO);
448: doc.add(field);
449:
450: } catch (ProviderException e) {
451: // Unable to read attachments
452: log.error("Failed to get attachments for page", e);
453: }
454: writer.addDocument(doc);
455:
456: return doc;
457: }
458:
459: /**
460: * {@inheritDoc}
461: */
462: public void pageRemoved(WikiPage page) {
463: try {
464: // Must first remove existing version of page.
465: IndexReader reader = IndexReader.open(m_luceneDirectory);
466: reader.deleteDocuments(new Term(LUCENE_ID, page.getName()));
467: reader.close();
468: } catch (IOException e) {
469: log.error("Unable to update page '" + page.getName()
470: + "' from Lucene index", e);
471: }
472: }
473:
474: /**
475: * Adds a page-text pair to the lucene update queue. Safe to call always
476: *
477: * @param page WikiPage to add to the update queue.
478: */
479: public void reindexPage(WikiPage page) {
480: if (page != null) {
481: String text;
482:
483: // TODO: Think if this was better done in the thread itself?
484:
485: if (page instanceof Attachment) {
486: text = getAttachmentContent((Attachment) page);
487: } else {
488: text = m_engine.getPureText(page);
489: }
490:
491: if (text != null) {
492: // Add work item to m_updates queue.
493: Object[] pair = new Object[2];
494: pair[0] = page;
495: pair[1] = text;
496: m_updates.add(pair);
497: log.debug("Scheduling page " + page.getName()
498: + " for index update");
499: }
500: }
501: }
502:
503: /**
504: * {@inheritDoc}
505: */
506: public Collection findPages(String query) throws ProviderException {
507: return findPages(query, FLAG_CONTEXTS);
508: }
509:
510: /**
511: * Create contexts also. Generating contexts can be expensive,
512: * so they're not on by default.
513: */
514: public static final int FLAG_CONTEXTS = 0x01;
515:
516: /**
517: * Searches pages using a particular combination of flags.
518: *
519: * @param query The query to perform in Lucene query language
520: * @param flags A set of flags
521: * @return A Collection of SearchResult instances
522: * @throws ProviderException if there is a problem with the backend
523: */
524: public Collection findPages(String query, int flags)
525: throws ProviderException {
526: Searcher searcher = null;
527: ArrayList list = null;
528: Highlighter highlighter = null;
529:
530: try {
531: String[] queryfields = { LUCENE_PAGE_CONTENTS,
532: LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS };
533: QueryParser qp = new MultiFieldQueryParser(queryfields,
534: getLuceneAnalyzer());
535:
536: //QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
537: Query luceneQuery = qp.parse(query);
538:
539: if ((flags & FLAG_CONTEXTS) != 0) {
540: highlighter = new Highlighter(new SimpleHTMLFormatter(
541: "<span class=\"searchmatch\">", "</span>"),
542: new SimpleHTMLEncoder(), new QueryScorer(
543: luceneQuery));
544: }
545:
546: try {
547: searcher = new IndexSearcher(m_luceneDirectory);
548: } catch (Exception ex) {
549: log.info("Lucene not yet ready; indexing not started",
550: ex);
551: return null;
552: }
553:
554: Hits hits = searcher.search(luceneQuery);
555:
556: list = new ArrayList(hits.length());
557: for (int curr = 0; curr < hits.length(); curr++) {
558: Document doc = hits.doc(curr);
559: String pageName = doc.get(LUCENE_ID);
560: WikiPage page = m_engine.getPage(pageName,
561: WikiPageProvider.LATEST_VERSION);
562:
563: if (page != null) {
564: if (page instanceof Attachment) {
565: // Currently attachments don't look nice on the search-results page
566: // When the search-results are cleaned up this can be enabled again.
567: }
568:
569: int score = (int) (hits.score(curr) * 100);
570:
571: // Get highlighted search contexts
572: String text = doc.get(LUCENE_PAGE_CONTENTS);
573:
574: String[] fragments = new String[0];
575: if (text != null && highlighter != null) {
576: TokenStream tokenStream = getLuceneAnalyzer()
577: .tokenStream(LUCENE_PAGE_CONTENTS,
578: new StringReader(text));
579: fragments = highlighter.getBestFragments(
580: tokenStream, text, MAX_FRAGMENTS);
581:
582: }
583:
584: SearchResult result = new SearchResultImpl(page,
585: score, fragments);
586: list.add(result);
587: } else {
588: log
589: .error("Lucene found a result page '"
590: + pageName
591: + "' that could not be loaded, removing from Lucene cache");
592: pageRemoved(new WikiPage(m_engine, pageName));
593: }
594: }
595: } catch (IOException e) {
596: log.error("Failed during lucene search", e);
597: } catch (InstantiationException e) {
598: log.error("Unable to get a Lucene analyzer", e);
599: } catch (IllegalAccessException e) {
600: log.error("Unable to get a Lucene analyzer", e);
601: } catch (ClassNotFoundException e) {
602: log.error("Specified Lucene analyzer does not exist", e);
603: } catch (ParseException e) {
604: log.info("Broken query; cannot parse", e);
605:
606: throw new ProviderException(
607: "You have entered a query Lucene cannot process: "
608: + e.getMessage());
609: } finally {
610: if (searcher != null) {
611: try {
612: searcher.close();
613: } catch (IOException e) {
614: }
615: }
616: }
617:
618: return list;
619: }
620:
621: /**
622: * {@inheritDoc}
623: */
624: public String getProviderInfo() {
625: return "LuceneSearchProvider";
626: }
627:
628: /**
629: * Updater thread that updates Lucene indexes.
630: */
631: private static final class LuceneUpdater extends
632: WikiBackgroundThread {
633: protected static final int INDEX_DELAY = 1;
634: protected static final int INITIAL_DELAY = 60;
635: private final LuceneSearchProvider m_provider;
636:
637: private int m_initialDelay;
638:
639: private WatchDog m_watchdog;
640:
641: private LuceneUpdater(WikiEngine engine,
642: LuceneSearchProvider provider, int initialDelay,
643: int indexDelay) {
644: super (engine, indexDelay);
645: m_provider = provider;
646: setName("JSPWiki Lucene Indexer");
647: }
648:
649: public void startupTask() throws Exception {
650: m_watchdog = getEngine().getCurrentWatchDog();
651:
652: // Sleep initially...
653: try {
654: Thread.sleep(m_initialDelay * 1000L);
655: } catch (InterruptedException e) {
656: throw new InternalWikiException(
657: "Interrupted while waiting to start.");
658: }
659:
660: m_watchdog.enterState("Full reindex");
661: // Reindex everything
662: m_provider.doFullLuceneReindex();
663: m_watchdog.exitState();
664: }
665:
666: public void backgroundTask() throws Exception {
667: m_watchdog.enterState("Emptying index queue", 60);
668:
669: synchronized (m_provider.m_updates) {
670: while (m_provider.m_updates.size() > 0) {
671: Object[] pair = (Object[]) m_provider.m_updates
672: .remove(0);
673: WikiPage page = (WikiPage) pair[0];
674: String text = (String) pair[1];
675: m_provider.updateLuceneIndex(page, text);
676: }
677: }
678:
679: m_watchdog.exitState();
680: }
681:
682: }
683:
684: // FIXME: This class is dumb; needs to have a better implementation
685: private static class SearchResultImpl implements SearchResult {
686: private WikiPage m_page;
687: private int m_score;
688: private String[] m_contexts;
689:
690: public SearchResultImpl(WikiPage page, int score,
691: String[] contexts) {
692: m_page = page;
693: m_score = score;
694: m_contexts = contexts;
695: }
696:
697: public WikiPage getPage() {
698: return m_page;
699: }
700:
701: /* (non-Javadoc)
702: * @see com.ecyrd.jspwiki.SearchResult#getScore()
703: */
704: public int getScore() {
705: return m_score;
706: }
707:
708: public String[] getContexts() {
709: return m_contexts;
710: }
711: }
712: }
|