001: /**
002: * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
003: *
004: * This program is free software; you can redistribute it and/or modify
005: * it under the terms of the latest version of the GNU Lesser General
006: * Public License as published by the Free Software Foundation;
007: *
008: * This program is distributed in the hope that it will be useful,
009: * but WITHOUT ANY WARRANTY; without even the implied warranty of
010: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
011: * GNU Lesser General Public License for more details.
012: *
013: * You should have received a copy of the GNU Lesser General Public License
014: * along with this program (LICENSE.txt); if not, write to the Free Software
015: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
016: */package org.jamwiki.search;
017:
018: import java.io.File;
019: import java.io.StringReader;
020: import java.util.Collection;
021: import java.util.Iterator;
022: import java.util.Vector;
023: import org.apache.commons.lang.StringEscapeUtils;
024: import org.apache.commons.lang.StringUtils;
025: import org.apache.lucene.analysis.TokenStream;
026: import org.apache.lucene.analysis.KeywordAnalyzer;
027: import org.apache.lucene.analysis.standard.StandardAnalyzer;
028: import org.apache.lucene.document.Document;
029: import org.apache.lucene.document.Field;
030: import org.apache.lucene.index.IndexWriter;
031: import org.apache.lucene.index.Term;
032: import org.apache.lucene.queryParser.QueryParser;
033: import org.apache.lucene.search.BooleanQuery;
034: import org.apache.lucene.search.Hits;
035: import org.apache.lucene.search.IndexSearcher;
036: import org.apache.lucene.search.PhraseQuery;
037: import org.apache.lucene.search.Query;
038: import org.apache.lucene.search.BooleanClause.Occur;
039: import org.apache.lucene.search.highlight.Highlighter;
040: import org.apache.lucene.search.highlight.QueryScorer;
041: import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
042: import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
043: import org.apache.lucene.store.FSDirectory;
044: import org.jamwiki.Environment;
045: import org.jamwiki.SearchEngine;
046: import org.jamwiki.WikiBase;
047: import org.jamwiki.model.Topic;
048: import org.jamwiki.model.VirtualWiki;
049: import org.jamwiki.parser.ParserOutput;
050: import org.jamwiki.parser.ParserUtil;
051: import org.jamwiki.utils.WikiLogger;
052:
053: /**
054: * An implementation of {@link org.jamwiki.search.SearchEngine} that uses
055: * <a href="http://lucene.apache.org/java/">Lucene</a> to perform searches of
056: * Wiki content.
057: */
058: public class LuceneSearchEngine implements SearchEngine {
059:
060: /** Where to log to */
061: private static final WikiLogger logger = WikiLogger
062: .getLogger(LuceneSearchEngine.class.getName());
063: /** Directory for search index files */
064: private static final String SEARCH_DIR = "search";
065: /** Id stored with documents to indicate the searchable topic name */
066: private static final String ITYPE_TOPIC = "topic";
067: /** Id stored with documents to indicate the searchable content. */
068: private static final String ITYPE_CONTENT = "content";
069: /** Id stored with documents to indicate the raw Wiki markup */
070: private static final String ITYPE_CONTENT_PLAIN = "content_plain";
071: /** Id stored with documents to indicate the topic name. */
072: private static final String ITYPE_TOPIC_PLAIN = "topic_plain";
073: /** Id stored with the document to indicate the search names of topics linked from the page. */
074: private static final String ITYPE_TOPIC_LINK = "topic_link";
075:
076: /**
077: * Add a topic to the search index.
078: *
079: * @param topic The Topic object that is to be added to the index.
080: * @param links A collection containing the topic names for all topics that link
081: * to the current topic.
082: */
083: public synchronized void addToIndex(Topic topic, Collection links) {
084: String virtualWiki = topic.getVirtualWiki();
085: String topicName = topic.getName();
086: IndexWriter writer = null;
087: try {
088: FSDirectory directory = FSDirectory
089: .getDirectory(getSearchIndexPath(virtualWiki));
090: // FIXME - move synchronization to the writer instance for this directory
091: try {
092: writer = new IndexWriter(directory,
093: new StandardAnalyzer(), false);
094: KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
095: writer.optimize();
096: Document standardDocument = createStandardDocument(topic);
097: writer.addDocument(standardDocument);
098: Document keywordDocument = createKeywordDocument(topic,
099: links);
100: writer.addDocument(keywordDocument, keywordAnalyzer);
101: } finally {
102: try {
103: if (writer != null) {
104: writer.optimize();
105: }
106: } catch (Exception e) {
107: }
108: try {
109: if (writer != null) {
110: writer.close();
111: }
112: } catch (Exception e) {
113: }
114: }
115: directory.close();
116: } catch (Exception e) {
117: logger.severe("Exception while adding topic " + topicName,
118: e);
119: }
120: }
121:
122: /**
123: * Create a basic Lucene document to add to the index that does treats
124: * the topic content as a single keyword and does not tokenize it.
125: */
126: private Document createKeywordDocument(Topic topic, Collection links)
127: throws Exception {
128: String topicContent = topic.getTopicContent();
129: if (topicContent == null) {
130: topicContent = "";
131: }
132: Document doc = new Document();
133: // store topic name for later retrieval
134: doc.add(new Field(ITYPE_TOPIC_PLAIN, topic.getName(),
135: Field.Store.YES, Field.Index.UN_TOKENIZED));
136: if (links == null) {
137: links = new Vector();
138: }
139: // index topic links for search purposes
140: for (Iterator iter = links.iterator(); iter.hasNext();) {
141: String linkTopic = (String) iter.next();
142: doc.add(new Field(ITYPE_TOPIC_LINK, linkTopic,
143: Field.Store.NO, Field.Index.UN_TOKENIZED));
144: }
145: return doc;
146: }
147:
148: /**
149: * Create a basic Lucene document to add to the index. This document
150: * is suitable to be parsed with the StandardAnalyzer.
151: */
152: private Document createStandardDocument(Topic topic)
153: throws Exception {
154: String topicContent = topic.getTopicContent();
155: if (topicContent == null) {
156: topicContent = "";
157: }
158: Document doc = new Document();
159: // store topic name and content for later retrieval
160: doc.add(new Field(ITYPE_TOPIC_PLAIN, topic.getName(),
161: Field.Store.YES, Field.Index.UN_TOKENIZED));
162: doc.add(new Field(ITYPE_CONTENT_PLAIN, topicContent,
163: Field.Store.YES, Field.Index.NO));
164: // index topic name and content for search purposes
165: doc.add(new Field(ITYPE_TOPIC,
166: new StringReader(topic.getName())));
167: doc
168: .add(new Field(ITYPE_CONTENT, new StringReader(
169: topicContent)));
170: return doc;
171: }
172:
173: /**
174: * Remove a topic from the search index.
175: *
176: * @param topic The topic object that is to be removed from the index.
177: */
178: public synchronized void deleteFromIndex(Topic topic) {
179: String virtualWiki = topic.getVirtualWiki();
180: String topicName = topic.getName();
181: IndexWriter writer = null;
182: try {
183: FSDirectory directory = FSDirectory
184: .getDirectory(getSearchIndexPath(virtualWiki));
185: // delete the current document
186: // FIXME - move synchronization to the writer instance for this directory
187: try {
188: writer = new IndexWriter(directory,
189: new StandardAnalyzer(), false);
190: writer.deleteDocuments(new Term(ITYPE_TOPIC_PLAIN,
191: topicName));
192: } finally {
193: if (writer != null) {
194: try {
195: writer.close();
196: } catch (Exception e) {
197: }
198: }
199: }
200: directory.close();
201: } catch (Exception e) {
202: logger.severe("Exception while adding topic " + topicName,
203: e);
204: }
205: }
206:
207: /**
208: * Find all documents that link to a specified topic.
209: *
210: * @param virtualWiki The virtual wiki for the topic.
211: * @param topicName The name of the topic.
212: * @return A collection of SearchResultEntry objects for all documents that
213: * link to the topic.
214: */
215: public Collection findLinkedTo(String virtualWiki, String topicName) {
216: Collection results = new Vector();
217: IndexSearcher searcher = null;
218: try {
219: PhraseQuery query = new PhraseQuery();
220: Term term = new Term(ITYPE_TOPIC_LINK, topicName);
221: query.add(term);
222: searcher = new IndexSearcher(FSDirectory
223: .getDirectory(getSearchIndexPath(virtualWiki)));
224: // actually perform the search
225: Hits hits = searcher.search(query);
226: for (int i = 0; i < hits.length(); i++) {
227: SearchResultEntry result = new SearchResultEntry();
228: result.setRanking(hits.score(i));
229: result.setTopic(hits.doc(i).get(ITYPE_TOPIC_PLAIN));
230: results.add(result);
231: }
232: } catch (Exception e) {
233: logger.severe("Exception while searching for " + topicName,
234: e);
235: } finally {
236: if (searcher != null) {
237: try {
238: searcher.close();
239: } catch (Exception e) {
240: }
241: }
242: }
243: return results;
244: }
245:
246: /**
247: * Find all documents that contain a specific search term, ordered by relevance.
248: * This method supports all Lucene search query syntax.
249: *
250: * @param virtualWiki The virtual wiki for the topic.
251: * @param text The search term being searched for.
252: * @return A collection of SearchResultEntry objects for all documents that
253: * contain the search term.
254: */
255: public Collection findResults(String virtualWiki, String text) {
256: StandardAnalyzer analyzer = new StandardAnalyzer();
257: Collection results = new Vector();
258: logger.fine("search text: " + text);
259: IndexSearcher searcher = null;
260: try {
261: BooleanQuery query = new BooleanQuery();
262: QueryParser qp;
263: qp = new QueryParser(ITYPE_TOPIC, analyzer);
264: query.add(qp.parse(text), Occur.SHOULD);
265: qp = new QueryParser(ITYPE_CONTENT, analyzer);
266: query.add(qp.parse(text), Occur.SHOULD);
267: searcher = new IndexSearcher(FSDirectory
268: .getDirectory(getSearchIndexPath(virtualWiki)));
269: // rewrite the query to expand it - required for wildcards to work with highlighter
270: Query rewrittenQuery = searcher.rewrite(query);
271: // actually perform the search
272: Hits hits = searcher.search(rewrittenQuery);
273: Highlighter highlighter = new Highlighter(
274: new SimpleHTMLFormatter(
275: "<span class=\"highlight\">", "</span>"),
276: new SimpleHTMLEncoder(), new QueryScorer(
277: rewrittenQuery));
278: for (int i = 0; i < hits.length(); i++) {
279: String summary = retrieveResultSummary(hits.doc(i),
280: highlighter, analyzer);
281: SearchResultEntry result = new SearchResultEntry();
282: result.setRanking(hits.score(i));
283: result.setTopic(hits.doc(i).get(ITYPE_TOPIC_PLAIN));
284: result.setSummary(summary);
285: results.add(result);
286: }
287: } catch (Exception e) {
288: logger.severe("Exception while searching for " + text, e);
289: } finally {
290: if (searcher != null) {
291: try {
292: searcher.close();
293: } catch (Exception e) {
294: }
295: }
296: }
297: return results;
298: }
299:
300: /**
301: * Get the path, which holds all index files
302: */
303: private String getSearchIndexPath(String virtualWiki) {
304: File parent = new File(Environment
305: .getValue(Environment.PROP_BASE_FILE_DIR), SEARCH_DIR);
306: try {
307: if (System.getProperty("org.apache.lucene.lockdir") == null) {
308: // set the Lucene lock directory. this defaults to java.io.tmpdir,
309: // which may not be writable on some systems.
310: System.setProperty("org.apache.lucene.lockdir", parent
311: .getPath());
312: }
313: } catch (Exception e) {
314: // probably a security exception
315: logger
316: .warning("Unable to specify Lucene lock directory, default will be used: "
317: + e.getMessage());
318: }
319: File child = new File(parent.getPath(), "index" + virtualWiki
320: + File.separator);
321: if (!child.exists()) {
322: child.mkdirs();
323: IndexWriter writer = null;
324: try {
325: // create the search instance
326: FSDirectory directory = FSDirectory
327: .getDirectory(getSearchIndexPath(virtualWiki));
328: writer = new IndexWriter(directory,
329: new StandardAnalyzer(), true);
330: directory.close();
331: } catch (Exception e) {
332: logger.severe("Unable to create search instance "
333: + child.getPath(), e);
334: } finally {
335: try {
336: if (writer != null) {
337: writer.close();
338: }
339: } catch (Exception e) {
340: logger.severe("Exception during close", e);
341: }
342: }
343: }
344: return child.getPath();
345: }
346:
347: /**
348: * Refresh the current search index by re-visiting all topic pages.
349: *
350: * @throws Exception Thrown if any error occurs while re-indexing the Wiki.
351: */
352: public synchronized void refreshIndex() throws Exception {
353: Collection allWikis = WikiBase.getDataHandler()
354: .getVirtualWikiList(null);
355: Topic topic;
356: for (Iterator iterator = allWikis.iterator(); iterator
357: .hasNext();) {
358: long start = System.currentTimeMillis();
359: int count = 0;
360: VirtualWiki virtualWiki = (VirtualWiki) iterator.next();
361: FSDirectory directory = FSDirectory.getDirectory(this
362: .getSearchIndexPath(virtualWiki.getName()));
363: KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
364: IndexWriter writer = null;
365: // FIXME - move synchronization to the writer instance for this directory
366: try {
367: writer = new IndexWriter(directory,
368: new StandardAnalyzer(), true);
369: Collection topicNames = WikiBase.getDataHandler()
370: .getAllTopicNames(virtualWiki.getName());
371: for (Iterator iter = topicNames.iterator(); iter
372: .hasNext();) {
373: String topicName = (String) iter.next();
374: topic = WikiBase.getDataHandler().lookupTopic(
375: virtualWiki.getName(), topicName, false,
376: null);
377: Document standardDocument = createStandardDocument(topic);
378: writer.addDocument(standardDocument);
379: // FIXME - parsing all documents will be intolerably slow with even a
380: // moderately large Wiki
381: ParserOutput parserOutput = ParserUtil
382: .parserOutput(topic.getTopicContent(),
383: virtualWiki.getName(), topicName);
384: Document keywordDocument = createKeywordDocument(
385: topic, parserOutput.getLinks());
386: writer
387: .addDocument(keywordDocument,
388: keywordAnalyzer);
389: count++;
390: }
391: } catch (Exception ex) {
392: logger.severe("Failure while refreshing search index",
393: ex);
394: } finally {
395: try {
396: if (writer != null) {
397: writer.optimize();
398: }
399: } catch (Exception e) {
400: logger.severe("Exception during optimize", e);
401: }
402: try {
403: if (writer != null) {
404: writer.close();
405: }
406: } catch (Exception e) {
407: logger.severe("Exception during close", e);
408: }
409: }
410: directory.close();
411: logger.info("Rebuilt search index for "
412: + virtualWiki.getName() + " (" + count
413: + " documents) in "
414: + ((System.currentTimeMillis() - start) / 1000.000)
415: + " seconds");
416: }
417: }
418:
419: /**
420: *
421: */
422: private String retrieveResultSummary(Document document,
423: Highlighter highlighter, StandardAnalyzer analyzer)
424: throws Exception {
425: String content = document.get(ITYPE_CONTENT_PLAIN);
426: TokenStream tokenStream = analyzer.tokenStream(
427: ITYPE_CONTENT_PLAIN, new StringReader(content));
428: String summary = highlighter.getBestFragments(tokenStream,
429: content, 3, "...");
430: if (StringUtils.isBlank(summary)
431: && !StringUtils.isBlank(content)) {
432: summary = StringEscapeUtils.escapeHtml(content.substring(0,
433: Math.min(200, content.length())));
434: if (Math.min(200, content.length()) == 200) {
435: summary += "...";
436: }
437: }
438: return summary;
439: }
440: }
|