001: /**
002: * LibreSource
003: * Copyright (C) 2004-2008 Artenum SARL / INRIA
004: * http://www.libresource.org - contact@artenum.com
005: *
006: * This file is part of the LibreSource software,
007: * which can be used and distributed under license conditions.
008: * The license conditions are provided in the LICENSE.TXT file
009: * at the root path of the packaging that enclose this file.
010: * More information can be found at
011: * - http://dev.libresource.org/home/license
012: *
013: * Initial authors :
014: *
015: * Guillaume Bort / INRIA
016: * Francois Charoy / Universite Nancy 2
017: * Julien Forest / Artenum
018: * Claude Godart / Universite Henry Poincare
019: * Florent Jouille / INRIA
020: * Sebastien Jourdain / INRIA / Artenum
021: * Yves Lerumeur / Artenum
022: * Pascal Molli / Universite Henry Poincare
023: * Gerald Oster / INRIA
024: * Mariarosa Penzi / Artenum
025: * Gerard Sookahet / Artenum
026: * Raphael Tani / INRIA
027: *
028: * Contributors :
029: *
030: * Stephane Bagnier / Artenum
031: * Amadou Dia / Artenum-IUP Blois
032: * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
033: */package org.libresource.search;
034:
035: import org.apache.lucene.analysis.Analyzer;
036: import org.apache.lucene.index.IndexReader;
037: import org.apache.lucene.index.IndexWriter;
038: import org.apache.lucene.index.Term;
039: import org.apache.lucene.queryParser.QueryParser;
040: import org.apache.lucene.search.Hits;
041: import org.apache.lucene.search.IndexSearcher;
042: import org.apache.lucene.search.Query;
043: import org.apache.lucene.search.Searcher;
044: import org.apache.lucene.search.highlight.QueryHighlightExtractor;
045: import org.apache.lucene.store.Directory;
046: import org.apache.lucene.store.FSDirectory;
047:
048: import org.libresource.Libresource;
049: import org.libresource.LibresourceException;
050:
051: import org.libresource.kernel.KernelConstants;
052: import org.libresource.kernel.interfaces.KernelService;
053:
054: import java.io.File;
055: import java.io.IOException;
056:
057: import java.net.URI;
058:
059: /**
060: * LibreSource
061: *
062: * @author <a href="mailto:bort@loria.fr">Guillaume Bort </a>- <a
063: * href="http://www.inria.fr">INRIA Lorraine </a>
064: */
065: public class LibresourceSearch {
066: private static LibresourceSearch instance;
067: private Analyzer analyzer;
068: private Directory indexDirectory;
069: private IndexWriter writer;
070:
071: // highlighter config
072: private int highlightFragmentSizeInBytes = 80;
073: private int maxNumFragmentsRequired = 4;
074: private String fragmentSeparator = "...";
075:
076: private LibresourceSearch() throws Exception {
077: KernelService kernelService = (KernelService) Libresource
078: .getService(KernelConstants.SERVICE);
079: String indexName = kernelService.getIndexationDataDir();
080: analyzer = new LibresourceAnalyzer();
081:
082: File indexDir = new File(indexName);
083:
084: if (indexDir.exists() && indexDir.isDirectory()) {
085: indexDirectory = FSDirectory.getDirectory(indexName);
086: } else {
087: indexDir.mkdirs();
088: indexDirectory = FSDirectory.getDirectory(indexName);
089:
090: // Build the index
091: writer = new IndexWriter(indexDirectory, analyzer, true);
092: writer.close();
093: }
094: }
095:
096: public static LibresourceSearch getInstance()
097: throws LibresourceException {
098: try {
099: if (instance == null) {
100: instance = new LibresourceSearch();
101: }
102:
103: return instance;
104: } catch (Exception e) {
105: throw new LibresourceException(e);
106: }
107: }
108:
109: public void index(LibresourceResourceDocument document)
110: throws LibresourceException {
111: try {
112: synchronized (this ) {
113: // System.out.println("Indexing " + document);
114: IndexWriter writer = null;
115:
116: try {
117: writer = new IndexWriter(indexDirectory, analyzer,
118: false);
119: writer.addDocument(document.getDocument());
120: writer.optimize();
121: } finally {
122: writer.close();
123: }
124: }
125: } catch (IOException e) {
126: throw new LibresourceException("Can't index a document", e);
127: }
128: }
129:
130: public void remove(URI uri) throws LibresourceException {
131: try {
132: synchronized (this ) {
133: // System.out.println("Removing Document " +
134: // normalizeURIForIndexation(uri));
135: Term term = new Term("URI",
136: normalizeURIForIndexation(uri));
137: IndexReader reader = IndexReader.open(indexDirectory);
138: reader.deleteDocuments(term);
139: reader.close();
140: }
141: } catch (IOException e) {
142: throw new LibresourceException("Can't remove a document", e);
143: }
144: }
145:
146: public void reindex(LibresourceResourceDocument document)
147: throws LibresourceException {
148: remove(document.getUri());
149: index(document);
150: }
151:
152: public LibresourceSearchResult[] search(String uriPattern,
153: String service, String resource, String queryString)
154: throws LibresourceException {
155: try {
156: // BooleanQuery.setMaxClauseCount(1300);
157: queryString = StringHelper.clarify(queryString);
158: queryString = "(" + queryString + ")";
159:
160: QueryParser queryParser = new QueryParser("CONTENT",
161: analyzer);
162: queryParser.parse(queryString);
163:
164: Query query = queryParser.parse(queryString);
165:
166: Searcher searcher = new IndexSearcher(indexDirectory);
167: IndexReader reader = IndexReader.open(indexDirectory);
168: Hits hits = null;
169:
170: LibresourceQueryFilter uriFilter = new LibresourceQueryFilter(
171: normalizeURIForIndexation(new URI(uriPattern)),
172: resource, service);
173: hits = searcher.search(query, uriFilter);
174:
175: LibresourceSearchResult[] libresourceSearchResults = new LibresourceSearchResult[hits
176: .length()];
177: QueryHighlightExtractor highlighter = new QueryHighlightExtractor(
178: query.rewrite(reader), analyzer, "<highlighted>",
179: "</highlighted>");
180: KernelService kernelService = (KernelService) Libresource
181: .getService(KernelConstants.SERVICE);
182:
183: for (int i = 0; i < hits.length(); i++) {
184: URI uri = new URI(hits.doc(i).getField("URI")
185: .stringValue());
186: float score = hits.score(i);
187: String content = hits.doc(i).get("CONTENT");
188: String highlightedText = highlighter.getBestFragments(
189: content, highlightFragmentSizeInBytes,
190: maxNumFragmentsRequired, fragmentSeparator);
191: String name = hits.doc(i).get("NAME");
192: String type = hits.doc(i).get("SERVICE") + "/"
193: + hits.doc(i).get("TYPE");
194: libresourceSearchResults[i] = new LibresourceSearchResult(
195: kernelService.normalizeURI(uri), score, name,
196: highlightedText, type);
197: }
198:
199: reader.close();
200: searcher.close();
201:
202: return libresourceSearchResults;
203: } catch (Exception e) {
204: throw new LibresourceException("Can't search "
205: + queryString, e);
206: }
207: }
208:
209: public String highlight(String text, String queryString)
210: throws LibresourceException {
211: try {
212: queryString = StringHelper.clarify(queryString);
213:
214: QueryParser queryParser = new QueryParser("CONTENT",
215: analyzer);
216: queryParser.parse(queryString);
217:
218: Query query = queryParser.parse(queryString);
219:
220: IndexReader reader = IndexReader.open(indexDirectory);
221: QueryHighlightExtractor highlighter = new QueryHighlightExtractor(
222: query.rewrite(reader), analyzer, "<highlighted>",
223: "</highlighted>");
224:
225: return highlighter.highlightText(text);
226: } catch (Exception e) {
227: throw new LibresourceException("Can't highlight "
228: + queryString, e);
229: }
230: }
231:
232: protected static String normalizeURIForIndexation(URI uri) {
233: String path = uri.normalize().getPath();
234:
235: if (!path.startsWith("/")) {
236: path = "/" + path;
237: }
238:
239: if (path.endsWith("/")) {
240: path = path.substring(0, path.length() - 1);
241: }
242:
243: return path;
244: }
245:
246: public void flush() throws Exception {
247: IndexWriter writer = new IndexWriter(indexDirectory, analyzer,
248: true);
249: writer.close();
250: }
251: }
|