001: package org.contineo.core.searchengine.crawler;
002:
003: import java.io.File;
004: import java.io.IOException;
005: import java.util.Locale;
006:
007: import org.apache.commons.lang.StringUtils;
008: import org.apache.commons.logging.Log;
009: import org.apache.commons.logging.LogFactory;
010: import org.apache.lucene.analysis.Analyzer;
011: import org.apache.lucene.document.Document;
012: import org.apache.lucene.index.IndexReader;
013: import org.apache.lucene.index.IndexWriter;
014: import org.apache.lucene.index.MultiReader;
015: import org.apache.lucene.index.Term;
016: import org.apache.lucene.store.FSDirectory;
017: import org.contineo.core.searchengine.util.SquareSimilarity;
018: import org.contineo.core.text.AnalyzeText;
019: import org.contineo.core.text.parser.Parser;
020: import org.contineo.core.text.parser.ParserFactory;
021: import org.contineo.util.config.SettingsConfig;
022:
023: /**
024: * Class for indexing files and maintaining indexes.
025: *
026: * @author Michael Scholz, Marco Meschieri
027: */
028: public class Indexer {
029:
030: protected static Log log = LogFactory.getLog(Indexer.class);
031:
032: private SettingsConfig settingsConfig;
033:
034: private Indexer() {
035: }
036:
037: public void setSettingsConfig(SettingsConfig settingsConfig) {
038: this .settingsConfig = settingsConfig;
039: }
040:
041: public synchronized int addFile(File file,
042: org.contineo.core.document.Document document,
043: StringBuffer content, String language) throws Exception {
044:
045: String name = file.getName();
046: int testversion = -1;
047: int result = -1;
048: name = name.substring(name.lastIndexOf(".") + 1);
049:
050: try {
051: testversion = Integer.parseInt(name);
052: } catch (Exception e) {
053: }
054:
055: if (testversion == -1) {
056: LuceneDocument lDoc = new LuceneDocument(document);
057:
058: try {
059: log.info("addFile: " + file.toString());
060:
061: Document doc = lDoc.getDocument(file, content);
062: result = addDocument(doc, language);
063: } catch (Exception e) {
064: log.error("Exception addFile: "
065: + e.getLocalizedMessage(), e);
066: }
067:
068: try {
069: AnalyzeText aText = new AnalyzeText();
070: aText.storeTerms(document.getMenuId(), content
071: .toString(), language);
072: } catch (Exception e) {
073: log.error("Exception analyzing File: "
074: + e.getLocalizedMessage(), e);
075: }
076: }
077:
078: return result;
079: }
080:
081: /**
082: * Adds a LuceneDocument to the index.
083: */
084: private int addDocument(Document doc, String language) {
085: String path = settingsConfig.getValue("indexdir");
086: Analyzer analyzer = LuceneAnalyzerFactory.getAnalyzer(language);
087: String dir = new Locale(language).getDisplayLanguage(
088: Locale.ENGLISH).toLowerCase();
089:
090: if (!path.endsWith(File.pathSeparator)) {
091: path += "/";
092: }
093:
094: path += dir + "/";
095:
096: IndexWriter writer = null;
097: try {
098: writer = new IndexWriter(path, analyzer, false);
099: writer.setSimilarity(new SquareSimilarity());
100: writer.addDocument(doc);
101: writer.optimize();
102: return writer.docCount() - 1;
103: } catch (Exception e) {
104: log.error("Exception adding Document to Lucene index: "
105: + path + ", " + e.getMessage(), e);
106: return -1;
107: } finally {
108: if (writer != null)
109: try {
110: writer.close();
111: } catch (Exception e) {
112: log.error("Error closing index: " + path + ", "
113: + e.getMessage(), e);
114: return -1;
115: }
116: }
117: }
118:
119: /**
120: * Adds all documents of a given directory to the index of the search
121: * engine.
122: *
123: * @param file Path of the directory.
124: * @param doc The document that we want to add
125: * @throws Exception
126: */
127: public synchronized void addDirectory(File file,
128: org.contineo.core.document.Document doc) throws Exception {
129:
130: if (file.isDirectory()) {
131: String[] subitems = file.list();
132:
133: for (int i = 0; i < subitems.length; i++) {
134: addDirectory(new File(file, subitems[i]), doc);
135: }
136: } else {
137: try {
138: Parser parser = ParserFactory.getParser(file);
139: if (parser == null) {
140: return;
141: }
142:
143: StringBuffer content = parser.getContent();
144:
145: String language = doc.getLanguage();
146: if (StringUtils.isEmpty(language)) {
147: language = "en";
148: }
149:
150: if (log.isInfoEnabled()) {
151: log.info("addDirectory 0" + doc.getDocId() + " "
152: + doc.getDocName() + " "
153: + doc.getDocVersion() + " "
154: + doc.getDocDate() + " "
155: + doc.getDocPublisher() + " "
156: + doc.getDocStatus() + " "
157: + doc.getSource() + " "
158: + doc.getSourceAuthor());
159: }
160: addFile(file, doc, content, language);
161: } catch (Exception e) {
162: log.error("addDirectory " + e.getMessage(), e);
163: }
164: }
165: }
166:
167: /**
168: * Launch optimization on a single Lucene Index identified by the language
169: */
170: protected synchronized void optimize(String language) {
171: String path = settingsConfig.getValue("indexdir");
172:
173: if (!path.endsWith(File.pathSeparator)) {
174: path += "/";
175: }
176:
177: try {
178: String pathLang = new Locale(language).getDisplayLanguage(
179: Locale.ENGLISH).toLowerCase();
180: String completePath = path + pathLang + "/";
181: Analyzer analyzer = LuceneAnalyzerFactory
182: .getAnalyzer(language);
183: IndexWriter writer = new IndexWriter(completePath,
184: analyzer, false);
185: writer.optimize();
186: writer.close();
187: } catch (Exception e) {
188: log.error("optimize " + e.getMessage(), e);
189: }
190: }
191:
192: /**
193: * Launch optimization on all the Lucene Indexes
194: */
195: public synchronized void optimize() {
196: log.warn("Started optimization for all indexes");
197:
198: String path = settingsConfig.getValue("indexdir");
199: String[] languages = new String[] { "en", "de", "fr", "es",
200: "it" };
201:
202: if (!path.endsWith(File.pathSeparator)) {
203: path += "/";
204: }
205:
206: try {
207: String pathLang = null;
208: String completePath = null;
209: Analyzer analyzer = null;
210: for (int i = 0; i < languages.length; i++) {
211: analyzer = LuceneAnalyzerFactory
212: .getAnalyzer(languages[i]);
213: pathLang = new Locale(languages[i]).getDisplayLanguage(
214: Locale.ENGLISH).toLowerCase();
215: completePath = path + pathLang + "/";
216: IndexWriter writer = new IndexWriter(completePath,
217: analyzer, false);
218: writer.optimize();
219: writer.close();
220: }
221: } catch (Exception e) {
222: log.error("optimize " + e.getMessage(), e);
223: }
224:
225: log.warn("Finished optimization for all indexes");
226: }
227:
228: /**
229: * Deletes the entries of a document in the index of the search engine.
230: *
231: * @param menuId - MenuID of the document.
232: * @param language - Language of the document.
233: */
234: public synchronized void deleteFile(String menuId, String language) {
235: String path = settingsConfig.getValue("indexdir");
236:
237: if (!path.endsWith(File.pathSeparator)) {
238: path += "/";
239: }
240:
241: String pathLang = new Locale(language).getDisplayLanguage(
242: Locale.ENGLISH).toLowerCase();
243: String completePath = path + pathLang + "/";
244:
245: try {
246: IndexReader reader = IndexReader.open(completePath);
247: reader.deleteDocuments(new Term("menuId", menuId));
248: reader.close();
249: optimize(language);
250: } catch (IOException ioe) {
251: log.error("deleteFile " + ioe.getMessage(), ioe);
252: }
253: }
254:
255: public Document getDocument(int luceneid) {
256: String path = settingsConfig.getValue("indexdir");
257:
258: if (!path.endsWith(File.pathSeparator)) {
259: path += "/";
260: }
261:
262: try {
263: IndexReader enreader = IndexReader.open(path + "english/");
264: IndexReader frreader = IndexReader.open(path + "french/");
265: IndexReader dereader = IndexReader.open(path + "german/");
266: IndexReader esreader = IndexReader.open(path + "spanish/");
267: IndexReader itreader = IndexReader.open(path + "italian/");
268: IndexReader[] readers = new IndexReader[] { enreader,
269: frreader, dereader, esreader, itreader };
270: MultiReader reader = new MultiReader(readers);
271: Document doc = reader.document(luceneid);
272: reader.close();
273: return doc;
274: } catch (Exception e) {
275: log.error("getDocument " + e.getMessage(), e);
276:
277: return null;
278: }
279: }
280:
281: /**
282: * This method can unlock a locked index.
283: */
284: public synchronized void unlock() {
285: String path = settingsConfig.getValue("indexdir");
286:
287: if (!path.endsWith(File.pathSeparator)) {
288: path += "/";
289: }
290:
291: try {
292: FSDirectory enfsdir = FSDirectory.getDirectory(path
293: + "english/", false);
294: IndexReader reader = IndexReader.open(enfsdir);
295: IndexReader.unlock(enfsdir);
296: reader.close();
297:
298: FSDirectory frfsdir = FSDirectory.getDirectory(path
299: + "french/", false);
300: reader = IndexReader.open(frfsdir);
301: IndexReader.unlock(frfsdir);
302: reader.close();
303:
304: FSDirectory defsdir = FSDirectory.getDirectory(path
305: + "german/", false);
306: reader = IndexReader.open(defsdir);
307: IndexReader.unlock(defsdir);
308: reader.close();
309:
310: FSDirectory esfsdir = FSDirectory.getDirectory(path
311: + "spanish/", false);
312: reader = IndexReader.open(esfsdir);
313: IndexReader.unlock(esfsdir);
314: reader.close();
315:
316: FSDirectory itfsdir = FSDirectory.getDirectory(path
317: + "italian/", false);
318: reader = IndexReader.open(itfsdir);
319: IndexReader.unlock(itfsdir);
320: reader.close();
321: } catch (Exception e) {
322: log.error("unlock " + e.getMessage(), e);
323: }
324: }
325:
326: public boolean isLocked() {
327: boolean result = false;
328: String path = settingsConfig.getValue("indexdir");
329:
330: if (!path.endsWith(File.pathSeparator)) {
331: path += "/";
332: }
333:
334: try {
335: FSDirectory enfsdir = FSDirectory.getDirectory(path
336: + "english/", false);
337: IndexReader reader = IndexReader.open(enfsdir);
338:
339: if (IndexReader.isLocked(enfsdir)) {
340: result = true;
341: }
342:
343: reader.close();
344:
345: FSDirectory frfsdir = FSDirectory.getDirectory(path
346: + "french/", false);
347: reader = IndexReader.open(frfsdir);
348:
349: if (IndexReader.isLocked(frfsdir)) {
350: result = true;
351: }
352:
353: reader.close();
354:
355: FSDirectory defsdir = FSDirectory.getDirectory(path
356: + "german/", false);
357: reader = IndexReader.open(defsdir);
358:
359: if (IndexReader.isLocked(defsdir)) {
360: result = true;
361: }
362:
363: reader.close();
364:
365: FSDirectory esfsdir = FSDirectory.getDirectory(path
366: + "spanish/", false);
367: reader = IndexReader.open(esfsdir);
368: if (IndexReader.isLocked(esfsdir)) {
369: result = true;
370: }
371: reader.close();
372:
373: FSDirectory itfsdir = FSDirectory.getDirectory(path
374: + "italian/", false);
375: reader = IndexReader.open(itfsdir);
376: if (IndexReader.isLocked(itfsdir)) {
377: result = true;
378: }
379: reader.close();
380:
381: } catch (Exception e) {
382: log.error("isLocked " + e.getMessage(), e);
383: }
384:
385: return result;
386: }
387:
388: /**
389: * Returns the number of indexed documents in all indexes. Used for
390: * statistical output.
391: */
392: public int getCount() {
393: int count = 0;
394: String path = settingsConfig.getValue("indexdir");
395:
396: if (!path.endsWith(File.pathSeparator)) {
397: path += "/";
398: }
399:
400: try {
401: IndexReader enreader = IndexReader.open(path + "english/");
402: IndexReader frreader = IndexReader.open(path + "french/");
403: IndexReader dereader = IndexReader.open(path + "german/");
404: IndexReader esreader = IndexReader.open(path + "spanish/");
405: IndexReader itreader = IndexReader.open(path + "italian/");
406: count = enreader.numDocs();
407: count += frreader.numDocs();
408: count += dereader.numDocs();
409: count += esreader.numDocs();
410: count += itreader.numDocs();
411: enreader.close();
412: frreader.close();
413: dereader.close();
414: esreader.close();
415: itreader.close();
416: } catch (Exception e) {
417: log.error("getCount " + e.getMessage(), e);
418: }
419:
420: return count;
421: }
422:
423: /**
424: * Create all indexes (one per language)
425: */
426: public void createIndexes() {
427: try {
428: // Instantiate all index writers(one per language)
429: String path = settingsConfig.getValue("indexdir");
430: new IndexWriter(path + "english/", LuceneAnalyzerFactory
431: .getAnalyzer("en"), true);
432: new IndexWriter(path + "french/", LuceneAnalyzerFactory
433: .getAnalyzer("fr"), true);
434: new IndexWriter(path + "german/", LuceneAnalyzerFactory
435: .getAnalyzer("de"), true);
436: new IndexWriter(path + "spanish/", LuceneAnalyzerFactory
437: .getAnalyzer("es"), true);
438: new IndexWriter(path + "italian/", LuceneAnalyzerFactory
439: .getAnalyzer("it"), true);
440: } catch (Exception e) {
441: log.error("createIndexes " + e.getMessage(), e);
442: }
443: }
444: }
|