001: /*
002: * Copyright 2003-2004 Michael Franken, Zilverline.
003: *
004: * The contents of this file, or the files included with this file, are subject to
005: * the current version of ZILVERLINE Collaborative Source License for the
006: * Zilverline Search Engine (the "License"); You may not use this file except in
007: * compliance with the License.
008: *
009: * You may obtain a copy of the License at
010: *
011: * http://www.zilverline.org.
012: *
013: * See the License for the rights, obligations and
014: * limitations governing use of the contents of the file.
015: *
016: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018: * copyrights in the portions it created. All Rights Reserved.
019: *
020: */
021:
022: package org.zilverline.core;
023:
024: import java.io.File;
025: import java.io.IOException;
026: import java.text.DateFormat;
027: import java.text.SimpleDateFormat;
028: import java.util.Date;
029:
030: import org.apache.commons.logging.Log;
031: import org.apache.commons.logging.LogFactory;
032:
033: import org.apache.lucene.document.Document;
034: import org.apache.lucene.document.Field;
035: import org.apache.lucene.index.IndexWriter;
036:
037: import org.springframework.util.StringUtils;
038: import org.zilverline.service.CollectionManagerImpl;
039: import org.zilverline.util.FileUtils;
040: import org.zilverline.util.StopWatch;
041:
042: /**
043: * A Collection is a number of documents in a directory that are indexed together.
044: *
045: * @author Michael Franken
046: * @version $Revision: 1.19 $
047: */
048: public class FileSystemCollection extends AbstractCollection {
049: /** logger for Commons logging. */
050: private static Log log = LogFactory
051: .getLog(FileSystemCollection.class);
052:
053: /**
054: * Default Constructor setting all fields to non null defaults.
055: */
056: public FileSystemCollection() {
057: name = "";
058: url = "";
059: description = "";
060: numberOfDocs = 0;
061: version = 0;
062: lastIndexed = null;
063: existsOnDisk = false;
064: keepCache = false;
065: isKeepCacheSet = false;
066: // other constructor stuff should appear here first ...
067: log.debug("in constructor - initializing...");
068: }
069:
070: /**
071: * Sets existsOnDisk based on whether the collection (contentDir) actually (now) sits on disk.
072: *
073: * @todo the whole existsOnDisk construction is a little funny, refactor some time
074: */
075: protected void setExistsOnDisk() {
076: if (contentDir == null) {
077: existsOnDisk = false;
078: } else {
079: existsOnDisk = contentDir.isDirectory();
080: }
081: }
082:
083: /**
084: * Gets the origin from where this collection's documents can be retrieved.
085: *
086: * @return location such as e:/docs or InBox
087: */
088: public final String getRoot() {
089: if (getContentDir() == null) {
090: return "-";
091: }
092: return getContentDir().getAbsolutePath();
093: }
094:
095: /**
096: * Prints Collection as String for logging.
097: *
098: * @return pretty formatted information about the collection
099: */
100: public final String toString() {
101: return "Collection(" + id + "), with name: " + name
102: + ",\n\t\tdescription: " + description
103: + ",\n\t\tcontentDir: " + contentDir + ",\n\t\turl: "
104: + url + ",\n\t\texistsOnDisk: " + existsOnDisk
105: + ",\n\t\tindexDir: " + indexDir + ",\n\t\tcacheDir: "
106: + cacheDir + ",\n\t\tcacheUrl: " + cacheUrl
107: + ",\n\t\tanalyzer: " + analyzer + ",\n\t\tkeepCache: "
108: + keepCache + ",\n\t\tisKeepCacheSet: "
109: + isKeepCacheSet + ",\n\t\tnumberOfDocs: "
110: + numberOfDocs + ",\n\t\tmanager: " + manager
111: + ",\n\t\tlastIndexed: " + lastIndexed;
112: // +
113: // ",\n\t\tmd5DocumentCache:
114: // " + md5DocumentCache +
115: // "\n\n";
116: }
117:
118: /**
119: * Index the given Collection.
120: *
121: * @param fullIndex indicated whether a full or incremental index should be created
122: * @throws IndexException if the Collections can not be indexed
123: */
124: public final void index(final boolean fullIndex)
125: throws IndexException {
126: log.info("Starting creation of index of "
127: + this .getContentDir());
128:
129: IndexWriter writer = null;
130:
131: try {
132: // record start time
133: StopWatch watch = new StopWatch();
134:
135: watch.start();
136:
137: // make sure the index exists
138: File indexDirectory = this .getIndexDirWithManagerDefaults();
139:
140: // reindex if the index is not there or invalid
141: int currentNumberOfDocs = 0;
142: boolean mustReindex = fullIndex;
143: if (!this .isIndexValid()) {
144: mustReindex = true;
145: indexDirectory.mkdirs();
146: } else {
147: currentNumberOfDocs = getNumberOfDocs();
148: }
149:
150: // create an index(writer)
151: writer = new IndexWriter(indexDirectory, this
152: .createAnalyzer(), mustReindex);
153: // see whether there are specific indexing settings in manager
154: if (manager.getMergeFactor() != null) {
155: writer.setMergeFactor(manager.getMergeFactor()
156: .intValue());
157: }
158: if (manager.getMinMergeDocs() != null) {
159: writer.setMaxBufferedDocs(manager.getMinMergeDocs()
160: .intValue());
161: }
162:
163: if (manager.getMaxMergeDocs() != null) {
164: writer.setMaxMergeDocs(manager.getMaxMergeDocs()
165: .intValue());
166: }
167:
168: resetCache(fullIndex);
169:
170: // prepare Index parameters
171: IndexCommand ic = new IndexCommand();
172:
173: ic.setWriter(writer);
174: ic.setCollection(this );
175: ic.setFile(this .getContentDir());
176: ic.setInZip(false);
177: ic.setStart(true);
178:
179: // and start indexing
180: this .indexDocs(ic);
181: log.debug("Optimizing index of " + this .getContentDir());
182: writer.optimize();
183:
184: // update the info of this collection
185: this .init();
186:
187: // record end time and report duration of indexing
188: watch.stop();
189: log.info("Indexed "
190: + (writer.docCount() - currentNumberOfDocs)
191: + " new documents in " + watch.elapsedTime());
192: } catch (IOException e) {
193: throw new IndexException("Error indexing '"
194: + this .getName()
195: + "'. Possibly unable to remove old index", e);
196: } catch (Exception e) {
197: throw new IndexException("Error indexing '"
198: + this .getName() + "'", e);
199: } finally {
200: if (writer != null) {
201: try {
202: writer.close();
203: } catch (IOException e1) {
204: // assume the index is made, just can't close, so don't
205: // rethrow, just log
206: log.error("Error closing index for "
207: + this .getName(), e1);
208: }
209: }
210: }
211:
212: }
213:
214: /**
215: * Index the given Collection.
216: *
217: * @param fullIndex indicated whether a full or incremental index should be created
218: * @throws IndexException if the Collections can not be indexed
219: */
220: // TODO: this really looks like the previous method: refactor!
221: public final void indexFile(final File theFile)
222: throws IndexException {
223: log.info("Adding File " + theFile + " to collection " + name);
224:
225: IndexWriter writer = null;
226:
227: try {
228: // record start time
229: StopWatch watch = new StopWatch();
230:
231: watch.start();
232:
233: // make sure the index exists
234: File indexDirectory = this .getIndexDirWithManagerDefaults();
235:
236: int currentNumberOfDocs = getNumberOfDocs();
237:
238: boolean reindex = false;
239: if (!isIndexValid()) {
240: log.debug("Index for " + name
241: + " is not valid, create a new one");
242: reindex = true;
243: }
244:
245: // create an index(writer)
246: writer = new IndexWriter(indexDirectory, this
247: .createAnalyzer(), reindex);
248: // see whether there are specific indexing settings in manager
249: if (manager.getMergeFactor() != null) {
250: writer.setMergeFactor(manager.getMergeFactor()
251: .intValue());
252: }
253: if (manager.getMinMergeDocs() != null) {
254: writer.setMaxBufferedDocs(manager.getMinMergeDocs()
255: .intValue());
256: }
257:
258: if (manager.getMaxMergeDocs() != null) {
259: writer.setMaxMergeDocs(manager.getMaxMergeDocs()
260: .intValue());
261: }
262:
263: // prepare Index parameters
264: IndexCommand ic = new IndexCommand();
265:
266: ic.setWriter(writer);
267: ic.setCollection(this );
268: ic.setFile(theFile);
269: ic.setInZip(false);
270: ic.setStart(true);
271:
272: // and start indexing
273: this .indexDocs(ic);
274: log.debug("Optimizing index of " + this .getContentDir());
275: writer.optimize();
276:
277: // update the info of this collection
278: this .init();
279:
280: // record end time and report duration of indexing
281: watch.stop();
282: log.info("Indexed "
283: + (writer.docCount() - currentNumberOfDocs)
284: + " new documents in " + watch.elapsedTime());
285: } catch (IOException e) {
286: throw new IndexException("Error indexing '"
287: + this .getName()
288: + "'. Possibly unable to remove old index", e);
289: } catch (Exception e) {
290: throw new IndexException("Error indexing '"
291: + this .getName() + "'", e);
292: } finally {
293: if (writer != null) {
294: try {
295: writer.close();
296: } catch (IOException e1) {
297: // assume the index is made, just can't close, so don't
298: // rethrow, just log
299: log.error("Error closing index for "
300: + this .getName(), e1);
301: }
302: }
303: }
304:
305: }
306:
307: /**
308: * Reads a File from IndexCommand (a directory, 'straight' file or an archive) and creates an index for all files recursively.
309: *
310: * <p>
311: * now supports pdf, rtf, html, txt, rar, zip, chm and doc formats.
312: * </p>
313: *
314: * @param ic IndexCommand
315: *
316: * @throws IndexException when Indexing stops
317: */
318: private void indexDocs(final IndexCommand ic) throws IndexException {
319: if (stopRequested) {
320: log.info("Indexing stops, due to request");
321: return;
322: }
323: log.debug("indexDocs: document #" + ic.getWriter().docCount()
324: + ": " + ic);
325: if (ic.getFile().isDirectory()) {
326: if (!FileUtils.isLink(ic.getFile())) {
327: indexDirectory(ic);
328: } else {
329: log.warn("Skipping symbolic link: "
330: + ic.getFile().getAbsolutePath());
331: }
332: } else {
333: // handle composed docs first based on file extension, lookup in the manager.getArchiveHandler() whether
334: // this is an archive
335: // TODO refactor this together with straight file
336: String extension = FileUtils.getExtension(ic.getFile());
337:
338: if ((manager.getArchiveHandler() != null)
339: && manager.getArchiveHandler().canUnPack(extension)) {
340: indexArchive(ic, extension);
341: } else {
342: // handle straight files
343: if (ic.getFile().isFile()) {
344: if (!FileUtils.isLink(ic.getFile())) {
345: indexStraightFile(ic);
346: } else {
347: log.warn("Skipping symbolic link: "
348: + ic.getFile().getAbsolutePath());
349: }
350: } else {
351: log.debug("not a normal file: "
352: + ic.getFile().getName());
353: }
354: }
355: }
356: }
357:
358: /**
359: * @param ic
360: * @throws IndexException
361: */
362: private void indexStraightFile(final IndexCommand ic)
363: throws IndexException {
364: log.debug(ic.getFile() + " is a straight file");
365: if (!ic.isInZip()) {
366: ic.setRealName(ic.getFile().getName());
367: } else {
368: ic.setZipName(ic.getFile().getName());
369: }
370: // do we support this kind of file?
371: if (manager.getFactory().canExtract(ic.getFile())
372: || manager.getFactory().isDefaultFileinfo()) {
373: // get the hash for this file
374: String hash = FileUtils.getMD5Hash(ic.getFile());
375: // if we can't get a hash, just set it to a non null value,
376: // so at least the indexing continues
377: if (hash == null) {
378: hash = "unknown";
379: }
380: // Check whether this file has been added already
381: if (!ic.getCollection().getMd5DocumentCache()
382: .contains(hash)) {
383: // new document, handle it
384: ic.setHash(hash);
385: Document doc = parse(ic);
386: if (doc != null) {
387: if (log.isDebugEnabled()) {
388: log.debug("Indexcommand: " + ic);
389: }
390: // add the document to the index(writer)
391: try {
392: ic.getWriter().addDocument(doc);
393:
394: // add the hash to hashtable if not "unknown" or
395: // empty
396: if (!"unknown".equals(hash)
397: && (hash.length() > 0)) {
398: boolean result = ic.getCollection()
399: .getMd5DocumentCache().add(hash);
400:
401: if (result) {
402: log.debug("Hash added for document: "
403: + ic.getFile());
404: } else {
405: log.warn("No Hash added for document: "
406: + ic.getFile());
407: }
408: }
409:
410: log.info("document #"
411: + ic.getWriter().docCount() + ": "
412: + ic.getFile().getName()
413: + " added to index");
414: } catch (IOException e) {
415: throw new IndexException(
416: "Error adding document '"
417: + ic.getFile().getName()
418: + "' to Index", e);
419: }
420: }
421: } else {
422: log.info("skipping duplicate document: "
423: + ic.getFile().getName());
424:
425: // if this document is in the cache, we may remove it
426: if (FileUtils.isIn(ic.getFile(), ic.getCollection()
427: .getCacheDirWithManagerDefaults())) {
428: if (ic.getFile().delete()) {
429: log.debug("Removed: " + ic.getFile()
430: + " from cache.");
431: }
432: }
433: }
434: } else {
435: log.debug("skipping unsupported document: "
436: + ic.getFile().getName());
437: }
438: }
439:
440: /**
441: * @param ic
442: * @param extension
443: * @throws IndexException
444: */
445: private void indexArchive(final IndexCommand ic, String extension)
446: throws IndexException {
447: if (stopRequested) {
448: log.info("Indexing stops, due to request");
449: return;
450: }
451: // we have an archive
452: log.debug(ic.getFile() + " is an archive");
453: // add the document with just its name and hash to the collection as well, so that we can cache it
454: // for incremental indexing
455: String hash = FileUtils.getMD5Hash(ic.getFile());
456: // if we can't get a hash, just set it to a non null value, so at least the indexing continues
457: if (hash == null) {
458: hash = "unknown";
459: }
460: // Check whether this file has been added already
461: if (!ic.getCollection().getMd5DocumentCache().contains(hash)) {
462: try {
463: // add the document with just its name and hash
464: Document doc = new Document();
465: doc.add(new Field("hash", hash, Field.Store.YES,
466: Field.Index.UN_TOKENIZED));
467: doc.add(new Field("name", ic.getRealName(),
468: Field.Store.YES, Field.Index.TOKENIZED));
469: ic.getWriter().addDocument(doc);
470: log.debug("Archive " + ic.getFile()
471: + " added to collection");
472: File dir = null;
473: if (!StringUtils.hasText(manager.getArchiveHandler()
474: .getUnArchiveCommand(extension))) {
475: // this is a zip: handle with java's zip
476: // capabilities
477: log.debug(ic.getFile() + " is a zip file");
478: dir = CollectionManagerImpl.unZip(ic.getFile(), ic
479: .getCollection());
480: } else {
481: log.debug(ic.getFile()
482: + " is a external archive file");
483: dir = manager.unPack(ic.getFile(), ic
484: .getCollection());
485: }
486:
487: IndexCommand localIc = new IndexCommand(ic);
488: if (ic.isInZip()) {
489: // ic.setZipPath(ic.getZipPath() +
490: // ic.getFile().getName() + "::/");
491: localIc.setZipPath(ic.getZipPath() + dir.getName()
492: + "/");
493: localIc.setStart(true);
494: } else {
495: localIc.setRealName(ic.getFile().getName());
496: localIc.setInZip(true);
497: localIc.setStart(false);
498: }
499:
500: localIc.setFile(dir);
501: indexDocs(localIc);
502: // remove dir since it is temporary
503: if (!ic.getCollection()
504: .isKeepCacheWithManagerDefaults()) {
505: FileUtils.removeDir(dir);
506: }
507:
508: // add the hash to hashtable if not "unknown" or empty
509: if (!"unknown".equals(hash) && (hash.length() > 0)) {
510: boolean result = ic.getCollection()
511: .getMd5DocumentCache().add(hash);
512: if (result) {
513: log.debug("Hash added for document: "
514: + ic.getFile());
515: } else {
516: log.warn("No Hash added for document: "
517: + ic.getFile());
518: }
519: }
520: } catch (IOException e) {
521: throw new IndexException("Error adding document '"
522: + ic.getFile().getName() + "' to Index", e);
523: }
524: } else {
525: log.info("skipping duplicate archive: "
526: + ic.getFile().getName());
527: }
528: }
529:
530: /**
531: * @param ic
532: * @throws IndexException
533: */
534: private void indexDirectory(final IndexCommand ic)
535: throws IndexException {
536: if (stopRequested) {
537: log.info("Indexing stops, due to request");
538: return;
539: }
540: log.debug(ic.getFile() + " is a directory");
541: // recurse
542: String[] files = ic.getFile().list();
543: // I've seen list return null, so be carefull, guess dir names too long for OS
544: if (files == null) {
545: log.warn("Something funny with '" + ic.getFile()
546: + "'. Name or path too long?");
547: log.warn("Could not access '" + ic.getFile()
548: + "' for indexing. Skipping this directory.");
549: } else {
550:
551: log.debug(ic.getFile() + " is a directory with "
552: + files.length + " docs");
553: if (!ic.isInZip()) {
554: // is this the first directory
555: if (!ic.isStart()) {
556: ic.setRealPath(ic.getRealPath()
557: + ic.getFile().getName() + "/");
558: } else {
559: ic.setStart(false);
560: }
561: } else {
562: if (!ic.isStart()) {
563: ic.setZipPath(ic.getZipPath()
564: + ic.getFile().getName() + "/");
565: } else {
566: ic.setStart(false);
567: }
568: }
569:
570: // Index the files using a new IndexCommand that's a copy of the current one
571: // except with the new File, don't use the current since status will be overridden
572: // when backtracking from recursion
573: for (int i = 0; i < files.length; i++) {
574: IndexCommand localIc = new IndexCommand(ic);
575: localIc.setFile(new File(ic.getFile(), files[i]));
576: indexDocs(localIc);
577: }
578: }
579: }
580:
581: /**
582: * Makes a document for a File, by parsing the contents and metadata provided by {@link IndexCommand}.
583: *
584: * @param ic IndexCommand containing all parameters for parsing.
585: *
586: * @return Document with parsed content, or null if unknown format, or empty content.
587: */
588: private Document parse(final IndexCommand ic) {
589: log.debug("Parsing " + ic.getFile().getName());
590:
591: // Extract relevant info from the file by first getting the relevant
592: // Extractor
593: Extractor ext = manager.getFactory().createExtractor(
594: ic.getFile());
595:
596: if (ext == null) {
597: log.debug("Skipping " + ic.getFile().getName());
598:
599: return null;
600: }
601:
602: ParsedFileInfo fileInfo = ext.extractInfo(ic.getFile());
603:
604: if (fileInfo != null) {
605: // make a new, empty document
606: if (log.isDebugEnabled()) {
607: log.debug("Creating new Document with ParsedFileInfo: "
608: + fileInfo);
609: }
610:
611: Document doc = new Document();
612:
613: // Add all collection info
614: doc.add(new Field("name", ic.getRealName(),
615: Field.Store.YES, Field.Index.TOKENIZED));
616: doc.add(new Field("path", ic.getRealPath(),
617: Field.Store.YES, Field.Index.TOKENIZED));
618: doc.add(new Field("zipPath", ic.getZipPath(),
619: Field.Store.YES, Field.Index.TOKENIZED));
620: doc.add(new Field("zipName", ic.getZipName(),
621: Field.Store.YES, Field.Index.TOKENIZED));
622: doc
623: .add(new Field("collection", ic.getCollection()
624: .getName(), Field.Store.YES,
625: Field.Index.TOKENIZED));
626:
627: // Add all file info
628: if (fileInfo.getReader() != null) {
629: doc.add(new Field("contents", fileInfo.getReader()));
630: }
631: doc.add(new Field("summary", fileInfo.getSummary(),
632: Field.Store.YES, Field.Index.TOKENIZED));
633: doc.add(new Field("title", fileInfo.getTitle(),
634: Field.Store.YES, Field.Index.TOKENIZED));
635: doc.add(new Field("type", fileInfo.getType(),
636: Field.Store.YES, Field.Index.TOKENIZED));
637: if (fileInfo.getISBN() != null) {
638: doc.add(new Field("isbn", fileInfo.getISBN(),
639: Field.Store.YES, Field.Index.UN_TOKENIZED));
640: }
641:
642: // store date as yyyyMMdd
643: DateFormat df = new SimpleDateFormat("yyyyMMdd");
644: String dfString = df.format(new Date(fileInfo
645: .getModificationDate()));
646:
647: doc.add(new Field("modified", dfString, Field.Store.YES,
648: Field.Index.UN_TOKENIZED));
649: doc.add(new Field("size",
650: Long.toString(fileInfo.getSize()), Field.Store.YES,
651: Field.Index.UN_TOKENIZED));
652: doc.add(new Field("hash", ic.getHash(), Field.Store.YES,
653: Field.Index.UN_TOKENIZED));
654:
655: if (log.isDebugEnabled()) {
656: log.debug("Parsed " + doc);
657: }
658:
659: return doc;
660: } else {
661: log
662: .warn("Extractor does not return any ParsedFileInfo for: "
663: + ic.getFile().getName());
664: }
665: return null;
666: }
667:
668: }
|