001: /*
002: * Copyright 2003-2004 Michael Franken, Zilverline.
003: *
004: * The contents of this file, or the files included with this file, are subject to
005: * the current version of ZILVERLINE Collaborative Source License for the
006: * Zilverline Search Engine (the "License"); You may not use this file except in
007: * compliance with the License.
008: *
009: * You may obtain a copy of the License at
010: *
011: * http://www.zilverline.org.
012: *
013: * See the License for the rights, obligations and
014: * limitations governing use of the contents of the file.
015: *
016: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018: * copyrights in the portions it created. All Rights Reserved.
019: *
020: */
021:
022: package org.zilverline.extractors;
023:
024: import java.io.File;
025: import java.io.IOException;
026: import java.io.Reader;
027:
028: import org.apache.commons.logging.Log;
029: import org.apache.commons.logging.LogFactory;
030:
031: import org.springframework.util.StringUtils;
032:
033: import org.zilverline.core.Extractor;
034: import org.zilverline.core.ParsedFileInfo;
035: import org.zilverline.util.FileUtils;
036: import org.zilverline.util.Utils;
037:
038: /**
039: * Abstract baseclass of extractors. Extractors extract all relevant info from a File, and return the info in a ParsedFileInfo
040: * Object.
041: *
042: * @author Michael Franken
043: * @version $Revision: 1.20 $
044: *
045: * @see org.zilverline.core.ParsedFileInfo
046: */
047: public abstract class AbstractExtractor implements Extractor {
048: /** default size of summary extracted from the file. */
049: private static final int SUMMARY_SIZE = 200;
050:
051: /**
052: * logger for Commons logging. This is non-static final protected, such that it defines a log for all subclasses too.
053: */
054: protected final Log log = LogFactory.getLog(getClass().getName());
055:
056: private final static Log log2 = LogFactory
057: .getLog(AbstractExtractor.class);
058:
059: /** default size of summary extracted from the file. */
060: private ParsedFileInfo fileInfo = new ParsedFileInfo();
061:
062: /**
063: * Set the file and all file related information of the document, such as length and modification date.
064: *
065: * @param f The file that is being parsed
066: */
067: public final void setFile(final File f) {
068: fileInfo.setFile(f);
069: fileInfo.setSize(f.length());
070: fileInfo.setModificationDate(f.lastModified());
071: }
072:
073: /**
074: * Set the type of the document.
075: *
076: * @param type such as EXCEL, PDF
077: */
078: public final void setType(final String type) {
079: fileInfo.setType(type);
080: }
081:
082: /**
083: * Set the author of the document.
084: *
085: * @param author the author
086: */
087: public final void setAuthor(final String author) {
088: fileInfo.setAuthor(author);
089: }
090:
091: /**
092: * Set the isbn number of the document.
093: *
094: * @param ISBN the ISBN number
095: */
096: public final void setISBN(final String ISBN) {
097: fileInfo.setISBN(ISBN);
098: }
099:
100: /**
101: * Set the title of the document.
102: *
103: * @param title the title
104: */
105: public final void setTitle(final String title) {
106: fileInfo.setTitle(title);
107: }
108:
109: /**
110: * Set the size of the document.
111: *
112: * @param size the size in bytes
113: */
114: public final void setSize(final long size) {
115: fileInfo.setSize(size);
116: }
117:
118: /**
119: * Set the modificationDate of the document.
120: *
121: * @param modificationDate the modificationDate in milliseconds since January 1, 1970, 00:00:00 GMT
122: */
123: public final void setModificationDate(final long modificationDate) {
124: fileInfo.setModificationDate(modificationDate);
125: }
126:
127: /**
128: * Set the creationDate of the document.
129: *
130: * @param creationDate the creationDate in milliseconds since January 1, 1970, 00:00:00 GMT
131: */
132: public final void setCreationDate(final long creationDate) {
133: fileInfo.setCreationDate(creationDate);
134: }
135:
136: /**
137: * Set the summary of the document.
138: *
139: * @param summary the summary
140: */
141: public final void setSummary(final String summary) {
142: fileInfo.setSummary(summary);
143: }
144:
145: /**
146: * Extract the content from the given file. As a side effect other attributes of ParsedFileInfo may be set too.
147: *
148: * Implementations should catch all checked exceptions, sensibly, And close all resources.
149: *
150: * @param f The file to extract the content from.
151: *
152: * @return Reader containing text-only content
153: */
154: public abstract Reader getContent(final File f);
155:
156: /**
157: * This method extracts all relevant info of the file as an ParsedFileInfo object. Uses getContent as callback.
158: *
159: * @param f the File to extract content from
160: *
161: * @return ParsedFileInfo the object containing relevant info of the provided file
162: */
163: public final ParsedFileInfo extractInfo(final File f) {
164: if (f == null) {
165: log
166: .warn("Something went terribly wrong, file = null, returning null ");
167: return null;
168: }
169: try {
170: setFile(f);
171:
172: Reader reader = getContent(f);
173: fileInfo.setReader(reader);
174: // get the summary from the reader
175: if (reader != null) {
176: String summary = fileInfo.getSummary();
177:
178: if (!StringUtils.hasText(summary)) {
179: char[] sumChars = new char[SUMMARY_SIZE];
180: int numChars = 0;
181: try {
182: if (reader.markSupported()) {
183: reader.mark(SUMMARY_SIZE);
184: numChars = reader.read(sumChars);
185: reader.reset();
186: }
187: if (numChars > 0) {
188: summary = new String(sumChars, 0, numChars);
189: }
190: if (log.isDebugEnabled()) {
191: log.debug("Summary extracted from reader: "
192: + summary);
193: }
194: setSummary(getSummaryFromContent(summary));
195: } catch (IOException e) {
196: log.warn(
197: "Error extracting summary form reader",
198: e);
199: }
200: }
201: }
202: // Set the title if there's none yet
203: if (!StringUtils.hasLength(fileInfo.getTitle())) {
204: fileInfo.setTitle(FileUtils.getBasename(f));
205: }
206: } catch (Exception e) {
207: // here we don't throw any, since we do not want to interrupt the indexing process
208: log.warn("Unexpected Error extracting content from "
209: + f.getName(), e);
210: } catch (OutOfMemoryError e) {
211: // this happens with very, very large Documents
212: log
213: .error(
214: "Very Serious Error. Out of Memory for very large documents: "
215: + f.getName()
216: + ", try increasing your JVM heap size: for example, start your server with option '-Xmx128m'."
217: + " Skipping file.", e);
218: } catch (Throwable e) {
219: log.error(
220: "Very Serious Error while extracting contents from: "
221: + f.getName(), e);
222: }
223:
224: return fileInfo;
225: }
226:
227: /**
228: * Get a ISBN number from the given text.
229: *
230: * @param text the plain text, can be null
231: * @return a valid ISBNnumber (10 characters without -) or else ""
232: */
233: public static String getISBNFromContent(final String text) {
234: if (text == null) {
235: return "";
236: }
237: // ISBN:0764543857
238: String ISBNnumber = "";
239: int j;
240: // does text contain ISBN or isbn?
241: if (((j = text.indexOf("ISBN")) != -1)
242: || (j = text.indexOf("isbn")) != -1) {
243: // look 25 characters forward
244: ISBNnumber = text.substring(j, j + 25);
245: // remove ISBN.. (all text until first number)
246: ISBNnumber = ISBNnumber.replaceFirst("[\\D]+", "");
247: // remove all non-valid ISBN characters (0-9xX and - seem valid), remove - as well
248: ISBNnumber = ISBNnumber.replaceAll("[^0-9xX]", "");
249: if (ISBNnumber.length() > 10) {
250: ISBNnumber = ISBNnumber.substring(0, 10);
251: }
252: log2.debug("possible ISBN found: " + ISBNnumber);
253: if (!Utils.isValidISBNNumber(ISBNnumber)) {
254: return "";
255: }
256: }
257: return ISBNnumber;
258: }
259:
260: /**
261: * Get a summary from the given text.
262: *
263: * @param text the plain text, can be null
264: * @return the summary
265: */
266: public static String getSummaryFromContent(final String text) {
267: if (!StringUtils.hasText(text)) {
268: return "";
269: }
270: // alternative: just the first characters:
271: String summary = text.substring(0, Math.min(text.length(),
272: SUMMARY_SIZE));
273: // SimpleSummariser sum = new SimpleSummariser();
274: // get two representative lines
275: // String summary = sum.summarise(text, 2);
276: // return with minimal whitespace
277: return summary.replaceAll("\\s+", " ");
278: }
279: }
|