001: /*
002: * Copyright 2003-2004 Michael Franken, Zilverline.
003: *
004: * The contents of this file, or the files included with this file, are subject to
005: * the current version of ZILVERLINE Collaborative Source License for the
006: * Zilverline Search Engine (the "License"); You may not use this file except in
007: * compliance with the License.
008: *
009: * You may obtain a copy of the License at
010: *
011: * http://www.zilverline.org.
012: *
013: * See the License for the rights, obligations and
014: * limitations governing use of the contents of the file.
015: *
016: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018: * copyrights in the portions it created. All Rights Reserved.
019: *
020: */
021:
022: package org.zilverline.extractors;
023:
024: import java.io.ByteArrayOutputStream;
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.FileNotFoundException;
028: import java.io.FileReader;
029: import java.io.IOException;
030: import java.io.InputStream;
031: import java.io.OutputStream;
032: import java.io.Reader;
033:
034: /**
035: * This class extracts text from text files.
036: *
037: * @author Michael Franken
038: * @version $Revision: 1.19 $
039: */
040: public class TextExtractor extends AbstractExtractor {
041:
042: /**
043: * Get the reader form this file, and set the type and summary while we're at it.
044: *
045: * @param f the file containing the text
046: * @return the reader containing the plain text.
047: */
048: public final Reader getContent(final File f) {
049: setType("TEXT");
050:
051: Reader reader = null;
052: FileInputStream fis = null;
053: try {
054: reader = new FileReader(f);
055: fis = new FileInputStream(f);
056: String content = getContent(fis);
057: setSummary(getSummaryFromContent(content));
058: setISBN(getISBNFromContent(content));
059: } catch (FileNotFoundException e) {
060: log.warn("Can't extract contents and summary from "
061: + f.getName(), e);
062: } finally {
063: if (fis != null) {
064: try {
065: fis.close();
066: } catch (IOException e1) {
067: log.error("Can't file input stream from "
068: + f.getName(), e1);
069: }
070: }
071: }
072: return reader;
073: }
074:
075: /**
076: * Extract the content from the given InputStream.
077: *
078: * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
079: */
080: public final String getContent(final InputStream is) {
081: int k;
082: int aBuffSize = 512;
083: byte[] buff = new byte[aBuffSize];
084: OutputStream out = new ByteArrayOutputStream(aBuffSize);
085: try {
086: while ((k = is.read(buff)) != -1) {
087: out.write(buff, 0, k);
088: }
089: return out.toString();
090: } catch (Exception e) {
091: log.warn("Can't extract contents stream ", e);
092: } finally {
093: if (out != null) {
094: try {
095: out.close();
096: } catch (IOException e1) {
097: log.error("Can't close output stream any more", e1);
098: }
099: }
100: }
101:
102: return "";
103: }
104: }
|