001: /*
002: * Copyright 2003-2004 Michael Franken, Zilverline.
003: *
004: * The contents of this file, or the files included with this file, are subject to
005: * the current version of ZILVERLINE Collaborative Source License for the
006: * Zilverline Search Engine (the "License"); You may not use this file except in
007: * compliance with the License.
008: *
009: * You may obtain a copy of the License at
010: *
011: * http://www.zilverline.org.
012: *
013: * See the License for the rights, obligations and
014: * limitations governing use of the contents of the file.
015: *
016: * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017: * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018: * copyrights in the portions it created. All Rights Reserved.
019: *
020: */
021:
022: package org.zilverline.extractors;
023:
024: import java.io.File;
025: import java.io.FileInputStream;
026: import java.io.FileNotFoundException;
027: import java.io.IOException;
028: import java.io.InputStream;
029: import java.io.Reader;
030: import java.io.StringReader;
031:
032: import javax.swing.text.BadLocationException;
033: import javax.swing.text.Document;
034: import javax.swing.text.rtf.RTFEditorKit;
035:
036: /**
037: * This class extracts text from RTF files by using the javax.swing.text.rtf library.
038: *
039: * @author Michael Franken
040: * @version $Revision: 1.15 $
041: */
042: public class RTFExtractor extends AbstractExtractor {
043: /**
044: * Extract the content from the given RTF file. As a side effect the type and summary are set too.
045: *
046: * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
047: */
048: public final Reader getContent(final File f) {
049: setType("RTF");
050:
051: Reader reader = null;
052: FileInputStream fis = null;
053: try {
054: fis = new FileInputStream(f);
055: RTFEditorKit kit = new RTFEditorKit();
056: Document doc = kit.createDefaultDocument();
057:
058: kit.read(fis, doc, 0);
059:
060: String plainText = doc.getText(0, doc.getLength());
061:
062: reader = new StringReader(plainText);
063:
064: setSummary(getSummaryFromContent(plainText));
065: setISBN(getISBNFromContent(plainText));
066: } catch (FileNotFoundException e) {
067: log.warn("Can't open file: " + f.getName(), e);
068: } catch (IOException e) {
069: log.warn("Can't extract contents for: " + f.getName(), e);
070: } catch (BadLocationException e) {
071: log.warn("Can't extract contents for: " + f.getName(), e);
072: } finally {
073: if (fis != null) {
074: try {
075: fis.close();
076: } catch (IOException e1) {
077: log.warn("Can not close inputstream for "
078: + f.getName(), e1);
079: }
080: }
081: }
082:
083: return reader;
084: }
085:
086: /**
087: * Extract the content from the given RTF file. As a side effect the type and summary are set too.
088: *
089: * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
090: */
091: public final String getContent(final InputStream is) {
092: try {
093: RTFEditorKit kit = new RTFEditorKit();
094: Document doc = kit.createDefaultDocument();
095:
096: kit.read(is, doc, 0);
097:
098: return doc.getText(0, doc.getLength());
099:
100: } catch (IOException e) {
101: log.warn("Can't extract contents", e);
102: } catch (BadLocationException e) {
103: log.warn("Can't extract contents", e);
104: }
105:
106: return "";
107: }
108: }
|