001: //rtfParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: 16.05.2005
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.rtf;
045:
046: import java.io.InputStream;
047: import java.util.Hashtable;
048:
049: import javax.swing.text.DefaultStyledDocument;
050: import javax.swing.text.rtf.RTFEditorKit;
051:
052: import de.anomic.plasma.plasmaParserDocument;
053: import de.anomic.plasma.parser.AbstractParser;
054: import de.anomic.plasma.parser.Parser;
055: import de.anomic.plasma.parser.ParserException;
056: import de.anomic.yacy.yacyURL;
057:
058: public class rtfParser extends AbstractParser implements Parser {
059:
060: /**
061: * a list of mime types that are supported by this parser class
062: * @see #getSupportedMimeTypes()
063: */
064: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
065: static {
066: SUPPORTED_MIME_TYPES.put("application/rtf", "rtf");
067: SUPPORTED_MIME_TYPES.put("text/rtf", "rtf");
068: }
069:
070: /**
071: * a list of library names that are needed by this parser
072: * @see Parser#getLibxDependences()
073: */
074: private static final String[] LIBX_DEPENDENCIES = new String[] {};
075:
076: public rtfParser() {
077: super (LIBX_DEPENDENCIES);
078: this .parserName = "Rich Text Format Parser";
079: }
080:
081: public plasmaParserDocument parse(yacyURL location,
082: String mimeType, String charset, InputStream source)
083: throws ParserException, InterruptedException {
084:
085: try {
086: DefaultStyledDocument doc = new DefaultStyledDocument();
087:
088: RTFEditorKit theRtfEditorKit = new RTFEditorKit();
089: theRtfEditorKit.read(source, doc, 0);
090:
091: String bodyText = doc.getText(0, doc.getLength());
092:
093: plasmaParserDocument theDoc = new plasmaParserDocument(
094: location, mimeType, "UTF-8", null, ((bodyText
095: .length() > 80) ? bodyText.substring(0, 80)
096: : bodyText.trim()).replaceAll("\r\n", " ")
097: .replaceAll("\n", " ")
098: .replaceAll("\r", " ")
099: .replaceAll("\t", " "), "", // TODO: AUTHOR
100: null, null, bodyText.getBytes("UTF-8"), null, null);
101:
102: return theDoc;
103: } catch (Exception e) {
104: if (e instanceof InterruptedException)
105: throw (InterruptedException) e;
106: if (e instanceof ParserException)
107: throw (ParserException) e;
108:
109: throw new ParserException(
110: "Unexpected error while parsing rtf resource."
111: + e.getMessage(), location);
112: }
113: }
114:
115: public Hashtable<String, String> getSupportedMimeTypes() {
116: return rtfParser.SUPPORTED_MIME_TYPES;
117: }
118:
119: public void reset() {
120: // Nothing todo here at the moment
121: super.reset();
122: }
123:
124: }
|