001: //docParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: 24.04.2005
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.doc;
045:
046: import java.io.InputStream;
047: import java.util.Hashtable;
048:
049: import org.textmining.text.extraction.WordExtractor;
050:
051: import de.anomic.plasma.plasmaParserDocument;
052: import de.anomic.plasma.parser.AbstractParser;
053: import de.anomic.plasma.parser.Parser;
054: import de.anomic.plasma.parser.ParserException;
055: import de.anomic.yacy.yacyURL;
056:
057: public class docParser extends AbstractParser implements Parser {
058:
059: /**
060: * a list of mime types that are supported by this parser class
061: * @see #getSupportedMimeTypes()
062: */
063: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
064: static {
065: SUPPORTED_MIME_TYPES.put("application/msword", "doc");
066: }
067:
068: /**
069: * a list of library names that are needed by this parser
070: * @see Parser#getLibxDependences()
071: */
072: private static final String[] LIBX_DEPENDENCIES = new String[] { "tm-extractors-0.4.jar" };
073:
074: public docParser() {
075: super (LIBX_DEPENDENCIES);
076: this .parserName = "Word Document Parser";
077: }
078:
079: public plasmaParserDocument parse(yacyURL location,
080: String mimeType, String charset, InputStream source)
081: throws ParserException, InterruptedException {
082:
083: try {
084: WordExtractor extractor = new WordExtractor();
085: String contents = extractor.extractText(source);
086:
087: plasmaParserDocument theDoc = new plasmaParserDocument(
088: location, mimeType, "UTF-8", null, ((contents
089: .length() > 80) ? contents.substring(0, 80)
090: : contents.trim()).replaceAll("\r\n", " ")
091: .replaceAll("\n", " ")
092: .replaceAll("\r", " ")
093: .replaceAll("\t", " "), "", // TODO: AUTHOR
094: null, null, contents.getBytes("UTF-8"), null, null);
095:
096: return theDoc;
097: } catch (Exception e) {
098: if (e instanceof InterruptedException)
099: throw (InterruptedException) e;
100: if (e instanceof ParserException)
101: throw (ParserException) e;
102:
103: throw new ParserException(
104: "Unexpected error while parsing doc file. "
105: + e.getMessage(), location);
106: }
107: }
108:
109: public java.util.Hashtable<String, String> getSupportedMimeTypes() {
110: return docParser.SUPPORTED_MIME_TYPES;
111: }
112:
113: public void reset() {
114: // Nothing todo here at the moment
115: super.reset();
116: }
117:
118: }
|