001: //pptParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007:
008: //this file is contributed by Tim Riemann
009: //last major change: 10.09.2006
010:
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015:
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020:
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024:
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036:
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.ppt;
045:
046: import java.io.BufferedInputStream;
047: import java.io.InputStream;
048: import java.util.Hashtable;
049:
050: import org.apache.poi.hslf.extractor.PowerPointExtractor;
051:
052: import de.anomic.plasma.plasmaParserDocument;
053: import de.anomic.plasma.parser.AbstractParser;
054: import de.anomic.plasma.parser.Parser;
055: import de.anomic.plasma.parser.ParserException;
056: import de.anomic.yacy.yacyURL;
057:
058: public class pptParser extends AbstractParser implements Parser {
059:
060: /**
061: * a list of mime types that are supported by this parser class
062: * @see #getSupportedMimeTypes()
063: */
064: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
065: static {
066: SUPPORTED_MIME_TYPES.put("application/mspowerpoint", "ppt,pps");
067: SUPPORTED_MIME_TYPES.put("application/powerpoint", "ppt,pps");
068: SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",
069: "ppt,pps");
070: }
071:
072: /**
073: * a list of library names that are needed by this parser
074: * @see Parser#getLibxDependences()
075: */
076: private static final String[] LIBX_DEPENDENCIES = new String[] {
077: "poi-3.0-alpha2-20060616.jar",
078: "poi-scratchpad-3.0-alpha2-20060616.jar" };
079:
080: public pptParser() {
081: super (LIBX_DEPENDENCIES);
082: this .parserName = "Microsoft Powerpoint Parser";
083: this .parserVersionNr = "0.1";
084: }
085:
086: /*
087: * parses the source documents and returns a plasmaParserDocument containing
088: * all extracted information about the parsed document
089: */
090: public plasmaParserDocument parse(yacyURL location,
091: String mimeType, String charset, InputStream source)
092: throws ParserException, InterruptedException {
093: try {
094: /*
095: * create new PowerPointExtractor and extract text and notes
096: * of the document
097: */
098: PowerPointExtractor pptExtractor = new PowerPointExtractor(
099: new BufferedInputStream(source));
100: String contents = pptExtractor.getText(true, true);
101:
102: /*
103: * create the plasmaParserDocument for the database
104: * and set shortText and bodyText properly
105: */
106: plasmaParserDocument theDoc = new plasmaParserDocument(
107: location, mimeType, "UTF-8", null, ((contents
108: .length() > 80) ? contents.substring(0, 80)
109: : contents.trim()).replaceAll("\r\n", " ")
110: .replaceAll("\n", " ")
111: .replaceAll("\r", " ")
112: .replaceAll("\t", " "), "", // TODO: AUTHOR
113: null, null, contents.getBytes("UTF-8"), null, null);
114: return theDoc;
115: } catch (Exception e) {
116: if (e instanceof InterruptedException)
117: throw (InterruptedException) e;
118:
119: /*
120: * an unexpected error occurred, log it and throw a ParserException
121: */
122: String errorMsg = "Unable to parse the ppt document '"
123: + location + "':" + e.getMessage();
124: this .theLogger.logSevere(errorMsg);
125: throw new ParserException(errorMsg, location);
126: }
127: }
128:
129: public Hashtable<String, String> getSupportedMimeTypes() {
130: return SUPPORTED_MIME_TYPES;
131: }
132:
133: public void reset() {
134: //nothing to do
135: super.reset();
136: }
137: }
|