001: //psParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2007
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: 15.09.2005
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.ps;
045:
046: import java.io.BufferedReader;
047: import java.io.BufferedWriter;
048: import java.io.File;
049: import java.io.FileReader;
050: import java.io.FileWriter;
051: import java.io.InputStream;
052: import java.io.InputStreamReader;
053: import java.util.Hashtable;
054:
055: import de.anomic.plasma.plasmaParserDocument;
056: import de.anomic.plasma.parser.AbstractParser;
057: import de.anomic.plasma.parser.Parser;
058: import de.anomic.plasma.parser.ParserException;
059: import de.anomic.server.serverFileUtils;
060: import de.anomic.yacy.yacyURL;
061:
062: public class psParser extends AbstractParser implements Parser {
063:
064: /**
065: * a list of mime types that are supported by this parser class
066: * @see #getSupportedMimeTypes()
067: */
068: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
069: static {
070: SUPPORTED_MIME_TYPES.put("application/postscript", "ps");
071: SUPPORTED_MIME_TYPES.put("text/postscript", "ps");
072: }
073:
074: /**
075: * a list of library names that are needed by this parser
076: * @see Parser#getLibxDependences()
077: */
078: private static final String[] LIBX_DEPENDENCIES = new String[] {};
079:
080: private static Object modeScan = new Object();
081: private static boolean modeScanDone = false;
082: private static String parserMode = "java";
083:
084: public psParser() {
085: super (LIBX_DEPENDENCIES);
086: this .parserName = "PostScript Document Parser";
087: if (!modeScanDone)
088: synchronized (modeScan) {
089: if (testForPs2Ascii())
090: parserMode = "ps2ascii";
091: else
092: parserMode = "java";
093: modeScanDone = true;
094: }
095: }
096:
097: public Hashtable<String, String> getSupportedMimeTypes() {
098: return SUPPORTED_MIME_TYPES;
099: }
100:
101: public boolean testForPs2Ascii() {
102: try {
103: String procOutputLine = null;
104: StringBuffer procOutput = new StringBuffer();
105:
106: Process ps2asciiProc = Runtime.getRuntime().exec(
107: new String[] { "ps2ascii", "--version" });
108: BufferedReader stdOut = new BufferedReader(
109: new InputStreamReader(ps2asciiProc.getInputStream()));
110: while ((procOutputLine = stdOut.readLine()) != null) {
111: procOutput.append(procOutputLine).append(", ");
112: }
113: int returnCode = ps2asciiProc.waitFor();
114: return (returnCode == 0);
115: } catch (Exception e) {
116: if (this .theLogger != null)
117: this .theLogger
118: .logInfo("ps2ascii not found. Switching to java parser mode.");
119: return false;
120: }
121: }
122:
123: public plasmaParserDocument parse(yacyURL location,
124: String mimeType, String charset, File sourceFile)
125: throws ParserException, InterruptedException {
126:
127: File outputFile = null;
128: try {
129: // creating a temp file for the output
130: outputFile = super .createTempFile("ascii.txt");
131:
132: // decide with parser mode to use
133: if (parserMode.equals("ps2ascii")) {
134: parseUsingPS2ascii(sourceFile, outputFile);
135: } else {
136: parseUsingJava(sourceFile, outputFile);
137: }
138:
139: // return result
140: plasmaParserDocument theDoc = new plasmaParserDocument(
141: location, mimeType, "UTF-8", null, null, "", null,
142: null, outputFile, null, null);
143:
144: return theDoc;
145: } catch (Exception e) {
146: if (e instanceof InterruptedException)
147: throw (InterruptedException) e;
148: if (e instanceof ParserException)
149: throw (ParserException) e;
150:
151: // delete temp file
152: if (outputFile != null)
153: outputFile.delete();
154:
155: // throw exception
156: throw new ParserException(
157: "Unexpected error while parsing ps file. "
158: + e.getMessage(), location);
159: }
160: }
161:
162: public void parseUsingJava(File inputFile, File outputFile)
163: throws Exception {
164:
165: BufferedReader reader = null;
166: BufferedWriter writer = null;
167: try {
168: reader = new BufferedReader(new FileReader(inputFile));
169: writer = new BufferedWriter(new FileWriter(outputFile));
170:
171: String versionInfoLine = reader.readLine();
172: String version = versionInfoLine.substring(versionInfoLine
173: .length() - 3);
174:
175: int ichar = 0;
176: boolean isComment = false;
177: boolean isText = false;
178:
179: if (version.startsWith("2")) {
180: boolean isConnector = false;
181:
182: while ((ichar = reader.read()) > 0) {
183: if (isConnector) {
184: if (ichar < 108) {
185: writer.write(' ');
186: }
187: isConnector = false;
188: } else if (ichar == '%') {
189: isComment = true;
190: } else if (ichar == '\n' && isComment) {
191: isComment = false;
192: } else if (ichar == ')' && isText) {
193: isConnector = true;
194: isText = false;
195: } else if (isText) {
196: writer.write((char) ichar);
197: } else if (ichar == '(' && !isComment) {
198: isText = true;
199: }
200: }
201:
202: } else if (version.startsWith("3")) {
203: StringBuffer stmt = new StringBuffer();
204: boolean isBMP = false;
205: boolean isStore = false;
206: int store = 0;
207:
208: while ((ichar = reader.read()) > 0) {
209: if (ichar == '%') {
210: isComment = true;
211: } else if (ichar == '\n' && isComment) {
212: isComment = false;
213: } else if (ichar == ')' && isText) {
214: isText = false;
215: } else if (isText && !isBMP) {
216: writer.write((char) ichar);
217: } else if (ichar == '(' && !isComment && !isBMP) {
218: isText = true;
219: } else if (isStore) {
220: if (store == 9 || ichar == ' ' || ichar == 10) {
221: isStore = false;
222: store = 0;
223: if (stmt.toString().equals("BEGINBITM")) {
224: isText = false;
225: isBMP = true;
226: } else if (stmt.toString().equals(
227: "ENDBITMAP")) {
228: isBMP = false;
229: }
230: stmt.delete(0, stmt.length());
231: } else {
232: stmt.append((char) ichar);
233: store++;
234: }
235: } else if (!isComment && !isStore
236: && (ichar == 66 || ichar == 69)) {
237: isStore = true;
238: stmt.append((char) ichar);
239: store++;
240: }
241: }
242: } else {
243: throw new Exception("Unsupported Postscript version '"
244: + version + "'.");
245: }
246: } finally {
247: if (reader != null)
248: try {
249: reader.close();
250: } catch (Exception e) {/* */
251: }
252: if (writer != null)
253: try {
254: writer.close();
255: } catch (Exception e) {/* */
256: }
257: }
258:
259: }
260:
261: /**
262: * This function requires the ghostscript-library
263: * @param inputFile
264: * @param outputFile
265: * @throws Exception
266: */
267: private void parseUsingPS2ascii(File inputFile, File outputFile)
268: throws Exception {
269: int execCode = 0;
270: StringBuffer procErr = null;
271: try {
272: String procOutputLine = null;
273: StringBuffer procOut = new StringBuffer();
274: procErr = new StringBuffer();
275:
276: Process ps2asciiProc = Runtime.getRuntime().exec(
277: new String[] { "ps2ascii",
278: inputFile.getAbsolutePath(),
279: outputFile.getAbsolutePath() });
280: BufferedReader stdOut = new BufferedReader(
281: new InputStreamReader(ps2asciiProc.getInputStream()));
282: BufferedReader stdErr = new BufferedReader(
283: new InputStreamReader(ps2asciiProc.getErrorStream()));
284: while ((procOutputLine = stdOut.readLine()) != null) {
285: procOut.append(procOutputLine);
286: }
287: while ((procOutputLine = stdErr.readLine()) != null) {
288: procErr.append(procOutputLine);
289: }
290: execCode = ps2asciiProc.waitFor();
291: } catch (Exception e) {
292: String errorMsg = "Unable to convert ps to ascii. "
293: + e.getMessage();
294: this .theLogger.logSevere(errorMsg);
295: throw new Exception(errorMsg);
296: }
297:
298: if (execCode != 0)
299: throw new Exception(
300: "Unable to convert ps to ascii. ps2ascii returned statuscode "
301: + execCode + "\n" + procErr.toString());
302: }
303:
304: public void reset() {
305: // Nothing todo here at the moment
306: super .reset();
307: }
308:
309: public plasmaParserDocument parse(yacyURL location,
310: String mimeType, String charset, InputStream source)
311: throws ParserException, InterruptedException {
312:
313: File tempFile = null;
314: try {
315: // creating a tempfile
316: tempFile = super .createTempFile("temp.ps");
317: tempFile.deleteOnExit();
318:
319: // copying inputstream into file
320: serverFileUtils.copy(source, tempFile);
321:
322: // parsing the file
323: return parse(location, mimeType, charset, tempFile);
324: } catch (Exception e) {
325: if (e instanceof InterruptedException)
326: throw (InterruptedException) e;
327: if (e instanceof ParserException)
328: throw (ParserException) e;
329:
330: throw new ParserException("Unable to parse the ps file. "
331: + e.getMessage(), location, e);
332: } finally {
333: if (tempFile != null)
334: try {
335: tempFile.delete();
336: } catch (Exception e) {/* */
337: }
338: }
339: }
340:
341: }
|