001: //AbstractParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007: //
008: //this file was contributed by Martin Thelian
009: //last major change: $LastChangedDate$ by $LastChangedBy$
010: //Revision: $LastChangedRevision$
011: //
012: //This program is free software; you can redistribute it and/or modify
013: //it under the terms of the GNU General Public License as published by
014: //the Free Software Foundation; either version 2 of the License, or
015: //(at your option) any later version.
016: //
017: //This program is distributed in the hope that it will be useful,
018: //but WITHOUT ANY WARRANTY; without even the implied warranty of
019: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: //GNU General Public License for more details.
021: //
022: //You should have received a copy of the GNU General Public License
023: //along with this program; if not, write to the Free Software
024: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: //
026: //Using this software in any meaning (reading, learning, copying, compiling,
027: //running) means that you agree that the Author(s) is (are) not responsible
028: //for cost, loss of data or any harm that may be caused directly or indirectly
029: //by usage of this softare or this documentation. The usage of this software
030: //is on your own risk. The installation and usage (starting/running) of this
031: //software may allow other people or application to access your computer and
032: //any attached devices and is highly dependent on the configuration of the
033: //software which must be done by the user of the software; the author(s) is
034: //(are) also not responsible for proper configuration and usage of the
035: //software, even if provoked by documentation provided together with
036: //the software.
037: //
038: //Any changes to this file according to the GPL as documented in the file
039: //gpl.txt aside this file in the shipment you received can be done to the
040: //lines that follows this copyright notice here, but changes must not be
041: //done inside the copyright notive above. A re-distribution must contain
042: //the intact and unchanged copyright notice.
043: //Contributions and changes to the program code must be marked as such.
044:
045: package de.anomic.plasma.parser;
046:
047: import java.io.BufferedInputStream;
048: import java.io.ByteArrayInputStream;
049: import java.io.File;
050: import java.io.FileInputStream;
051: import java.io.FileNotFoundException;
052: import java.io.IOException;
053: import java.io.InputStream;
054:
055: import de.anomic.plasma.plasmaParser;
056: import de.anomic.plasma.plasmaParserDocument;
057: import de.anomic.server.serverThread;
058: import de.anomic.server.logging.serverLog;
059: import de.anomic.yacy.yacyURL;
060:
061: /**
062: * New classes implementing the {@link de.anomic.plasma.parser.Parser} interface
063: * can extend this class to inherit all functions already implemented in this class.
064: * @author Martin Thelian
065: * @version $LastChangedRevision$ / $LastChangedDate$
066: */
067: public abstract class AbstractParser implements Parser {
068:
069: /**
070: * a list of library names that are needed by this parser
071: */
072: protected String[] libxDependencies = null;
073:
074: /**
075: * the logger class that should be used by the parser module for logging
076: * purposes.
077: */
078: protected serverLog theLogger = null;
079:
080: /**
081: * Version number of the parser
082: */
083: protected String parserVersionNr = "0.1";
084:
085: /**
086: * Parser name
087: */
088: protected String parserName = this .getClass().getName();
089:
090: /**
091: * The source file file size in bytes if the source document was passed
092: * in as file
093: */
094: protected long contentLength = -1;
095:
096: /**
097: * The Constructor of this class.
098: */
099: public AbstractParser(String[] libxDependencies) {
100: super ();
101: this .libxDependencies = libxDependencies;
102: }
103:
104: /**
105: * Set the content length of the source file.
106: * This value is needed by some parsers to decide
107: * if the parsed text could be hold in memory
108: */
109: public void setContentLength(long length) {
110: this .contentLength = length;
111: }
112:
113: /**
114: * Check if the parser was interrupted.
115: * @throws InterruptedException if the parser was interrupted
116: */
117: public static final void checkInterruption()
118: throws InterruptedException {
119: Thread currentThread = Thread.currentThread();
120: if ((currentThread instanceof serverThread)
121: && ((serverThread) currentThread).shutdownInProgress())
122: throw new InterruptedException("Shutdown in progress ...");
123: if (currentThread.isInterrupted())
124: throw new InterruptedException("Shutdown in progress ...");
125: }
126:
127: public final File createTempFile(String name) throws IOException {
128: String parserClassName = this .getClass().getName();
129: int idx = parserClassName.lastIndexOf(".");
130: if (idx != -1) {
131: parserClassName = parserClassName.substring(idx + 1);
132: }
133:
134: // getting the file extension
135: idx = name.lastIndexOf("/");
136: String fileName = (idx != -1) ? name.substring(idx + 1) : name;
137:
138: idx = fileName.lastIndexOf(".");
139: String fileExt = (idx > -1) ? fileName.substring(idx + 1) : "";
140:
141: // creates the temp file
142: File tempFile = File.createTempFile(parserClassName + "_"
143: + ((idx > -1) ? fileName.substring(0, idx) : fileName),
144: (fileExt.length() > 0) ? "." + fileExt : fileExt);
145: return tempFile;
146: }
147:
148: public int parseDir(yacyURL location, String prefix, File dir,
149: plasmaParserDocument doc) throws ParserException,
150: InterruptedException, IOException {
151: if (!dir.isDirectory())
152: throw new ParserException("tried to parse ordinary file "
153: + dir + " as directory", location);
154:
155: String[] files = dir.list();
156: int result = 0;
157: for (int i = 0; i < files.length; i++) {
158: checkInterruption();
159: File file = new File(dir, files[i]);
160: this .theLogger.logFine("parsing file " + location + "#"
161: + file + " in archive...");
162: if (file.isDirectory()) {
163: result += parseDir(location, prefix, file, doc);
164: } else
165: try {
166: yacyURL url = yacyURL.newURL(location, "/"
167: + prefix
168: + "/"
169: // XXX: workaround for relative paths within document
170: + file.getPath().substring(
171: file.getPath().indexOf(
172: File.separatorChar) + 1)
173: + "/" + file.getName());
174: plasmaParserDocument subdoc = new plasmaParser()
175: .parseSource(
176: url,
177: plasmaParser
178: .getMimeTypeByFileExt(files[i]
179: .substring(files[i]
180: .indexOf('.') + 1)),
181: null, file);
182: // TODO: change anchors back to use '#' after archive name
183: doc.addSubDocument(subdoc);
184: subdoc.close();
185: result++;
186: } catch (ParserException e) {
187: this .theLogger.logInfo("unable to parse file "
188: + file + " in " + location + ", skipping");
189: }
190: }
191: return result;
192: }
193:
194: /**
195: * Parsing a document available as byte array.
196: * @param location the origin of the document
197: * @param mimeType the mimetype of the document
198: * @param charset the supposed charset of the document or <code>null</code> if unkown
199: * @param source the content byte array
200: * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
201: * and some additional metadata.
202: * @throws ParserException if the content could not be parsed properly
203: *
204: * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, byte[])
205: */
206: public plasmaParserDocument parse(yacyURL location,
207: String mimeType, String charset, byte[] source)
208: throws ParserException, InterruptedException {
209: ByteArrayInputStream contentInputStream = null;
210: try {
211: // convert the byte array into a stream
212: contentInputStream = new ByteArrayInputStream(source);
213:
214: // parse the stream
215: return this .parse(location, mimeType, charset,
216: contentInputStream);
217: } finally {
218: if (contentInputStream != null) {
219: try {
220: contentInputStream.close();
221: contentInputStream = null;
222: } catch (Exception e) { /* ignore this */
223: }
224: }
225: }
226: }
227:
228: /**
229: * Parsing a document stored in a {@link File}
230: * @param location the origin of the document
231: * @param mimeType the mimetype of the document
232: * @param charset the supposed charset of the document or <code>null</code> if unkown
233: * @param sourceFile the file containing the content of the document
234: * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
235: * and some additional metadata.
236: * @throws ParserException if the content could not be parsed properly
237: *
238: * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
239: */
240: public plasmaParserDocument parse(yacyURL location,
241: String mimeType, String charset, File sourceFile)
242: throws ParserException, InterruptedException {
243: BufferedInputStream contentInputStream = null;
244: try {
245: // getting the file size of the document
246: this .contentLength = sourceFile.length();
247:
248: // create a stream from the file
249: contentInputStream = new BufferedInputStream(
250: new FileInputStream(sourceFile));
251:
252: // parse the stream
253: return this .parse(location, mimeType, charset,
254: contentInputStream);
255: } catch (FileNotFoundException e) {
256: throw new ParserException(
257: "Unexpected error while parsing file. "
258: + e.getMessage(), location);
259: } finally {
260: if (contentInputStream != null)
261: try {
262: contentInputStream.close();
263: } catch (Exception e) {/* ignore this */
264: }
265: }
266: }
267:
268: /**
269: * Parsing a document available as {@link InputStream}
270: * @param location the origin of the document
271: * @param mimeType the mimetype of the document
272: * @param charset the supposed charset of the document or <code>null</code> if unkown
273: * @param source the {@link InputStream} containing the document content
274: * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
275: * and some additional metadata.
276: * @throws ParserException if the content could not be parsed properly
277: *
278: * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
279: */
280: public abstract plasmaParserDocument parse(yacyURL location,
281: String mimeType, String charset, InputStream source)
282: throws ParserException, InterruptedException;
283:
284: /**
285: * @return Returns a list of library names that are needed by this parser
286: * @see de.anomic.plasma.parser.Parser#getLibxDependences()
287: */
288: public String[] getLibxDependences() {
289: return this .libxDependencies;
290: }
291:
292: /**
293: * Setting the logger that should be used by this parser class ...
294: */
295: public void setLogger(serverLog log) {
296: this .theLogger = log;
297: }
298:
299: /**
300: * Returns the version number of the parser
301: * @return parser version number
302: */
303: public String getVersion() {
304: return this .parserVersionNr;
305: }
306:
307: /**
308: * Return the name of the parser
309: */
310: public String getName() {
311: return this .parserName;
312: }
313:
314: public void reset() {
315: this .contentLength = -1;
316: }
317: }
|