001: //tarParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: 16.05.2005
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.tar;
045:
046: import java.io.BufferedOutputStream;
047: import java.io.File;
048: import java.io.FileOutputStream;
049: import java.io.InputStream;
050: import java.io.OutputStream;
051: import java.util.Arrays;
052: import java.util.HashMap;
053: import java.util.Hashtable;
054: import java.util.LinkedList;
055: import java.util.Map;
056: import java.util.TreeSet;
057: import java.util.zip.GZIPInputStream;
058:
059: import com.ice.tar.TarEntry;
060: import com.ice.tar.TarInputStream;
061:
062: import de.anomic.htmlFilter.htmlFilterImageEntry;
063: import de.anomic.plasma.plasmaParser;
064: import de.anomic.plasma.plasmaParserDocument;
065: import de.anomic.plasma.parser.AbstractParser;
066: import de.anomic.plasma.parser.Parser;
067: import de.anomic.plasma.parser.ParserException;
068: import de.anomic.server.serverByteBuffer;
069: import de.anomic.server.serverFileUtils;
070: import de.anomic.yacy.yacyURL;
071:
072: public class tarParser extends AbstractParser implements Parser {
073:
074: /**
075: * a list of mime types that are supported by this parser class
076: * @see #getSupportedMimeTypes()
077: */
078: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
079: static {
080: SUPPORTED_MIME_TYPES.put("application/x-tar", "tar");
081: SUPPORTED_MIME_TYPES.put("application/tar", "tar");
082: }
083:
084: /**
085: * a list of library names that are needed by this parser
086: * @see Parser#getLibxDependences()
087: */
088: private static final String[] LIBX_DEPENDENCIES = new String[] {
089: // "tar.jar"
090: };
091:
092: public tarParser() {
093: super (LIBX_DEPENDENCIES);
094: this .parserName = "Tape Archive File Parser";
095: }
096:
097: public Hashtable<String, String> getSupportedMimeTypes() {
098: return SUPPORTED_MIME_TYPES;
099: }
100:
101: public plasmaParserDocument parse(yacyURL location,
102: String mimeType, String charset, InputStream source)
103: throws ParserException, InterruptedException {
104:
105: long docTextLength = 0;
106: OutputStream docText = null;
107: File outputFile = null;
108: plasmaParserDocument subDoc = null;
109: try {
110: if ((this .contentLength == -1)
111: || (this .contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
112: outputFile = File.createTempFile("zipParser", ".tmp");
113: docText = new BufferedOutputStream(
114: new FileOutputStream(outputFile));
115: } else {
116: docText = new serverByteBuffer();
117: }
118:
119: // creating a new parser class to parse the unzipped content
120: plasmaParser theParser = new plasmaParser();
121:
122: /*
123: * If the mimeType was not reported correcly by the webserve we
124: * have to decompress it first
125: */
126: String ext = plasmaParser.getFileExt(location)
127: .toLowerCase();
128: if (ext.equals("gz") || ext.equals("tgz")) {
129: source = new GZIPInputStream(source);
130: }
131:
132: // TODO: what about bzip ....
133:
134: StringBuffer docKeywords = new StringBuffer();
135: StringBuffer docLongTitle = new StringBuffer();
136: LinkedList<String> docSections = new LinkedList<String>();
137: StringBuffer docAbstrct = new StringBuffer();
138:
139: Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
140: TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
141:
142: // looping through the contained files
143: TarEntry entry;
144: TarInputStream tin = new TarInputStream(source);
145: while ((entry = tin.getNextEntry()) != null) {
146: // check for interruption
147: checkInterruption();
148:
149: // skip directories
150: if (entry.isDirectory())
151: continue;
152:
153: // Get the short entry name
154: String entryName = entry.getName();
155:
156: // getting the entry file extension
157: int idx = entryName.lastIndexOf(".");
158: String entryExt = (idx > -1) ? entryName
159: .substring(idx + 1) : "";
160:
161: // trying to determine the mimeType per file extension
162: String entryMime = plasmaParser
163: .getMimeTypeByFileExt(entryExt);
164:
165: // getting the entry content
166: File subDocTempFile = null;
167: try {
168: // create the temp file
169: subDocTempFile = createTempFile(entryName);
170:
171: // copy the data into the file
172: serverFileUtils.copy(tin, subDocTempFile, entry
173: .getSize());
174:
175: // check for interruption
176: checkInterruption();
177:
178: // parsing the content
179: subDoc = theParser.parseSource(yacyURL.newURL(
180: location, "#" + entryName), entryMime,
181: null, subDocTempFile);
182: } catch (ParserException e) {
183: this .theLogger
184: .logInfo("Unable to parse tar file entry '"
185: + entryName + "'. "
186: + e.getMessage());
187: } finally {
188: if (subDocTempFile != null)
189: try {
190: subDocTempFile.delete();
191: } catch (Exception ex) {/* ignore this */
192: }
193: }
194: if (subDoc == null)
195: continue;
196:
197: // merging all documents together
198: if (docKeywords.length() > 0)
199: docKeywords.append(",");
200: docKeywords.append(subDoc.dc_subject(','));
201:
202: if (docLongTitle.length() > 0)
203: docLongTitle.append("\n");
204: docLongTitle.append(subDoc.dc_title());
205:
206: docSections.addAll(Arrays.asList(subDoc
207: .getSectionTitles()));
208:
209: if (docAbstrct.length() > 0)
210: docAbstrct.append("\n");
211: docAbstrct.append(subDoc.dc_description());
212:
213: if (subDoc.getTextLength() > 0) {
214: if (docTextLength > 0)
215: docText.write('\n');
216: docTextLength += serverFileUtils.copy(subDoc
217: .getText(), docText);
218: }
219:
220: docAnchors.putAll(subDoc.getAnchors());
221: docImages.addAll(subDoc.getImages());
222:
223: // release subdocument
224: subDoc.close();
225: subDoc = null;
226: }
227:
228: plasmaParserDocument result = null;
229:
230: if (docText instanceof serverByteBuffer) {
231: result = new plasmaParserDocument(
232: location,
233: mimeType,
234: null,
235: docKeywords.toString().split(" |,"),
236: docLongTitle.toString(),
237: "", // TODO: AUTHOR
238: (String[]) docSections
239: .toArray(new String[docSections.size()]),
240: docAbstrct.toString(),
241: ((serverByteBuffer) docText).getBytes(),
242: docAnchors, docImages);
243: } else {
244: result = new plasmaParserDocument(
245: location,
246: mimeType,
247: null,
248: docKeywords.toString().split(" |,"),
249: docLongTitle.toString(),
250: "", // TODO: AUTHOR
251: (String[]) docSections
252: .toArray(new String[docSections.size()]),
253: docAbstrct.toString(), outputFile, docAnchors,
254: docImages);
255: }
256:
257: return result;
258: } catch (Exception e) {
259: if (e instanceof InterruptedException)
260: throw (InterruptedException) e;
261: if (e instanceof ParserException)
262: throw (ParserException) e;
263:
264: if (subDoc != null)
265: subDoc.close();
266:
267: // close the writer
268: if (docText != null)
269: try {
270: docText.close();
271: } catch (Exception ex) {/* ignore this */
272: }
273:
274: // delete the file
275: if (outputFile != null)
276: try {
277: outputFile.delete();
278: } catch (Exception ex) {/* ignore this */
279: }
280:
281: throw new ParserException(
282: "Unexpected error while parsing tar resource. "
283: + e.getMessage(), location);
284: }
285: }
286:
287: public void reset() {
288: // Nothing todo here at the moment
289: super.reset();
290: }
291: }
|