001: //zipParser.java
002: //------------------------
003: //part of YaCy
004: //(C) by Michael Peter Christen; mc@anomic.de
005: //first published on http://www.anomic.de
006: //Frankfurt, Germany, 2005
007: //
008: //this file is contributed by Martin Thelian
009: //last major change: 16.05.2005
010: //
011: //This program is free software; you can redistribute it and/or modify
012: //it under the terms of the GNU General Public License as published by
013: //the Free Software Foundation; either version 2 of the License, or
014: //(at your option) any later version.
015: //
016: //This program is distributed in the hope that it will be useful,
017: //but WITHOUT ANY WARRANTY; without even the implied warranty of
018: //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: //GNU General Public License for more details.
020: //
021: //You should have received a copy of the GNU General Public License
022: //along with this program; if not, write to the Free Software
023: //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: //Using this software in any meaning (reading, learning, copying, compiling,
026: //running) means that you agree that the Author(s) is (are) not responsible
027: //for cost, loss of data or any harm that may be caused directly or indirectly
028: //by usage of this softare or this documentation. The usage of this software
029: //is on your own risk. The installation and usage (starting/running) of this
030: //software may allow other people or application to access your computer and
031: //any attached devices and is highly dependent on the configuration of the
032: //software which must be done by the user of the software; the author(s) is
033: //(are) also not responsible for proper configuration and usage of the
034: //software, even if provoked by documentation provided together with
035: //the software.
036: //
037: //Any changes to this file according to the GPL as documented in the file
038: //gpl.txt aside this file in the shipment you received can be done to the
039: //lines that follows this copyright notice here, but changes must not be
040: //done inside the copyright notive above. A re-distribution must contain
041: //the intact and unchanged copyright notice.
042: //Contributions and changes to the program code must be marked as such.
043:
044: package de.anomic.plasma.parser.zip;
045:
046: import java.io.BufferedOutputStream;
047: import java.io.File;
048: import java.io.FileOutputStream;
049: import java.io.InputStream;
050: import java.io.OutputStream;
051: import java.util.Arrays;
052: import java.util.HashMap;
053: import java.util.Hashtable;
054: import java.util.LinkedList;
055: import java.util.Map;
056: import java.util.TreeSet;
057: import java.util.zip.ZipEntry;
058: import java.util.zip.ZipInputStream;
059:
060: import de.anomic.htmlFilter.htmlFilterImageEntry;
061: import de.anomic.plasma.plasmaParser;
062: import de.anomic.plasma.plasmaParserDocument;
063: import de.anomic.plasma.parser.AbstractParser;
064: import de.anomic.plasma.parser.Parser;
065: import de.anomic.plasma.parser.ParserException;
066: import de.anomic.server.serverByteBuffer;
067: import de.anomic.server.serverFileUtils;
068: import de.anomic.yacy.yacyURL;
069:
070: public class zipParser extends AbstractParser implements Parser {
071:
072: /**
073: * a list of mime types that are supported by this parser class
074: * @see #getSupportedMimeTypes()
075: */
076: public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
077: static {
078: SUPPORTED_MIME_TYPES.put("application/zip", "zip");
079: SUPPORTED_MIME_TYPES.put("application/x-zip", "zip");
080: SUPPORTED_MIME_TYPES.put("application/x-zip-compressed", "zip");
081: SUPPORTED_MIME_TYPES.put("application/java-archive", "jar");
082: }
083:
084: /**
085: * a list of library names that are needed by this parser
086: * @see Parser#getLibxDependences()
087: */
088: private static final String[] LIBX_DEPENDENCIES = new String[] {};
089:
090: public zipParser() {
091: super (LIBX_DEPENDENCIES);
092: this .parserName = "Compressed Archive File Parser";
093: }
094:
095: public Hashtable<String, String> getSupportedMimeTypes() {
096: return SUPPORTED_MIME_TYPES;
097: }
098:
099: public plasmaParserDocument parse(yacyURL location,
100: String mimeType, String charset, InputStream source)
101: throws ParserException, InterruptedException {
102:
103: long docTextLength = 0;
104: OutputStream docText = null;
105: File outputFile = null;
106: plasmaParserDocument subDoc = null;
107: try {
108: if ((this .contentLength == -1)
109: || (this .contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
110: outputFile = File.createTempFile("zipParser", ".tmp");
111: docText = new BufferedOutputStream(
112: new FileOutputStream(outputFile));
113: } else {
114: docText = new serverByteBuffer();
115: }
116:
117: StringBuffer docKeywords = new StringBuffer();
118: StringBuffer docLongTitle = new StringBuffer();
119: LinkedList<String> docSections = new LinkedList<String>();
120: StringBuffer docAbstrct = new StringBuffer();
121: Map<yacyURL, String> docAnchors = new HashMap<yacyURL, String>();
122: TreeSet<htmlFilterImageEntry> docImages = new TreeSet<htmlFilterImageEntry>();
123:
124: // creating a new parser class to parse the unzipped content
125: plasmaParser theParser = new plasmaParser();
126:
127: // looping through the contained files
128: ZipEntry entry;
129: ZipInputStream zippedContent = new ZipInputStream(source);
130: while ((entry = zippedContent.getNextEntry()) != null) {
131: // check for interruption
132: checkInterruption();
133:
134: // skip directories
135: if (entry.isDirectory())
136: continue;
137:
138: // Get the entry name
139: String entryName = entry.getName();
140: int idx = entryName.lastIndexOf(".");
141:
142: // getting the file extension
143: String entryExt = (idx > -1) ? entryName
144: .substring(idx + 1) : "";
145:
146: // trying to determine the mimeType per file extension
147: String entryMime = plasmaParser
148: .getMimeTypeByFileExt(entryExt);
149:
150: // parsing the content
151: File subDocTempFile = null;
152: try {
153: // create the temp file
154: subDocTempFile = createTempFile(entryName);
155:
156: // copy the data into the file
157: serverFileUtils.copy(zippedContent, subDocTempFile,
158: entry.getSize());
159:
160: // parsing the zip file entry
161: subDoc = theParser.parseSource(yacyURL.newURL(
162: location, "#" + entryName), entryMime,
163: null, subDocTempFile);
164: } catch (ParserException e) {
165: this .theLogger
166: .logInfo("Unable to parse zip file entry '"
167: + entryName + "'. "
168: + e.getMessage());
169: } finally {
170: if (subDocTempFile != null)
171: try {
172: subDocTempFile.delete();
173: } catch (Exception ex) {/* ignore this */
174: }
175: }
176: if (subDoc == null)
177: continue;
178:
179: // merging all documents together
180: if (docKeywords.length() > 0)
181: docKeywords.append(",");
182: docKeywords.append(subDoc.dc_subject(','));
183:
184: if (docLongTitle.length() > 0)
185: docLongTitle.append("\n");
186: docLongTitle.append(subDoc.dc_title());
187:
188: docSections.addAll(Arrays.asList(subDoc
189: .getSectionTitles()));
190:
191: if (docAbstrct.length() > 0)
192: docAbstrct.append("\n");
193: docAbstrct.append(subDoc.dc_description());
194:
195: if (subDoc.getTextLength() > 0) {
196: if (docTextLength > 0)
197: docText.write('\n');
198: docTextLength += serverFileUtils.copy(subDoc
199: .getText(), docText);
200: }
201:
202: docAnchors.putAll(subDoc.getAnchors());
203: docImages.addAll(subDoc.getImages());
204:
205: // release subdocument
206: subDoc.close();
207: subDoc = null;
208: }
209:
210: plasmaParserDocument result = null;
211:
212: if (docText instanceof serverByteBuffer) {
213: result = new plasmaParserDocument(
214: location,
215: mimeType,
216: null,
217: docKeywords.toString().split(" |,"),
218: docLongTitle.toString(),
219: "", // TODO: AUTHOR
220: (String[]) docSections
221: .toArray(new String[docSections.size()]),
222: docAbstrct.toString(),
223: ((serverByteBuffer) docText).getBytes(),
224: docAnchors, docImages);
225: } else {
226: result = new plasmaParserDocument(
227: location,
228: mimeType,
229: null,
230: docKeywords.toString().split(" |,"),
231: docLongTitle.toString(),
232: "", // TODO: AUTHOR
233: (String[]) docSections
234: .toArray(new String[docSections.size()]),
235: docAbstrct.toString(), outputFile, docAnchors,
236: docImages);
237: }
238:
239: return result;
240: } catch (Exception e) {
241: if (e instanceof InterruptedException)
242: throw (InterruptedException) e;
243: if (e instanceof ParserException)
244: throw (ParserException) e;
245:
246: if (subDoc != null)
247: subDoc.close();
248:
249: // close the writer
250: if (docText != null)
251: try {
252: docText.close();
253: } catch (Exception ex) {/* ignore this */
254: }
255:
256: // delete the file
257: if (outputFile != null)
258: try {
259: outputFile.delete();
260: } catch (Exception ex) {/* ignore this */
261: }
262:
263: throw new ParserException(
264: "Unexpected error while parsing zip resource. "
265: + e.getClass().getName() + ": "
266: + e.getMessage(), location);
267: }
268: }
269:
270: public void reset() {
271: // Nothing todo here at the moment
272: super.reset();
273: }
274: }
|