001: // SZParserExtractCallback.java
002: // -------------------------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2004
007: //
008: // This file ist contributed by Franz Brausze
009: //
010: // This program is free software; you can redistribute it and/or modify
011: // it under the terms of the GNU General Public License as published by
012: // the Free Software Foundation; either version 2 of the License, or
013: // (at your option) any later version.
014: //
015: // This program is distributed in the hope that it will be useful,
016: // but WITHOUT ANY WARRANTY; without even the implied warranty of
018: // GNU General Public License for more details.
019: //
020: // You should have received a copy of the GNU General Public License
021: // along with this program; if not, write to the Free Software
022: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
023: //
024: // Using this software in any meaning (reading, learning, copying, compiling,
025: // running) means that you agree that the Author(s) is (are) not responsible
026: // for cost, loss of data or any harm that may be caused directly or indirectly
027: // by usage of this softare or this documentation. The usage of this software
028: // is on your own risk. The installation and usage (starting/running) of this
029: // software may allow other people or application to access your computer and
030: // any attached devices and is highly dependent on the configuration of the
031: // software which must be done by the user of the software; the author(s) is
032: // (are) also not responsible for proper configuration and usage of the
033: // software, even if provoked by documentation provided together with
034: // the software.
035: //
036: // Any changes to this file according to the GPL as documented in the file
037: // gpl.txt aside this file in the shipment you received can be done to the
038: // lines that follows this copyright notice here, but changes must not be
039: // done inside the copyright notive above. A re-distribution must contain
040: // the intact and unchanged copyright notice.
041: // Contributions and changes to the program code must be marked as such.
043: package de.anomic.plasma.parser.sevenzip;
045: import java.io.IOException;
046: import java.io.OutputStream;
047: import java.util.HashMap;
048: import java.util.Iterator;
049: import java.util.Map;
051: import SevenZip.ArchiveExtractCallback;
052: import SevenZip.Archive.IInArchive;
053: import SevenZip.Archive.SevenZipEntry;
054: import de.anomic.plasma.plasmaParser;
055: import de.anomic.plasma.plasmaParserDocument;
056: import de.anomic.plasma.parser.AbstractParser;
057: import de.anomic.plasma.parser.ParserException;
058: import de.anomic.server.serverCachedFileOutputStream;
059: import de.anomic.server.logging.serverLog;
060: import de.anomic.yacy.yacyURL;
062: // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
063: // and parse the extracted content
064: public class SZParserExtractCallback extends ArchiveExtractCallback {
066: private final serverLog log;
067: private final long maxRamSize;
068: private serverCachedFileOutputStream cfos = null;
069: private final plasmaParser parser;
070: private final plasmaParserDocument doc;
071: private final String prefix;
073: public SZParserExtractCallback(serverLog logger,
074: IInArchive handler, long maxRamSize,
075: plasmaParserDocument doc, String prefix) {
076: super .Init(handler);
077: this .log = logger;
078: this .maxRamSize = maxRamSize;
079: this .parser = new plasmaParser();
080: this .doc = doc;
081: this .prefix = prefix;
082: }
084: public void PrepareOperation(int arg0) {
085: this .extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
086: switch (arg0) {
087: case IInArchive.NExtract_NAskMode_kExtract:
088: this .log.logFine("Extracting " + this .filePath);
089: break;
090: case IInArchive.NExtract_NAskMode_kTest:
091: this .log.logFine("Testing " + this .filePath);
092: break;
093: case IInArchive.NExtract_NAskMode_kSkip:
094: this .log.logFine("Skipping " + this .filePath);
095: break;
096: }
097: }
099: public void SetOperationResult(int arg0) throws IOException {
100: if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
101: this .NumErrors++;
102: switch (arg0) {
103: case IInArchive.NExtract_NOperationResult_kUnSupportedMethod:
104: throw new IOException("Unsupported Method");
105: case IInArchive.NExtract_NOperationResult_kCRCError:
106: throw new IOException("CRC Failed");
107: case IInArchive.NExtract_NOperationResult_kDataError:
108: throw new IOException("Data Error");
109: default:
110: // throw new IOException("Unknown Error");
111: }
112: } else
113: try {
114: AbstractParser.checkInterruption();
116: if (this .cfos != null) {
117: // parse the file
118: plasmaParserDocument theDoc;
119: // workaround for relative links in file, normally '#' shall be used behind the location, see
120: // below for reversion of the effects
121: yacyURL url = yacyURL.newURL(doc.dc_source(),
122: this .prefix + "/" + super .filePath);
123: String mime = plasmaParser
124: .getMimeTypeByFileExt(super .filePath
125: .substring(super .filePath
126: .lastIndexOf('.') + 1));
127: if (this .cfos.isFallback()) {
128: theDoc = this .parser.parseSource(url, mime,
129: null, this .cfos.getContentFile());
130: } else {
131: theDoc = this .parser.parseSource(url, mime,
132: null, this .cfos.getContentBAOS());
133: }
135: // revert the above workaround
136: Map<yacyURL, String> nanchors = new HashMap<yacyURL, String>(
137: theDoc.getAnchors().size(), 1f);
138: Iterator<Map.Entry<yacyURL, String>> it = theDoc
139: .getAnchors().entrySet().iterator();
140: Map.Entry<yacyURL, String> entry;
141: String base = doc.dc_source().toNormalform(false,
142: true);
143: String u;
144: while (it.hasNext()) {
145: entry = it.next();
146: u = entry.getKey().toNormalform(true, true);
147: if (u.startsWith(base + "/")) {
148: String ref = "#"
149: + u.substring(base.length() + 1);
150: this .log.logFinest("changing "
151: + entry.getKey()
152: + " to use reference " + ref);
153: nanchors.put(new yacyURL(base + ref, null),
154: entry.getValue());
155: } else {
156: nanchors.put(entry.getKey(), entry
157: .getValue());
158: }
159: }
160: theDoc.getAnchors().clear();
161: theDoc.getAnchors().putAll(nanchors);
162: this .doc.addSubDocument(theDoc);
163: }
164: } catch (ParserException e) {
165: IOException ex = new IOException(
166: "error parsing extracted content of "
167: + super .filePath + ": "
168: + e.getMessage());
169: ex.initCause(e);
170: throw ex;
171: } catch (InterruptedException e) {
172: IOException ex = new IOException("interrupted");
173: ex.initCause(e);
174: throw ex;
175: }
176: }
178: public OutputStream GetStream(int index, int askExtractMode)
179: throws IOException {
180: SevenZipEntry item = super .archiveHandler.getEntry(index);
181: super .filePath = item.getName();
182: try {
183: AbstractParser.checkInterruption();
184: } catch (InterruptedException e) {
185: IOException ex = new IOException("interrupted");
186: ex.initCause(e);
187: throw ex;
188: }
189: this .cfos = (item.isDirectory()) ? null
190: : new serverCachedFileOutputStream(this .maxRamSize,
191: null, true, item.getSize());
192: return this .cfos;
193: }
195: public String getCurrentFilePath() {
196: return super.filePath;
197: }
198: }