001: /*
002: * Heritrix
003: *
004: * $Id: ExtractorSWF.java 4653 2006-09-25 18:58:50Z paul_jack $
005: *
006: * Created on March 19, 2004
007: *
008: * Copyright (C) 2003 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.extractor;
028:
029: import java.io.IOException;
030: import java.io.InputStream;
031: import java.util.logging.Logger;
032:
033: import org.archive.crawler.datamodel.CoreAttributeConstants;
034: import org.archive.crawler.datamodel.CrawlURI;
035:
036: import com.anotherbigidea.flash.interfaces.SWFTagTypes;
037: import com.anotherbigidea.flash.readers.SWFReader;
038: import com.anotherbigidea.flash.readers.TagParser;
039: import com.anotherbigidea.io.InStream;
040:
041: /**
042: * Extracts URIs from SWF (flash/shockwave) files.
043: *
044: * To test, here is a link to an swf that has links
045: * embedded inside of it: http://www.hitspring.com/index.swf.
046: *
047: * @author Igor Ranitovic
048: */
049: public class ExtractorSWF extends Extractor implements
050: CoreAttributeConstants {
051:
052: private static final long serialVersionUID = 3627359592408010589L;
053:
054: private static Logger logger = Logger.getLogger(ExtractorSWF.class
055: .getName());
056: protected long numberOfCURIsHandled = 0;
057: protected long numberOfLinksExtracted = 0;
058: // TODO: consider if this should be even smaller, because anything
059: // containing URLs wouldn't be this big
060: private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB
061:
062: /**
063: * @param name
064: */
065: public ExtractorSWF(String name) {
066: super (name, "Flash extractor. Extracts URIs from SWF "
067: + "(flash/shockwave) files.");
068: }
069:
070: protected void extract(CrawlURI curi) {
071: if (!isHttpTransactionContentToProcess(curi)) {
072: return;
073: }
074:
075: String contentType = curi.getContentType();
076: if (contentType == null) {
077: return;
078: }
079: if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
080: && (!curi.toString().toLowerCase().endsWith(".swf"))) {
081: return;
082: }
083:
084: numberOfCURIsHandled++;
085:
086: InputStream documentStream = null;
087: // Get the SWF file's content stream.
088: try {
089: documentStream = curi.getHttpRecorder().getRecordedInput()
090: .getContentReplayInputStream();
091: if (documentStream == null) {
092: return;
093: }
094:
095: // Create SWF action that will add discoved URIs to CrawlURI
096: // alist(s).
097: CrawlUriSWFAction curiAction = new CrawlUriSWFAction(curi,
098: getController());
099: // Overwrite parsing of specific tags that might have URIs.
100: CustomSWFTags customTags = new CustomSWFTags(curiAction);
101: // Get a SWFReader instance.
102: SWFReader reader = new SWFReader(getTagParser(customTags),
103: documentStream) {
104: /**
105: * Override because a corrupt SWF file can cause us to try
106: * read lengths that are hundreds of megabytes in size
107: * causing us to OOME.
108: *
109: * Below is copied from SWFReader parent class.
110: */
111: public int readOneTag() throws IOException {
112: int header = mIn.readUI16();
113: int type = header >> 6; //only want the top 10 bits
114: int length = header & 0x3F; //only want the bottom 6 bits
115: boolean longTag = (length == 0x3F);
116: if (longTag) {
117: length = (int) mIn.readUI32();
118: }
119: // Below test added for Heritrix use.
120: if (length > MAX_READ_SIZE) {
121: // skip to next, rather than throw IOException ending
122: // processing
123: mIn.skipBytes(length);
124: logger.info("oversized SWF tag (type=" + type
125: + ";length=" + length + ") skipped");
126: } else {
127: byte[] contents = mIn.read(length);
128: mConsumer.tag(type, longTag, contents);
129: }
130: return type;
131: }
132: };
133:
134: reader.readFile();
135: numberOfLinksExtracted += curiAction.getLinkCount();
136: } catch (IOException e) {
137: curi.addLocalizedError(getName(), e, "Fail reading.");
138: } finally {
139: try {
140: documentStream.close();
141: } catch (IOException e) {
142: curi.addLocalizedError(getName(), e, "Fail on close.");
143: }
144: }
145:
146: // Set flag to indicate that link extraction is completed.
147: curi.linkExtractorFinished();
148: logger
149: .fine(curi + " has " + numberOfLinksExtracted
150: + " links.");
151: }
152:
153: public String report() {
154: StringBuffer ret = new StringBuffer();
155: ret
156: .append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
157: ret
158: .append(" Function: Link extraction on Shockwave Flash "
159: + "documents (.swf)\n");
160:
161: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
162: + "\n");
163: ret.append(" Links extracted: " + numberOfLinksExtracted
164: + "\n\n");
165: return ret.toString();
166: }
167:
168: /**
169: * Get a TagParser
170: *
171: * A custom ExtractorTagParser which ignores all the big binary image/
172: * sound/font types which don't carry URLs is used, to avoid the
173: * occasionally fatal (OutOfMemoryError) memory bloat caused by the
174: * all-in-memory SWF library handling.
175: *
176: * @param customTags A custom tag parser.
177: * @return An SWFReader.
178: */
179: private TagParser getTagParser(CustomSWFTags customTags) {
180: return new ExtractorTagParser(customTags);
181: }
182:
183: /**
184: * TagParser customized to ignore SWFTags that
185: * will never contain extractable URIs.
186: */
187: protected class ExtractorTagParser extends TagParser {
188:
189: protected ExtractorTagParser(SWFTagTypes tagtypes) {
190: super (tagtypes);
191: }
192:
193: protected void parseDefineBits(InStream in) throws IOException {
194: // DO NOTHING - no URLs to be found in bits
195: }
196:
197: protected void parseDefineBitsJPEG3(InStream in)
198: throws IOException {
199: // DO NOTHING - no URLs to be found in bits
200: }
201:
202: protected void parseDefineBitsLossless(InStream in, int length,
203: boolean hasAlpha) throws IOException {
204: // DO NOTHING - no URLs to be found in bits
205: }
206:
207: protected void parseDefineButtonSound(InStream in)
208: throws IOException {
209: // DO NOTHING - no URLs to be found in sound
210: }
211:
212: protected void parseDefineFont(InStream in) throws IOException {
213: // DO NOTHING - no URLs to be found in font
214: }
215:
216: protected void parseDefineJPEG2(InStream in, int length)
217: throws IOException {
218: // DO NOTHING - no URLs to be found in jpeg
219: }
220:
221: protected void parseDefineJPEGTables(InStream in)
222: throws IOException {
223: // DO NOTHING - no URLs to be found in jpeg
224: }
225:
226: protected void parseDefineShape(int type, InStream in)
227: throws IOException {
228: // DO NOTHING - no URLs to be found in shape
229: }
230:
231: protected void parseDefineSound(InStream in) throws IOException {
232: // DO NOTHING - no URLs to be found in sound
233: }
234:
235: protected void parseFontInfo(InStream in, int length,
236: boolean isFI2) throws IOException {
237: // DO NOTHING - no URLs to be found in font info
238: }
239:
240: protected void parseDefineFont2(InStream in) throws IOException {
241: // DO NOTHING - no URLs to be found in bits
242: }
243: }
244: }
|