01: /*
02: * CrawlUriSWFAction
03: *
04: * $Id: CrawlUriSWFAction.java 4475 2006-08-11 06:09:46Z gojomo $
05: *
06: * Created on March 15, 2004
07: *
08: * Copyright (C) 2003 Internet Archive.
09: *
10: * This file is part of the Heritrix web crawler (crawler.archive.org).
11: *
12: * Heritrix is free software; you can redistribute it and/or modify
13: * it under the terms of the GNU Lesser Public License as published by
14: * the Free Software Foundation; either version 2.1 of the License, or
15: * any later version.
16: *
17: * Heritrix is distributed in the hope that it will be useful,
18: * but WITHOUT ANY WARRANTY; without even the implied warranty of
19: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20: * GNU Lesser Public License for more details.
21: *
22: * You should have received a copy of the GNU Lesser Public License
23: * along with Heritrix; if not, write to the Free Software
24: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25: */
26:
27: package org.archive.crawler.extractor;
28:
29: import java.io.IOException;
30:
31: import org.archive.crawler.datamodel.CrawlURI;
32: import org.archive.crawler.framework.CrawlController;
33:
34: import com.anotherbigidea.flash.writers.SWFActionsImpl;
35:
36: /**
37: * SWF action that handles discovered URIs.
38: *
39: * @author Igor Ranitovic
40: */
41: public class CrawlUriSWFAction extends SWFActionsImpl {
42: CrawlURI curi;
43: CrawlController controller; // for error reporting
44:
45: private long linkCount;
46: static final String JSSTRING = "javascript:";
47:
48: /**
49: *
50: * @param curi
51: */
52: public CrawlUriSWFAction(CrawlURI curi, CrawlController controller) {
53: assert (curi != null) : "CrawlURI should not be null";
54: this .curi = curi;
55: this .controller = controller;
56: this .linkCount = 0;
57: }
58:
59: /**
60: * Overwrite handling of discovered URIs.
61: *
62: * @param url Discovered URL.
63: * @param target Discovered target (currently not being used.)
64: * @throws IOException
65: */
66: public void getURL(String url, String target) throws IOException {
67: // I have done tests on a few tens of swf files and have not seen a need
68: // to use 'target.' Most of the time 'target' is not set, or it is set
69: // to '_self' or '_blank'.
70: if (url.startsWith(JSSTRING)) {
71: linkCount = +ExtractorJS.considerStrings(curi, url,
72: controller, false);
73: } else {
74: curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
75: Link.EMBED_HOP);
76: linkCount++;
77: }
78: }
79:
80: /**
81: * @return Total number of links extracted from a swf file.
82: */
83: public long getLinkCount() {
84: return linkCount;
85: }
86: }
|