01: /*
02: * AggressiveExtractorHTML
03: *
04: * $Id: AggressiveExtractorHTML.java 4713 2006-11-04 05:58:26Z gojomo $
05: *
06: * Created on Jan 6, 2004
07: *
08: * Copyright (C) 2004 Internet Archive.
09: *
10: * This file is part of the Heritrix web crawler (crawler.archive.org).
11: *
12: * Heritrix is free software; you can redistribute it and/or modify
13: * it under the terms of the GNU Lesser Public License as published by
14: * the Free Software Foundation; either version 2.1 of the License, or
15: * any later version.
16: *
17: * Heritrix is distributed in the hope that it will be useful,
18: * but WITHOUT ANY WARRANTY; without even the implied warranty of
19: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20: * GNU Lesser Public License for more details.
21: *
22: * You should have received a copy of the GNU Lesser Public License
23: * along with Heritrix; if not, write to the Free Software
24: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25: */
26:
27: package org.archive.crawler.extractor;
28:
29: import java.util.logging.Logger;
30:
31: import org.archive.crawler.datamodel.CrawlURI;
32:
33: /**
34: * Extended version of ExtractorHTML with more aggressive javascript link
35: * extraction where javascript code is parsed first with general HTML tags
36: * regexp, and than by javascript speculative link regexp.
37: *
38: * @author Igor Ranitovic
39: *
40: */
41: public class AggressiveExtractorHTML extends ExtractorHTML {
42:
43: private static final long serialVersionUID = 3586060081186247087L;
44:
45: static Logger logger = Logger
46: .getLogger(AggressiveExtractorHTML.class.getName());
47:
48: public AggressiveExtractorHTML(String name) {
49: super (
50: name,
51: "Aggressive HTML extractor. Subclasses ExtractorHTML "
52: + " so does all that it does, except in regard to javascript "
53: + " blocks. Here "
54: + " it first processes as JS as its parent does, but then it "
55: + " reruns through the JS treating it as HTML (May cause many "
56: + " false positives). It finishes by applying heuristics "
57: + " against script code looking for possible URIs. ");
58: }
59:
60: protected void processScript(CrawlURI curi, CharSequence sequence,
61: int endOfOpenTag) {
62: super .processScript(curi, sequence, endOfOpenTag);
63: // then, process entire javascript code as html code
64: // this may cause a lot of false positves
65: processGeneralTag(curi, sequence.subSequence(0, 6), sequence
66: .subSequence(endOfOpenTag, sequence.length()));
67: }
68:
69: /* (non-Javadoc)
70: * @see org.archive.crawler.framework.Processor#report()
71: */
72: public String report() {
73: StringBuffer ret = new StringBuffer(256);
74: ret
75: .append("Processor: org.archive.crawler.extractor.ExtractorHTML2\n");
76: ret
77: .append(" Function: Link extraction on HTML documents "
78: + "(including embedded CSS)\n");
79: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
80: + "\n");
81: ret.append(" Links extracted: " + numberOfLinksExtracted
82: + "\n\n");
83: return ret.toString();
84: }
85: }
|