001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Nov 17, 2003
020: *
021: * To change the template for this generated file go to
022: * Window>Preferences>Java>Code Generation>Code and Comments
023: */
024: package org.archive.crawler.extractor;
025:
026: import java.io.IOException;
027: import java.util.logging.Logger;
028: import java.util.regex.Matcher;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CoreAttributeConstants;
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.framework.CrawlController;
034: import org.archive.io.ReplayCharSequence;
035: import org.archive.net.UURI;
036: import org.archive.util.DevUtils;
037: import org.archive.util.TextUtils;
038:
039: /**
040: * Processes Javascript files for strings that are likely to be
041: * crawlable URIs.
042: *
043: * @author gojomo
044: *
045: */
046: public class ExtractorJS extends Extractor implements
047: CoreAttributeConstants {
048:
049: private static final long serialVersionUID = -2231962381454717720L;
050:
051: private static Logger LOGGER = Logger
052: .getLogger("org.archive.crawler.extractor.ExtractorJS");
053:
054: static final String AMP = "&";
055: static final String ESCAPED_AMP = "&";
056: static final String WHITESPACE = "\\s";
057:
058: // finds whitespace-free strings in Javascript
059: // (areas between paired ' or " characters, possibly backslash-quoted
060: // on the ends, but not in the middle)
061: static final String JAVASCRIPT_STRING_EXTRACTOR = "(\\\\{0,8}+(?:\"|\'))(\\S{0,"
062: + UURI.MAX_URL_LENGTH + "}?)(?:\\1)";
063: // GROUPS:
064: // (G1) ' or " with optional leading backslashes
065: // (G2) whitespace-free string delimited on boths ends by G1
066:
067: // determines whether a string is likely URI
068: // (no whitespace or '<' '>', has an internal dot or some slash,
069: // begins and ends with either '/' or a word-char)
070: static final String STRING_URI_DETECTOR = "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)";
071:
072: protected long numberOfCURIsHandled = 0;
073: protected static long numberOfLinksExtracted = 0;
074:
075: /**
076: * @param name
077: */
078: public ExtractorJS(String name) {
079: super (name,
080: "JavaScript extractor. Link extraction on JavaScript"
081: + " files (.js).");
082: }
083:
084: /* (non-Javadoc)
085: * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
086: */
087: public void extract(CrawlURI curi) {
088: if (!isHttpTransactionContentToProcess(curi)) {
089: return;
090: }
091: String contentType = curi.getContentType();
092: if ((contentType == null)) {
093: return;
094: }
095: // If content type is not js and if the viaContext
096: // does not begin with 'script', return.
097: if ((contentType.indexOf("javascript") < 0)
098: && (contentType.indexOf("jscript") < 0)
099: && (contentType.indexOf("ecmascript") < 0)
100: && (!curi.toString().toLowerCase().endsWith(".js"))
101: && (curi.getViaContext() == null || !curi
102: .getViaContext().toString().toLowerCase()
103: .startsWith("script"))) {
104: return;
105: }
106:
107: this .numberOfCURIsHandled++;
108:
109: ReplayCharSequence cs = null;
110: try {
111: cs = curi.getHttpRecorder().getReplayCharSequence();
112: } catch (IOException e) {
113: curi.addLocalizedError(this .getName(), e,
114: "Failed get of replay char sequence.");
115: }
116: if (cs == null) {
117: LOGGER.warning("Failed getting ReplayCharSequence: "
118: + curi.toString());
119: return;
120: }
121:
122: try {
123: try {
124: numberOfLinksExtracted += considerStrings(curi, cs,
125: getController(), true);
126: } catch (StackOverflowError e) {
127: DevUtils
128: .warnHandle(e, "ExtractorJS StackOverflowError");
129: }
130: // Set flag to indicate that link extraction is completed.
131: curi.linkExtractorFinished();
132: } finally {
133: // Done w/ the ReplayCharSequence. Close it.
134: if (cs != null) {
135: try {
136: cs.close();
137: } catch (IOException ioe) {
138: LOGGER
139: .warning(TextUtils
140: .exceptionToString(
141: "Failed close of ReplayCharSequence.",
142: ioe));
143: }
144: }
145: }
146: }
147:
148: public static long considerStrings(CrawlURI curi, CharSequence cs,
149: CrawlController controller, boolean handlingJSFile) {
150: long foundLinks = 0;
151: Matcher strings = TextUtils.getMatcher(
152: JAVASCRIPT_STRING_EXTRACTOR, cs);
153: while (strings.find()) {
154: CharSequence subsequence = cs.subSequence(strings.start(2),
155: strings.end(2));
156: Matcher uri = TextUtils.getMatcher(STRING_URI_DETECTOR,
157: subsequence);
158: if (uri.matches()) {
159: String string = uri.group();
160: string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
161: foundLinks++;
162: try {
163: if (handlingJSFile) {
164: curi.createAndAddLinkRelativeToVia(string,
165: Link.JS_MISC, Link.SPECULATIVE_HOP);
166: } else {
167: curi.createAndAddLinkRelativeToBase(string,
168: Link.JS_MISC, Link.SPECULATIVE_HOP);
169: }
170: } catch (URIException e) {
171: // There may not be a controller (e.g. If we're being run
172: // by the extractor tool).
173: if (controller != null) {
174: controller.logUriError(e, curi.getUURI(),
175: string);
176: } else {
177: LOGGER.info(curi + ", " + string + ": "
178: + e.getMessage());
179: }
180: }
181: } else {
182: foundLinks += considerStrings(curi, subsequence,
183: controller, handlingJSFile);
184: }
185: TextUtils.recycleMatcher(uri);
186: }
187: TextUtils.recycleMatcher(strings);
188: return foundLinks;
189: }
190:
191: /* (non-Javadoc)
192: * @see org.archive.crawler.framework.Processor#report()
193: */
194: public String report() {
195: StringBuffer ret = new StringBuffer();
196: ret
197: .append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
198: ret
199: .append(" Function: Link extraction on JavaScript code\n");
200: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
201: + "\n");
202: ret.append(" Links extracted: " + numberOfLinksExtracted
203: + "\n\n");
204:
205: return ret.toString();
206: }
207: }
|