001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Created on Nov 17, 2003
020: *
021: * To change the template for this generated file go to
022: * Window>Preferences>Java>Code Generation>Code and Comments
023: */
024: package org.archive.extractor;
025:
026: import java.util.LinkedList;
027: import java.util.regex.Matcher;
028: import java.util.regex.Pattern;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.extractor.Link;
032: import org.archive.net.UURI;
033: import org.archive.net.UURIFactory;
034: import org.archive.util.TextUtils;
035:
036: /**
037: * Uses regular expressions to find likely URIs inside Javascript.
038: *
039: * ROUGH DRAFT IN PROGRESS / incomplete... untested...
040: *
041: * @author gojomo
042: */
043: public class RegexpJSLinkExtractor extends CharSequenceLinkExtractor {
044: // private static Logger logger =
045: // Logger.getLogger(RegexpJSLinkExtractor.class.getName());
046:
047: static final String AMP = "&";
048: static final String ESCAPED_AMP = "&";
049: static final String WHITESPACE = "\\s";
050:
051: // finds whitespace-free strings in Javascript
052: // (areas between paired ' or " characters, possibly backslash-quoted
053: // on the ends, but not in the middle)
054: static final Pattern JAVASCRIPT_STRING_EXTRACTOR = Pattern
055: .compile("(\\\\{0,8}+(?:\"|\'))(.+?)(?:\\1)");
056:
057: // determines whether a string is likely URI
058: // (no whitespace or '<' '>', has an internal dot or some slash,
059: // begins and ends with either '/' or a word-char)
060: static final Pattern STRING_URI_DETECTOR = Pattern
061: .compile("(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)");
062:
063: Matcher strings;
064: LinkedList<Matcher> matcherStack = new LinkedList<Matcher>();
065:
066: protected boolean findNextLink() {
067: if (strings == null) {
068: strings = JAVASCRIPT_STRING_EXTRACTOR
069: .matcher(sourceContent);
070: }
071: while (strings != null) {
072: while (strings.find()) {
073: CharSequence subsequence = sourceContent.subSequence(
074: strings.start(2), strings.end(2));
075: Matcher uri = STRING_URI_DETECTOR.matcher(subsequence);
076: if ((subsequence.length() <= UURI.MAX_URL_LENGTH)
077: && uri.matches()) {
078: String string = uri.group();
079: string = TextUtils.replaceAll(ESCAPED_AMP, string,
080: AMP);
081: try {
082: Link link = new Link(source, UURIFactory
083: .getInstance(source, string),
084: Link.JS_MISC, Link.SPECULATIVE_HOP);
085: next.add(link);
086: return true;
087: } catch (URIException e) {
088: extractErrorListener.noteExtractError(e,
089: source, string);
090: }
091: } else {
092: // push current range
093: matcherStack.addFirst(strings);
094: // start looking inside string
095: strings = JAVASCRIPT_STRING_EXTRACTOR
096: .matcher(subsequence);
097: }
098: }
099: // continue at enclosing range, if available
100: strings = (Matcher) (matcherStack.isEmpty() ? null
101: : matcherStack.removeFirst());
102: }
103: return false;
104: }
105:
106: /* (non-Javadoc)
107: * @see org.archive.extractor.LinkExtractor#reset()
108: */
109: public void reset() {
110: super .reset();
111: matcherStack.clear();
112: strings = null;
113: }
114:
115: protected static CharSequenceLinkExtractor newDefaultInstance() {
116: return new RegexpJSLinkExtractor();
117: }
118: }
|