001: /*
002: * ExtractorCSS
003: *
004: * $Id: RegexpCSSLinkExtractor.java 4646 2006-09-22 17:23:04Z paul_jack $
005: *
006: * Created on Mar 29, 2005
007: *
008: * Copyright (C) 2005 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.extractor;
028:
029: import java.util.regex.Matcher;
030:
031: import org.apache.commons.httpclient.URIException;
032: import org.archive.crawler.extractor.Link;
033: import org.archive.net.UURIFactory;
034: import org.archive.util.DevUtils;
035: import org.archive.util.TextUtils;
036:
037: /**
038: * This extractor is parsing URIs from CSS type files.
039: * The format of a CSS URL value is 'url(' followed by optional white space
040: * followed by an optional single quote (') or double quote (") character
041: * followed by the URL itself followed by an optional single quote (') or
042: * double quote (") character followed by optional white space followed by ')'.
043: * Parentheses, commas, white space characters, single quotes (') and double
044: * quotes (") appearing in a URL must be escaped with a backslash:
045: * '\(', '\)', '\,'. Partial URLs are interpreted relative to the source of
046: * the style sheet, not relative to the document. <a href="http://www.w3.org/TR/REC-CSS1#url">
047: * Source: www.w3.org</a>
048: *
049: * ROUGH DRAFT IN PROGRESS / incomplete... untested... major changes likely
050: *
051: * @author igor gojomo
052: *
053: **/
054:
055: public class RegexpCSSLinkExtractor extends CharSequenceLinkExtractor {
056:
057: // private static Logger logger =
058: // Logger.getLogger(RegexpCSSLinkExtractor.class.getName());
059:
060: private static String ESCAPED_AMP = "&";
061: // CSS escapes: "Parentheses, commas, whitespace characters, single
062: // quotes (') and double quotes (") appearing in a URL must be
063: // escaped with a backslash"
064: static final String CSS_BACKSLASH_ESCAPE = "\\\\([,'\"\\(\\)\\s])";
065:
066: protected Matcher uris;
067:
068: /**
069: * CSS URL extractor pattern.
070: *
071: * This pattern extracts URIs for CSS files
072: **/
073: static final String CSS_URI_EXTRACTOR = "(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)([^\\\"\'].*?)\\1\\s*[);]";
074:
075: protected boolean findNextLink() {
076: if (uris == null) {
077: uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR,
078: sourceContent);
079: // NOTE: this matcher can't be recycled in this method because
080: // it is reused on rentry
081: }
082: String cssUri;
083: try {
084: while (uris.find()) {
085: cssUri = uris.group(2);
086: // TODO: Escape more HTML Entities.
087: cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&");
088: // Remove backslashes when used as escape character in CSS URL
089: cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE,
090: cssUri, "$1");
091: // TODO: handle relative URIs?
092: try {
093: Link link = new Link(source, UURIFactory
094: .getInstance(base, cssUri),
095: Link.EMBED_MISC, Link.EMBED_HOP);
096: next.addLast(link);
097: } catch (URIException e) {
098: extractErrorListener.noteExtractError(e, source,
099: cssUri);
100: }
101: return true;
102: }
103: } catch (StackOverflowError e) {
104: DevUtils.warnHandle(e,
105: "RegexpCSSLinkExtractor StackOverflowError");
106: }
107: return false;
108: }
109:
110: public void reset() {
111: super .reset();
112: TextUtils.recycleMatcher(uris);
113: uris = null;
114: }
115:
116: protected static CharSequenceLinkExtractor newDefaultInstance() {
117: return new RegexpCSSLinkExtractor();
118: }
119: }
|