001: /*
002: * ExtractorCSS
003: *
004: * $Id: ExtractorCSS.java 4653 2006-09-25 18:58:50Z paul_jack $
005: *
006: * Created on Jan 6, 2004
007: *
008: * Copyright (C) 2004 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU Lesser Public License as published by
014: * the Free Software Foundation; either version 2.1 of the License, or
015: * any later version.
016: *
017: * Heritrix is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU Lesser Public License for more details.
021: *
022: * You should have received a copy of the GNU Lesser Public License
023: * along with Heritrix; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: package org.archive.crawler.extractor;
028:
029: import java.io.IOException;
030: import java.util.logging.Logger;
031: import java.util.regex.Matcher;
032:
033: import org.apache.commons.httpclient.URIException;
034: import org.archive.crawler.datamodel.CoreAttributeConstants;
035: import org.archive.crawler.datamodel.CrawlURI;
036: import org.archive.crawler.framework.CrawlController;
037: import org.archive.io.ReplayCharSequence;
038: import org.archive.net.UURI;
039: import org.archive.util.DevUtils;
040: import org.archive.util.TextUtils;
041:
042: /**
043: * This extractor is parsing URIs from CSS type files.
044: * The format of a CSS URL value is 'url(' followed by optional white space
045: * followed by an optional single quote (') or double quote (") character
046: * followed by the URL itself followed by an optional single quote (') or
047: * double quote (") character followed by optional white space followed by ')'.
048: * Parentheses, commas, white space characters, single quotes (') and double
049: * quotes (") appearing in a URL must be escaped with a backslash:
050: * '\(', '\)', '\,'. Partial URLs are interpreted relative to the source of
051: * the style sheet, not relative to the document. <a href="http://www.w3.org/TR/REC-CSS1#url">
052: * Source: www.w3.org</a>
053: *
054: * @author Igor Ranitovic
055: *
056: **/
057:
058: public class ExtractorCSS extends Extractor implements
059: CoreAttributeConstants {
060:
061: private static final long serialVersionUID = -1540252885329424902L;
062:
063: private static Logger logger = Logger
064: .getLogger("org.archive.crawler.extractor.ExtractorCSS");
065:
066: private static String ESCAPED_AMP = "&";
067: // CSS escapes: "Parentheses, commas, whitespace characters, single
068: // quotes (') and double quotes (") appearing in a URL must be
069: // escaped with a backslash"
070: static final String CSS_BACKSLASH_ESCAPE = "\\\\([,'\"\\(\\)\\s])";
071:
072: /**
073: * CSS URL extractor pattern.
074: *
075: * This pattern extracts URIs for CSS files
076: **/
077: // static final String CSS_URI_EXTRACTOR =
078: // "url[(]\\s*([\"\']?)([^\\\"\\'].*?)\\1\\s*[)]";
079: static final String CSS_URI_EXTRACTOR = "(?i)(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)"
080: + // G1
081: "([^\\\"\'].{0," + UURI.MAX_URL_LENGTH + "}?)\\1\\s*[);]"; // G2
082: // GROUPS:
083: // (G1) optional ' or "
084: // (G2) URI
085:
086: private long numberOfCURIsHandled = 0;
087: private long numberOfLinksExtracted = 0;
088:
089: /**
090: * @param name
091: */
092: public ExtractorCSS(String name) {
093: super (name,
094: "CSS Extractor. Extracts links from Cascading Style"
095: + " Sheets (.css).");
096: }
097:
098: /**
099: * @param curi Crawl URI to process.
100: */
101: public void extract(CrawlURI curi) {
102: if (!isHttpTransactionContentToProcess(curi)) {
103: return;
104: }
105: String mimeType = curi.getContentType();
106: if (mimeType == null) {
107: return;
108: }
109: if ((mimeType.toLowerCase().indexOf("css") < 0)
110: && (!curi.toString().toLowerCase().endsWith(".css"))) {
111: return;
112: }
113: this .numberOfCURIsHandled++;
114:
115: ReplayCharSequence cs = null;
116: try {
117: cs = curi.getHttpRecorder().getReplayCharSequence();
118: } catch (IOException e) {
119: logger.severe("Failed getting ReplayCharSequence: "
120: + e.getMessage());
121: }
122: if (cs == null) {
123: logger.warning("Failed getting ReplayCharSequence: "
124: + curi.toString());
125: return;
126: }
127:
128: // We have a ReplayCharSequence open. Wrap all in finally so we
129: // for sure close it before we leave.
130: try {
131: this .numberOfLinksExtracted += processStyleCode(curi, cs,
132: getController());
133: // Set flag to indicate that link extraction is completed.
134: curi.linkExtractorFinished();
135: } finally {
136: if (cs != null) {
137: try {
138: cs.close();
139: } catch (IOException ioe) {
140: logger
141: .warning(TextUtils
142: .exceptionToString(
143: "Failed close of ReplayCharSequence.",
144: ioe));
145: }
146: }
147: }
148: }
149:
150: public static long processStyleCode(CrawlURI curi, CharSequence cs,
151: CrawlController controller) {
152: long foundLinks = 0;
153: Matcher uris = null;
154: String cssUri;
155: try {
156: uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, cs);
157: while (uris.find()) {
158: cssUri = uris.group(2);
159: // TODO: Escape more HTML Entities.
160: cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&");
161: // Remove backslashes when used as escape character in CSS URL
162: cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE,
163: cssUri, "$1");
164: foundLinks++;
165: try {
166: curi.createAndAddLinkRelativeToBase(cssUri,
167: Link.EMBED_MISC, Link.EMBED_HOP);
168: } catch (URIException e) {
169: // There may not be a controller (e.g. If we're being run
170: // by the extractor tool).
171: if (controller != null) {
172: controller.logUriError(e, curi.getUURI(),
173: cssUri);
174: } else {
175: logger.info(curi + ", " + cssUri + ": "
176: + e.getMessage());
177: }
178: }
179: }
180: } catch (StackOverflowError e) {
181: DevUtils.warnHandle(e, "ExtractorCSS StackOverflowError");
182: } finally {
183: TextUtils.recycleMatcher(uris);
184: }
185: return foundLinks;
186: }
187:
188: public String report() {
189: StringBuffer ret = new StringBuffer();
190: ret
191: .append("Processor: org.archive.crawler.extractor.ExtractorCSS\n");
192: ret
193: .append(" Function: Link extraction on Cascading Style Sheets (.css)\n");
194: ret.append(" CrawlURIs handled: " + numberOfCURIsHandled
195: + "\n");
196: ret.append(" Links extracted: " + numberOfLinksExtracted
197: + "\n\n");
198:
199: return ret.toString();
200: }
201: }
|