0001: /* FetchHTTP.java
0002: *
0003: * $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $
0004: *
0005: * Created on Jun 5, 2003
0006: *
0007: * Copyright (C) 2003 Internet Archive.
0008: *
0009: * This file is part of the Heritrix web crawler (crawler.archive.org).
0010: *
0011: * Heritrix is free software; you can redistribute it and/or modify
0012: * it under the terms of the GNU Lesser Public License as published by
0013: * the Free Software Foundation; either version 2.1 of the License, or
0014: * any later version.
0015: *
0016: * Heritrix is distributed in the hope that it will be useful,
0017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0019: * GNU Lesser Public License for more details.
0020: *
0021: * You should have received a copy of the GNU Lesser Public License
0022: * along with Heritrix; if not, write to the Free Software
0023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0024: */
0025: package org.archive.crawler.fetcher;
0026:
0027: import it.unimi.dsi.mg4j.util.MutableString;
0028:
0029: import java.io.File;
0030: import java.io.FileNotFoundException;
0031: import java.io.FileOutputStream;
0032: import java.io.IOException;
0033: import java.io.ObjectInputStream;
0034: import java.io.ObjectOutputStream;
0035: import java.io.RandomAccessFile;
0036: import java.security.KeyManagementException;
0037: import java.security.KeyStoreException;
0038: import java.security.MessageDigest;
0039: import java.security.NoSuchAlgorithmException;
0040: import java.util.Collection;
0041: import java.util.HashSet;
0042: import java.util.Iterator;
0043: import java.util.List;
0044: import java.util.ListIterator;
0045: import java.util.Map;
0046: import java.util.Set;
0047: import java.util.logging.Level;
0048: import java.util.logging.Logger;
0049: import java.net.InetAddress;
0050: import java.net.UnknownHostException;
0051:
0052: import javax.management.AttributeNotFoundException;
0053: import javax.management.MBeanException;
0054: import javax.management.ReflectionException;
0055: import javax.net.ssl.SSLContext;
0056: import javax.net.ssl.SSLSocketFactory;
0057: import javax.net.ssl.TrustManager;
0058:
0059: import org.apache.commons.httpclient.Cookie;
0060: import org.apache.commons.httpclient.Header;
0061: import org.apache.commons.httpclient.HostConfiguration;
0062: import org.apache.commons.httpclient.HttpClient;
0063: import org.apache.commons.httpclient.HttpConnection;
0064: import org.apache.commons.httpclient.HttpConnectionManager;
0065: import org.apache.commons.httpclient.HttpException;
0066: import org.apache.commons.httpclient.HttpMethod;
0067: import org.apache.commons.httpclient.HttpMethodBase;
0068: import org.apache.commons.httpclient.HttpState;
0069: import org.apache.commons.httpclient.HttpStatus;
0070: import org.apache.commons.httpclient.HttpVersion;
0071: import org.apache.commons.httpclient.auth.AuthChallengeParser;
0072: import org.apache.commons.httpclient.auth.AuthScheme;
0073: import org.apache.commons.httpclient.auth.BasicScheme;
0074: import org.apache.commons.httpclient.auth.DigestScheme;
0075: import org.apache.commons.httpclient.auth.MalformedChallengeException;
0076: import org.apache.commons.httpclient.cookie.CookiePolicy;
0077: import org.apache.commons.httpclient.params.HttpClientParams;
0078: import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
0079: import org.apache.commons.httpclient.params.HttpMethodParams;
0080: import org.apache.commons.httpclient.protocol.Protocol;
0081: import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
0082: import org.archive.crawler.Heritrix;
0083: import org.archive.crawler.datamodel.CoreAttributeConstants;
0084: import org.archive.crawler.datamodel.CrawlHost;
0085: import org.archive.crawler.datamodel.CrawlOrder;
0086: import org.archive.crawler.datamodel.CrawlServer;
0087: import org.archive.crawler.datamodel.CrawlURI;
0088: import org.archive.crawler.datamodel.CredentialStore;
0089: import org.archive.crawler.datamodel.FetchStatusCodes;
0090: import org.archive.crawler.datamodel.ServerCache;
0091: import org.archive.crawler.datamodel.credential.Credential;
0092: import org.archive.crawler.datamodel.credential.CredentialAvatar;
0093: import org.archive.crawler.datamodel.credential.Rfc2617Credential;
0094: import org.archive.crawler.deciderules.DecideRule;
0095: import org.archive.crawler.deciderules.DecideRuleSequence;
0096: import org.archive.crawler.event.CrawlStatusListener;
0097: import org.archive.crawler.framework.Filter;
0098: import org.archive.crawler.framework.Processor;
0099: import org.archive.crawler.settings.MapType;
0100: import org.archive.crawler.settings.SettingsHandler;
0101: import org.archive.crawler.settings.SimpleType;
0102: import org.archive.crawler.settings.StringList;
0103: import org.archive.crawler.settings.Type;
0104: import org.archive.httpclient.ConfigurableX509TrustManager;
0105: import org.archive.httpclient.HttpRecorderGetMethod;
0106: import org.archive.httpclient.HttpRecorderMethod;
0107: import org.archive.httpclient.HttpRecorderPostMethod;
0108: import org.archive.httpclient.SingleHttpConnectionManager;
0109: import org.archive.io.ObjectPlusFilesInputStream;
0110: import org.archive.io.RecorderLengthExceededException;
0111: import org.archive.io.RecorderTimeoutException;
0112: import org.archive.io.RecorderTooMuchHeaderException;
0113: import org.archive.util.ArchiveUtils;
0114: import org.archive.util.HttpRecorder;
0115: import org.archive.util.bdbje.EnhancedEnvironment;
0116:
0117: import st.ata.util.AList;
0118:
0119: import com.sleepycat.bind.serial.SerialBinding;
0120: import com.sleepycat.bind.serial.StoredClassCatalog;
0121: import com.sleepycat.bind.tuple.StringBinding;
0122: import com.sleepycat.collections.StoredSortedMap;
0123: import com.sleepycat.je.Database;
0124: import com.sleepycat.je.DatabaseConfig;
0125: import com.sleepycat.je.DatabaseException;
0126: import com.sleepycat.je.Environment;
0127:
0128: /**
0129: * HTTP fetcher that uses <a
0130: * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
0131: * HttpClient</a> library.
0132: *
0133: * @author Gordon Mohr
0134: * @author Igor Ranitovic
0135: * @author others
0136: * @version $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $
0137: */
0138: public class FetchHTTP extends Processor implements
0139: CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
0140: // be robust against trivial implementation changes
0141: private static final long serialVersionUID = ArchiveUtils
0142: .classnameBasedUID(FetchHTTP.class, 1);
0143:
0144: private static Logger logger = Logger.getLogger(FetchHTTP.class
0145: .getName());
0146:
0147: public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
0148: public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
0149: public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
0150: public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
0151: public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
0152: public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
0153: public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
0154: public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
0155: public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
0156: public static final String ATTR_DIGEST_CONTENT = "digest-content";
0157: public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";
0158: public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
0159:
0160: /**
0161: * SSL trust level setting attribute name.
0162: */
0163: public static final String ATTR_TRUST = "trust-level";
0164:
0165: private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
0166: private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
0167: private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
0168: private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;
0169:
0170: /**
0171: * This is the default value pre-1.4. Needs special handling else
0172: * treated as negative number doing math later in processing.
0173: */
0174: private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
0175:
0176: /**
0177: * Default character encoding to use for pages that do not specify.
0178: */
0179: private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
0180:
0181: /**
0182: * Default whether to perform on-the-fly digest hashing of content-bodies.
0183: */
0184: static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);
0185:
0186: /**
0187: * The different digest algorithms to choose between,
0188: * SHA-1 or MD-5 at the moment.
0189: */
0190: public static final String SHA1 = "sha1";
0191: public static final String MD5 = "md5";
0192: public static String[] DIGEST_ALGORITHMS = { SHA1, MD5 };
0193:
0194: /**
0195: * Default algorithm to use for message disgesting.
0196: */
0197: public static final String DEFAULT_DIGEST_ALGORITHM = SHA1;
0198:
0199: private transient HttpClient http = null;
0200:
0201: /**
0202: * How many 'instant retries' of HttpRecoverableExceptions have occurred
0203: *
0204: * Would like it to be 'long', but longs aren't atomic
0205: */
0206: private int recoveryRetries = 0;
0207:
0208: /**
0209: * Count of crawl uris handled.
0210: * Would like to be 'long', but longs aren't atomic
0211: */
0212: private int curisHandled = 0;
0213:
0214: /**
0215: * Rules to apply mid-fetch, just after receipt of the response
0216: * headers before we start to download body.
0217: */
0218: public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";
0219:
0220: /**
0221: * What to log if midfetch abort.
0222: */
0223: private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";
0224:
0225: public static final String ATTR_SEND_CONNECTION_CLOSE = "send-connection-close";
0226: private static final Header HEADER_SEND_CONNECTION_CLOSE = new Header(
0227: "Connection", "close");
0228: public static final String ATTR_SEND_REFERER = "send-referer";
0229: public static final String ATTR_SEND_RANGE = "send-range";
0230: public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";
0231: public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";
0232: public static final String REFERER = "Referer";
0233: public static final String RANGE = "Range";
0234: public static final String RANGE_PREFIX = "bytes=0-";
0235: public static final String HTTP_SCHEME = "http";
0236: public static final String HTTPS_SCHEME = "https";
0237:
0238: public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
0239: private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);
0240:
0241: public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
0242: private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);
0243:
0244: public static final String ATTR_LOCAL_ADDRESS = "bind-address";
0245:
0246: /**
0247: * Database backing cookie map, if using BDB
0248: */
0249: protected Database cookieDb;
0250: /**
0251: * Name of cookie BDB Database
0252: */
0253: public static final String COOKIEDB_NAME = "http_cookies";
0254:
0255: static {
0256: Protocol.registerProtocol("http", new Protocol("http",
0257: new HeritrixProtocolSocketFactory(), 80));
0258: try {
0259: Protocol
0260: .registerProtocol(
0261: "https",
0262: new Protocol(
0263: "https",
0264: ((ProtocolSocketFactory) new HeritrixSSLProtocolSocketFactory()),
0265: 443));
0266: } catch (KeyManagementException e) {
0267: e.printStackTrace();
0268: } catch (KeyStoreException e) {
0269: e.printStackTrace();
0270: } catch (NoSuchAlgorithmException e) {
0271: e.printStackTrace();
0272: }
0273: }
0274: static final String SERVER_CACHE_KEY = "heritrix.server.cache";
0275: static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";
0276:
0277: /***
0278: * Socket factory that has the configurable trust manager installed.
0279: */
0280: private SSLSocketFactory sslfactory = null;
0281:
0282: /**
0283: * Constructor.
0284: *
0285: * @param name Name of this processor.
0286: */
0287: public FetchHTTP(String name) {
0288: super (name, "HTTP Fetcher");
0289:
0290: addElementToDefinition(new DecideRuleSequence(
0291: ATTR_MIDFETCH_DECIDE_RULES,
0292: "DecideRules which, if final decision is REJECT, "
0293: + "abort fetch after headers before all content is"
0294: + "read."));
0295:
0296: addElementToDefinition(new SimpleType(
0297: ATTR_TIMEOUT_SECONDS,
0298: "If the fetch is not completed in this number of seconds, "
0299: + "even if it is making progress, give up. The URI will be "
0300: + "annotated as timeTrunc. Set to zero for no timeout. "
0301: + "(This is not recommended: threads could wait indefinitely "
0302: + "for the fetch to end.)",
0303: DEFAULT_TIMEOUT_SECONDS));
0304: Type e = addElementToDefinition(new SimpleType(
0305: ATTR_SOTIMEOUT_MS,
0306: "If a socket is unresponsive for this number of milliseconds, "
0307: + "give up on that connects/read. (This does not necessarily give "
0308: + "up on the fetch immediately; connects are subject to retries "
0309: + "and reads will be retried until "
0310: + ATTR_TIMEOUT_SECONDS
0311: + " have elapsed. Set to zero for no socket timeout. (This is "
0312: + "note recommended: a socket operation could hand indefinitely.",
0313: DEFAULT_SOTIMEOUT_MS));
0314: e.setExpertSetting(true);
0315: e = addElementToDefinition(new SimpleType(
0316: ATTR_FETCH_BANDWIDTH_MAX,
0317: "The maximum KB/sec to use when fetching data from a server. "
0318: + "0 means no maximum. Default: "
0319: + DEFAULT_FETCH_BANDWIDTH_MAX + ".",
0320: DEFAULT_FETCH_BANDWIDTH_MAX));
0321: e.setExpertSetting(true);
0322: e.setOverrideable(true);
0323: addElementToDefinition(new SimpleType(
0324: ATTR_MAX_LENGTH_BYTES,
0325: "Maximum length in bytes to fetch.\n"
0326: + "Fetch is truncated at this length. A value of 0 means no limit.",
0327: DEFAULT_MAX_LENGTH_BYTES));
0328: e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
0329: "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
0330: e.setOverrideable(true);
0331: e.setExpertSetting(true);
0332: e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
0333: "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
0334: e.setExpertSetting(true);
0335:
0336: e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
0337: "File to preload cookies from", ""));
0338: e.setExpertSetting(true);
0339: e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
0340: "When crawl finishes save cookies to this file", ""));
0341: e.setExpertSetting(true);
0342: e = addElementToDefinition(new SimpleType(
0343: ATTR_TRUST,
0344: "SSL certificate trust level. Range is from the default 'open'"
0345: + " (trust all certs including expired, selfsigned, and those for"
0346: + " which we do not have a CA) through 'loose' (trust all valid"
0347: + " certificates including selfsigned), 'normal' (all valid"
0348: + " certificates not including selfsigned) to 'strict' (Cert is"
0349: + " valid and DN must match servername)",
0350: ConfigurableX509TrustManager.DEFAULT,
0351: ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
0352: e.setOverrideable(false);
0353: e.setExpertSetting(true);
0354: e = addElementToDefinition(new StringList(
0355: ATTR_ACCEPT_HEADERS,
0356: "Accept Headers to include in each request. Each must be the"
0357: + " complete header, e.g., 'Accept-Language: en'"));
0358: e.setExpertSetting(true);
0359: e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
0360: "Proxy host IP (set only if needed).", ""));
0361: e.setExpertSetting(true);
0362: e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
0363: "Proxy port (set only if needed)", ""));
0364: e.setExpertSetting(true);
0365: e = addElementToDefinition(new SimpleType(
0366: ATTR_DEFAULT_ENCODING,
0367: "The character encoding to use for files that do not have one"
0368: + " specified in the HTTP response headers. Default: "
0369: + DEFAULT_CONTENT_CHARSET + ".",
0370: DEFAULT_CONTENT_CHARSET));
0371: e.setExpertSetting(true);
0372: e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT,
0373: "Whether or not to perform an on-the-fly digest hash of"
0374: + " retrieved content-bodies.",
0375: DEFAULT_DIGEST_CONTENT));
0376: e.setExpertSetting(true);
0377: e = addElementToDefinition(new SimpleType(
0378: ATTR_DIGEST_ALGORITHM,
0379: "Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest"
0380: + " hash of retrieved content-bodies.",
0381: DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));
0382: e.setExpertSetting(true);
0383: e = addElementToDefinition(new SimpleType(
0384: ATTR_SEND_IF_MODIFIED_SINCE,
0385: "Send 'If-Modified-Since' header, if previous 'Last-Modified' "
0386: + "fetch history information is available in URI history.",
0387: new Boolean(true)));
0388: e.setOverrideable(true);
0389: e.setExpertSetting(true);
0390: e = addElementToDefinition(new SimpleType(
0391: ATTR_SEND_IF_NONE_MATCH,
0392: "Send 'If-None-Match' header, if previous 'Etag' fetch "
0393: + "history information is available in URI history.",
0394: new Boolean(true)));
0395: e.setOverrideable(true);
0396: e.setExpertSetting(true);
0397: e = addElementToDefinition(new SimpleType(
0398: ATTR_SEND_CONNECTION_CLOSE,
0399: "Send 'Connection: close' header with every request.",
0400: new Boolean(true)));
0401: e.setOverrideable(true);
0402: e.setExpertSetting(true);
0403: e = addElementToDefinition(new SimpleType(
0404: ATTR_SEND_REFERER,
0405: "Send 'Referer' header with every request.\n"
0406: + "The 'Referer' header contans the location the crawler came "
0407: + " from, "
0408: + "the page the current URI was discovered in. The 'Referer' "
0409: + "usually is "
0410: + "logged on the remote server and can be of assistance to "
0411: + "webmasters trying to figure how a crawler got to a "
0412: + "particular area on a site.", new Boolean(
0413: true)));
0414: e.setOverrideable(true);
0415: e.setExpertSetting(true);
0416: e = addElementToDefinition(new SimpleType(
0417: ATTR_SEND_RANGE,
0418: "Send 'Range' header when a limit ("
0419: + ATTR_MAX_LENGTH_BYTES
0420: + ") on document size.\n"
0421: + "Be polite to the HTTP servers and send the 'Range' header,"
0422: + "stating that you are only interested in the first n bytes. "
0423: + "Only pertinent if "
0424: + ATTR_MAX_LENGTH_BYTES
0425: + " > 0. "
0426: + "Sending the 'Range' header results in a "
0427: + "'206 Partial Content' status response, which is better than "
0428: + "just cutting the response mid-download. On rare occasion, "
0429: + " sending 'Range' will "
0430: + "generate '416 Request Range Not Satisfiable' response.",
0431: new Boolean(false)));
0432: e.setOverrideable(true);
0433: e.setExpertSetting(true);
0434: e = addElementToDefinition(new SimpleType(
0435: ATTR_LOCAL_ADDRESS,
0436: "Local IP address or hostname to use when making connections "
0437: + "(binding sockets). When not specified, uses default local"
0438: + "address(es).", ""));
0439: e.setExpertSetting(true);
0440: }
0441:
0442: protected void innerProcess(final CrawlURI curi)
0443: throws InterruptedException {
0444: if (!canFetch(curi)) {
0445: // Cannot fetch this, due to protocol, retries, or other problems
0446: return;
0447: }
0448:
0449: this .curisHandled++;
0450:
0451: // Note begin time
0452: curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
0453:
0454: // Get a reference to the HttpRecorder that is set into this ToeThread.
0455: HttpRecorder rec = HttpRecorder.getHttpRecorder();
0456:
0457: // Shall we get a digest on the content downloaded?
0458: boolean digestContent = ((Boolean) getUncheckedAttribute(curi,
0459: ATTR_DIGEST_CONTENT)).booleanValue();
0460: String algorithm = null;
0461: if (digestContent) {
0462: algorithm = ((String) getUncheckedAttribute(curi,
0463: ATTR_DIGEST_ALGORITHM));
0464: rec.getRecordedInput().setDigest(algorithm);
0465: } else {
0466: // clear
0467: rec.getRecordedInput().setDigest((MessageDigest) null);
0468: }
0469:
0470: // Below we do two inner classes that add check of midfetch
0471: // filters just as we're about to receive the response body.
0472: String curiString = curi.getUURI().toString();
0473: HttpMethodBase method = null;
0474: if (curi.isPost()) {
0475: method = new HttpRecorderPostMethod(curiString, rec) {
0476: protected void readResponseBody(HttpState state,
0477: HttpConnection conn) throws IOException,
0478: HttpException {
0479: addResponseContent(this , curi);
0480: if (checkMidfetchAbort(curi,
0481: this .httpRecorderMethod, conn)) {
0482: doAbort(curi, this , MIDFETCH_ABORT_LOG);
0483: } else {
0484: super .readResponseBody(state, conn);
0485: }
0486: }
0487: };
0488: } else {
0489: method = new HttpRecorderGetMethod(curiString, rec) {
0490: protected void readResponseBody(HttpState state,
0491: HttpConnection conn) throws IOException,
0492: HttpException {
0493: addResponseContent(this , curi);
0494: if (checkMidfetchAbort(curi,
0495: this .httpRecorderMethod, conn)) {
0496: doAbort(curi, this , MIDFETCH_ABORT_LOG);
0497: } else {
0498: super .readResponseBody(state, conn);
0499: }
0500: }
0501: };
0502: }
0503:
0504: HostConfiguration customConfigOrNull = configureMethod(curi,
0505: method);
0506:
0507: // Set httpRecorder into curi. Subsequent code both here and later
0508: // in extractors expects to find the HttpRecorder in the CrawlURI.
0509: curi.setHttpRecorder(rec);
0510:
0511: // Populate credentials. Set config so auth. is not automatic.
0512: boolean addedCredentials = populateCredentials(curi, method);
0513: method.setDoAuthentication(addedCredentials);
0514:
0515: // set hardMax on bytes (if set by operator)
0516: long hardMax = getMaxLength(curi);
0517: // set overall timeout (if set by operator)
0518: long timeoutMs = 1000 * getTimeout(curi);
0519: // Get max fetch rate (bytes/ms). It comes in in KB/sec
0520: long maxRateKBps = getMaxFetchRate(curi);
0521: rec.getRecordedInput().setLimits(hardMax, timeoutMs,
0522: maxRateKBps);
0523:
0524: try {
0525: this .http.executeMethod(customConfigOrNull, method);
0526: } catch (RecorderTooMuchHeaderException ex) {
0527: // when too much header material, abort like other truncations
0528: doAbort(curi, method, HEADER_TRUNC);
0529: } catch (IOException e) {
0530: failedExecuteCleanup(method, curi, e);
0531: return;
0532: } catch (ArrayIndexOutOfBoundsException e) {
0533: // For weird windows-only ArrayIndex exceptions in native
0534: // code... see
0535: // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
0536: // treating as if it were an IOException
0537: failedExecuteCleanup(method, curi, e);
0538: return;
0539: }
0540:
0541: // set softMax on bytes to get (if implied by content-length)
0542: long softMax = method.getResponseContentLength();
0543:
0544: try {
0545: if (!method.isAborted()) {
0546: // Force read-to-end, so that any socket hangs occur here,
0547: // not in later modules.
0548: rec.getRecordedInput().readFullyOrUntil(softMax);
0549: }
0550: } catch (RecorderTimeoutException ex) {
0551: doAbort(curi, method, TIMER_TRUNC);
0552: } catch (RecorderLengthExceededException ex) {
0553: doAbort(curi, method, LENGTH_TRUNC);
0554: } catch (IOException e) {
0555: cleanup(curi, e, "readFully", S_CONNECT_LOST);
0556: return;
0557: } catch (ArrayIndexOutOfBoundsException e) {
0558: // For weird windows-only ArrayIndex exceptions from native code
0559: // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
0560: // treating as if it were an IOException
0561: cleanup(curi, e, "readFully", S_CONNECT_LOST);
0562: return;
0563: } finally {
0564: // ensure recording has stopped
0565: rec.closeRecorders();
0566: if (!method.isAborted()) {
0567: method.releaseConnection();
0568: }
0569: // Note completion time
0570: curi.putLong(A_FETCH_COMPLETED_TIME, System
0571: .currentTimeMillis());
0572: // Set the response charset into the HttpRecord if available.
0573: setCharacterEncoding(rec, method);
0574: setSizes(curi, rec);
0575: }
0576:
0577: if (digestContent) {
0578: curi.setContentDigest(algorithm, rec.getRecordedInput()
0579: .getDigestValue());
0580: }
0581: if (logger.isLoggable(Level.INFO)) {
0582: logger.info((curi.isPost() ? "POST" : "GET") + " "
0583: + curi.getUURI().toString() + " "
0584: + method.getStatusCode() + " "
0585: + rec.getRecordedInput().getSize() + " "
0586: + curi.getContentType());
0587: }
0588:
0589: if (curi.isSuccess() && addedCredentials) {
0590: // Promote the credentials from the CrawlURI to the CrawlServer
0591: // so they are available for all subsequent CrawlURIs on this
0592: // server.
0593: promoteCredentials(curi);
0594: if (logger.isLoggable(Level.FINE)) {
0595: // Print out the cookie. Might help with the debugging.
0596: Header setCookie = method
0597: .getResponseHeader("set-cookie");
0598: if (setCookie != null) {
0599: logger.fine(setCookie.toString().trim());
0600: }
0601: }
0602: } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
0603: // 401 is not 'success'.
0604: handle401(method, curi);
0605: }
0606:
0607: if (rec.getRecordedInput().isOpen()) {
0608: logger.severe(curi.toString()
0609: + " RIS still open. Should have"
0610: + " been closed by method release: "
0611: + Thread.currentThread().getName());
0612: try {
0613: rec.getRecordedInput().close();
0614: } catch (IOException e) {
0615: logger.log(Level.SEVERE,
0616: "second-chance RIS close failed", e);
0617: }
0618: }
0619: }
0620:
0621: /**
0622: * Update CrawlURI internal sizes based on current transaction (and
0623: * in the case of 304s, history)
0624: *
0625: * @param curi CrawlURI
0626: * @param rec HttpRecorder
0627: */
0628: protected void setSizes(final CrawlURI curi, HttpRecorder rec) {
0629: // set reporting size
0630: curi.setContentSize(rec.getRecordedInput().getSize());
0631: // special handling for 304-not modified
0632: if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED
0633: && curi.containsKey(A_FETCH_HISTORY)) {
0634: AList history[] = curi.getAList().getAListArray(
0635: A_FETCH_HISTORY);
0636: if (history[0] != null
0637: && history[0]
0638: .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {
0639: long referenceLength = history[0]
0640: .getLong(A_REFERENCE_LENGTH);
0641: // carry-forward previous 'reference-length' for future
0642: curi.putLong(A_REFERENCE_LENGTH, referenceLength);
0643: // increase content-size to virtual-size for reporting
0644: curi.setContentSize(rec.getRecordedInput().getSize()
0645: + referenceLength);
0646: }
0647: }
0648: }
0649:
0650: protected void doAbort(CrawlURI curi, HttpMethod method,
0651: String annotation) {
0652: curi.addAnnotation(annotation);
0653: curi.getHttpRecorder().close();
0654: method.abort();
0655: }
0656:
0657: protected boolean checkMidfetchAbort(CrawlURI curi,
0658: HttpRecorderMethod method, HttpConnection conn) {
0659: if (curi.isPrerequisite()
0660: || rulesAccept(getMidfetchRule(curi), curi)) {
0661: return false;
0662: }
0663: method.markContentBegin(conn);
0664: return true;
0665: }
0666:
0667: protected DecideRule getMidfetchRule(Object o) {
0668: try {
0669: return (DecideRule) getAttribute(o,
0670: ATTR_MIDFETCH_DECIDE_RULES);
0671: } catch (AttributeNotFoundException e) {
0672: throw new RuntimeException(e);
0673: }
0674: }
0675:
0676: /**
0677: * This method populates <code>curi</code> with response status and
0678: * content type.
0679: * @param curi CrawlURI to populate.
0680: * @param method Method to get response status and headers from.
0681: */
0682: protected void addResponseContent(HttpMethod method, CrawlURI curi) {
0683: curi.setFetchStatus(method.getStatusCode());
0684: Header ct = method.getResponseHeader("content-type");
0685: curi.setContentType((ct == null) ? null : ct.getValue());
0686: // Save method into curi too. Midfetch filters may want to leverage
0687: // info in here.
0688: curi.putObject(A_HTTP_TRANSACTION, method);
0689: }
0690:
0691: /**
0692: * Set the character encoding based on the result headers or default.
0693: *
0694: * The HttpClient returns its own default encoding ("ISO-8859-1") if one
0695: * isn't specified in the Content-Type response header. We give the user
0696: * the option of overriding this, so we need to detect the case where the
0697: * default is returned.
0698: *
0699: * Now, it may well be the case that the default returned by HttpClient
0700: * and the default defined by the user are the same.
0701: *
0702: * @param rec Recorder for this request.
0703: * @param method Method used for the request.
0704: */
0705: private void setCharacterEncoding(final HttpRecorder rec,
0706: final HttpMethod method) {
0707: String encoding = null;
0708:
0709: try {
0710: encoding = ((HttpMethodBase) method).getResponseCharSet();
0711: if (encoding == null
0712: || encoding.equals(DEFAULT_CONTENT_CHARSET)) {
0713: encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
0714: }
0715: } catch (Exception e) {
0716: logger.warning("Failed get default encoding: "
0717: + e.getLocalizedMessage());
0718: }
0719: rec.setCharacterEncoding(encoding);
0720: }
0721:
0722: /**
0723: * Cleanup after a failed method execute.
0724: * @param curi CrawlURI we failed on.
0725: * @param method Method we failed on.
0726: * @param exception Exception we failed with.
0727: */
0728: private void failedExecuteCleanup(final HttpMethod method,
0729: final CrawlURI curi, final Exception exception) {
0730: cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);
0731: method.releaseConnection();
0732: }
0733:
0734: /**
0735: * Cleanup after a failed method execute.
0736: * @param curi CrawlURI we failed on.
0737: * @param exception Exception we failed with.
0738: * @param message Message to log with failure.
0739: * @param status Status to set on the fetch.
0740: */
0741: private void cleanup(final CrawlURI curi,
0742: final Exception exception, final String message,
0743: final int status) {
0744: curi.addLocalizedError(this .getName(), exception, message);
0745: curi.setFetchStatus(status);
0746: curi.getHttpRecorder().close();
0747: }
0748:
0749: /**
0750: * Can this processor fetch the given CrawlURI. May set a fetch
0751: * status if this processor would usually handle the CrawlURI,
0752: * but cannot in this instance.
0753: *
0754: * @param curi
0755: * @return True if processor can fetch.
0756: */
0757: private boolean canFetch(CrawlURI curi) {
0758: if (curi.getFetchStatus() < 0) {
0759: // already marked as errored, this pass through
0760: // skip to end
0761: curi.skipToProcessorChain(getController()
0762: .getPostprocessorChain());
0763: return false;
0764: }
0765: String scheme = curi.getUURI().getScheme();
0766: if (!(scheme.equals("http") || scheme.equals("https"))) {
0767: // handles only plain http and https
0768: return false;
0769: }
0770: CrawlHost host = getController().getServerCache().getHostFor(
0771: curi);
0772: // make sure the dns lookup succeeded
0773: if (host.getIP() == null && host.hasBeenLookedUp()) {
0774: curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
0775: return false;
0776: }
0777: return true;
0778: }
0779:
0780: /**
0781: * Configure the HttpMethod setting options and headers.
0782: *
0783: * @param curi CrawlURI from which we pull configuration.
0784: * @param method The Method to configure.
0785: */
0786: protected HostConfiguration configureMethod(CrawlURI curi,
0787: HttpMethod method) {
0788: // Don't auto-follow redirects
0789: method.setFollowRedirects(false);
0790:
0791: // // set soTimeout
0792: // method.getParams().setSoTimeout(
0793: // ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
0794: // .intValue());
0795:
0796: // Set cookie policy.
0797: method
0798: .getParams()
0799: .setCookiePolicy(
0800: (((Boolean) getUncheckedAttribute(curi,
0801: ATTR_IGNORE_COOKIES)).booleanValue()) ? CookiePolicy.IGNORE_COOKIES
0802: : CookiePolicy.BROWSER_COMPATIBILITY);
0803:
0804: // Use only HTTP/1.0 (to avoid receiving chunked responses)
0805: method.getParams().setVersion(HttpVersion.HTTP_1_0);
0806:
0807: CrawlOrder order = getSettingsHandler().getOrder();
0808: String userAgent = curi.getUserAgent();
0809: if (userAgent == null) {
0810: userAgent = order.getUserAgent(curi);
0811: }
0812: method.setRequestHeader("User-Agent", userAgent);
0813: method.setRequestHeader("From", order.getFrom(curi));
0814:
0815: // Set retry handler.
0816: method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
0817: new HeritrixHttpMethodRetryHandler());
0818:
0819: final long maxLength = getMaxLength(curi);
0820: if (maxLength > 0
0821: && ((Boolean) getUncheckedAttribute(curi,
0822: ATTR_SEND_RANGE)).booleanValue()) {
0823: method.addRequestHeader(RANGE, RANGE_PREFIX.concat(Long
0824: .toString(maxLength - 1)));
0825: }
0826:
0827: if (((Boolean) getUncheckedAttribute(curi,
0828: ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
0829: method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
0830: }
0831:
0832: if (((Boolean) getUncheckedAttribute(curi, ATTR_SEND_REFERER))
0833: .booleanValue()) {
0834: // RFC2616 says no referer header if referer is https and the url
0835: // is not
0836: String via = curi.flattenVia();
0837: if (via != null
0838: && via.length() > 0
0839: && !(via.startsWith(HTTPS_SCHEME) && curi.getUURI()
0840: .getScheme().equals(HTTP_SCHEME))) {
0841: method.setRequestHeader(REFERER, via);
0842: }
0843: }
0844:
0845: if (!curi.isPrerequisite()) {
0846: setConditionalGetHeader(curi, method,
0847: ATTR_SEND_IF_MODIFIED_SINCE,
0848: CoreAttributeConstants.A_LAST_MODIFIED_HEADER,
0849: "If-Modified-Since");
0850: setConditionalGetHeader(curi, method,
0851: ATTR_SEND_IF_NONE_MATCH,
0852: CoreAttributeConstants.A_ETAG_HEADER,
0853: "If-None-Match");
0854: }
0855:
0856: // TODO: What happens if below method adds a header already
0857: // added above: e.g. Connection, Range, or Referer?
0858: setAcceptHeaders(curi, method);
0859:
0860: return configureProxy(curi);
0861: }
0862:
0863: /**
0864: * Set the given conditional-GET header, if the setting is enabled and
0865: * a suitable value is available in the URI history.
0866: * @param curi source CrawlURI
0867: * @param method HTTP operation pending
0868: * @param setting true/false enablement setting name to consult
0869: * @param sourceHeader header to consult in URI history
0870: * @param targetHeader header to set if possible
0871: */
0872: protected void setConditionalGetHeader(CrawlURI curi,
0873: HttpMethod method, String setting, String sourceHeader,
0874: String targetHeader) {
0875: if (((Boolean) getUncheckedAttribute(curi, setting))) {
0876: try {
0877: String previous = curi.getAList().getAListArray(
0878: A_FETCH_HISTORY)[0].getString(sourceHeader);
0879: if (previous != null) {
0880: method.setRequestHeader(targetHeader, previous);
0881: }
0882: } catch (RuntimeException e) {
0883: // for absent key, bad index, etc. just do nothing
0884: }
0885: }
0886: }
0887:
0888: /**
0889: * Setup proxy, based on attributes in CrawlURI and settings,
0890: * for this CrawlURI only.
0891: * @return HostConfiguration customized as necessary, or null if no
0892: * customization required
0893: */
0894: private HostConfiguration configureProxy(CrawlURI curi) {
0895: String proxy = (String) getAttributeEither(curi,
0896: ATTR_HTTP_PROXY_HOST);
0897: int port = -1;
0898: if (proxy.length() == 0) {
0899: proxy = null;
0900: } else {
0901: String portString = (String) getAttributeEither(curi,
0902: ATTR_HTTP_PROXY_PORT);
0903: port = portString.length() > 0 ? Integer
0904: .parseInt(portString) : -1;
0905: }
0906: HostConfiguration config = this .http.getHostConfiguration();
0907: if (config.getProxyHost() == proxy
0908: && config.getProxyPort() == port) {
0909: // no change
0910: return null;
0911: }
0912: if (proxy != null && proxy.equals(config.getProxyHost())
0913: && config.getProxyPort() == port) {
0914: // no change
0915: return null;
0916: }
0917: config = new HostConfiguration(config); // copy of config
0918: config.setProxy(proxy, port);
0919: return config;
0920: }
0921:
0922: /**
0923: * Get a value either from inside the CrawlURI instance, or from
0924: * settings (module attributes).
0925: *
0926: * @param curi CrawlURI to consult
0927: * @param key key to lookup
0928: * @return value from either CrawlURI (preferred) or settings
0929: */
0930: protected Object getAttributeEither(CrawlURI curi, String key) {
0931: Object obj = curi != null ? curi.getObject(key) : null;
0932: if (obj == null) {
0933: obj = getUncheckedAttribute(curi, key);
0934: }
0935: return obj;
0936: }
0937:
0938: /**
0939: * Add credentials if any to passed <code>method</code>.
0940: *
0941: * Do credential handling. Credentials are in two places. 1. Credentials
0942: * that succeeded are added to the CrawlServer (Or rather, avatars for
0943: * credentials are whats added because its not safe to keep around
0944: * references to credentials). 2. Credentials to be tried are in the curi.
0945: * Returns true if found credentials to be tried.
0946: *
0947: * @param curi Current CrawlURI.
0948: * @param method The method to add to.
0949: * @return True if prepopulated <code>method</code> with credentials AND the
0950: * credentials came from the <code>curi</code>, not from the CrawlServer.
0951: * The former is special in that if the <code>curi</curi> credentials
0952: * succeed, then the caller needs to promote them from the CrawlURI to the
0953: * CrawlServer so they are available for all subsequent CrawlURIs on this
0954: * server.
0955: */
0956: private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
0957: // First look at the server avatars. Add any that are to be volunteered
0958: // on every request (e.g. RFC2617 credentials). Every time creds will
0959: // return true when we call 'isEveryTime().
0960: CrawlServer server = getController().getServerCache()
0961: .getServerFor(curi);
0962: if (server.hasCredentialAvatars()) {
0963: Set avatars = server.getCredentialAvatars();
0964: for (Iterator i = avatars.iterator(); i.hasNext();) {
0965: CredentialAvatar ca = (CredentialAvatar) i.next();
0966: Credential c = ca.getCredential(getSettingsHandler(),
0967: curi);
0968: if (c.isEveryTime()) {
0969: c
0970: .populate(curi, this .http, method, ca
0971: .getPayload());
0972: }
0973: }
0974: }
0975:
0976: boolean result = false;
0977:
0978: // Now look in the curi. The Curi will have credentials loaded either
0979: // by the handle401 method if its a rfc2617 or it'll have been set into
0980: // the curi by the preconditionenforcer as this login uri came through.
0981: if (curi.hasCredentialAvatars()) {
0982: Set avatars = curi.getCredentialAvatars();
0983: for (Iterator i = avatars.iterator(); i.hasNext();) {
0984: CredentialAvatar ca = (CredentialAvatar) i.next();
0985: Credential c = ca.getCredential(getSettingsHandler(),
0986: curi);
0987: if (c
0988: .populate(curi, this .http, method, ca
0989: .getPayload())) {
0990: result = true;
0991: }
0992: }
0993: }
0994:
0995: return result;
0996: }
0997:
0998: /**
0999: * Promote successful credential to the server.
1000: *
1001: * @param curi CrawlURI whose credentials we are to promote.
1002: */
1003: private void promoteCredentials(final CrawlURI curi) {
1004: if (!curi.hasCredentialAvatars()) {
1005: logger
1006: .severe("No credentials to promote when there should be "
1007: + curi);
1008: } else {
1009: Set avatars = curi.getCredentialAvatars();
1010: for (Iterator i = avatars.iterator(); i.hasNext();) {
1011: CredentialAvatar ca = (CredentialAvatar) i.next();
1012: curi.removeCredentialAvatar(ca);
1013: // The server to attach too may not be the server that hosts
1014: // this passed curi. It might be of another subdomain.
1015: // The avatar needs to be added to the server that is dependent
1016: // on this precondition. Find it by name. Get the name from
1017: // the credential this avatar represents.
1018: Credential c = ca.getCredential(getSettingsHandler(),
1019: curi);
1020: String cd = null;
1021: try {
1022: cd = c.getCredentialDomain(curi);
1023: } catch (AttributeNotFoundException e) {
1024: logger.severe("Failed to get cred domain for "
1025: + curi + " for " + ca + ": "
1026: + e.getMessage());
1027: }
1028: if (cd != null) {
1029: CrawlServer cs = getController().getServerCache()
1030: .getServerFor(cd);
1031: if (cs != null) {
1032: cs.addCredentialAvatar(ca);
1033: }
1034: }
1035: }
1036: }
1037: }
1038:
1039: /**
1040: * Server is looking for basic/digest auth credentials (RFC2617). If we have
1041: * any, put them into the CrawlURI and have it come around again. Presence
1042: * of the credential serves as flag to frontier to requeue promptly. If we
1043: * already tried this domain and still got a 401, then our credentials are
1044: * bad. Remove them and let this curi die.
1045: *
1046: * @param method Method that got a 401.
1047: * @param curi CrawlURI that got a 401.
1048: */
1049: protected void handle401(final HttpMethod method,
1050: final CrawlURI curi) {
1051: AuthScheme authscheme = getAuthScheme(method, curi);
1052: if (authscheme == null) {
1053: return;
1054: }
1055: String realm = authscheme.getRealm();
1056:
1057: // Look to see if this curi had rfc2617 avatars loaded. If so, are
1058: // any of them for this realm? If so, then the credential failed
1059: // if we got a 401 and it should be let die a natural 401 death.
1060: Set curiRfc2617Credentials = getCredentials(
1061: getSettingsHandler(), curi, Rfc2617Credential.class);
1062: Rfc2617Credential extant = Rfc2617Credential.getByRealm(
1063: curiRfc2617Credentials, realm, curi);
1064: if (extant != null) {
1065: // Then, already tried this credential. Remove ANY rfc2617
1066: // credential since presence of a rfc2617 credential serves
1067: // as flag to frontier to requeue this curi and let the curi
1068: // die a natural death.
1069: extant.detachAll(curi);
1070: logger.warning("Auth failed (401) though supplied realm "
1071: + realm + " to " + curi.toString());
1072: } else {
1073: // Look see if we have a credential that corresponds to this
1074: // realm in credential store. Filter by type and credential
1075: // domain. If not, let this curi die. Else, add it to the
1076: // curi and let it come around again. Add in the AuthScheme
1077: // we got too. Its needed when we go to run the Auth on
1078: // second time around.
1079: CredentialStore cs = CredentialStore
1080: .getCredentialStore(getSettingsHandler());
1081: if (cs == null) {
1082: logger.severe("No credential store for " + curi);
1083: } else {
1084: CrawlServer server = getController().getServerCache()
1085: .getServerFor(curi);
1086: Set storeRfc2617Credentials = cs.subset(curi,
1087: Rfc2617Credential.class, server.getName());
1088: if (storeRfc2617Credentials == null
1089: || storeRfc2617Credentials.size() <= 0) {
1090: logger.info("No rfc2617 credentials for " + curi);
1091: } else {
1092: Rfc2617Credential found = Rfc2617Credential
1093: .getByRealm(storeRfc2617Credentials, realm,
1094: curi);
1095: if (found == null) {
1096: logger.info("No rfc2617 credentials for realm "
1097: + realm + " in " + curi);
1098: } else {
1099: found.attach(curi, authscheme.getRealm());
1100: logger.info("Found credential for realm "
1101: + realm + " in store for "
1102: + curi.toString());
1103: }
1104: }
1105: }
1106: }
1107: }
1108:
1109: /**
1110: * @param method Method that got a 401.
1111: * @param curi CrawlURI that got a 401.
1112: * @return Returns first wholesome authscheme found else null.
1113: */
1114: protected AuthScheme getAuthScheme(final HttpMethod method,
1115: final CrawlURI curi) {
1116: Header[] headers = method
1117: .getResponseHeaders("WWW-Authenticate");
1118: if (headers == null || headers.length <= 0) {
1119: logger
1120: .info("We got a 401 but no WWW-Authenticate challenge: "
1121: + curi.toString());
1122: return null;
1123: }
1124:
1125: Map authschemes = null;
1126: try {
1127: authschemes = AuthChallengeParser.parseChallenges(headers);
1128: } catch (MalformedChallengeException e) {
1129: logger.info("Failed challenge parse: " + e.getMessage());
1130: }
1131: if (authschemes == null || authschemes.size() <= 0) {
1132: logger.info("We got a 401 and WWW-Authenticate challenge"
1133: + " but failed parse of the header "
1134: + curi.toString());
1135: return null;
1136: }
1137:
1138: AuthScheme result = null;
1139: // Use the first auth found.
1140: for (Iterator i = authschemes.keySet().iterator(); result == null
1141: && i.hasNext();) {
1142: String key = (String) i.next();
1143: String challenge = (String) authschemes.get(key);
1144: if (key == null || key.length() <= 0 || challenge == null
1145: || challenge.length() <= 0) {
1146: logger.warning("Empty scheme: " + curi.toString()
1147: + ": " + headers);
1148: }
1149: AuthScheme authscheme = null;
1150: if (key.equals("basic")) {
1151: authscheme = new BasicScheme();
1152: } else if (key.equals("digest")) {
1153: authscheme = new DigestScheme();
1154: } else {
1155: logger.info("Unsupported scheme: " + key);
1156: continue;
1157: }
1158:
1159: try {
1160: authscheme.processChallenge(challenge);
1161: } catch (MalformedChallengeException e) {
1162: logger
1163: .info(e.getMessage() + " " + curi + " "
1164: + headers);
1165: continue;
1166: }
1167: if (authscheme.isConnectionBased()) {
1168: logger.info("Connection based " + authscheme);
1169: continue;
1170: }
1171:
1172: if (authscheme.getRealm() == null
1173: || authscheme.getRealm().length() <= 0) {
1174: logger.info("Empty realm " + authscheme + " for "
1175: + curi);
1176: continue;
1177: }
1178: result = authscheme;
1179: }
1180:
1181: return result;
1182: }
1183:
1184: /**
1185: * @param handler Settings Handler.
1186: * @param curi CrawlURI that got a 401.
1187: * @param type Class of credential to get from curi.
1188: * @return Set of credentials attached to this curi.
1189: */
1190: private Set<Credential> getCredentials(SettingsHandler handler,
1191: CrawlURI curi, Class type) {
1192: Set<Credential> result = null;
1193:
1194: if (curi.hasCredentialAvatars()) {
1195: for (Iterator i = curi.getCredentialAvatars().iterator(); i
1196: .hasNext();) {
1197: CredentialAvatar ca = (CredentialAvatar) i.next();
1198: if (ca.match(type)) {
1199: if (result == null) {
1200: result = new HashSet<Credential>();
1201: }
1202: result.add(ca.getCredential(handler, curi));
1203: }
1204: }
1205: }
1206: return result;
1207: }
1208:
1209: public void initialTasks() {
1210: super .initialTasks();
1211: this .getController().addCrawlStatusListener(this );
1212: configureHttp();
1213:
1214: // load cookies from a file if specified in the order file.
1215: loadCookies();
1216:
1217: // I tried to get the default KeyManagers but doesn't work unless you
1218: // point at a physical keystore. Passing null seems to do the right
1219: // thing so we'll go w/ that.
1220: try {
1221: SSLContext context = SSLContext.getInstance("SSL");
1222: context
1223: .init(
1224: null,
1225: new TrustManager[] { new ConfigurableX509TrustManager(
1226: (String) getAttribute(ATTR_TRUST)) },
1227: null);
1228: this .sslfactory = context.getSocketFactory();
1229: } catch (Exception e) {
1230: logger
1231: .log(Level.WARNING,
1232: "Failed configure of ssl context "
1233: + e.getMessage(), e);
1234: }
1235: }
1236:
1237: public void finalTasks() {
1238: // At the end save cookies to the file specified in the order file.
1239: saveCookies();
1240: cleanupHttp();
1241: super .finalTasks();
1242: }
1243:
1244: /**
1245: * Perform any final cleanup related to the HttpClient instance.
1246: */
1247: protected void cleanupHttp() {
1248: if (cookieDb != null) {
1249: try {
1250: cookieDb.sync();
1251: cookieDb.close();
1252: } catch (DatabaseException e) {
1253: // TODO Auto-generated catch block
1254: e.printStackTrace();
1255: }
1256: }
1257: }
1258:
1259: protected void configureHttp() throws RuntimeException {
1260: // Get timeout. Use it for socket and for connection timeout.
1261: int timeout = (getSoTimeout(null) > 0) ? getSoTimeout(null) : 0;
1262:
1263: // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();
1264: HttpConnectionManager cm = new SingleHttpConnectionManager();
1265:
1266: // TODO: The following settings should be made in the corresponding
1267: // HttpConnectionManager, not here.
1268: HttpConnectionManagerParams hcmp = cm.getParams();
1269: hcmp.setConnectionTimeout(timeout);
1270: hcmp.setStaleCheckingEnabled(true);
1271: // Minimizes bandwidth usage. Setting to true disables Nagle's
1272: // algorithm. IBM JVMs < 142 give an NPE setting this boolean
1273: // on ssl sockets.
1274: hcmp.setTcpNoDelay(false);
1275:
1276: this .http = new HttpClient(cm);
1277: HttpClientParams hcp = this .http.getParams();
1278: // Set default socket timeout.
1279: hcp.setSoTimeout(timeout);
1280: // Set client to be version 1.0.
1281: hcp.setVersion(HttpVersion.HTTP_1_0);
1282:
1283: String addressStr = null;
1284: try {
1285: addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS);
1286: } catch (Exception e1) {
1287: // If exception, just use default.
1288: }
1289: if (addressStr != null && addressStr.length() > 0) {
1290: try {
1291: InetAddress localAddress = InetAddress
1292: .getByName(addressStr);
1293: this .http.getHostConfiguration().setLocalAddress(
1294: localAddress);
1295: } catch (UnknownHostException e) {
1296: // Convert all to RuntimeException so get an exception out
1297: // if initialization fails.
1298: throw new RuntimeException("Unknown host " + addressStr
1299: + " in " + ATTR_LOCAL_ADDRESS);
1300: }
1301: }
1302:
1303: configureHttpCookies();
1304:
1305: // Configure how we want the method to act.
1306: this .http.getParams().setParameter(
1307: HttpMethodParams.SINGLE_COOKIE_HEADER,
1308: new Boolean(true));
1309: this .http.getParams().setParameter(
1310: HttpMethodParams.UNAMBIGUOUS_STATUS_LINE,
1311: new Boolean(false));
1312: this .http.getParams().setParameter(
1313: HttpMethodParams.STRICT_TRANSFER_ENCODING,
1314: new Boolean(false));
1315: this .http.getParams().setIntParameter(
1316: HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);
1317:
1318: HostConfiguration configOrNull = configureProxy(null);
1319: if (configOrNull != null) {
1320: // global proxy settings are in effect
1321: this .http.setHostConfiguration(configOrNull);
1322: }
1323:
1324: // Use our own protocol factory, one that gets IP to use from
1325: // heritrix cache (They're cached in CrawlHost instances).
1326: final ServerCache cache = getController().getServerCache();
1327: hcmp.setParameter(SERVER_CACHE_KEY, cache);
1328: hcmp.setParameter(SSL_FACTORY_KEY, this .sslfactory);
1329: }
1330:
1331: /**
1332: * Set the HttpClient HttpState instance to use a BDB-backed
1333: * StoredSortedMap for cookie storage, if that option is chosen.
1334: */
1335: private void configureHttpCookies() {
1336: // If Bdb-backed cookies chosen, replace map in HttpState
1337: if (((Boolean) getUncheckedAttribute(null, ATTR_BDB_COOKIES))
1338: .booleanValue()) {
1339: try {
1340: EnhancedEnvironment env = getController()
1341: .getBdbEnvironment();
1342: StoredClassCatalog classCatalog = env.getClassCatalog();
1343: DatabaseConfig dbConfig = new DatabaseConfig();
1344: dbConfig.setTransactional(false);
1345: dbConfig.setAllowCreate(true);
1346: dbConfig.setDeferredWrite(true);
1347: cookieDb = env.openDatabase(null, COOKIEDB_NAME,
1348: dbConfig);
1349: StoredSortedMap cookiesMap = new StoredSortedMap(
1350: cookieDb, new StringBinding(),
1351: new SerialBinding(classCatalog, Cookie.class),
1352: true);
1353: this .http.getState().setCookiesMap(cookiesMap);
1354: } catch (DatabaseException e) {
1355: // TODO Auto-generated catch block
1356: logger.severe(e.getMessage());
1357: e.printStackTrace();
1358: }
1359: }
1360: }
1361:
1362: /**
1363: * @param curi Current CrawlURI. Used to get context.
1364: * @return Socket timeout value.
1365: */
1366: private int getSoTimeout(CrawlURI curi) {
1367: Integer res = null;
1368: try {
1369: res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);
1370: } catch (Exception e) {
1371: res = DEFAULT_SOTIMEOUT_MS;
1372: }
1373: return res.intValue();
1374: }
1375:
1376: /**
1377: * @param curi Current CrawlURI. Used to get context.
1378: * @return Timeout value for total request.
1379: */
1380: private int getTimeout(CrawlURI curi) {
1381: Integer res;
1382: try {
1383: res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
1384: } catch (Exception e) {
1385: res = DEFAULT_TIMEOUT_SECONDS;
1386: }
1387: return res.intValue();
1388: }
1389:
1390: private int getMaxFetchRate(CrawlURI curi) {
1391: Integer res;
1392: try {
1393: res = (Integer) getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
1394: } catch (Exception e) {
1395: res = DEFAULT_FETCH_BANDWIDTH_MAX;
1396: }
1397: return res.intValue();
1398: }
1399:
1400: private long getMaxLength(CrawlURI curi) {
1401: Long res;
1402: try {
1403: res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
1404: if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
1405: res = DEFAULT_MAX_LENGTH_BYTES;
1406: }
1407: } catch (Exception e) {
1408: res = DEFAULT_MAX_LENGTH_BYTES;
1409: }
1410: return res.longValue();
1411: }
1412:
1413: /**
1414: * Load cookies from a file before the first fetch.
1415: * <p>
1416: * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1417: * Example entry of cookies.txt file:<br>
1418: * <br>
1419: * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1420: * <br>
1421: * Each line has 7 tab-separated fields:<br>
1422: * <li>1. DOMAIN: The domain that created and have access to the cookie
1423: * value.
1424: * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1425: * domain can access the cookie value.
1426: * <li>3. PATH: The path within the domain that the cookie value is valid
1427: * for.
1428: * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1429: * connection to access the cookie value.
1430: * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1431: * <li>6. NAME: The name of the cookie value
1432: * <li>7. VALUE: The cookie value
1433: *
1434: * @param cookiesFile file in the Netscape's 'cookies.txt' format.
1435: */
1436: public void loadCookies(String cookiesFile) {
1437: // Do nothing if cookiesFile is not specified.
1438: if (cookiesFile == null || cookiesFile.length() <= 0) {
1439: return;
1440: }
1441: RandomAccessFile raf = null;
1442: try {
1443: raf = new RandomAccessFile(cookiesFile, "r");
1444: String[] cookieParts;
1445: String line;
1446: Cookie cookie = null;
1447: while ((line = raf.readLine()) != null) {
1448: // Line that starts with # is commented line, therefore skip it.
1449: if (!line.startsWith("#")) {
1450: cookieParts = line.split("\\t");
1451: if (cookieParts.length == 7) {
1452: // Create cookie with not expiration date (-1 value).
1453: // TODO: add this as an option.
1454: cookie = new Cookie(cookieParts[0],
1455: cookieParts[5], cookieParts[6],
1456: cookieParts[2], -1, Boolean.valueOf(
1457: cookieParts[3]).booleanValue());
1458:
1459: if (cookieParts[1].toLowerCase().equals("true")) {
1460: cookie.setDomainAttributeSpecified(true);
1461: } else {
1462: cookie.setDomainAttributeSpecified(false);
1463: }
1464: this .http.getState().addCookie(cookie);
1465: logger.fine("Adding cookie: "
1466: + cookie.toExternalForm());
1467: }
1468: }
1469: }
1470: } catch (FileNotFoundException e) {
1471: // We should probably throw FatalConfigurationException.
1472: System.out.println("Could not find file: " + cookiesFile
1473: + " (Element: " + ATTR_LOAD_COOKIES + ")");
1474:
1475: } catch (IOException e) {
1476: // We should probably throw FatalConfigurationException.
1477: e.printStackTrace();
1478: } finally {
1479: try {
1480: if (raf != null) {
1481: raf.close();
1482: }
1483: } catch (IOException e) {
1484: e.printStackTrace();
1485: }
1486: }
1487: }
1488:
1489: /* (non-Javadoc)
1490: * @see org.archive.crawler.framework.Processor#report()
1491: */
1492: public String report() {
1493: StringBuffer ret = new StringBuffer();
1494: ret
1495: .append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");
1496: ret.append(" Function: Fetch HTTP URIs\n");
1497: ret.append(" CrawlURIs handled: " + this .curisHandled + "\n");
1498: ret.append(" Recovery retries: " + this .recoveryRetries
1499: + "\n\n");
1500:
1501: return ret.toString();
1502: }
1503:
1504: /**
1505: * Load cookies from the file specified in the order file.
1506: *
1507: * <p>
1508: * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1509: * Example entry of cookies.txt file:<br>
1510: * <br>
1511: * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1512: * <br>
1513: * Each line has 7 tab-separated fields:<br>
1514: * <li>1. DOMAIN: The domain that created and have access to the cookie
1515: * value.
1516: * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1517: * domain can access the cookie value.
1518: * <li>3. PATH: The path within the domain that the cookie value is valid
1519: * for.
1520: * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1521: * connection to access the cookie value.
1522: * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1523: * <li>6. NAME: The name of the cookie value
1524: * <li>7. VALUE: The cookie value
1525: */
1526: public void loadCookies() {
1527: try {
1528: loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));
1529: } catch (MBeanException e) {
1530: logger.warning(e.getLocalizedMessage());
1531: } catch (ReflectionException e) {
1532: logger.warning(e.getLocalizedMessage());
1533: } catch (AttributeNotFoundException e) {
1534: logger.warning(e.getLocalizedMessage());
1535: }
1536: }
1537:
1538: /**
1539: * Saves cookies to the file specified in the order file.
1540: *
1541: * Output file is in the Netscape 'cookies.txt' format.
1542: *
1543: */
1544: public void saveCookies() {
1545: try {
1546: saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));
1547: } catch (MBeanException e) {
1548: logger.warning(e.getLocalizedMessage());
1549: } catch (ReflectionException e) {
1550: logger.warning(e.getLocalizedMessage());
1551: } catch (AttributeNotFoundException e) {
1552: logger.warning(e.getLocalizedMessage());
1553: }
1554: }
1555:
1556: /**
1557: * Saves cookies to a file.
1558: *
1559: * Output file is in the Netscape 'cookies.txt' format.
1560: *
1561: * @param saveCookiesFile output file.
1562: */
1563: public void saveCookies(String saveCookiesFile) {
1564: // Do nothing if cookiesFile is not specified.
1565: if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
1566: return;
1567: }
1568:
1569: FileOutputStream out = null;
1570: try {
1571: out = new FileOutputStream(new File(saveCookiesFile));
1572: @SuppressWarnings("unchecked")
1573: Map<String, Cookie> cookies = http.getState()
1574: .getCookiesMap();
1575: String tab = "\t";
1576: out.write("# Heritrix Cookie File\n".getBytes());
1577: out
1578: .write("# This file is the Netscape cookies.txt format\n\n"
1579: .getBytes());
1580: for (Cookie cookie : cookies.values()) {
1581: MutableString line = new MutableString(1024 * 2 /*Guess an initial size*/);
1582: line.append(cookie.getDomain());
1583: line.append(tab);
1584: line
1585: .append(cookie.isDomainAttributeSpecified() == true ? "TRUE"
1586: : "FALSE");
1587: line.append(tab);
1588: line.append(cookie.getPath());
1589: line.append(tab);
1590: line.append(cookie.getSecure() == true ? "TRUE"
1591: : "FALSE");
1592: line.append(tab);
1593: line.append(cookie.getName());
1594: line.append(tab);
1595: line.append((null == cookie.getValue()) ? "" : cookie
1596: .getValue());
1597: line.append("\n");
1598: out.write(line.toString().getBytes());
1599: }
1600: } catch (FileNotFoundException e) {
1601: // We should probably throw FatalConfigurationException.
1602: System.out.println("Could not find file: "
1603: + saveCookiesFile + " (Element: "
1604: + ATTR_SAVE_COOKIES + ")");
1605: } catch (IOException e) {
1606: e.printStackTrace();
1607: } finally {
1608: try {
1609: if (out != null) {
1610: out.close();
1611: }
1612: } catch (IOException e) {
1613: e.printStackTrace();
1614: }
1615: }
1616: }
1617:
1618: /* (non-Javadoc)
1619: * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
1620: */
1621: protected void listUsedFiles(List<String> list) {
1622: // List the cookies files
1623: // Add seed file
1624: try {
1625: String tmp = (String) getAttribute(ATTR_LOAD_COOKIES);
1626: if (tmp != null && tmp.length() > 0) {
1627: File file = getSettingsHandler()
1628: .getPathRelativeToWorkingDirectory(tmp);
1629: list.add(file.getAbsolutePath());
1630: }
1631: tmp = (String) getAttribute(ATTR_SAVE_COOKIES);
1632: if (tmp != null && tmp.length() > 0) {
1633: File file = getSettingsHandler()
1634: .getPathRelativeToWorkingDirectory(tmp);
1635: list.add(file.getAbsolutePath());
1636: }
1637: } catch (AttributeNotFoundException e) {
1638: // TODO Auto-generated catch block
1639: e.printStackTrace();
1640: } catch (MBeanException e) {
1641: // TODO Auto-generated catch block
1642: e.printStackTrace();
1643: } catch (ReflectionException e) {
1644: // TODO Auto-generated catch block
1645: e.printStackTrace();
1646: }
1647: }
1648:
1649: private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
1650: try {
1651: StringList accept_headers = (StringList) getAttribute(
1652: ATTR_ACCEPT_HEADERS, curi);
1653: if (!accept_headers.isEmpty()) {
1654: for (ListIterator i = accept_headers.listIterator(); i
1655: .hasNext();) {
1656: String hdr = (String) i.next();
1657: String[] nvp = hdr.split(": +");
1658: if (nvp.length == 2) {
1659: get.setRequestHeader(nvp[0], nvp[1]);
1660: } else {
1661: logger.warning("Invalid accept header: " + hdr);
1662: }
1663: }
1664: }
1665: } catch (AttributeNotFoundException e) {
1666: logger.severe(e.getMessage());
1667: }
1668: }
1669:
1670: // custom serialization
1671: private void writeObject(ObjectOutputStream stream)
1672: throws IOException {
1673: stream.defaultWriteObject();
1674: // save cookies
1675: @SuppressWarnings("unchecked")
1676: Collection<Cookie> c = http.getState().getCookiesMap().values();
1677: Cookie[] cookies = c.toArray(new Cookie[c.size()]);
1678: stream.writeObject(cookies);
1679: }
1680:
1681: private void readObject(ObjectInputStream stream)
1682: throws IOException, ClassNotFoundException {
1683: stream.defaultReadObject();
1684: Cookie cookies[] = (Cookie[]) stream.readObject();
1685: ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream) stream;
1686: coistream.registerFinishTask(new PostRestore(cookies));
1687: }
1688:
1689: /**
1690: * @return Returns the http instance.
1691: */
1692: protected HttpClient getHttp() {
1693: return this .http;
1694: }
1695:
1696: class PostRestore implements Runnable {
1697: Cookie cookies[];
1698:
1699: public PostRestore(Cookie cookies[]) {
1700: this .cookies = cookies;
1701: }
1702:
1703: public void run() {
1704: configureHttp();
1705: for (int i = 0; i < cookies.length; i++) {
1706: getHttp().getState().addCookie(cookies[i]);
1707: }
1708: }
1709: }
1710:
1711: /* (non-Javadoc)
1712: * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1713: */
1714: public void crawlStarted(String message) {
1715: // TODO Auto-generated method stub
1716: }
1717:
1718: /* (non-Javadoc)
1719: * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1720: */
1721: public void crawlCheckpoint(File checkpointDir) {
1722: try {
1723: cookieDb.sync();
1724: } catch (DatabaseException e) {
1725: // TODO Auto-generated catch block
1726: throw new RuntimeException(e);
1727: }
1728: }
1729:
1730: /* (non-Javadoc)
1731: * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1732: */
1733: public void crawlEnding(String sExitMessage) {
1734: // TODO Auto-generated method stub
1735: }
1736:
1737: /* (non-Javadoc)
1738: * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1739: */
1740: public void crawlEnded(String sExitMessage) {
1741: this .http = null;
1742: }
1743:
1744: /* (non-Javadoc)
1745: * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1746: */
1747: public void crawlPausing(String statusMessage) {
1748: // TODO Auto-generated method stub
1749: }
1750:
1751: /* (non-Javadoc)
1752: * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1753: */
1754: public void crawlPaused(String statusMessage) {
1755: // TODO Auto-generated method stub
1756: }
1757:
1758: /* (non-Javadoc)
1759: * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1760: */
1761: public void crawlResuming(String statusMessage) {
1762: // TODO Auto-generated method stub
1763: }
1764: }
|