Source Code Cross Referenced for FetchHTTP.java in  » Web-Crawler » heritrix » org » archive » crawler » fetcher » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.fetcher 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /* FetchHTTP.java
0002:         *
0003:         * $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $
0004:         *
0005:         * Created on Jun 5, 2003
0006:         *
0007:         * Copyright (C) 2003 Internet Archive.
0008:         *
0009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0010:         *
0011:         * Heritrix is free software; you can redistribute it and/or modify
0012:         * it under the terms of the GNU Lesser Public License as published by
0013:         * the Free Software Foundation; either version 2.1 of the License, or
0014:         * any later version.
0015:         *
0016:         * Heritrix is distributed in the hope that it will be useful,
0017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0019:         * GNU Lesser Public License for more details.
0020:         *
0021:         * You should have received a copy of the GNU Lesser Public License
0022:         * along with Heritrix; if not, write to the Free Software
0023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0024:         */
0025:        package org.archive.crawler.fetcher;
0026:
0027:        import it.unimi.dsi.mg4j.util.MutableString;
0028:
0029:        import java.io.File;
0030:        import java.io.FileNotFoundException;
0031:        import java.io.FileOutputStream;
0032:        import java.io.IOException;
0033:        import java.io.ObjectInputStream;
0034:        import java.io.ObjectOutputStream;
0035:        import java.io.RandomAccessFile;
0036:        import java.security.KeyManagementException;
0037:        import java.security.KeyStoreException;
0038:        import java.security.MessageDigest;
0039:        import java.security.NoSuchAlgorithmException;
0040:        import java.util.Collection;
0041:        import java.util.HashSet;
0042:        import java.util.Iterator;
0043:        import java.util.List;
0044:        import java.util.ListIterator;
0045:        import java.util.Map;
0046:        import java.util.Set;
0047:        import java.util.logging.Level;
0048:        import java.util.logging.Logger;
0049:        import java.net.InetAddress;
0050:        import java.net.UnknownHostException;
0051:
0052:        import javax.management.AttributeNotFoundException;
0053:        import javax.management.MBeanException;
0054:        import javax.management.ReflectionException;
0055:        import javax.net.ssl.SSLContext;
0056:        import javax.net.ssl.SSLSocketFactory;
0057:        import javax.net.ssl.TrustManager;
0058:
0059:        import org.apache.commons.httpclient.Cookie;
0060:        import org.apache.commons.httpclient.Header;
0061:        import org.apache.commons.httpclient.HostConfiguration;
0062:        import org.apache.commons.httpclient.HttpClient;
0063:        import org.apache.commons.httpclient.HttpConnection;
0064:        import org.apache.commons.httpclient.HttpConnectionManager;
0065:        import org.apache.commons.httpclient.HttpException;
0066:        import org.apache.commons.httpclient.HttpMethod;
0067:        import org.apache.commons.httpclient.HttpMethodBase;
0068:        import org.apache.commons.httpclient.HttpState;
0069:        import org.apache.commons.httpclient.HttpStatus;
0070:        import org.apache.commons.httpclient.HttpVersion;
0071:        import org.apache.commons.httpclient.auth.AuthChallengeParser;
0072:        import org.apache.commons.httpclient.auth.AuthScheme;
0073:        import org.apache.commons.httpclient.auth.BasicScheme;
0074:        import org.apache.commons.httpclient.auth.DigestScheme;
0075:        import org.apache.commons.httpclient.auth.MalformedChallengeException;
0076:        import org.apache.commons.httpclient.cookie.CookiePolicy;
0077:        import org.apache.commons.httpclient.params.HttpClientParams;
0078:        import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
0079:        import org.apache.commons.httpclient.params.HttpMethodParams;
0080:        import org.apache.commons.httpclient.protocol.Protocol;
0081:        import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
0082:        import org.archive.crawler.Heritrix;
0083:        import org.archive.crawler.datamodel.CoreAttributeConstants;
0084:        import org.archive.crawler.datamodel.CrawlHost;
0085:        import org.archive.crawler.datamodel.CrawlOrder;
0086:        import org.archive.crawler.datamodel.CrawlServer;
0087:        import org.archive.crawler.datamodel.CrawlURI;
0088:        import org.archive.crawler.datamodel.CredentialStore;
0089:        import org.archive.crawler.datamodel.FetchStatusCodes;
0090:        import org.archive.crawler.datamodel.ServerCache;
0091:        import org.archive.crawler.datamodel.credential.Credential;
0092:        import org.archive.crawler.datamodel.credential.CredentialAvatar;
0093:        import org.archive.crawler.datamodel.credential.Rfc2617Credential;
0094:        import org.archive.crawler.deciderules.DecideRule;
0095:        import org.archive.crawler.deciderules.DecideRuleSequence;
0096:        import org.archive.crawler.event.CrawlStatusListener;
0097:        import org.archive.crawler.framework.Filter;
0098:        import org.archive.crawler.framework.Processor;
0099:        import org.archive.crawler.settings.MapType;
0100:        import org.archive.crawler.settings.SettingsHandler;
0101:        import org.archive.crawler.settings.SimpleType;
0102:        import org.archive.crawler.settings.StringList;
0103:        import org.archive.crawler.settings.Type;
0104:        import org.archive.httpclient.ConfigurableX509TrustManager;
0105:        import org.archive.httpclient.HttpRecorderGetMethod;
0106:        import org.archive.httpclient.HttpRecorderMethod;
0107:        import org.archive.httpclient.HttpRecorderPostMethod;
0108:        import org.archive.httpclient.SingleHttpConnectionManager;
0109:        import org.archive.io.ObjectPlusFilesInputStream;
0110:        import org.archive.io.RecorderLengthExceededException;
0111:        import org.archive.io.RecorderTimeoutException;
0112:        import org.archive.io.RecorderTooMuchHeaderException;
0113:        import org.archive.util.ArchiveUtils;
0114:        import org.archive.util.HttpRecorder;
0115:        import org.archive.util.bdbje.EnhancedEnvironment;
0116:
0117:        import st.ata.util.AList;
0118:
0119:        import com.sleepycat.bind.serial.SerialBinding;
0120:        import com.sleepycat.bind.serial.StoredClassCatalog;
0121:        import com.sleepycat.bind.tuple.StringBinding;
0122:        import com.sleepycat.collections.StoredSortedMap;
0123:        import com.sleepycat.je.Database;
0124:        import com.sleepycat.je.DatabaseConfig;
0125:        import com.sleepycat.je.DatabaseException;
0126:        import com.sleepycat.je.Environment;
0127:
0128:        /**
0129:         * HTTP fetcher that uses <a
0130:         * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
0131:         * HttpClient</a> library.
0132:         *
0133:         * @author Gordon Mohr
0134:         * @author Igor Ranitovic
0135:         * @author others
0136:         * @version $Id: FetchHTTP.java 5093 2007-04-24 21:48:34Z gojomo $
0137:         */
0138:        public class FetchHTTP extends Processor implements 
0139:                CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
0140:            // be robust against trivial implementation changes
0141:            private static final long serialVersionUID = ArchiveUtils
0142:                    .classnameBasedUID(FetchHTTP.class, 1);
0143:
0144:            private static Logger logger = Logger.getLogger(FetchHTTP.class
0145:                    .getName());
0146:
0147:            public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
0148:            public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
0149:            public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
0150:            public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
0151:            public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
0152:            public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
0153:            public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
0154:            public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
0155:            public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
0156:            public static final String ATTR_DIGEST_CONTENT = "digest-content";
0157:            public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";
0158:            public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
0159:
0160:            /**
0161:             * SSL trust level setting attribute name.
0162:             */
0163:            public static final String ATTR_TRUST = "trust-level";
0164:
0165:            private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
0166:            private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
0167:            private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
0168:            private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;
0169:
0170:            /**
0171:             * This is the default value pre-1.4. Needs special handling else
0172:             * treated as negative number doing math later in processing.
0173:             */
0174:            private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
0175:
0176:            /**
0177:             * Default character encoding to use for pages that do not specify.
0178:             */
0179:            private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
0180:
0181:            /**
0182:             * Default whether to perform on-the-fly digest hashing of content-bodies.
0183:             */
0184:            static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);
0185:
0186:            /**
0187:             * The different digest algorithms to choose between, 
0188:             * SHA-1 or MD-5 at the moment. 
0189:             */
0190:            public static final String SHA1 = "sha1";
0191:            public static final String MD5 = "md5";
0192:            public static String[] DIGEST_ALGORITHMS = { SHA1, MD5 };
0193:
0194:            /**
0195:             * Default algorithm to use for message disgesting.
0196:             */
0197:            public static final String DEFAULT_DIGEST_ALGORITHM = SHA1;
0198:
0199:            private transient HttpClient http = null;
0200:
0201:            /**
0202:             * How many 'instant retries' of HttpRecoverableExceptions have occurred
0203:             * 
0204:             * Would like it to be 'long', but longs aren't atomic
0205:             */
0206:            private int recoveryRetries = 0;
0207:
0208:            /**
0209:             * Count of crawl uris handled.
0210:             * Would like to be 'long', but longs aren't atomic
0211:             */
0212:            private int curisHandled = 0;
0213:
0214:            /**
0215:             * Rules to apply mid-fetch, just after receipt of the response
0216:             * headers before we start to download body.
0217:             */
0218:            public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";
0219:
0220:            /**
0221:             * What to log if midfetch abort.
0222:             */
0223:            private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";
0224:
0225:            public static final String ATTR_SEND_CONNECTION_CLOSE = "send-connection-close";
0226:            private static final Header HEADER_SEND_CONNECTION_CLOSE = new Header(
0227:                    "Connection", "close");
0228:            public static final String ATTR_SEND_REFERER = "send-referer";
0229:            public static final String ATTR_SEND_RANGE = "send-range";
0230:            public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";
0231:            public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";
0232:            public static final String REFERER = "Referer";
0233:            public static final String RANGE = "Range";
0234:            public static final String RANGE_PREFIX = "bytes=0-";
0235:            public static final String HTTP_SCHEME = "http";
0236:            public static final String HTTPS_SCHEME = "https";
0237:
0238:            public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
0239:            private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);
0240:
0241:            public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
0242:            private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);
0243:
0244:            public static final String ATTR_LOCAL_ADDRESS = "bind-address";
0245:
0246:            /**
0247:             * Database backing cookie map, if using BDB
0248:             */
0249:            protected Database cookieDb;
0250:            /**
0251:             * Name of cookie BDB Database
0252:             */
0253:            public static final String COOKIEDB_NAME = "http_cookies";
0254:
0255:            static {
0256:                Protocol.registerProtocol("http", new Protocol("http",
0257:                        new HeritrixProtocolSocketFactory(), 80));
0258:                try {
0259:                    Protocol
0260:                            .registerProtocol(
0261:                                    "https",
0262:                                    new Protocol(
0263:                                            "https",
0264:                                            ((ProtocolSocketFactory) new HeritrixSSLProtocolSocketFactory()),
0265:                                            443));
0266:                } catch (KeyManagementException e) {
0267:                    e.printStackTrace();
0268:                } catch (KeyStoreException e) {
0269:                    e.printStackTrace();
0270:                } catch (NoSuchAlgorithmException e) {
0271:                    e.printStackTrace();
0272:                }
0273:            }
0274:            static final String SERVER_CACHE_KEY = "heritrix.server.cache";
0275:            static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";
0276:
0277:            /***
0278:             * Socket factory that has the configurable trust manager installed.
0279:             */
0280:            private SSLSocketFactory sslfactory = null;
0281:
0282:            /**
0283:             * Constructor.
0284:             *
0285:             * @param name Name of this processor.
0286:             */
0287:            public FetchHTTP(String name) {
0288:                super (name, "HTTP Fetcher");
0289:
0290:                addElementToDefinition(new DecideRuleSequence(
0291:                        ATTR_MIDFETCH_DECIDE_RULES,
0292:                        "DecideRules which, if final decision is REJECT, "
0293:                                + "abort fetch after headers before all content is"
0294:                                + "read."));
0295:
0296:                addElementToDefinition(new SimpleType(
0297:                        ATTR_TIMEOUT_SECONDS,
0298:                        "If the fetch is not completed in this number of seconds, "
0299:                                + "even if it is making progress, give up. The URI will be "
0300:                                + "annotated as timeTrunc. Set to zero for no timeout. "
0301:                                + "(This is not recommended: threads could wait indefinitely "
0302:                                + "for the fetch to end.)",
0303:                        DEFAULT_TIMEOUT_SECONDS));
0304:                Type e = addElementToDefinition(new SimpleType(
0305:                        ATTR_SOTIMEOUT_MS,
0306:                        "If a socket is unresponsive for this number of milliseconds, "
0307:                                + "give up on that connects/read. (This does not necessarily give "
0308:                                + "up on the fetch immediately; connects are subject to retries "
0309:                                + "and reads will be retried until "
0310:                                + ATTR_TIMEOUT_SECONDS
0311:                                + " have elapsed. Set to zero for no socket timeout. (This is "
0312:                                + "note recommended: a socket operation could hand indefinitely.",
0313:                        DEFAULT_SOTIMEOUT_MS));
0314:                e.setExpertSetting(true);
0315:                e = addElementToDefinition(new SimpleType(
0316:                        ATTR_FETCH_BANDWIDTH_MAX,
0317:                        "The maximum KB/sec to use when fetching data from a server. "
0318:                                + "0 means no maximum.  Default: "
0319:                                + DEFAULT_FETCH_BANDWIDTH_MAX + ".",
0320:                        DEFAULT_FETCH_BANDWIDTH_MAX));
0321:                e.setExpertSetting(true);
0322:                e.setOverrideable(true);
0323:                addElementToDefinition(new SimpleType(
0324:                        ATTR_MAX_LENGTH_BYTES,
0325:                        "Maximum length in bytes to fetch.\n"
0326:                                + "Fetch is truncated at this length. A value of 0 means no limit.",
0327:                        DEFAULT_MAX_LENGTH_BYTES));
0328:                e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
0329:                        "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
0330:                e.setOverrideable(true);
0331:                e.setExpertSetting(true);
0332:                e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
0333:                        "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
0334:                e.setExpertSetting(true);
0335:
0336:                e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
0337:                        "File to preload cookies from", ""));
0338:                e.setExpertSetting(true);
0339:                e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
0340:                        "When crawl finishes save cookies to this file", ""));
0341:                e.setExpertSetting(true);
0342:                e = addElementToDefinition(new SimpleType(
0343:                        ATTR_TRUST,
0344:                        "SSL certificate trust level.  Range is from the default 'open'"
0345:                                + " (trust all certs including expired, selfsigned, and those for"
0346:                                + " which we do not have a CA) through 'loose' (trust all valid"
0347:                                + " certificates including selfsigned), 'normal' (all valid"
0348:                                + " certificates not including selfsigned) to 'strict' (Cert is"
0349:                                + " valid and DN must match servername)",
0350:                        ConfigurableX509TrustManager.DEFAULT,
0351:                        ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
0352:                e.setOverrideable(false);
0353:                e.setExpertSetting(true);
0354:                e = addElementToDefinition(new StringList(
0355:                        ATTR_ACCEPT_HEADERS,
0356:                        "Accept Headers to include in each request. Each must be the"
0357:                                + " complete header, e.g., 'Accept-Language: en'"));
0358:                e.setExpertSetting(true);
0359:                e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
0360:                        "Proxy host IP (set only if needed).", ""));
0361:                e.setExpertSetting(true);
0362:                e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
0363:                        "Proxy port (set only if needed)", ""));
0364:                e.setExpertSetting(true);
0365:                e = addElementToDefinition(new SimpleType(
0366:                        ATTR_DEFAULT_ENCODING,
0367:                        "The character encoding to use for files that do not have one"
0368:                                + " specified in the HTTP response headers.  Default: "
0369:                                + DEFAULT_CONTENT_CHARSET + ".",
0370:                        DEFAULT_CONTENT_CHARSET));
0371:                e.setExpertSetting(true);
0372:                e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT,
0373:                        "Whether or not to perform an on-the-fly digest hash of"
0374:                                + " retrieved content-bodies.",
0375:                        DEFAULT_DIGEST_CONTENT));
0376:                e.setExpertSetting(true);
0377:                e = addElementToDefinition(new SimpleType(
0378:                        ATTR_DIGEST_ALGORITHM,
0379:                        "Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest"
0380:                                + " hash of retrieved content-bodies.",
0381:                        DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));
0382:                e.setExpertSetting(true);
0383:                e = addElementToDefinition(new SimpleType(
0384:                        ATTR_SEND_IF_MODIFIED_SINCE,
0385:                        "Send 'If-Modified-Since' header, if previous 'Last-Modified' "
0386:                                + "fetch history information is available in URI history.",
0387:                        new Boolean(true)));
0388:                e.setOverrideable(true);
0389:                e.setExpertSetting(true);
0390:                e = addElementToDefinition(new SimpleType(
0391:                        ATTR_SEND_IF_NONE_MATCH,
0392:                        "Send 'If-None-Match' header, if previous 'Etag' fetch "
0393:                                + "history information is available in URI history.",
0394:                        new Boolean(true)));
0395:                e.setOverrideable(true);
0396:                e.setExpertSetting(true);
0397:                e = addElementToDefinition(new SimpleType(
0398:                        ATTR_SEND_CONNECTION_CLOSE,
0399:                        "Send 'Connection: close' header with every request.",
0400:                        new Boolean(true)));
0401:                e.setOverrideable(true);
0402:                e.setExpertSetting(true);
0403:                e = addElementToDefinition(new SimpleType(
0404:                        ATTR_SEND_REFERER,
0405:                        "Send 'Referer' header with every request.\n"
0406:                                + "The 'Referer' header contans the location the crawler came "
0407:                                + " from, "
0408:                                + "the page the current URI was discovered in. The 'Referer' "
0409:                                + "usually is "
0410:                                + "logged on the remote server and can be of assistance to "
0411:                                + "webmasters trying to figure how a crawler got to a "
0412:                                + "particular area on a site.", new Boolean(
0413:                                true)));
0414:                e.setOverrideable(true);
0415:                e.setExpertSetting(true);
0416:                e = addElementToDefinition(new SimpleType(
0417:                        ATTR_SEND_RANGE,
0418:                        "Send 'Range' header when a limit ("
0419:                                + ATTR_MAX_LENGTH_BYTES
0420:                                + ") on document size.\n"
0421:                                + "Be polite to the HTTP servers and send the 'Range' header,"
0422:                                + "stating that you are only interested in the first n bytes. "
0423:                                + "Only pertinent if "
0424:                                + ATTR_MAX_LENGTH_BYTES
0425:                                + " > 0. "
0426:                                + "Sending the 'Range' header results in a "
0427:                                + "'206 Partial Content' status response, which is better than "
0428:                                + "just cutting the response mid-download. On rare occasion, "
0429:                                + " sending 'Range' will "
0430:                                + "generate '416 Request Range Not Satisfiable' response.",
0431:                        new Boolean(false)));
0432:                e.setOverrideable(true);
0433:                e.setExpertSetting(true);
0434:                e = addElementToDefinition(new SimpleType(
0435:                        ATTR_LOCAL_ADDRESS,
0436:                        "Local IP address or hostname to use when making connections "
0437:                                + "(binding sockets). When not specified, uses default local"
0438:                                + "address(es).", ""));
0439:                e.setExpertSetting(true);
0440:            }
0441:
0442:            protected void innerProcess(final CrawlURI curi)
0443:                    throws InterruptedException {
0444:                if (!canFetch(curi)) {
0445:                    // Cannot fetch this, due to protocol, retries, or other problems
0446:                    return;
0447:                }
0448:
0449:                this .curisHandled++;
0450:
0451:                // Note begin time
0452:                curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
0453:
0454:                // Get a reference to the HttpRecorder that is set into this ToeThread.
0455:                HttpRecorder rec = HttpRecorder.getHttpRecorder();
0456:
0457:                // Shall we get a digest on the content downloaded?
0458:                boolean digestContent = ((Boolean) getUncheckedAttribute(curi,
0459:                        ATTR_DIGEST_CONTENT)).booleanValue();
0460:                String algorithm = null;
0461:                if (digestContent) {
0462:                    algorithm = ((String) getUncheckedAttribute(curi,
0463:                            ATTR_DIGEST_ALGORITHM));
0464:                    rec.getRecordedInput().setDigest(algorithm);
0465:                } else {
0466:                    // clear
0467:                    rec.getRecordedInput().setDigest((MessageDigest) null);
0468:                }
0469:
0470:                // Below we do two inner classes that add check of midfetch
0471:                // filters just as we're about to receive the response body.
0472:                String curiString = curi.getUURI().toString();
0473:                HttpMethodBase method = null;
0474:                if (curi.isPost()) {
0475:                    method = new HttpRecorderPostMethod(curiString, rec) {
0476:                        protected void readResponseBody(HttpState state,
0477:                                HttpConnection conn) throws IOException,
0478:                                HttpException {
0479:                            addResponseContent(this , curi);
0480:                            if (checkMidfetchAbort(curi,
0481:                                    this .httpRecorderMethod, conn)) {
0482:                                doAbort(curi, this , MIDFETCH_ABORT_LOG);
0483:                            } else {
0484:                                super .readResponseBody(state, conn);
0485:                            }
0486:                        }
0487:                    };
0488:                } else {
0489:                    method = new HttpRecorderGetMethod(curiString, rec) {
0490:                        protected void readResponseBody(HttpState state,
0491:                                HttpConnection conn) throws IOException,
0492:                                HttpException {
0493:                            addResponseContent(this , curi);
0494:                            if (checkMidfetchAbort(curi,
0495:                                    this .httpRecorderMethod, conn)) {
0496:                                doAbort(curi, this , MIDFETCH_ABORT_LOG);
0497:                            } else {
0498:                                super .readResponseBody(state, conn);
0499:                            }
0500:                        }
0501:                    };
0502:                }
0503:
0504:                HostConfiguration customConfigOrNull = configureMethod(curi,
0505:                        method);
0506:
0507:                // Set httpRecorder into curi. Subsequent code both here and later
0508:                // in extractors expects to find the HttpRecorder in the CrawlURI.
0509:                curi.setHttpRecorder(rec);
0510:
0511:                // Populate credentials. Set config so auth. is not automatic.
0512:                boolean addedCredentials = populateCredentials(curi, method);
0513:                method.setDoAuthentication(addedCredentials);
0514:
0515:                // set hardMax on bytes (if set by operator)
0516:                long hardMax = getMaxLength(curi);
0517:                // set overall timeout (if set by operator)
0518:                long timeoutMs = 1000 * getTimeout(curi);
0519:                // Get max fetch rate (bytes/ms). It comes in in KB/sec
0520:                long maxRateKBps = getMaxFetchRate(curi);
0521:                rec.getRecordedInput().setLimits(hardMax, timeoutMs,
0522:                        maxRateKBps);
0523:
0524:                try {
0525:                    this .http.executeMethod(customConfigOrNull, method);
0526:                } catch (RecorderTooMuchHeaderException ex) {
0527:                    // when too much header material, abort like other truncations
0528:                    doAbort(curi, method, HEADER_TRUNC);
0529:                } catch (IOException e) {
0530:                    failedExecuteCleanup(method, curi, e);
0531:                    return;
0532:                } catch (ArrayIndexOutOfBoundsException e) {
0533:                    // For weird windows-only ArrayIndex exceptions in native
0534:                    // code... see
0535:                    // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
0536:                    // treating as if it were an IOException
0537:                    failedExecuteCleanup(method, curi, e);
0538:                    return;
0539:                }
0540:
0541:                // set softMax on bytes to get (if implied by content-length) 
0542:                long softMax = method.getResponseContentLength();
0543:
0544:                try {
0545:                    if (!method.isAborted()) {
0546:                        // Force read-to-end, so that any socket hangs occur here,
0547:                        // not in later modules.
0548:                        rec.getRecordedInput().readFullyOrUntil(softMax);
0549:                    }
0550:                } catch (RecorderTimeoutException ex) {
0551:                    doAbort(curi, method, TIMER_TRUNC);
0552:                } catch (RecorderLengthExceededException ex) {
0553:                    doAbort(curi, method, LENGTH_TRUNC);
0554:                } catch (IOException e) {
0555:                    cleanup(curi, e, "readFully", S_CONNECT_LOST);
0556:                    return;
0557:                } catch (ArrayIndexOutOfBoundsException e) {
0558:                    // For weird windows-only ArrayIndex exceptions from native code
0559:                    // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
0560:                    // treating as if it were an IOException
0561:                    cleanup(curi, e, "readFully", S_CONNECT_LOST);
0562:                    return;
0563:                } finally {
0564:                    // ensure recording has stopped
0565:                    rec.closeRecorders();
0566:                    if (!method.isAborted()) {
0567:                        method.releaseConnection();
0568:                    }
0569:                    // Note completion time
0570:                    curi.putLong(A_FETCH_COMPLETED_TIME, System
0571:                            .currentTimeMillis());
0572:                    // Set the response charset into the HttpRecord if available.
0573:                    setCharacterEncoding(rec, method);
0574:                    setSizes(curi, rec);
0575:                }
0576:
0577:                if (digestContent) {
0578:                    curi.setContentDigest(algorithm, rec.getRecordedInput()
0579:                            .getDigestValue());
0580:                }
0581:                if (logger.isLoggable(Level.INFO)) {
0582:                    logger.info((curi.isPost() ? "POST" : "GET") + " "
0583:                            + curi.getUURI().toString() + " "
0584:                            + method.getStatusCode() + " "
0585:                            + rec.getRecordedInput().getSize() + " "
0586:                            + curi.getContentType());
0587:                }
0588:
0589:                if (curi.isSuccess() && addedCredentials) {
0590:                    // Promote the credentials from the CrawlURI to the CrawlServer
0591:                    // so they are available for all subsequent CrawlURIs on this
0592:                    // server.
0593:                    promoteCredentials(curi);
0594:                    if (logger.isLoggable(Level.FINE)) {
0595:                        // Print out the cookie.  Might help with the debugging.
0596:                        Header setCookie = method
0597:                                .getResponseHeader("set-cookie");
0598:                        if (setCookie != null) {
0599:                            logger.fine(setCookie.toString().trim());
0600:                        }
0601:                    }
0602:                } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
0603:                    // 401 is not 'success'.
0604:                    handle401(method, curi);
0605:                }
0606:
0607:                if (rec.getRecordedInput().isOpen()) {
0608:                    logger.severe(curi.toString()
0609:                            + " RIS still open. Should have"
0610:                            + " been closed by method release: "
0611:                            + Thread.currentThread().getName());
0612:                    try {
0613:                        rec.getRecordedInput().close();
0614:                    } catch (IOException e) {
0615:                        logger.log(Level.SEVERE,
0616:                                "second-chance RIS close failed", e);
0617:                    }
0618:                }
0619:            }
0620:
0621:            /**
0622:             * Update CrawlURI internal sizes based on current transaction (and
0623:             * in the case of 304s, history) 
0624:             * 
0625:             * @param curi CrawlURI
0626:             * @param rec HttpRecorder
0627:             */
0628:            protected void setSizes(final CrawlURI curi, HttpRecorder rec) {
0629:                // set reporting size
0630:                curi.setContentSize(rec.getRecordedInput().getSize());
0631:                // special handling for 304-not modified
0632:                if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED
0633:                        && curi.containsKey(A_FETCH_HISTORY)) {
0634:                    AList history[] = curi.getAList().getAListArray(
0635:                            A_FETCH_HISTORY);
0636:                    if (history[0] != null
0637:                            && history[0]
0638:                                    .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {
0639:                        long referenceLength = history[0]
0640:                                .getLong(A_REFERENCE_LENGTH);
0641:                        // carry-forward previous 'reference-length' for future
0642:                        curi.putLong(A_REFERENCE_LENGTH, referenceLength);
0643:                        // increase content-size to virtual-size for reporting
0644:                        curi.setContentSize(rec.getRecordedInput().getSize()
0645:                                + referenceLength);
0646:                    }
0647:                }
0648:            }
0649:
0650:            protected void doAbort(CrawlURI curi, HttpMethod method,
0651:                    String annotation) {
0652:                curi.addAnnotation(annotation);
0653:                curi.getHttpRecorder().close();
0654:                method.abort();
0655:            }
0656:
0657:            protected boolean checkMidfetchAbort(CrawlURI curi,
0658:                    HttpRecorderMethod method, HttpConnection conn) {
0659:                if (curi.isPrerequisite()
0660:                        || rulesAccept(getMidfetchRule(curi), curi)) {
0661:                    return false;
0662:                }
0663:                method.markContentBegin(conn);
0664:                return true;
0665:            }
0666:
0667:            protected DecideRule getMidfetchRule(Object o) {
0668:                try {
0669:                    return (DecideRule) getAttribute(o,
0670:                            ATTR_MIDFETCH_DECIDE_RULES);
0671:                } catch (AttributeNotFoundException e) {
0672:                    throw new RuntimeException(e);
0673:                }
0674:            }
0675:
0676:            /**
0677:             * This method populates <code>curi</code> with response status and
0678:             * content type.
0679:             * @param curi CrawlURI to populate.
0680:             * @param method Method to get response status and headers from.
0681:             */
0682:            protected void addResponseContent(HttpMethod method, CrawlURI curi) {
0683:                curi.setFetchStatus(method.getStatusCode());
0684:                Header ct = method.getResponseHeader("content-type");
0685:                curi.setContentType((ct == null) ? null : ct.getValue());
0686:                // Save method into curi too.  Midfetch filters may want to leverage
0687:                // info in here.
0688:                curi.putObject(A_HTTP_TRANSACTION, method);
0689:            }
0690:
0691:            /**
0692:             * Set the character encoding based on the result headers or default.
0693:             *
0694:             * The HttpClient returns its own default encoding ("ISO-8859-1") if one
0695:             * isn't specified in the Content-Type response header. We give the user
0696:             * the option of overriding this, so we need to detect the case where the
0697:             * default is returned.
0698:             *
0699:             * Now, it may well be the case that the default returned by HttpClient
0700:             * and the default defined by the user are the same.
0701:             * 
0702:             * @param rec Recorder for this request.
0703:             * @param method Method used for the request.
0704:             */
0705:            private void setCharacterEncoding(final HttpRecorder rec,
0706:                    final HttpMethod method) {
0707:                String encoding = null;
0708:
0709:                try {
0710:                    encoding = ((HttpMethodBase) method).getResponseCharSet();
0711:                    if (encoding == null
0712:                            || encoding.equals(DEFAULT_CONTENT_CHARSET)) {
0713:                        encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
0714:                    }
0715:                } catch (Exception e) {
0716:                    logger.warning("Failed get default encoding: "
0717:                            + e.getLocalizedMessage());
0718:                }
0719:                rec.setCharacterEncoding(encoding);
0720:            }
0721:
0722:            /**
0723:             * Cleanup after a failed method execute.
0724:             * @param curi CrawlURI we failed on.
0725:             * @param method Method we failed on.
0726:             * @param exception Exception we failed with.
0727:             */
0728:            private void failedExecuteCleanup(final HttpMethod method,
0729:                    final CrawlURI curi, final Exception exception) {
0730:                cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED);
0731:                method.releaseConnection();
0732:            }
0733:
0734:            /**
0735:             * Cleanup after a failed method execute.
0736:             * @param curi CrawlURI we failed on.
0737:             * @param exception Exception we failed with.
0738:             * @param message Message to log with failure.
0739:             * @param status Status to set on the fetch.
0740:             */
0741:            private void cleanup(final CrawlURI curi,
0742:                    final Exception exception, final String message,
0743:                    final int status) {
0744:                curi.addLocalizedError(this .getName(), exception, message);
0745:                curi.setFetchStatus(status);
0746:                curi.getHttpRecorder().close();
0747:            }
0748:
0749:            /**
0750:             * Can this processor fetch the given CrawlURI. May set a fetch
0751:             * status if this processor would usually handle the CrawlURI,
0752:             * but cannot in this instance.
0753:             *
0754:             * @param curi
0755:             * @return True if processor can fetch.
0756:             */
0757:            private boolean canFetch(CrawlURI curi) {
0758:                if (curi.getFetchStatus() < 0) {
0759:                    // already marked as errored, this pass through
0760:                    // skip to end
0761:                    curi.skipToProcessorChain(getController()
0762:                            .getPostprocessorChain());
0763:                    return false;
0764:                }
0765:                String scheme = curi.getUURI().getScheme();
0766:                if (!(scheme.equals("http") || scheme.equals("https"))) {
0767:                    // handles only plain http and https
0768:                    return false;
0769:                }
0770:                CrawlHost host = getController().getServerCache().getHostFor(
0771:                        curi);
0772:                // make sure the dns lookup succeeded
0773:                if (host.getIP() == null && host.hasBeenLookedUp()) {
0774:                    curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
0775:                    return false;
0776:                }
0777:                return true;
0778:            }
0779:
0780:            /**
0781:             * Configure the HttpMethod setting options and headers.
0782:             *
0783:             * @param curi CrawlURI from which we pull configuration.
0784:             * @param method The Method to configure.
0785:             */
0786:            protected HostConfiguration configureMethod(CrawlURI curi,
0787:                    HttpMethod method) {
0788:                // Don't auto-follow redirects
0789:                method.setFollowRedirects(false);
0790:
0791:                //        // set soTimeout
0792:                //        method.getParams().setSoTimeout(
0793:                //                ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
0794:                //                        .intValue());
0795:
0796:                // Set cookie policy.
0797:                method
0798:                        .getParams()
0799:                        .setCookiePolicy(
0800:                                (((Boolean) getUncheckedAttribute(curi,
0801:                                        ATTR_IGNORE_COOKIES)).booleanValue()) ? CookiePolicy.IGNORE_COOKIES
0802:                                        : CookiePolicy.BROWSER_COMPATIBILITY);
0803:
0804:                // Use only HTTP/1.0 (to avoid receiving chunked responses)
0805:                method.getParams().setVersion(HttpVersion.HTTP_1_0);
0806:
0807:                CrawlOrder order = getSettingsHandler().getOrder();
0808:                String userAgent = curi.getUserAgent();
0809:                if (userAgent == null) {
0810:                    userAgent = order.getUserAgent(curi);
0811:                }
0812:                method.setRequestHeader("User-Agent", userAgent);
0813:                method.setRequestHeader("From", order.getFrom(curi));
0814:
0815:                // Set retry handler.
0816:                method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
0817:                        new HeritrixHttpMethodRetryHandler());
0818:
0819:                final long maxLength = getMaxLength(curi);
0820:                if (maxLength > 0
0821:                        && ((Boolean) getUncheckedAttribute(curi,
0822:                                ATTR_SEND_RANGE)).booleanValue()) {
0823:                    method.addRequestHeader(RANGE, RANGE_PREFIX.concat(Long
0824:                            .toString(maxLength - 1)));
0825:                }
0826:
0827:                if (((Boolean) getUncheckedAttribute(curi,
0828:                        ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
0829:                    method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
0830:                }
0831:
0832:                if (((Boolean) getUncheckedAttribute(curi, ATTR_SEND_REFERER))
0833:                        .booleanValue()) {
0834:                    // RFC2616 says no referer header if referer is https and the url
0835:                    // is not
0836:                    String via = curi.flattenVia();
0837:                    if (via != null
0838:                            && via.length() > 0
0839:                            && !(via.startsWith(HTTPS_SCHEME) && curi.getUURI()
0840:                                    .getScheme().equals(HTTP_SCHEME))) {
0841:                        method.setRequestHeader(REFERER, via);
0842:                    }
0843:                }
0844:
0845:                if (!curi.isPrerequisite()) {
0846:                    setConditionalGetHeader(curi, method,
0847:                            ATTR_SEND_IF_MODIFIED_SINCE,
0848:                            CoreAttributeConstants.A_LAST_MODIFIED_HEADER,
0849:                            "If-Modified-Since");
0850:                    setConditionalGetHeader(curi, method,
0851:                            ATTR_SEND_IF_NONE_MATCH,
0852:                            CoreAttributeConstants.A_ETAG_HEADER,
0853:                            "If-None-Match");
0854:                }
0855:
0856:                // TODO: What happens if below method adds a header already
0857:                // added above: e.g. Connection, Range, or Referer?
0858:                setAcceptHeaders(curi, method);
0859:
0860:                return configureProxy(curi);
0861:            }
0862:
0863:            /**
0864:             * Set the given conditional-GET header, if the setting is enabled and
0865:             * a suitable value is available in the URI history. 
0866:             * @param curi source CrawlURI
0867:             * @param method HTTP operation pending
0868:             * @param setting true/false enablement setting name to consult
0869:             * @param sourceHeader header to consult in URI history
0870:             * @param targetHeader header to set if possible
0871:             */
0872:            protected void setConditionalGetHeader(CrawlURI curi,
0873:                    HttpMethod method, String setting, String sourceHeader,
0874:                    String targetHeader) {
0875:                if (((Boolean) getUncheckedAttribute(curi, setting))) {
0876:                    try {
0877:                        String previous = curi.getAList().getAListArray(
0878:                                A_FETCH_HISTORY)[0].getString(sourceHeader);
0879:                        if (previous != null) {
0880:                            method.setRequestHeader(targetHeader, previous);
0881:                        }
0882:                    } catch (RuntimeException e) {
0883:                        // for absent key, bad index, etc. just do nothing
0884:                    }
0885:                }
0886:            }
0887:
0888:            /**
0889:             * Setup proxy, based on attributes in CrawlURI and settings, 
0890:             * for this CrawlURI only. 
0891:             * @return HostConfiguration customized as necessary, or null if no
0892:             * customization required
0893:             */
0894:            private HostConfiguration configureProxy(CrawlURI curi) {
0895:                String proxy = (String) getAttributeEither(curi,
0896:                        ATTR_HTTP_PROXY_HOST);
0897:                int port = -1;
0898:                if (proxy.length() == 0) {
0899:                    proxy = null;
0900:                } else {
0901:                    String portString = (String) getAttributeEither(curi,
0902:                            ATTR_HTTP_PROXY_PORT);
0903:                    port = portString.length() > 0 ? Integer
0904:                            .parseInt(portString) : -1;
0905:                }
0906:                HostConfiguration config = this .http.getHostConfiguration();
0907:                if (config.getProxyHost() == proxy
0908:                        && config.getProxyPort() == port) {
0909:                    // no change
0910:                    return null;
0911:                }
0912:                if (proxy != null && proxy.equals(config.getProxyHost())
0913:                        && config.getProxyPort() == port) {
0914:                    // no change
0915:                    return null;
0916:                }
0917:                config = new HostConfiguration(config); // copy of config
0918:                config.setProxy(proxy, port);
0919:                return config;
0920:            }
0921:
0922:            /**
0923:             * Get a value either from inside the CrawlURI instance, or from 
0924:             * settings (module attributes). 
0925:             * 
0926:             * @param curi CrawlURI to consult
0927:             * @param key key to lookup
0928:             * @return value from either CrawlURI (preferred) or settings
0929:             */
0930:            protected Object getAttributeEither(CrawlURI curi, String key) {
0931:                Object obj = curi != null ? curi.getObject(key) : null;
0932:                if (obj == null) {
0933:                    obj = getUncheckedAttribute(curi, key);
0934:                }
0935:                return obj;
0936:            }
0937:
0938:            /**
0939:             * Add credentials if any to passed <code>method</code>.
0940:             *
0941:             * Do credential handling.  Credentials are in two places.  1. Credentials
0942:             * that succeeded are added to the CrawlServer (Or rather, avatars for
0943:             * credentials are whats added because its not safe to keep around
0944:             * references to credentials).  2. Credentials to be tried are in the curi.
0945:             * Returns true if found credentials to be tried.
0946:             *
0947:             * @param curi Current CrawlURI.
0948:             * @param method The method to add to.
0949:             * @return True if prepopulated <code>method</code> with credentials AND the
0950:             * credentials came from the <code>curi</code>, not from the CrawlServer.
0951:             * The former is  special in that if the <code>curi</curi> credentials
0952:             * succeed, then the caller needs to promote them from the CrawlURI to the
0953:             * CrawlServer so they are available for all subsequent CrawlURIs on this
0954:             * server.
0955:             */
0956:            private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
0957:                // First look at the server avatars. Add any that are to be volunteered
0958:                // on every request (e.g. RFC2617 credentials).  Every time creds will
0959:                // return true when we call 'isEveryTime().
0960:                CrawlServer server = getController().getServerCache()
0961:                        .getServerFor(curi);
0962:                if (server.hasCredentialAvatars()) {
0963:                    Set avatars = server.getCredentialAvatars();
0964:                    for (Iterator i = avatars.iterator(); i.hasNext();) {
0965:                        CredentialAvatar ca = (CredentialAvatar) i.next();
0966:                        Credential c = ca.getCredential(getSettingsHandler(),
0967:                                curi);
0968:                        if (c.isEveryTime()) {
0969:                            c
0970:                                    .populate(curi, this .http, method, ca
0971:                                            .getPayload());
0972:                        }
0973:                    }
0974:                }
0975:
0976:                boolean result = false;
0977:
0978:                // Now look in the curi.  The Curi will have credentials loaded either
0979:                // by the handle401 method if its a rfc2617 or it'll have been set into
0980:                // the curi by the preconditionenforcer as this login uri came through.
0981:                if (curi.hasCredentialAvatars()) {
0982:                    Set avatars = curi.getCredentialAvatars();
0983:                    for (Iterator i = avatars.iterator(); i.hasNext();) {
0984:                        CredentialAvatar ca = (CredentialAvatar) i.next();
0985:                        Credential c = ca.getCredential(getSettingsHandler(),
0986:                                curi);
0987:                        if (c
0988:                                .populate(curi, this .http, method, ca
0989:                                        .getPayload())) {
0990:                            result = true;
0991:                        }
0992:                    }
0993:                }
0994:
0995:                return result;
0996:            }
0997:
0998:            /**
0999:             * Promote successful credential to the server.
1000:             *
1001:             * @param curi CrawlURI whose credentials we are to promote.
1002:             */
1003:            private void promoteCredentials(final CrawlURI curi) {
1004:                if (!curi.hasCredentialAvatars()) {
1005:                    logger
1006:                            .severe("No credentials to promote when there should be "
1007:                                    + curi);
1008:                } else {
1009:                    Set avatars = curi.getCredentialAvatars();
1010:                    for (Iterator i = avatars.iterator(); i.hasNext();) {
1011:                        CredentialAvatar ca = (CredentialAvatar) i.next();
1012:                        curi.removeCredentialAvatar(ca);
1013:                        // The server to attach too may not be the server that hosts
1014:                        // this passed curi.  It might be of another subdomain.
1015:                        // The avatar needs to be added to the server that is dependent
1016:                        // on this precondition.  Find it by name.  Get the name from
1017:                        // the credential this avatar represents.
1018:                        Credential c = ca.getCredential(getSettingsHandler(),
1019:                                curi);
1020:                        String cd = null;
1021:                        try {
1022:                            cd = c.getCredentialDomain(curi);
1023:                        } catch (AttributeNotFoundException e) {
1024:                            logger.severe("Failed to get cred domain for "
1025:                                    + curi + " for " + ca + ": "
1026:                                    + e.getMessage());
1027:                        }
1028:                        if (cd != null) {
1029:                            CrawlServer cs = getController().getServerCache()
1030:                                    .getServerFor(cd);
1031:                            if (cs != null) {
1032:                                cs.addCredentialAvatar(ca);
1033:                            }
1034:                        }
1035:                    }
1036:                }
1037:            }
1038:
1039:            /**
1040:             * Server is looking for basic/digest auth credentials (RFC2617). If we have
1041:             * any, put them into the CrawlURI and have it come around again. Presence
1042:             * of the credential serves as flag to frontier to requeue promptly. If we
1043:             * already tried this domain and still got a 401, then our credentials are
1044:             * bad. Remove them and let this curi die.
1045:             *
1046:             * @param method Method that got a 401.
1047:             * @param curi CrawlURI that got a 401.
1048:             */
1049:            protected void handle401(final HttpMethod method,
1050:                    final CrawlURI curi) {
1051:                AuthScheme authscheme = getAuthScheme(method, curi);
1052:                if (authscheme == null) {
1053:                    return;
1054:                }
1055:                String realm = authscheme.getRealm();
1056:
1057:                // Look to see if this curi had rfc2617 avatars loaded.  If so, are
1058:                // any of them for this realm?  If so, then the credential failed
1059:                // if we got a 401 and it should be let die a natural 401 death.
1060:                Set curiRfc2617Credentials = getCredentials(
1061:                        getSettingsHandler(), curi, Rfc2617Credential.class);
1062:                Rfc2617Credential extant = Rfc2617Credential.getByRealm(
1063:                        curiRfc2617Credentials, realm, curi);
1064:                if (extant != null) {
1065:                    // Then, already tried this credential.  Remove ANY rfc2617
1066:                    // credential since presence of a rfc2617 credential serves
1067:                    // as flag to frontier to requeue this curi and let the curi
1068:                    // die a natural death.
1069:                    extant.detachAll(curi);
1070:                    logger.warning("Auth failed (401) though supplied realm "
1071:                            + realm + " to " + curi.toString());
1072:                } else {
1073:                    // Look see if we have a credential that corresponds to this
1074:                    // realm in credential store.  Filter by type and credential
1075:                    // domain.  If not, let this curi die. Else, add it to the
1076:                    // curi and let it come around again. Add in the AuthScheme
1077:                    // we got too.  Its needed when we go to run the Auth on
1078:                    // second time around.
1079:                    CredentialStore cs = CredentialStore
1080:                            .getCredentialStore(getSettingsHandler());
1081:                    if (cs == null) {
1082:                        logger.severe("No credential store for " + curi);
1083:                    } else {
1084:                        CrawlServer server = getController().getServerCache()
1085:                                .getServerFor(curi);
1086:                        Set storeRfc2617Credentials = cs.subset(curi,
1087:                                Rfc2617Credential.class, server.getName());
1088:                        if (storeRfc2617Credentials == null
1089:                                || storeRfc2617Credentials.size() <= 0) {
1090:                            logger.info("No rfc2617 credentials for " + curi);
1091:                        } else {
1092:                            Rfc2617Credential found = Rfc2617Credential
1093:                                    .getByRealm(storeRfc2617Credentials, realm,
1094:                                            curi);
1095:                            if (found == null) {
1096:                                logger.info("No rfc2617 credentials for realm "
1097:                                        + realm + " in " + curi);
1098:                            } else {
1099:                                found.attach(curi, authscheme.getRealm());
1100:                                logger.info("Found credential for realm "
1101:                                        + realm + " in store for "
1102:                                        + curi.toString());
1103:                            }
1104:                        }
1105:                    }
1106:                }
1107:            }
1108:
1109:            /**
1110:             * @param method Method that got a 401.
1111:             * @param curi CrawlURI that got a 401.
1112:             * @return Returns first wholesome authscheme found else null.
1113:             */
1114:            protected AuthScheme getAuthScheme(final HttpMethod method,
1115:                    final CrawlURI curi) {
1116:                Header[] headers = method
1117:                        .getResponseHeaders("WWW-Authenticate");
1118:                if (headers == null || headers.length <= 0) {
1119:                    logger
1120:                            .info("We got a 401 but no WWW-Authenticate challenge: "
1121:                                    + curi.toString());
1122:                    return null;
1123:                }
1124:
1125:                Map authschemes = null;
1126:                try {
1127:                    authschemes = AuthChallengeParser.parseChallenges(headers);
1128:                } catch (MalformedChallengeException e) {
1129:                    logger.info("Failed challenge parse: " + e.getMessage());
1130:                }
1131:                if (authschemes == null || authschemes.size() <= 0) {
1132:                    logger.info("We got a 401 and WWW-Authenticate challenge"
1133:                            + " but failed parse of the header "
1134:                            + curi.toString());
1135:                    return null;
1136:                }
1137:
1138:                AuthScheme result = null;
1139:                // Use the first auth found.
1140:                for (Iterator i = authschemes.keySet().iterator(); result == null
1141:                        && i.hasNext();) {
1142:                    String key = (String) i.next();
1143:                    String challenge = (String) authschemes.get(key);
1144:                    if (key == null || key.length() <= 0 || challenge == null
1145:                            || challenge.length() <= 0) {
1146:                        logger.warning("Empty scheme: " + curi.toString()
1147:                                + ": " + headers);
1148:                    }
1149:                    AuthScheme authscheme = null;
1150:                    if (key.equals("basic")) {
1151:                        authscheme = new BasicScheme();
1152:                    } else if (key.equals("digest")) {
1153:                        authscheme = new DigestScheme();
1154:                    } else {
1155:                        logger.info("Unsupported scheme: " + key);
1156:                        continue;
1157:                    }
1158:
1159:                    try {
1160:                        authscheme.processChallenge(challenge);
1161:                    } catch (MalformedChallengeException e) {
1162:                        logger
1163:                                .info(e.getMessage() + " " + curi + " "
1164:                                        + headers);
1165:                        continue;
1166:                    }
1167:                    if (authscheme.isConnectionBased()) {
1168:                        logger.info("Connection based " + authscheme);
1169:                        continue;
1170:                    }
1171:
1172:                    if (authscheme.getRealm() == null
1173:                            || authscheme.getRealm().length() <= 0) {
1174:                        logger.info("Empty realm " + authscheme + " for "
1175:                                + curi);
1176:                        continue;
1177:                    }
1178:                    result = authscheme;
1179:                }
1180:
1181:                return result;
1182:            }
1183:
1184:            /**
1185:             * @param handler Settings Handler.
1186:             * @param curi CrawlURI that got a 401.
1187:             * @param type Class of credential to get from curi.
1188:             * @return Set of credentials attached to this curi.
1189:             */
1190:            private Set<Credential> getCredentials(SettingsHandler handler,
1191:                    CrawlURI curi, Class type) {
1192:                Set<Credential> result = null;
1193:
1194:                if (curi.hasCredentialAvatars()) {
1195:                    for (Iterator i = curi.getCredentialAvatars().iterator(); i
1196:                            .hasNext();) {
1197:                        CredentialAvatar ca = (CredentialAvatar) i.next();
1198:                        if (ca.match(type)) {
1199:                            if (result == null) {
1200:                                result = new HashSet<Credential>();
1201:                            }
1202:                            result.add(ca.getCredential(handler, curi));
1203:                        }
1204:                    }
1205:                }
1206:                return result;
1207:            }
1208:
1209:            public void initialTasks() {
1210:                super .initialTasks();
1211:                this .getController().addCrawlStatusListener(this );
1212:                configureHttp();
1213:
1214:                // load cookies from a file if specified in the order file.
1215:                loadCookies();
1216:
1217:                // I tried to get the default KeyManagers but doesn't work unless you
1218:                // point at a physical keystore. Passing null seems to do the right
1219:                // thing so we'll go w/ that.
1220:                try {
1221:                    SSLContext context = SSLContext.getInstance("SSL");
1222:                    context
1223:                            .init(
1224:                                    null,
1225:                                    new TrustManager[] { new ConfigurableX509TrustManager(
1226:                                            (String) getAttribute(ATTR_TRUST)) },
1227:                                    null);
1228:                    this .sslfactory = context.getSocketFactory();
1229:                } catch (Exception e) {
1230:                    logger
1231:                            .log(Level.WARNING,
1232:                                    "Failed configure of ssl context "
1233:                                            + e.getMessage(), e);
1234:                }
1235:            }
1236:
1237:            public void finalTasks() {
1238:                // At the end save cookies to the file specified in the order file.
1239:                saveCookies();
1240:                cleanupHttp();
1241:                super .finalTasks();
1242:            }
1243:
1244:            /**
1245:             * Perform any final cleanup related to the HttpClient instance.
1246:             */
1247:            protected void cleanupHttp() {
1248:                if (cookieDb != null) {
1249:                    try {
1250:                        cookieDb.sync();
1251:                        cookieDb.close();
1252:                    } catch (DatabaseException e) {
1253:                        // TODO Auto-generated catch block
1254:                        e.printStackTrace();
1255:                    }
1256:                }
1257:            }
1258:
1259:            protected void configureHttp() throws RuntimeException {
1260:                // Get timeout.  Use it for socket and for connection timeout.
1261:                int timeout = (getSoTimeout(null) > 0) ? getSoTimeout(null) : 0;
1262:
1263:                // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();
1264:                HttpConnectionManager cm = new SingleHttpConnectionManager();
1265:
1266:                // TODO: The following settings should be made in the corresponding
1267:                // HttpConnectionManager, not here.
1268:                HttpConnectionManagerParams hcmp = cm.getParams();
1269:                hcmp.setConnectionTimeout(timeout);
1270:                hcmp.setStaleCheckingEnabled(true);
1271:                // Minimizes bandwidth usage.  Setting to true disables Nagle's
1272:                // algorithm.  IBM JVMs < 142 give an NPE setting this boolean
1273:                // on ssl sockets.
1274:                hcmp.setTcpNoDelay(false);
1275:
1276:                this .http = new HttpClient(cm);
1277:                HttpClientParams hcp = this .http.getParams();
1278:                // Set default socket timeout.
1279:                hcp.setSoTimeout(timeout);
1280:                // Set client to be version 1.0.
1281:                hcp.setVersion(HttpVersion.HTTP_1_0);
1282:
1283:                String addressStr = null;
1284:                try {
1285:                    addressStr = (String) getAttribute(ATTR_LOCAL_ADDRESS);
1286:                } catch (Exception e1) {
1287:                    // If exception, just use default.
1288:                }
1289:                if (addressStr != null && addressStr.length() > 0) {
1290:                    try {
1291:                        InetAddress localAddress = InetAddress
1292:                                .getByName(addressStr);
1293:                        this .http.getHostConfiguration().setLocalAddress(
1294:                                localAddress);
1295:                    } catch (UnknownHostException e) {
1296:                        // Convert all to RuntimeException so get an exception out
1297:                        // if initialization fails.
1298:                        throw new RuntimeException("Unknown host " + addressStr
1299:                                + " in " + ATTR_LOCAL_ADDRESS);
1300:                    }
1301:                }
1302:
1303:                configureHttpCookies();
1304:
1305:                // Configure how we want the method to act.
1306:                this .http.getParams().setParameter(
1307:                        HttpMethodParams.SINGLE_COOKIE_HEADER,
1308:                        new Boolean(true));
1309:                this .http.getParams().setParameter(
1310:                        HttpMethodParams.UNAMBIGUOUS_STATUS_LINE,
1311:                        new Boolean(false));
1312:                this .http.getParams().setParameter(
1313:                        HttpMethodParams.STRICT_TRANSFER_ENCODING,
1314:                        new Boolean(false));
1315:                this .http.getParams().setIntParameter(
1316:                        HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);
1317:
1318:                HostConfiguration configOrNull = configureProxy(null);
1319:                if (configOrNull != null) {
1320:                    // global proxy settings are in effect
1321:                    this .http.setHostConfiguration(configOrNull);
1322:                }
1323:
1324:                // Use our own protocol factory, one that gets IP to use from
1325:                // heritrix cache (They're cached in CrawlHost instances).
1326:                final ServerCache cache = getController().getServerCache();
1327:                hcmp.setParameter(SERVER_CACHE_KEY, cache);
1328:                hcmp.setParameter(SSL_FACTORY_KEY, this .sslfactory);
1329:            }
1330:
1331:            /**
1332:             * Set the HttpClient HttpState instance to use a BDB-backed
1333:             * StoredSortedMap for cookie storage, if that option is chosen.
1334:             */
1335:            private void configureHttpCookies() {
1336:                // If Bdb-backed cookies chosen, replace map in HttpState
1337:                if (((Boolean) getUncheckedAttribute(null, ATTR_BDB_COOKIES))
1338:                        .booleanValue()) {
1339:                    try {
1340:                        EnhancedEnvironment env = getController()
1341:                                .getBdbEnvironment();
1342:                        StoredClassCatalog classCatalog = env.getClassCatalog();
1343:                        DatabaseConfig dbConfig = new DatabaseConfig();
1344:                        dbConfig.setTransactional(false);
1345:                        dbConfig.setAllowCreate(true);
1346:                        dbConfig.setDeferredWrite(true);
1347:                        cookieDb = env.openDatabase(null, COOKIEDB_NAME,
1348:                                dbConfig);
1349:                        StoredSortedMap cookiesMap = new StoredSortedMap(
1350:                                cookieDb, new StringBinding(),
1351:                                new SerialBinding(classCatalog, Cookie.class),
1352:                                true);
1353:                        this .http.getState().setCookiesMap(cookiesMap);
1354:                    } catch (DatabaseException e) {
1355:                        // TODO Auto-generated catch block
1356:                        logger.severe(e.getMessage());
1357:                        e.printStackTrace();
1358:                    }
1359:                }
1360:            }
1361:
1362:            /**
1363:             * @param curi Current CrawlURI.  Used to get context.
1364:             * @return Socket timeout value.
1365:             */
1366:            private int getSoTimeout(CrawlURI curi) {
1367:                Integer res = null;
1368:                try {
1369:                    res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);
1370:                } catch (Exception e) {
1371:                    res = DEFAULT_SOTIMEOUT_MS;
1372:                }
1373:                return res.intValue();
1374:            }
1375:
1376:            /**
1377:             * @param curi Current CrawlURI.  Used to get context.
1378:             * @return Timeout value for total request.
1379:             */
1380:            private int getTimeout(CrawlURI curi) {
1381:                Integer res;
1382:                try {
1383:                    res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
1384:                } catch (Exception e) {
1385:                    res = DEFAULT_TIMEOUT_SECONDS;
1386:                }
1387:                return res.intValue();
1388:            }
1389:
1390:            private int getMaxFetchRate(CrawlURI curi) {
1391:                Integer res;
1392:                try {
1393:                    res = (Integer) getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
1394:                } catch (Exception e) {
1395:                    res = DEFAULT_FETCH_BANDWIDTH_MAX;
1396:                }
1397:                return res.intValue();
1398:            }
1399:
1400:            private long getMaxLength(CrawlURI curi) {
1401:                Long res;
1402:                try {
1403:                    res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
1404:                    if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
1405:                        res = DEFAULT_MAX_LENGTH_BYTES;
1406:                    }
1407:                } catch (Exception e) {
1408:                    res = DEFAULT_MAX_LENGTH_BYTES;
1409:                }
1410:                return res.longValue();
1411:            }
1412:
1413:            /**
1414:             * Load cookies from a file before the first fetch.
1415:             * <p>
1416:             * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1417:             * Example entry of cookies.txt file:<br>
1418:             * <br>
1419:             * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1420:             * <br>
1421:             * Each line has 7 tab-separated fields:<br>
1422:             * <li>1. DOMAIN: The domain that created and have access to the cookie
1423:             * value.
1424:             * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1425:             * domain can access the cookie value.
1426:             * <li>3. PATH: The path within the domain that the cookie value is valid
1427:             * for.
1428:             * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1429:             * connection to access the cookie value.
1430:             * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1431:             * <li>6. NAME: The name of the cookie value
1432:             * <li>7. VALUE: The cookie value
1433:             *
1434:             * @param cookiesFile file in the Netscape's 'cookies.txt' format.
1435:             */
1436:            public void loadCookies(String cookiesFile) {
1437:                // Do nothing if cookiesFile is not specified.
1438:                if (cookiesFile == null || cookiesFile.length() <= 0) {
1439:                    return;
1440:                }
1441:                RandomAccessFile raf = null;
1442:                try {
1443:                    raf = new RandomAccessFile(cookiesFile, "r");
1444:                    String[] cookieParts;
1445:                    String line;
1446:                    Cookie cookie = null;
1447:                    while ((line = raf.readLine()) != null) {
1448:                        // Line that starts with # is commented line, therefore skip it.
1449:                        if (!line.startsWith("#")) {
1450:                            cookieParts = line.split("\\t");
1451:                            if (cookieParts.length == 7) {
1452:                                // Create cookie with not expiration date (-1 value).
1453:                                // TODO: add this as an option.
1454:                                cookie = new Cookie(cookieParts[0],
1455:                                        cookieParts[5], cookieParts[6],
1456:                                        cookieParts[2], -1, Boolean.valueOf(
1457:                                                cookieParts[3]).booleanValue());
1458:
1459:                                if (cookieParts[1].toLowerCase().equals("true")) {
1460:                                    cookie.setDomainAttributeSpecified(true);
1461:                                } else {
1462:                                    cookie.setDomainAttributeSpecified(false);
1463:                                }
1464:                                this .http.getState().addCookie(cookie);
1465:                                logger.fine("Adding cookie: "
1466:                                        + cookie.toExternalForm());
1467:                            }
1468:                        }
1469:                    }
1470:                } catch (FileNotFoundException e) {
1471:                    // We should probably throw FatalConfigurationException.
1472:                    System.out.println("Could not find file: " + cookiesFile
1473:                            + " (Element: " + ATTR_LOAD_COOKIES + ")");
1474:
1475:                } catch (IOException e) {
1476:                    // We should probably throw FatalConfigurationException.
1477:                    e.printStackTrace();
1478:                } finally {
1479:                    try {
1480:                        if (raf != null) {
1481:                            raf.close();
1482:                        }
1483:                    } catch (IOException e) {
1484:                        e.printStackTrace();
1485:                    }
1486:                }
1487:            }
1488:
1489:            /* (non-Javadoc)
1490:             * @see org.archive.crawler.framework.Processor#report()
1491:             */
1492:            public String report() {
1493:                StringBuffer ret = new StringBuffer();
1494:                ret
1495:                        .append("Processor: org.archive.crawler.fetcher.FetchHTTP\n");
1496:                ret.append("  Function:          Fetch HTTP URIs\n");
1497:                ret.append("  CrawlURIs handled: " + this .curisHandled + "\n");
1498:                ret.append("  Recovery retries:   " + this .recoveryRetries
1499:                        + "\n\n");
1500:
1501:                return ret.toString();
1502:            }
1503:
1504:            /**
1505:             * Load cookies from the file specified in the order file.
1506:             *
1507:             * <p>
1508:             * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
1509:             * Example entry of cookies.txt file:<br>
1510:             * <br>
1511:             * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
1512:             * <br>
1513:             * Each line has 7 tab-separated fields:<br>
1514:             * <li>1. DOMAIN: The domain that created and have access to the cookie
1515:             * value.
1516:             * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
1517:             * domain can access the cookie value.
1518:             * <li>3. PATH: The path within the domain that the cookie value is valid
1519:             * for.
1520:             * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
1521:             * connection to access the cookie value.
1522:             * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
1523:             * <li>6. NAME: The name of the cookie value
1524:             * <li>7. VALUE: The cookie value
1525:             */
1526:            public void loadCookies() {
1527:                try {
1528:                    loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));
1529:                } catch (MBeanException e) {
1530:                    logger.warning(e.getLocalizedMessage());
1531:                } catch (ReflectionException e) {
1532:                    logger.warning(e.getLocalizedMessage());
1533:                } catch (AttributeNotFoundException e) {
1534:                    logger.warning(e.getLocalizedMessage());
1535:                }
1536:            }
1537:
1538:            /**
1539:             * Saves cookies to the file specified in the order file.
1540:             *
1541:             * Output file is in the Netscape 'cookies.txt' format.
1542:             *
1543:             */
1544:            public void saveCookies() {
1545:                try {
1546:                    saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));
1547:                } catch (MBeanException e) {
1548:                    logger.warning(e.getLocalizedMessage());
1549:                } catch (ReflectionException e) {
1550:                    logger.warning(e.getLocalizedMessage());
1551:                } catch (AttributeNotFoundException e) {
1552:                    logger.warning(e.getLocalizedMessage());
1553:                }
1554:            }
1555:
1556:            /**
1557:             * Saves cookies to a file.
1558:             *
1559:             * Output file is in the Netscape 'cookies.txt' format.
1560:             *
1561:             * @param saveCookiesFile output file.
1562:             */
1563:            public void saveCookies(String saveCookiesFile) {
1564:                // Do nothing if cookiesFile is not specified.
1565:                if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
1566:                    return;
1567:                }
1568:
1569:                FileOutputStream out = null;
1570:                try {
1571:                    out = new FileOutputStream(new File(saveCookiesFile));
1572:                    @SuppressWarnings("unchecked")
1573:                    Map<String, Cookie> cookies = http.getState()
1574:                            .getCookiesMap();
1575:                    String tab = "\t";
1576:                    out.write("# Heritrix Cookie File\n".getBytes());
1577:                    out
1578:                            .write("# This file is the Netscape cookies.txt format\n\n"
1579:                                    .getBytes());
1580:                    for (Cookie cookie : cookies.values()) {
1581:                        MutableString line = new MutableString(1024 * 2 /*Guess an initial size*/);
1582:                        line.append(cookie.getDomain());
1583:                        line.append(tab);
1584:                        line
1585:                                .append(cookie.isDomainAttributeSpecified() == true ? "TRUE"
1586:                                        : "FALSE");
1587:                        line.append(tab);
1588:                        line.append(cookie.getPath());
1589:                        line.append(tab);
1590:                        line.append(cookie.getSecure() == true ? "TRUE"
1591:                                : "FALSE");
1592:                        line.append(tab);
1593:                        line.append(cookie.getName());
1594:                        line.append(tab);
1595:                        line.append((null == cookie.getValue()) ? "" : cookie
1596:                                .getValue());
1597:                        line.append("\n");
1598:                        out.write(line.toString().getBytes());
1599:                    }
1600:                } catch (FileNotFoundException e) {
1601:                    // We should probably throw FatalConfigurationException.
1602:                    System.out.println("Could not find file: "
1603:                            + saveCookiesFile + " (Element: "
1604:                            + ATTR_SAVE_COOKIES + ")");
1605:                } catch (IOException e) {
1606:                    e.printStackTrace();
1607:                } finally {
1608:                    try {
1609:                        if (out != null) {
1610:                            out.close();
1611:                        }
1612:                    } catch (IOException e) {
1613:                        e.printStackTrace();
1614:                    }
1615:                }
1616:            }
1617:
1618:            /* (non-Javadoc)
1619:             * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
1620:             */
1621:            protected void listUsedFiles(List<String> list) {
1622:                // List the cookies files
1623:                // Add seed file
1624:                try {
1625:                    String tmp = (String) getAttribute(ATTR_LOAD_COOKIES);
1626:                    if (tmp != null && tmp.length() > 0) {
1627:                        File file = getSettingsHandler()
1628:                                .getPathRelativeToWorkingDirectory(tmp);
1629:                        list.add(file.getAbsolutePath());
1630:                    }
1631:                    tmp = (String) getAttribute(ATTR_SAVE_COOKIES);
1632:                    if (tmp != null && tmp.length() > 0) {
1633:                        File file = getSettingsHandler()
1634:                                .getPathRelativeToWorkingDirectory(tmp);
1635:                        list.add(file.getAbsolutePath());
1636:                    }
1637:                } catch (AttributeNotFoundException e) {
1638:                    // TODO Auto-generated catch block
1639:                    e.printStackTrace();
1640:                } catch (MBeanException e) {
1641:                    // TODO Auto-generated catch block
1642:                    e.printStackTrace();
1643:                } catch (ReflectionException e) {
1644:                    // TODO Auto-generated catch block
1645:                    e.printStackTrace();
1646:                }
1647:            }
1648:
1649:            private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
1650:                try {
1651:                    StringList accept_headers = (StringList) getAttribute(
1652:                            ATTR_ACCEPT_HEADERS, curi);
1653:                    if (!accept_headers.isEmpty()) {
1654:                        for (ListIterator i = accept_headers.listIterator(); i
1655:                                .hasNext();) {
1656:                            String hdr = (String) i.next();
1657:                            String[] nvp = hdr.split(": +");
1658:                            if (nvp.length == 2) {
1659:                                get.setRequestHeader(nvp[0], nvp[1]);
1660:                            } else {
1661:                                logger.warning("Invalid accept header: " + hdr);
1662:                            }
1663:                        }
1664:                    }
1665:                } catch (AttributeNotFoundException e) {
1666:                    logger.severe(e.getMessage());
1667:                }
1668:            }
1669:
1670:            // custom serialization
1671:            private void writeObject(ObjectOutputStream stream)
1672:                    throws IOException {
1673:                stream.defaultWriteObject();
1674:                // save cookies
1675:                @SuppressWarnings("unchecked")
1676:                Collection<Cookie> c = http.getState().getCookiesMap().values();
1677:                Cookie[] cookies = c.toArray(new Cookie[c.size()]);
1678:                stream.writeObject(cookies);
1679:            }
1680:
1681:            private void readObject(ObjectInputStream stream)
1682:                    throws IOException, ClassNotFoundException {
1683:                stream.defaultReadObject();
1684:                Cookie cookies[] = (Cookie[]) stream.readObject();
1685:                ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream) stream;
1686:                coistream.registerFinishTask(new PostRestore(cookies));
1687:            }
1688:
1689:            /**
1690:             * @return Returns the http instance.
1691:             */
1692:            protected HttpClient getHttp() {
1693:                return this .http;
1694:            }
1695:
1696:            class PostRestore implements  Runnable {
1697:                Cookie cookies[];
1698:
1699:                public PostRestore(Cookie cookies[]) {
1700:                    this .cookies = cookies;
1701:                }
1702:
1703:                public void run() {
1704:                    configureHttp();
1705:                    for (int i = 0; i < cookies.length; i++) {
1706:                        getHttp().getState().addCookie(cookies[i]);
1707:                    }
1708:                }
1709:            }
1710:
1711:            /* (non-Javadoc)
1712:             * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1713:             */
1714:            public void crawlStarted(String message) {
1715:                // TODO Auto-generated method stub
1716:            }
1717:
1718:            /* (non-Javadoc)
1719:             * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1720:             */
1721:            public void crawlCheckpoint(File checkpointDir) {
1722:                try {
1723:                    cookieDb.sync();
1724:                } catch (DatabaseException e) {
1725:                    // TODO Auto-generated catch block
1726:                    throw new RuntimeException(e);
1727:                }
1728:            }
1729:
1730:            /* (non-Javadoc)
1731:             * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1732:             */
1733:            public void crawlEnding(String sExitMessage) {
1734:                // TODO Auto-generated method stub
1735:            }
1736:
1737:            /* (non-Javadoc)
1738:             * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1739:             */
1740:            public void crawlEnded(String sExitMessage) {
1741:                this .http = null;
1742:            }
1743:
1744:            /* (non-Javadoc)
1745:             * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1746:             */
1747:            public void crawlPausing(String statusMessage) {
1748:                // TODO Auto-generated method stub
1749:            }
1750:
1751:            /* (non-Javadoc)
1752:             * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1753:             */
1754:            public void crawlPaused(String statusMessage) {
1755:                // TODO Auto-generated method stub
1756:            }
1757:
1758:            /* (non-Javadoc)
1759:             * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1760:             */
1761:            public void crawlResuming(String statusMessage) {
1762:                // TODO Auto-generated method stub
1763:            }
1764:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.