001: /* HeritrixProtocolSocketFactory
002: *
003: * Created on Oct 8, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.fetcher;
024:
025: import java.io.IOException;
026: import java.net.InetAddress;
027: import java.net.InetSocketAddress;
028: import java.net.Socket;
029: import java.net.SocketTimeoutException;
030: import java.net.UnknownHostException;
031:
032: import org.apache.commons.httpclient.ConnectTimeoutException;
033: import org.apache.commons.httpclient.params.HttpConnectionParams;
034: import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
035: import org.archive.crawler.datamodel.CrawlHost;
036: import org.archive.crawler.datamodel.ServerCache;
037:
038: /**
039: * Version of protocol socket factory that tries to get IP from heritrix IP
040: * cache -- if its been set into the HttpConnectionParameters.
041: *
042: * Copied the guts of DefaultProtocolSocketFactory. This factory gets
043: * setup by {@link FetchHTTP}.
044: *
045: * @author stack
046: * @version $Date: 2006-08-29 22:47:03 +0000 (Tue, 29 Aug 2006) $, $Revision: 4553 $
047: */
048: public class HeritrixProtocolSocketFactory implements
049: ProtocolSocketFactory {
050: /**
051: * Constructor.
052: */
053: public HeritrixProtocolSocketFactory() {
054: super ();
055: }
056:
057: /**
058: * @see #createSocket(java.lang.String,int,java.net.InetAddress,int)
059: */
060: public Socket createSocket(String host, int port,
061: InetAddress localAddress, int localPort)
062: throws IOException, UnknownHostException {
063: return new Socket(host, port, localAddress, localPort);
064: }
065:
066: /**
067: * Attempts to get a new socket connection to the given host within the
068: * given time limit.
069: * <p>
070: * This method employs several techniques to circumvent the limitations
071: * of older JREs that do not support connect timeout. When running in
072: * JRE 1.4 or above reflection is used to call
073: * Socket#connect(SocketAddress endpoint, int timeout) method. When
074: * executing in older JREs a controller thread is executed. The
075: * controller thread attempts to create a new socket within the given
076: * limit of time. If socket constructor does not return until the
077: * timeout expires, the controller terminates and throws an
078: * {@link ConnectTimeoutException}
079: * </p>
080: *
081: * @param host the host name/IP
082: * @param port the port on the host
083: * @param localAddress the local host name/IP to bind the socket to
084: * @param localPort the port on the local machine
085: * @param params {@link HttpConnectionParams Http connection parameters}
086: *
087: * @return Socket a new socket
088: *
089: * @throws IOException if an I/O error occurs while creating the socket
090: * @throws UnknownHostException if the IP address of the host cannot be
091: * @throws IOException if an I/O error occurs while creating the socket
092: * @throws UnknownHostException if the IP address of the host cannot be
093: * determined
094: * @throws ConnectTimeoutException if socket cannot be connected within the
095: * given time limit
096: *
097: * @since 3.0
098: */
099: public Socket createSocket(final String host, final int port,
100: final InetAddress localAddress, final int localPort,
101: final HttpConnectionParams params) throws IOException,
102: UnknownHostException, ConnectTimeoutException {
103: // Below code is from the DefaultSSLProtocolSocketFactory#createSocket
104: // method only it has workarounds to deal with pre-1.4 JVMs. I've
105: // cut these out.
106: if (params == null) {
107: throw new IllegalArgumentException(
108: "Parameters may not be null");
109: }
110: Socket socket = null;
111: int timeout = params.getConnectionTimeout();
112: if (timeout == 0) {
113: socket = createSocket(host, port, localAddress, localPort);
114: } else {
115: socket = new Socket();
116: ServerCache cache = (ServerCache) params
117: .getParameter(FetchHTTP.SERVER_CACHE_KEY);
118: InetAddress hostAddress = (cache != null) ? getHostAddress(
119: cache, host) : null;
120: InetSocketAddress address = (hostAddress != null) ? new InetSocketAddress(
121: hostAddress, port)
122: : new InetSocketAddress(host, port);
123: socket.bind(new InetSocketAddress(localAddress, localPort));
124: try {
125: socket.connect(address, timeout);
126: } catch (SocketTimeoutException e) {
127: // Add timeout info. to the exception.
128: throw new SocketTimeoutException(e.getMessage()
129: + ": timeout set at "
130: + Integer.toString(timeout) + "ms.");
131: }
132: assert socket.isConnected() : "Socket not connected "
133: + host;
134: }
135: return socket;
136: }
137:
138: /**
139: * Get host address using first the heritrix cache of addresses, then,
140: * failing that, go to the dnsjava cache.
141: *
142: * Default access and static so can be used by other classes in this
143: * package.
144: *
145: * @param host Host whose address we're to fetch.
146: * @return an IP address for this host or null if one can't be found
147: * in caches.
148: * @exception IOException If we fail to get host IP from ServerCache.
149: */
150: static InetAddress getHostAddress(final ServerCache cache,
151: final String host) throws IOException {
152: InetAddress result = null;
153: if (cache != null) {
154: CrawlHost ch = cache.getHostFor(host);
155: if (ch != null) {
156: result = ch.getIP();
157: }
158: }
159: if (result == null) {
160: throw new IOException("Failed to get host " + host
161: + " address from ServerCache");
162: }
163: return result;
164: }
165:
166: /**
167: * @see ProtocolSocketFactory#createSocket(java.lang.String,int)
168: */
169: public Socket createSocket(String host, int port)
170: throws IOException, UnknownHostException {
171: return new Socket(host, port);
172: }
173:
174: /**
175: * All instances of DefaultProtocolSocketFactory are the same.
176: * @param obj Object to compare.
177: * @return True if equal
178: */
179: public boolean equals(Object obj) {
180: return ((obj != null) && obj.getClass().equals(
181: HeritrixProtocolSocketFactory.class));
182: }
183:
184: /**
185: * All instances of DefaultProtocolSocketFactory have the same hash code.
186: * @return Hash code for this object.
187: */
188: public int hashCode() {
189: return HeritrixProtocolSocketFactory.class.hashCode();
190: }
191: }
|