0001: package net.matuschek.http;
0002:
0003: /*************************************************
0004: Copyright (c) 2001/2002 by Daniel Matuschek
0005: *************************************************/
0006:
0007: import java.io.BufferedInputStream;
0008: import java.io.BufferedWriter;
0009: import java.io.IOException;
0010: import java.io.InputStream;
0011: import java.io.OutputStreamWriter;
0012: import java.net.InetAddress;
0013: import java.net.URL;
0014: import java.net.UnknownHostException;
0015: import java.text.ParseException;
0016: import java.text.SimpleDateFormat;
0017: import java.util.Date;
0018: import java.util.Locale;
0019: import java.util.TimeZone;
0020: import java.util.Properties;
0021:
0022: import net.matuschek.http.connection.HttpConnection;
0023: import net.matuschek.http.connection.HttpsHelper;
0024: import net.matuschek.http.cookie.Cookie;
0025: import net.matuschek.http.cookie.CookieException;
0026: import net.matuschek.http.cookie.CookieManager;
0027: import net.matuschek.http.cookie.MemoryCookieManager;
0028: import net.matuschek.util.Base64;
0029: import net.matuschek.util.ByteBuffer;
0030: import net.matuschek.util.ChunkedInputStream;
0031: import net.matuschek.util.LimitedBandwidthStream;
0032:
0033: import org.apache.log4j.Category;
0034:
0035: /**
0036: * Class for retrieving documents from HTTP servers.
0037: *
0038: * <p>The main purpose of this class is to retrieve a document
0039: * from an HTTP server. </p>
0040: *
0041: * <p>For many purposes the Java URLInputStream is good for this,
0042: * but if you want to have full control over the HTTP headers
0043: * (both request and response headers), HttpTool is the answer. </p>
0044: *
0045: * <p>Also it defines a callback interface to inform a client about
0046: * the state of the current download operation. </p>
0047: *
0048: * <p>It is possible to abort a download after getting the
0049: * HTTP response headers from the server (e.g. if a document of
0050: * this Content-Type is useless for your application or the document
0051: * is to big or whatever you like) </p>
0052: *
0053: * <p>HttpTool is reusuable. You should initializes it once and use
0054: * it for every download operation.</p>
0055: *
0056: * @author Daniel Matuschek
0057: * @version $Id: HttpTool.java,v 1.28 2004/03/26 20:28:44 matuschd Exp $
0058: */
0059: public class HttpTool {
0060:
0061: /** Carriage return */
0062: final static byte CR = 13;
0063:
0064: /** Line feed */
0065: final static byte LF = 10;
0066:
0067: /** used HTTP version */
0068: final static String HTTP_VERSION = "HTTP/1.1";
0069:
0070: /* Status constants */
0071:
0072: /** HTTP connection will be established */
0073: public final static int STATUS_CONNECTING = 0;
0074: /** HTTP connection was established, but no data where retrieved */
0075: public final static int STATUS_CONNECTED = 1;
0076: /** data will be retrieved now */
0077: public final static int STATUS_RETRIEVING = 2;
0078: /** download finished */
0079: public final static int STATUS_DONE = 3;
0080: /** download could not be finished because a DownloadRule denied it */
0081: public final static int STATUS_DENIEDBYRULE = 4;
0082:
0083: /** default HTTP port */
0084: private final static int DEFAULT_HTTPPORT = 80;
0085:
0086: /** default HTTPS port */
0087: private final static int DEFAULT_HTTPSPORT = 443;
0088:
0089: /** default agent name */
0090: private final static String AGENTNAME = "JoBo/1.4beta "
0091: + "(http://www.matuschek.net/jobo.html)";
0092:
0093: /**
0094: * default update interval for calls of the callback interfaces
0095: * (in bytes)
0096: */
0097: private final static int DEFAULT_UPDATEINTERVAL = 1024;
0098:
0099: /** default socket timeout in seconds */
0100: private final static int DEFAULT_SOCKETTIMEOUT = 20;
0101:
0102: /** HTTP AgentName header */
0103: private String agentName = AGENTNAME;
0104:
0105: /** HTTP Referer header */
0106: private String referer = null;
0107:
0108: /** HTTP From header */
0109: private String fromAddress = null;
0110:
0111: /** Date of the HTTP If-Modified-Since header */
0112: private Date modifyDate = null;
0113:
0114: /**
0115: * maximal used bandwidth in bytes per second
0116: * 0 disables bandwidth limitations
0117: */
0118: private int bandwidth = 0;
0119:
0120: /** proxy address */
0121: private InetAddress proxyAddr = null;
0122:
0123: /** proxy port number */
0124: private int proxyPort = 0;
0125:
0126: /** textual description of the proxy (format host:port) */
0127: private String proxyDescr = "";
0128:
0129: /** timeout for getting data in seconds */
0130: private int socketTimeout = DEFAULT_SOCKETTIMEOUT;
0131:
0132: /** HttpTool should accept and use cookies */
0133: private boolean cookiesEnabled = true;
0134:
0135: /** Log4J Category object for logging */
0136: private Category log = null;
0137:
0138: /** Authentication infos */
0139: private Properties userInfos = new Properties();
0140:
0141: /** @link dependency */
0142: /*#HttpDoc lnkHttpDoc;*/
0143:
0144: /**
0145: * defines after how many bytes read from the web
0146: * server the Callback interface will be called
0147: * (default updates after one kilobyte)
0148: */
0149: private int updateInterval = DEFAULT_UPDATEINTERVAL;
0150:
0151: /**
0152: * callback interface that will be used after n bytes are
0153: * read from the web server to update the state of the current
0154: * retrieve operation to the application
0155: */
0156: private HttpToolCallback callback = null;
0157:
0158: /**
0159: * DownloadRuleSet tells the HttpTool, if it should download
0160: * the whole file after getting the headers
0161: */
0162: private DownloadRuleSet downloadRules = null;
0163:
0164: /**
0165: * The cookie manager will be used to store cookies
0166: */
0167: private CookieManager cookieManager = null;
0168:
0169: /**
0170: * The DateFormat instance will be used to format If-Modified-Since requests
0171: */
0172: static SimpleDateFormat df;
0173:
0174: private NTLMAuthorization ntlmAuthorization = null;
0175:
0176: /*
0177: * Initialize df to a formatter for timezone "GMT" and locale Locale.US
0178: * without changing the default timezone. If-Modified-Since requests need
0179: * to be in that format.
0180: */
0181: static {
0182: TimeZone local = TimeZone.getDefault();
0183: TimeZone gmt = TimeZone.getTimeZone("GMT");
0184: TimeZone.setDefault(gmt);
0185: df = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z",
0186: Locale.US);
0187: TimeZone.setDefault(local);
0188: }
0189:
0190: /**
0191: * Initializes HttpTool with a new CookieManager (that will not contain
0192: * any cookie).
0193: * Enables logging
0194: */
0195: public HttpTool() {
0196: this .cookieManager = new MemoryCookieManager();
0197: log = Category.getInstance(getClass().getName());
0198: }
0199:
0200: /**
0201: * Sets the Referer: HTTP header
0202: * @param referer value for the Referer header
0203: */
0204: public void setReferer(String referer) {
0205: this .referer = referer;
0206: }
0207:
0208: /**
0209: * Sets the User-Agent: HTTP header
0210: * @param name name of the user agent (may contain spaces)
0211: */
0212: public void setAgentName(String name) {
0213: this .agentName = name;
0214: }
0215:
0216: /**
0217: * Gets the current setting of the User-Agent HTTP header
0218: * @return the User-Agent name
0219: */
0220: public String getAgentName() {
0221: return agentName;
0222: }
0223:
0224: /**
0225: * <b>Insiders BugFix</b>
0226: * This method finishes the MemoryCleanupManager.
0227: */
0228: public void finish() {
0229: if (cookieManager != null) {
0230: cookieManager.finish();
0231: }
0232: }
0233:
0234: /**
0235: * Sets the DownloadRules for this object <br />
0236: * A download rule uses the HTTP return headers to decide if the
0237: * download should be finished.
0238: * @param rule a DownloadRule
0239: */
0240: public void setDownloadRuleSet(DownloadRuleSet rules) {
0241: this .downloadRules = rules;
0242: }
0243:
0244: /**
0245: * Gets the DownloadRules for this object
0246: * @return a DownloadRuleSet
0247: */
0248: public DownloadRuleSet getDownloadRuleSet() {
0249: return this .downloadRules;
0250: }
0251:
0252: /**
0253: * Gets the timeout for getting data in seconds
0254: * @return the value of sockerTimeout
0255: * @see #setTimeout(int)
0256: */
0257: public int getTimeout() {
0258: return this .socketTimeout;
0259: }
0260:
0261: /**
0262: * Sets the timeout for getting data. If HttpTool can't read
0263: * data from a remote web server after this number of seconds
0264: * it will stop the download of the current file
0265: * @param timeout Timeout in seconds
0266: */
0267: public void setTimeout(int timeout) {
0268: this .socketTimeout = timeout;
0269: }
0270:
0271: /**
0272: * Enable/disable cookies
0273: * @param enable if true, HTTP cookies will be enabled, if false
0274: * HttpTool will not use cookies
0275: */
0276: public void setEnableCookies(boolean enable) {
0277: this .cookiesEnabled = enable;
0278: }
0279:
0280: /**
0281: * Get the status of the cookie engine
0282: * @return true, if HTTP cookies are enabled, false otherwise
0283: */
0284: public boolean getEnableCookies() {
0285: return this .cookiesEnabled;
0286: }
0287:
0288: /**
0289: * sets a proxy to use
0290: * @param proxyDescr the Proxy definition in the format host:port
0291: */
0292: public void setProxy(String proxyDescr) throws HttpException {
0293: proxyAddr = null;
0294: proxyPort = 0;
0295: String proxyHost = null;
0296:
0297: if ((proxyDescr != null) && (!proxyDescr.equals(""))) {
0298: int pos = proxyDescr.indexOf(":");
0299: if (pos > 0) {
0300: try {
0301: String port = proxyDescr.substring(pos + 1);
0302: proxyHost = proxyDescr.substring(0, pos);
0303: proxyPort = Integer.parseInt(port);
0304: proxyAddr = InetAddress.getByName(proxyHost);
0305: } catch (NumberFormatException e) {
0306: throw new HttpException(
0307: "Proxy definition incorrect, "
0308: + "port not numeric: " + proxyDescr);
0309: } catch (UnknownHostException e) {
0310: throw new HttpException("Host not found: "
0311: + proxyHost);
0312: }
0313: } else {
0314: throw new HttpException("Proxy definition incorrect, "
0315: + "fomat must be host:port: " + proxyDescr);
0316: }
0317: }
0318: this .proxyDescr = proxyDescr;
0319: }
0320:
0321: /**
0322: * Gets a textual representation of the current proxy settings
0323: * @return return the proxy settings in the format host:port
0324: */
0325: public String getProxy() {
0326: return proxyDescr;
0327: }
0328:
0329: /**
0330: * Set the value of the "If-Modified-Since" header
0331: * Usually, this is null and HttpTool will retrieve every
0332: * document. Setting this to a date will retrieve only
0333: * documents that were modified since this time
0334: */
0335: public void setIfModifiedSince(Date modifyDate) {
0336: this .modifyDate = modifyDate;
0337: }
0338:
0339: /**
0340: * Returns the date used for the "If-Modified-Since" header
0341: * @return a Date object if the "If-Modified-Since" header is set,
0342: * null otherwise
0343: */
0344: public Date getIfModifiedSince() {
0345: return this .modifyDate;
0346: }
0347:
0348: /**
0349: * Sets the content From: HTTP header
0350: * @param fromAdress an email adress (e.g. some@where.com)
0351: */
0352: public void setFromAddress(String fromAddress) {
0353: this .fromAddress = fromAddress;
0354: }
0355:
0356: /**
0357: * Gets the current callback object
0358: * @return the defined HttpToolCallback object
0359: */
0360: public HttpToolCallback getCallback() {
0361: return callback;
0362: }
0363:
0364: /**
0365: * Get the value of bandwidth.
0366: * @return value of bandwidth.
0367: */
0368: public int getBandwidth() {
0369: return bandwidth;
0370: }
0371:
0372: /**
0373: * Set the value of bandwidth.
0374: * @param bandwith Value to assign to bandwidth.
0375: */
0376: public void setBandwidth(int bandwidth) {
0377: this .bandwidth = bandwidth;
0378: }
0379:
0380: /**
0381: * Sets a callback object
0382: *
0383: * If set this object will be used to inform about the current
0384: * status of the download. HttpTool will call methods of this
0385: * object while retrieving a document.
0386: *
0387: * @param callback a callback object
0388: * @see HttpToolCallback
0389: */
0390: public void setCallback(HttpToolCallback callback) {
0391: this .callback = callback;
0392: }
0393:
0394: /**
0395: * Gets the current update interval
0396: * @return the update interval in bytes
0397: * @see #setUpdateInterval(int)
0398: */
0399: public int getUpdateInterval() {
0400: return updateInterval;
0401: }
0402:
0403: /**
0404: * Sets the callback update interval
0405: *
0406: * This setting is used if a callback object is defined. Then after
0407: * reading this number of bytes, the method
0408: * <code>setHttpToolDocCurrentSize</code> will be called.
0409: * You should not set this to a value smaller then 1000 unless your
0410: * bandwidth is very small, because it will slow down downloads.
0411: *
0412: * @param updateInterval update interval in bytes
0413: *
0414: * @see HttpToolCallbackInterface#setHttpToolDocCurrentSize(int)
0415: */
0416: public void setUpdateInterval(int updateInterval) {
0417: if (updateInterval > 0) {
0418: this .updateInterval = updateInterval;
0419: } else {
0420: throw new IllegalArgumentException(
0421: "updateInterval must be > 0 (was " + updateInterval
0422: + ")");
0423: }
0424: }
0425:
0426: /**
0427: * Sets the CookieManager for this HttpTool
0428: * By default a MemoryCookieManager will be used, but you can
0429: * use this method to use your own CookieManager implementation
0430: *
0431: * @param cm an object that implements the CookieManager interface
0432: */
0433: public void setCookieManager(CookieManager cm) {
0434: this .cookieManager = cm;
0435: }
0436:
0437: /**
0438: * Gets the CookieManager used by this HttpTool
0439: *
0440: * @return the CookieManager that will be used by this HttpTool
0441: */
0442: public CookieManager getCookieManager() {
0443: return this .cookieManager;
0444: }
0445:
0446: /**
0447: * Delete all cookies
0448: */
0449: public void clearCookies() {
0450: if (cookieManager != null) {
0451: cookieManager.clear();
0452: }
0453: }
0454:
0455: /**
0456: * Retrieves a document from the given URL.
0457: * If Cookies are enabled it will use the CookieManager to set Cookies
0458: * it got from former retrieveDocument operations.
0459: *
0460: * @param u the URL to retrieve (only http:// supported yet)
0461: * @param method HttpConstants.GET for a GET request, HttpConstants.POST
0462: * for a POST request
0463: * @param parameters additional parameters. Will be added to the URL if
0464: * this is a GET request, posted if it is a POST request
0465: * @return a HttpDoc if a document was retrieved, null otherwise
0466: *
0467: * @see HttpConstants
0468: */
0469: public HttpDoc retrieveDocument(URL u, int method, String parameters)
0470: throws HttpException {
0471: DocAndConnection docAndConnection = retrieveDocumentInternal(u,
0472: method, parameters, null, null);
0473: HttpDoc doc = docAndConnection != null ? docAndConnection.httpDoc
0474: : null;
0475: if (doc != null && doc.getHttpCode() == 401) {
0476: String authProtName = NTLMAuthorization.WWW_AUTHENTICATE_HEADER;
0477: String authProtValue = doc.getHeaderValue(authProtName);
0478: if (authProtValue == null) {
0479: authProtName = NTLMAuthorization.PROXY_AUTHENTICATE_HEADER;
0480: authProtValue = doc.getHeaderValue(authProtName);
0481: }
0482: if (authProtValue.indexOf(NTLMAuthorization.NTLM_TAG) >= 0
0483: || authProtValue.indexOf("Negotiate") >= 0) {
0484:
0485: try {
0486: // STEP 1 - send NTLM-Request
0487: NTLMAuthorization authorization = (NTLMAuthorization) ntlmAuthorization
0488: .clone();
0489: authorization.setHost(u.getHost());
0490: // log.info("NTLM-Authentication: " + authorization);
0491: String auth = authorization.getRequest();
0492: docAndConnection = retrieveDocumentInternal(u,
0493: method, parameters, null, auth);
0494:
0495: // STEP 2 - receive NTLM-Nonce
0496: doc = docAndConnection.httpDoc;
0497: authProtValue = doc.getHeaderValue(authProtName);
0498: authorization.extractNonce(authProtValue);
0499:
0500: // STEP 3 - send NTLM-Response
0501: auth = authorization.getResponse();
0502: docAndConnection = retrieveDocumentInternal(u,
0503: method, parameters,
0504: docAndConnection.httpConnection, auth);
0505: if (docAndConnection != null) {
0506: doc = docAndConnection.httpDoc;
0507: if (docAndConnection.httpConnection != null) {
0508: docAndConnection.httpConnection.close();
0509: }
0510: } else {
0511: doc = null; // BUGFIX (Not modified files return null)
0512: }
0513:
0514: } catch (Exception e) {
0515: log.error("NTLM-Authentication Error: "
0516: + e.getMessage());
0517: throw new HttpException(e.getMessage());
0518: }
0519: }
0520: }
0521: return doc;
0522: }
0523:
0524: /**
0525: * Internal structure to keep connection after retrieval of doc.
0526: */
0527: protected class DocAndConnection {
0528: HttpDoc httpDoc;
0529: HttpConnection httpConnection;
0530: }
0531:
0532: /**
0533: * Same like method without parameter httpConnection, but this
0534: * method uses the passed connection.
0535: * @param u
0536: * @param method
0537: * @param parameters
0538: * @param httpConnection (Use this connection)
0539: * @return DocAndConnection
0540: * @throws HttpException
0541: */
0542: protected DocAndConnection retrieveDocumentInternal(URL u,
0543: int method, String parameters, HttpConnection httpConn,
0544: String ntlmAuthorizationInfo) throws HttpException {
0545: String host = null;
0546: InetAddress addr = null;
0547: String path = null;
0548: String requestPath = null;
0549: String protocol = null;
0550: String userinfo = null;
0551: boolean chunkedEncoding = false;
0552: boolean secureConnection = false;
0553: ChunkedInputStream chunkStream = null;
0554:
0555: // Content-Length
0556: int docSize = -1;
0557:
0558: int port = 0;
0559: HttpDoc doc = new HttpDoc();
0560: int i = 0;
0561:
0562: // set document URL
0563: doc.setURL(u);
0564:
0565: // document buffer
0566: ByteBuffer buff = new ByteBuffer();
0567:
0568: // the connection to the HTTP server
0569: // HttpConnection httpConn = null;
0570:
0571: InputStream is = null;
0572: BufferedWriter bwrite = null;
0573:
0574: // get host
0575: host = u.getHost();
0576: if (host == null) {
0577: throw new HttpException("no host part in URL found");
0578: }
0579:
0580: // get address, if not using a proxy
0581: // if the client runs behind a proxy it is possible, that name
0582: // resolution for the internet is not possible
0583: if (!useProxy()) {
0584: try {
0585: addr = InetAddress.getByName(host);
0586: } catch (UnknownHostException e) {
0587: addr = null;
0588: }
0589: if (addr == null) {
0590: throw new HttpException("host part (" + host
0591: + ") does not resolve");
0592: }
0593: }
0594:
0595: // get path
0596: path = u.getFile();
0597: if (path.equals("")) {
0598: path = "/";
0599: }
0600: // replace spaces
0601: path = path.replaceAll(" ", "%20");
0602:
0603: // get protocol and port
0604: port = u.getPort();
0605: protocol = u.getProtocol().toLowerCase();
0606: if (protocol.equals("http")) {
0607: if (port == -1) {
0608: port = DEFAULT_HTTPPORT;
0609: }
0610: } else if (protocol.equals("https")) {
0611: if (port == -1) {
0612: port = DEFAULT_HTTPSPORT;
0613: }
0614: secureConnection = true;
0615: } else {
0616: throw new HttpException("protocol " + protocol
0617: + " not supported");
0618: }
0619:
0620: // if using the proxy, request path is the whole URL, otherwise only
0621: // the path part of the URL
0622: if (useProxy() && (!secureConnection)) {
0623: requestPath = "http://" + host + path;
0624: } else {
0625: requestPath = path;
0626: }
0627:
0628: // get user info
0629: userinfo = u.getUserInfo();
0630: if (userinfo != null) {
0631: if (userinfo.equals("")) {
0632: userinfo = null;
0633: } else {
0634: // Store user info for this host
0635: userInfos.setProperty(host, userinfo);
0636: }
0637: } else {
0638: // do we hae a stored user info?
0639: userinfo = userInfos.getProperty(host);
0640: }
0641:
0642: if (callback != null) {
0643: callback.setHttpToolDocUrl(u.toString());
0644: callback.setHttpToolStatus(STATUS_CONNECTING);
0645: }
0646:
0647: // okay, we got all needed information, try to connect to the host
0648: try {
0649: if (httpConn == null) {
0650: // connect and initialize streams
0651: // timeout is stored in seconds in HttpTool, but
0652: // HttpConnection uses milliseconds
0653: if (secureConnection) {
0654: HttpsHelper helper = new HttpsHelper(proxyAddr,
0655: proxyPort, useProxy());
0656: httpConn = helper.createHttpsConnection(host, port);
0657: } else {
0658: if (useProxy()) {
0659: httpConn = HttpConnection.createConnection(
0660: proxyAddr, proxyPort,
0661: socketTimeout * 1000);
0662: } else {
0663: httpConn = HttpConnection.createConnection(
0664: addr, port, socketTimeout * 1000);
0665: }
0666: }
0667: }
0668:
0669: is = new LimitedBandwidthStream(new BufferedInputStream(
0670: httpConn.getInputStream(), 256), bandwidth);
0671: bwrite = new BufferedWriter(new OutputStreamWriter(httpConn
0672: .getOutputStream()));
0673:
0674: if (callback != null) {
0675: callback.setHttpToolStatus(STATUS_CONNECTED);
0676: }
0677:
0678: // write HTTP request
0679: // get or post ?
0680: if (method == HttpConstants.GET) {
0681: bwrite.write("GET ");
0682: bwrite.write(requestPath);
0683: if ((parameters != null) && (!parameters.equals(""))) {
0684: bwrite.write("?");
0685: bwrite.write(parameters);
0686: }
0687:
0688: } else if (method == HttpConstants.POST) {
0689: bwrite.write("POST " + requestPath);
0690: } else {
0691: throw new HttpException("HTTP method " + method
0692: + " not supported");
0693: }
0694:
0695: // last part of request line
0696: bwrite.write(" ");
0697: bwrite.write(HTTP_VERSION);
0698: bwrite.write("\r\n");
0699:
0700: // Referer header only if defined
0701: if (referer != null) {
0702: bwrite.write("Referer: " + referer + "\r\n");
0703: }
0704:
0705: // if cookies are enabled, write a Cookie: header
0706: if (cookiesEnabled) {
0707: String cookieString = cookieManager.cookiesForURL(u);
0708: if (cookieString != null) {
0709: bwrite.write("Cookie: ");
0710: bwrite.write(cookieString);
0711: bwrite.write("\r\n");
0712: log.debug("Cookie request header: " + cookieString);
0713: }
0714: }
0715:
0716: // Write other headers
0717: bwrite.write("Host: " + host + "\r\n");
0718: bwrite.write("User-Agent: " + agentName + "\r\n");
0719: bwrite.write("Accept: */*\r\n");
0720: if (ntlmAuthorizationInfo == null) {
0721: bwrite.write("Connection: close\r\n");
0722: } else {
0723: bwrite.write("Connection: keep-alive\r\n");
0724: }
0725:
0726: // Write "From:" header only if a fromAddress is defined
0727: if (fromAddress != null) {
0728: bwrite.write("From: " + fromAddress + "\r\n");
0729: }
0730:
0731: // if we have username and password, lets write an Authorization
0732: // header
0733: if (userinfo != null) {
0734: // special hack to support usernames with "@"
0735: // TO DO: find a better solution for this problem
0736: userinfo = userinfo.replace('%', '@');
0737: bwrite.write("Authorization: Basic ");
0738: bwrite.write(Base64.encode(userinfo));
0739: bwrite.write("\r\n");
0740: log.debug(userinfo);
0741:
0742: }
0743:
0744: if (ntlmAuthorizationInfo != null) {
0745: bwrite.write("Authorization: NTLM ");
0746: bwrite.write(ntlmAuthorizationInfo);
0747: bwrite.write("\r\n");
0748: }
0749:
0750: // if there is a "If-Modified-Since" date, also write this header
0751: if (modifyDate != null) {
0752: String dateStr = df.format(modifyDate);
0753:
0754: bwrite.write("If-Modified-Since: ");
0755: bwrite.write(dateStr);
0756: bwrite.write("\r\n");
0757: log.debug("If-Modified-Since header: " + dateStr);
0758: }
0759:
0760: // for a POST request we also need a content-length header
0761: if (method == HttpConstants.POST) {
0762: bwrite
0763: .write("Content-Type: application/x-www-form-urlencoded\r\n");
0764: bwrite.write("Content-Length: " + parameters.length()
0765: + "\r\n");
0766: }
0767:
0768: // finished headers
0769: bwrite.write("\r\n");
0770: // if this is a POST request, we have to add the POST parameters
0771: if (method == HttpConstants.POST) {
0772: bwrite.write(parameters);
0773: }
0774: bwrite.flush();
0775:
0776: if (callback != null) {
0777: callback.setHttpToolStatus(STATUS_RETRIEVING);
0778: }
0779:
0780: // read the first line (HTTP return code)
0781: while ((i = is.read()) != 10) {
0782: if (i == -1) {
0783: throw new HttpException(
0784: "Could not get HTTP return code "
0785: + "(buffer content is "
0786: + buff.toString() + ")");
0787: }
0788: buff.append((byte) i);
0789: }
0790:
0791: String httpCode = lineString(buff.getContent());
0792: buff.clean();
0793: doc.setHttpCode(httpCode);
0794:
0795: // read the HTTP headers
0796: boolean finishedHeaders = false;
0797: while (!finishedHeaders) {
0798: i = is.read();
0799: if (i == -1) {
0800: throw new HttpException(
0801: "Could not read HTTP headers");
0802: }
0803: if (i >= 32) {
0804: buff.append((byte) i);
0805: }
0806: // HTTP header processing
0807: if (i == LF) {
0808: String line = lineString(buff.getContent());
0809:
0810: buff.clean();
0811: // empty line means "end of headers"
0812: if (line.trim().equals("")) {
0813: finishedHeaders = true;
0814: } else {
0815: HttpHeader head = new HttpHeader(line);
0816: doc.addHeader(head);
0817:
0818: if (cookiesEnabled && head.isSetCookie()) {
0819: try {
0820: Cookie cookie = new Cookie(head
0821: .toLine(), u);
0822: cookieManager.add(cookie);
0823: log.debug("Got a cookie " + cookie);
0824: } catch (CookieException e) {
0825: log.info("Could not interpret cookie: "
0826: + e.getMessage());
0827: }
0828: }
0829:
0830: // Content chunked ?
0831: if (head.getName().equalsIgnoreCase(
0832: "Transfer-Encoding")
0833: && head.getValue().equalsIgnoreCase(
0834: "chunked")) {
0835: chunkedEncoding = true;
0836: }
0837:
0838: }
0839: }
0840: }
0841: buff.clean();
0842:
0843: // if there is a DownloadRule, ask if we should download
0844: // the data
0845: if (downloadRules != null) {
0846: // if it is not allowed to download this URL, close socket
0847: // and return a null document
0848: boolean isNotModified = false;
0849: if (modifyDate != null) {
0850: HttpHeader lastModifiedHeader = doc
0851: .getHttpHeader("Last-Modified");
0852: if (lastModifiedHeader != null) {
0853: try {
0854: Date lastModifiedDate = df
0855: .parse(lastModifiedHeader
0856: .getValue());
0857: if (lastModifiedDate.compareTo(modifyDate) <= 0) {
0858: isNotModified = true;
0859: }
0860: } catch (ParseException e) {
0861: }
0862: }
0863: }
0864:
0865: if (!downloadRules.downloadAllowed(doc.getHttpHeader())
0866: || isNotModified) {
0867: if (doc.isNotModified()) {
0868: log.info("If-Not-Modified successfull for: "
0869: + u);
0870: } else if (isNotModified) {
0871: log.info("Header indicates not modified for: "
0872: + u);
0873: } else {
0874: log
0875: .info("Download not allowed by download rule.");
0876: }
0877: // Close connection
0878: httpConn.close();
0879: httpConn = null;
0880:
0881: if (callback != null) {
0882: callback.setHttpToolStatus(STATUS_DENIEDBYRULE);
0883: }
0884: return null;
0885: }
0886: }
0887:
0888: // if we got encoding "chunked", use the ChunkedInputStream
0889: if (chunkedEncoding) {
0890: chunkStream = new ChunkedInputStream(is);
0891: }
0892:
0893: // did we got an Content-Length header ?
0894: HttpHeader contentLength = doc
0895: .getHeader(HttpHeader.CONTENT_LENGTH);
0896: if (contentLength != null) {
0897:
0898: try {
0899: docSize = Integer
0900: .parseInt(contentLength.getValue());
0901: } catch (NumberFormatException e) {
0902: log
0903: .error("Got a malformed Content-Length header from the server");
0904: docSize = -1;
0905: }
0906:
0907: // send information to callback
0908: if (callback != null) {
0909: callback.setHttpToolDocSize(docSize);
0910: }
0911:
0912: // initialize the byte buffer with the given document size
0913: // there is no need to increase the buffer size dynamically
0914: if (docSize > 0) {
0915: buff.setSize(docSize);
0916: }
0917: }
0918:
0919: // read data
0920: boolean finished = false;
0921: int count = 0;
0922:
0923: while (!finished) {
0924:
0925: if (chunkedEncoding) {
0926: i = chunkStream.read();
0927: } else {
0928: i = is.read();
0929: }
0930:
0931: if (i == -1) {
0932: // this should only happen on HTTP/1.0 responses
0933: // without a Content-Length header
0934: finished = true;
0935: } else {
0936: buff.append((byte) i);
0937: count++;
0938: }
0939:
0940: // finished ?
0941: // there are other tests then wait until read gives us a -1:
0942:
0943: // if there was a Content-Length header stop after reading the
0944: // given number of bytes
0945: if (count == docSize) {
0946: finished = true;
0947: }
0948:
0949: // if it is a chunked stream we should use the isDone method
0950: // to look if we reached the end
0951: if (chunkedEncoding) {
0952: if (chunkStream.isDone()) {
0953: finished = true;
0954: }
0955: }
0956:
0957: // should we call the callback interface ?
0958: if (callback != null) {
0959: if (((buff.length() % updateInterval) == 0)
0960: || finished) {
0961: callback.setHttpToolDocCurrentSize(buff
0962: .length());
0963: }
0964: }
0965:
0966: }
0967:
0968: doc.setContent(buff.getContent());
0969:
0970: if (ntlmAuthorizationInfo == null) {
0971: // close everything
0972: // bwrite.close();
0973: // is.close();
0974: httpConn.close();
0975: httpConn = null;
0976: }
0977:
0978: if (callback != null) {
0979: callback.setHttpToolStatus(STATUS_DONE);
0980: }
0981:
0982: } catch (IOException e) {
0983: throw new HttpException(e.getMessage());
0984: }
0985:
0986: DocAndConnection docAndConnection = new DocAndConnection();
0987: docAndConnection.httpDoc = doc;
0988: docAndConnection.httpConnection = httpConn;
0989:
0990: return docAndConnection;
0991: }
0992:
0993: /**
0994: * should I use a proxy ?
0995: * @return true if a proxy was configured, false otherwise
0996: */
0997: protected boolean useProxy() {
0998: return (proxyAddr != null);
0999: }
1000:
1001: /**
1002: * convert an array of bytes to a String. if the last byte is an CR
1003: * it will be ignored
1004: */
1005: protected String lineString(byte[] b) {
1006: if (b.length == 0) {
1007: return "";
1008: }
1009:
1010: if (b[b.length - 1] != CR) {
1011: return new String(b);
1012: } else {
1013: return new String(b, 0, b.length - 1);
1014: }
1015: }
1016:
1017: public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
1018: this .ntlmAuthorization = ntlmAuthorization;
1019: }
1020:
1021: public NTLMAuthorization getNtlmAuthorization() {
1022: return ntlmAuthorization;
1023: }
1024:
1025: }
|