001: /*
002: * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
003: *
004: * Copyright (c) 2001 Brian Pitcher
005: *
006: * Permission is hereby granted, free of charge, to any person obtaining a
007: * copy of this software and associated documentation files (the "Software"),
008: * to deal in the Software without restriction, including without limitation
009: * the rights to use, copy, modify, merge, publish, distribute, sublicense,
010: * and/or sell copies of the Software, and to permit persons to whom the
011: * Software is furnished to do so, subject to the following conditions:
012: *
013: * The above copyright notice and this permission notice shall be included in
014: * all copies or substantial portions of the Software.
015: *
016: * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
017: * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
018: * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
019: * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
020: * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
021: * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
022: * SOFTWARE.
023: */
024:
025: // $Header: /cvsroot/weblech/weblech/src/weblech/spider/URLGetter.java,v 1.2 2002/06/02 08:02:45 weblech Exp $
026: package weblech.spider;
027:
028: import org.apache.log4j.Category;
029:
030: import java.net.HttpURLConnection;
031: import java.net.URL;
032: import java.net.Authenticator;
033: import java.io.*;
034:
035: import weblech.util.Log4j;
036:
037: public class URLGetter {
038: private final static Category _logClass = Category
039: .getInstance(URLGetter.class);
040:
041: static {
042: Log4j.init();
043: }
044:
045: private int failureCount = 0;
046:
047: private final SpiderConfig config;
048:
049: public URLGetter(SpiderConfig config) {
050: _logClass.debug("URLGetter()");
051: this .config = config;
052:
053: Authenticator.setDefault(new DumbAuthenticator(config
054: .getBasicAuthUser(), config.getBasicAuthPassword()));
055: }
056:
057: public URLObject getURL(URLToDownload url) {
058: _logClass.debug("getURL(" + url + ")");
059:
060: if (failureCount > 10) {
061: _logClass
062: .warn("Lots of failures recently, waiting 5 seconds before attempting download");
063: try {
064: Thread.sleep(5 * 1000);
065: } catch (InterruptedException e) {
066: }
067: ;
068: failureCount = 0;
069: }
070:
071: URL requestedURL = url.getURL();
072: URL referer = url.getReferer();
073:
074: try {
075: _logClass.debug("Creating HTTP connection to "
076: + requestedURL);
077: HttpURLConnection conn = (HttpURLConnection) requestedURL
078: .openConnection();
079: if (referer != null) {
080: _logClass.debug("Setting Referer header to " + referer);
081: conn.setRequestProperty("Referer", referer
082: .toExternalForm());
083: }
084:
085: if (config.getUserAgent() != null) {
086: _logClass.debug("Setting User-Agent to "
087: + config.getUserAgent());
088: conn.setRequestProperty("User-Agent", config
089: .getUserAgent());
090: }
091:
092: conn.setUseCaches(false);
093:
094: _logClass.debug("Opening URL");
095: long startTime = System.currentTimeMillis();
096: conn.connect();
097:
098: String resp = conn.getResponseMessage();
099: _logClass.debug("Remote server response: " + resp);
100:
101: String respStr = conn.getHeaderField(0);
102: _logClass.info("Server response: " + respStr);
103:
104: for (int i = 1;; i++) {
105: String key = conn.getHeaderFieldKey(i);
106: if (key == null) {
107: break;
108: }
109: String value = conn.getHeaderField(key);
110: _logClass
111: .debug("Received header " + key + ": " + value);
112: }
113:
114: _logClass
115: .debug("Getting buffered input stream from remote connection");
116: BufferedInputStream remoteBIS = new BufferedInputStream(
117: conn.getInputStream());
118: ByteArrayOutputStream baos = new ByteArrayOutputStream(
119: 10240);
120: byte[] buf = new byte[1024];
121: int bytesRead = 0;
122: while (bytesRead >= 0) {
123: baos.write(buf, 0, bytesRead);
124: bytesRead = remoteBIS.read(buf);
125: }
126:
127: byte[] content = baos.toByteArray();
128: long timeTaken = System.currentTimeMillis() - startTime;
129: if (timeTaken < 100)
130: timeTaken = 500;
131:
132: int bytesPerSec = (int) ((double) content.length / ((double) timeTaken / 1000.0));
133: _logClass.info("Downloaded " + content.length + " bytes, "
134: + bytesPerSec + " bytes/sec");
135: if (content.length < conn.getContentLength()) {
136: _logClass.warn("Didn't download full content for URL: "
137: + url);
138: failureCount++;
139: return null;
140: }
141: return new URLObject(requestedURL, conn.getContentType(),
142: content, config);
143: } catch (FileNotFoundException fnfe) {
144: _logClass.warn("File not found: " + fnfe.getMessage());
145: return null;
146: } catch (IOException ioe) {
147: _logClass.warn("Caught IO Exception: " + ioe.getMessage(),
148: ioe);
149: failureCount++;
150: return null;
151: }
152: }
153: }
|