001: /*
002: * Copyright 2005-2008 by Lars Torunski
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.torunski.crawler.parser.httpclient;
018:
019: import java.io.IOException;
020: import java.io.InputStreamReader;
021: import java.io.Reader;
022: import java.util.Collection;
023: import java.util.Collections;
024:
025: import org.apache.commons.logging.Log;
026: import org.apache.commons.logging.LogFactory;
027:
028: import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
029: import org.apache.commons.httpclient.Header;
030: import org.apache.commons.httpclient.HeaderElement;
031: import org.apache.commons.httpclient.HttpStatus;
032: import org.apache.commons.httpclient.methods.GetMethod;
033: import org.apache.commons.httpclient.params.HttpMethodParams;
034:
035: import com.torunski.crawler.filter.ILinkFilter;
036: import com.torunski.crawler.link.Link;
037: import com.torunski.crawler.parser.IParser;
038: import com.torunski.crawler.parser.PageData;
039: import com.torunski.crawler.util.LinksUtil;
040:
041: /**
042: * A simple parser based on Apache Jakarta Commons HttpClient to show how to
043: * implement an own parser.
044: *
045: * This class is <em>NOT</em> thread-safe. If it is accessed from multiple
046: * threads concurrently, proper synchronization must be provided by the callers.
047: *
048: * This parser provides full support for HTTP over Secure Sockets Layer (SSL) or
049: * IETF Transport Layer Security (TLS) protocols by leveraging the Java Secure
050: * Socket Extension (JSSE).
051: *
052: * http://jakarta.apache.org/commons/httpclient/
053: *
054: * @author Lars Torunski
055: * @version $Revision: 1.18 $
056: */
057: public class SimpleHttpClientParser extends AbstractHttpClient
058: implements IParser {
059:
060: private static final transient Log log = LogFactory
061: .getLog(SimpleHttpClientParser.class);
062:
063: /**
064: * The constructor of SimpleHttpClientParser for single http connections.
065: */
066: public SimpleHttpClientParser() {
067: this (false);
068: }
069:
070: /**
071: * Creates an instance of SimpleHttpClientParser.
072: * @param multiThreaded true for creating a multi threaded connection manager else only a single connection is allowed
073: */
074: public SimpleHttpClientParser(boolean multiThreaded) {
075: super (multiThreaded);
076: }
077:
078: /**
079: * @see com.torunski.crawler.parser.IParser#load(com.torunski.crawler.link.Link)
080: */
081: public PageData load(Link link) {
082: String uri = link.getURI();
083: log.info("download: " + uri);
084:
085: // Create a method instance.
086: GetMethod httpGet = null;
087: try {
088: httpGet = new GetMethod(uri);
089: } catch (Exception e) {
090: log.info("HTTP get failed for " + uri);
091: return new PageDataHttpClient(link, PageData.ERROR);
092: }
093:
094: // Provide a custom retry handler
095: httpGet.getParams().setParameter(
096: HttpMethodParams.RETRY_HANDLER,
097: new DefaultHttpMethodRetryHandler(5, false));
098: httpGet
099: .setRequestHeader("User-Agent",
100: "SmartAndSimpleWebCrawler/1.1 (https://crawler.dev.java.net)");
101:
102: StringBuffer responseBody = new StringBuffer(64 * 1024);
103: try {
104: // Execute the method.
105: int statusCode = client.executeMethod(httpGet);
106:
107: if (statusCode != HttpStatus.SC_OK) {
108: log.info("Method failed: " + httpGet.getStatusLine()
109: + " for " + uri);
110: }
111:
112: if (!containsText(httpGet)) {
113: log.warn("URL does not contain text: " + uri);
114: responseBody = null;
115: } else {
116: // read the response body as a stream
117: Reader reader = new InputStreamReader(httpGet
118: .getResponseBodyAsStream(), httpGet
119: .getResponseCharSet());
120: try {
121: char[] buffer = new char[4096];
122: int len;
123: while ((len = reader.read(buffer)) > 0) {
124: responseBody.append(buffer, 0, len);
125: }
126: } finally {
127: reader.close();
128: }
129: }
130: } catch (IOException e) {
131: log.warn("Failed to download file = " + uri, e);
132: } finally {
133: // Release the connection.
134: httpGet.releaseConnection();
135: }
136:
137: if (responseBody != null) {
138: return new PageDataHttpClient(link, responseBody.toString());
139: } else {
140: return new PageDataHttpClient(link, PageData.ERROR);
141: }
142: }
143:
144: private boolean containsText(GetMethod method) {
145: Header contentType = method.getResponseHeader("content-type");
146: if (contentType != null) {
147: HeaderElement[] elements = contentType.getElements();
148: for (int i = 0; i < elements.length; i++) {
149: String name = elements[i].getName();
150: if ((name != null) && (name.startsWith("text"))) {
151: return true;
152: }
153: }
154: // if no correct content-type is found, so it isn't text
155: return false;
156: }
157: // if no content type is set, it may be text
158: return true;
159: }
160:
161: /**
162: * @see com.torunski.crawler.parser.IParser#parse(com.torunski.crawler.parser.PageData, com.torunski.crawler.filter.ILinkFilter)
163: */
164: public Collection parse(PageData pageData, ILinkFilter linkFilter) {
165: if (!(pageData instanceof PageDataHttpClient)) {
166: log.warn("Type mismatch in " + this .getClass().getName());
167: return Collections.EMPTY_LIST;
168: }
169: return LinksUtil.retrieveLinks(pageData.getLink().getURI(),
170: (String) pageData.getData(), linkFilter);
171: }
172:
173: // --- PageData implementation ---
174:
175: private static class PageDataHttpClient extends PageData {
176:
177: /** the data of the page */
178: private String data;
179:
180: /**
181: * @param uri the uri of the data
182: * @param data the data of the uri
183: */
184: public PageDataHttpClient(Link link, String data) {
185: super (link, PageData.OK);
186: this .data = data;
187: }
188:
189: /**
190: * @param uri the uri of the data
191: * @param status the status
192: */
193: public PageDataHttpClient(Link link, int status) {
194: super (link, status);
195: }
196:
197: /**
198: * @see com.torunski.crawler.parser.PageData#getData()
199: */
200: public Object getData() {
201: return data;
202: }
203:
204: }
205:
206: }
|