001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.jetspeed.search.handlers;
018:
019: // Java imports
020: import java.io.IOException;
021: import java.net.URL;
022:
023: import org.apache.commons.httpclient.HttpClient;
024: import org.apache.commons.httpclient.HttpException;
025: import org.apache.commons.httpclient.methods.GetMethod;
026: import org.apache.jetspeed.search.AbstractObjectHandler;
027: import org.apache.jetspeed.search.BaseParsedObject;
028:
029: /**
030: * This object handler deals with URLs.
031: *
032: * @author <a href="mailto:morciuch@apache.org">Mark Orciuch</a>
033: * @version $Id: URLToDocHandler.java 516448 2007-03-09 16:25:47Z ate $
034: */
035: public class URLToDocHandler extends AbstractObjectHandler {
036: /**
037: * Static initialization of the logger for this class
038: */
039: //private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLToDocHandler.class.getName());
040: /**
041: * Parses a specific object into a document suitable for index placement
042: *
043: * @param o
044: * @return
045: */
046: public org.apache.jetspeed.search.ParsedObject parseObject(Object o) {
047: org.apache.jetspeed.search.ParsedObject result = new BaseParsedObject();
048:
049: if ((o instanceof URL) == false) {
050: //logger.error("URLToDocHandler: invalid object type: " + o);
051: return null;
052: }
053:
054: URL pageToAdd = (URL) o;
055:
056: HttpClient client = new HttpClient();
057: GetMethod method = new GetMethod(pageToAdd.toString());
058: method.setFollowRedirects(true);
059: int statusCode = -1;
060: int attempt = 0;
061:
062: try {
063: // We will retry up to 3 times.
064: while (statusCode == -1 && attempt < 3) {
065: try {
066: // execute the method.
067: client.executeMethod(method);
068: statusCode = method.getStatusCode();
069: //if (logger.isDebugEnabled())
070: {
071: //logger.debug("URL = " + pageToAdd.toString() + "Status code = " + statusCode);
072: }
073: } catch (HttpException e) {
074: // We will retry
075: attempt++;
076: } catch (IOException e) {
077: return null;
078: }
079: }
080: // Check that we didn't run out of retries.
081: if (statusCode != -1) {
082: String content = null;
083: try {
084: content = method.getResponseBodyAsString();
085: } catch (Exception ioe) {
086: //logger.error("Getting content for " + pageToAdd.toString(), ioe);
087: }
088:
089: if (content != null) {
090: try {
091: result.setKey(java.net.URLEncoder.encode(
092: pageToAdd.toString(), "UTF-8"));
093: result
094: .setType(org.apache.jetspeed.search.ParsedObject.OBJECT_TYPE_URL);
095: // TODO: We should extract the <title> tag here.
096: result.setTitle(pageToAdd.toString());
097: result.setContent(content);
098: result.setDescription("");
099: result.setLanguage("");
100: result.setURL(pageToAdd);
101: result.setClassName(o.getClass().getName());
102: //logger.info("Parsed '" + pageToAdd.toString() + "'");
103: } catch (Exception e) {
104: e.printStackTrace();
105: //logger.error("Adding document to index", e);
106: }
107: }
108: }
109: } finally {
110: method.releaseConnection();
111: }
112:
113: return result;
114:
115: }
116: }
|