01: /* AddRedirectFromRootServerToScope
02: *
03: * Created on May 25, 2005
04: *
05: * Copyright (C) 2005 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23:
24: package org.archive.crawler.deciderules;
25:
26: import java.util.logging.Logger;
27: import org.apache.commons.httpclient.URIException;
28: import org.archive.crawler.datamodel.CandidateURI;
29: import org.archive.net.UURI;
30:
31: public class AddRedirectFromRootServerToScope extends
32: PredicatedDecideRule {
33:
34: private static final long serialVersionUID = 2644131585813079064L;
35:
36: private static final Logger LOGGER = Logger
37: .getLogger(AddRedirectFromRootServerToScope.class.getName());
38: private static final String SLASH = "/";
39:
40: public AddRedirectFromRootServerToScope(String name) {
41: super (name);
42: setDescription("Allow URI only if it is a redirect and via URI is a "
43: + "root server (host's slash page) that is within the "
44: + "scope. Also mark the URI as a seed.");
45: }
46:
47: protected boolean evaluate(Object object) {
48: UURI via = getVia(object);
49: if (via == null) {
50: return false;
51: }
52: CandidateURI curi = (CandidateURI) object;
53: if (curi == null) {
54: return false;
55: }
56: try {
57: // Mark URI as seed if via is from different host, URI is not a seed
58: // already, URI is redirect and via is root server
59: if (curi.getUURI().getHostBasename() != null
60: && via.getHostBasename() != null
61: && !curi.getUURI().getHostBasename().equals(
62: via.getHostBasename()) && curi.isLocation()
63: && via.getPath().equals(SLASH)) {
64: curi.setIsSeed(true);
65: LOGGER.info("Adding " + object.toString()
66: + " to seeds via " + getVia(object).toString());
67: return true;
68: }
69: } catch (URIException e) {
70: e.printStackTrace();
71: } catch (Exception e) {
72: e.printStackTrace();
73: // Return false since we could not get hostname or something else
74: // went wrong
75: }
76: return false;
77: }
78:
79: private UURI getVia(Object o) {
80: return (o instanceof CandidateURI) ? ((CandidateURI) o)
81: .getVia() : null;
82: }
83: }
|