001: /* ScopePlusOneDecideRule
002: *
003: * Created on Aug 22, 2005
004: *
005: * Copyright 2005 Regents of the University of California, All rights reserved
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.deciderules;
024:
025: import java.util.logging.Level;
026: import java.util.logging.Logger;
027:
028: import javax.management.AttributeNotFoundException;
029:
030: import org.archive.crawler.datamodel.CandidateURI;
031: import org.archive.crawler.settings.SimpleType;
032: import org.archive.net.UURI;
033: import org.archive.util.SurtPrefixSet;
034:
035: /**
036: * Rule allows one level of discovery beyond configured scope
037: * (e.g. Domain, plus the first otherwise out-of-scope link from an
038: * in-scope page, but not further hops from that first page)
039: *
040: * @author Shifra Raffel
041: * @version $Date: 2006-09-25 17:16:55 +0000 (Mon, 25 Sep 2006) $ $Revision: 4649 $
042: */
043: public class ScopePlusOneDecideRule extends SurtPrefixedDecideRule {
044:
045: private static final long serialVersionUID = -6344162369024146340L;
046:
047: public static final String ATTR_SCOPE = "host-or-domain-scope";
048: public static final String HOST = "Host";
049: public static final String DOMAIN = "Domain";
050:
051: private static final Logger logger = Logger
052: .getLogger(ScopePlusOneDecideRule.class.getName());
053:
054: /**
055: * Constructor.
056: * @param name
057: */
058: public ScopePlusOneDecideRule(String name) {
059: super (name);
060: setDescription("ScopePlusOneDecideRule. Rule allows one level of discovery "
061: + "beyond configured scope (e.g. Domain, plus the first "
062: + "otherwise out-of-scope link from an in-scope page, but "
063: + "no further hops from that first otherwise-out-of-scope page). "
064: + "surts-source-file is optional. Use surts-dump-file option "
065: + "when testing.");
066: addElementToDefinition(new SimpleType(
067: ATTR_SCOPE,
068: "Restrict to host, e.g. archive.org excludes audio.archive.org, "
069: + "or expand to domain as well, e.g. archive.org includes all "
070: + "*.archive.org", DOMAIN, new String[] { HOST,
071: DOMAIN }));
072: }
073:
074: /**
075: * Evaluate whether given object comes from a URI which is in scope
076: *
077: * @param object to evaluate
078: * @return true if URI is either in scope or its via is
079: */
080: protected boolean evaluate(Object object) {
081: boolean result = false;
082: if (!(object instanceof CandidateURI)) {
083: // Can't evaluate if not a candidate URI
084: return false;
085: }
086: SurtPrefixSet set = getPrefixes(object);
087: UURI u = UURI.from(object);
088: // First, is the URI itself in scope?
089: boolean firstResult = isInScope(u, set);
090: if (logger.isLoggable(Level.FINE)) {
091: logger.fine("Tested scope of UURI itself '" + u
092: + " and result was " + firstResult);
093: }
094: if (firstResult == true) {
095: result = true;
096: } else {
097: // This object is not itself within scope, but
098: // see whether its via might be
099: UURI via = getVia(object);
100: if (via == null) {
101: // If there is no via and the URL doesn't match scope,reject it
102: return false;
103: }
104: // If the via is within scope, accept it
105: result = isInScope(via, set);
106: if (logger.isLoggable(Level.FINE)) {
107: logger.fine("Tested via UURI '" + via
108: + " and result was " + result);
109: }
110: }
111: return result;
112: }
113:
114: /**
115: * Synchronized get of prefix set to use
116: *
117: * @return SurtPrefixSet to use for check
118: *@see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
119: */
120: protected synchronized SurtPrefixSet getPrefixes() {
121: return getPrefixes(null);
122: }
123:
124: /**
125: * Synchronized get of prefix set to use.
126: * @param o Context object.
127: *
128: * @return SurtPrefixSet to use for check
129: * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
130: */
131: protected synchronized SurtPrefixSet getPrefixes(Object o) {
132: if (surtPrefixes == null) {
133: readPrefixes(o);
134: }
135: return surtPrefixes;
136: }
137:
138: /**
139: * Patch the SURT prefix set so that it only includes the appropriate
140: * prefixes.
141: * @param o Context object.
142: * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
143: */
144: protected void readPrefixes(Object o) {
145: buildSurtPrefixSet();
146: // See whether Host or Domain was chosen
147: String scope = this .getScope(o);
148: if (scope.equals(HOST)) {
149: surtPrefixes.convertAllPrefixesToHosts();
150: } else if (scope.equals(DOMAIN)) {
151: surtPrefixes.convertAllPrefixesToDomains();
152: }
153: dumpSurtPrefixSet();
154: }
155:
156: private UURI getVia(Object o) {
157: return (o instanceof CandidateURI) ? ((CandidateURI) o)
158: .getVia() : null;
159: }
160:
161: /**
162: * Decide whether using host or domain scope
163: * @param o Context
164: * @return String Host or domain
165: *
166: */
167: protected String getScope(Object o) {
168: try {
169: String scope = (String) getAttribute(o, ATTR_SCOPE);
170: if (scope.equals(HOST)) {
171: return HOST;
172: } else if (scope.equals(DOMAIN)) {
173: return DOMAIN;
174: } else {
175: assert false : "Unrecognized scope " + scope
176: + ". Should never happen!";
177: }
178: } catch (AttributeNotFoundException e) {
179: logger.severe(e.getMessage());
180: }
181: return null; // Basically the rule is inactive if this occurs.
182: }
183:
184: //check that the URI is in scope
185: private boolean isInScope(Object o, SurtPrefixSet set) {
186: boolean iResult = false;
187: UURI u = (UURI) o;
188: if (u == null) {
189: return false;
190: }
191: String candidateSurt = u.getSurtForm();
192: // also want to treat https as http
193: if (candidateSurt.startsWith("https:")) {
194: candidateSurt = "http:" + candidateSurt.substring(6);
195: }
196: if (set.containsPrefixOf(candidateSurt)) {
197: iResult = true;
198: }
199: return iResult;
200: }
201: }
|