001: /* AcceptRule
002: *
003: * $Id: TooManyPathSegmentsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
004: *
005: * Created on Apr 1, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules;
026:
027: import org.archive.crawler.datamodel.CandidateURI;
028: import org.archive.crawler.settings.SimpleType;
029: import org.archive.crawler.settings.Type;
030:
031: /**
032: * Rule REJECTs any CrawlURIs whose total number of path-segments (as
033: * indicated by the count of '/' characters not including the first '//')
034: * is over a given threshold.
035: *
036: * @author gojomo
037: */
038: public class TooManyPathSegmentsDecideRule extends PredicatedDecideRule {
039:
040: private static final long serialVersionUID = 147079100367815075L;
041:
042: public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth";
043:
044: /**
045: * Default maximum value.
046: * Default access so available to unit test.
047: */
048: static final Integer DEFAULT_MAX_PATH_DEPTH = new Integer(20);
049:
050: /**
051: * Usual constructor.
052: * @param name Name of this DecideRule.
053: */
054: public TooManyPathSegmentsDecideRule(String name) {
055: super (name);
056: setDescription("TooManyPathSegmentsDecideRule. REJECTs URIs with "
057: + "more total path-segments (as indicated by '/' characters) "
058: + "than the configured '" + ATTR_MAX_PATH_DEPTH + "'.");
059:
060: // make default REJECT (overriding superclass) & always-default
061: Type type = addElementToDefinition(new SimpleType(
062: ATTR_DECISION, "Decision to be applied", REJECT,
063: ALLOWED_TYPES));
064: type.setTransient(true);
065:
066: addElementToDefinition(new SimpleType(
067: ATTR_MAX_PATH_DEPTH,
068: "Number of"
069: + " path segments beyond which this rule will reject URIs.",
070: DEFAULT_MAX_PATH_DEPTH));
071:
072: }
073:
074: /**
075: * Evaluate whether given object is over the threshold number of
076: * path-segments.
077: *
078: * @param object
079: * @return true if the path-segments is exceeded
080: */
081: protected boolean evaluate(Object object) {
082: boolean result = false;
083: CandidateURI curi = null;
084: try {
085: curi = (CandidateURI) object;
086: } catch (ClassCastException e) {
087: // if not CrawlURI, always disregard
088: return result;
089: }
090: String uri = curi.toString();
091: int count = 0;
092: int threshold = getThresholdSegments(object);
093: for (int i = 0; i < uri.length(); i++) {
094: if (uri.charAt(i) == '/') {
095: count++;
096: }
097: if (count > threshold) {
098: result = true;
099: break;
100: }
101: }
102: return result;
103: }
104:
105: /**
106: * @param obj
107: * @return path-segments cutoff threshold
108: */
109: private int getThresholdSegments(Object obj) {
110: // add 2 for start-of-authority slashes (not path segments)
111: return ((Integer) getUncheckedAttribute(obj,
112: ATTR_MAX_PATH_DEPTH)).intValue() + 2;
113: }
114: }
|