001: /* PathologicalPathDecideRule
002: *
003: * $Id: PathologicalPathDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
004: *
005: * Created on Apr 1, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules;
026:
027: import java.util.logging.Logger;
028:
029: import javax.management.AttributeNotFoundException;
030:
031: import org.archive.crawler.settings.SimpleType;
032: import org.archive.crawler.settings.Type;
033:
034: /**
035: * Rule REJECTs any URI which contains an excessive number of identical,
036: * consecutive path-segments (eg http://example.com/a/a/a/boo.html == 3 '/a'
037: * segments)
038: *
039: * @author gojomo
040: */
041: public class PathologicalPathDecideRule extends MatchesRegExpDecideRule {
042:
043: private static final long serialVersionUID = -1803997581321178499L;
044:
045: private static final Logger logger = Logger
046: .getLogger(PathologicalPathDecideRule.class.getName());
047:
048: public static final String ATTR_REPETITIONS = "max-repetitions";
049:
050: /**
051: * Default maximum repetitions.
052: * Default access so accessible by unit test.
053: */
054: static final Integer DEFAULT_REPETITIONS = new Integer(2);
055:
056: protected String constructedRegexp;
057:
058: /** Constructs a new PathologicalPathFilter.
059: *
060: * @param name the name of the filter.
061: */
062: public PathologicalPathDecideRule(String name) {
063: super (name);
064: setDescription("PathologicalPathDecideRule. This rule"
065: + " is used to avoid crawler traps by adding a constraint on"
066: + " how many times a path-segment pattern in the URI may be"
067: + " repeated. A URI will be REJECTed if the same path-segment"
068: + " repeats more than '" + ATTR_REPETITIONS
069: + "' in a row.");
070:
071: // make default REJECT (overriding superclass) & always-default
072: Type type = addElementToDefinition(new SimpleType(
073: ATTR_DECISION, "Decision to be applied", REJECT,
074: ALLOWED_TYPES));
075: type.setTransient(true);
076:
077: // disable direct setting of regexp from superclass
078: type = getElementFromDefinition(ATTR_REGEXP);
079: type.setTransient(true);
080:
081: type = addElementToDefinition(new SimpleType(
082: ATTR_REPETITIONS,
083: "Number of times the pattern should be allowed to occur. "
084: + "This rule returns its decision (usually REJECT) if a "
085: + "path-segment is repeated more than number of times.",
086: DEFAULT_REPETITIONS));
087: // overriding would require reconstruction of regexp every test
088: type.setOverrideable(false);
089: }
090:
091: /**
092: * Construct the regexp string to be matched against the URI.
093: * @param o an object to extract a URI from.
094: * @return the regexp pattern.
095: */
096: protected String getRegexp(Object o) {
097: if (constructedRegexp == null) {
098: // race no concern: assignment is atomic, happy with any last value
099: constructedRegexp = constructRegexp();
100: }
101: return constructedRegexp;
102: }
103:
104: protected String constructRegexp() {
105: int rep = 0;
106: try {
107: rep = ((Integer) getAttribute(null, ATTR_REPETITIONS))
108: .intValue();
109: } catch (AttributeNotFoundException e) {
110: logger.severe(e.getMessage());
111: }
112: return (rep == 0) ? null : ".*?/(.*?/)\\1{" + rep + ",}.*";
113: }
114:
115: /**
116: * Repetitions may have changed; refresh constructedRegexp
117: *
118: * @see org.archive.crawler.deciderules.DecideRule#kickUpdate()
119: */
120: public void kickUpdate() {
121: super.kickUpdate();
122: constructedRegexp = constructRegexp();
123: }
124: }
|