001: /* PathologicalFilter
002: *
003: * $Id: PathologicalPathFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
004: *
005: * Created on Feb 20, 2004
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.filter;
026:
027: import java.util.logging.Logger;
028:
029: import javax.management.AttributeNotFoundException;
030:
031: import org.archive.crawler.datamodel.CrawlURI;
032: import org.archive.crawler.deciderules.DecideRule;
033: import org.archive.crawler.deciderules.DecidingFilter;
034: import org.archive.crawler.settings.SimpleType;
035: import org.archive.crawler.settings.Type;
036:
037: /**
038: * Checks if a URI contains a repeated pattern.
039: *
040: * This filter is checking if a pattern is repeated a specific number of times.
041: * The use is to avoid crawler traps where the server adds the same pattern to
042: * the requested URI like: <code>http://host/img/img/img/img....</code>. This
043: * filter returns TRUE if the path is pathological. FALSE otherwise.
044: *
045: * @author John Erik Halse
046: * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
047: * equivalent {@link DecideRule}.
048: */
049: public class PathologicalPathFilter extends URIRegExpFilter {
050:
051: private static final long serialVersionUID = 2797805167250054353L;
052:
053: private static final Logger logger = Logger
054: .getLogger(PathologicalPathFilter.class.getName());
055:
056: public static final String ATTR_REPETITIONS = "repetitions";
057:
058: public static final Integer DEFAULT_REPETITIONS = new Integer(3);
059:
060: private final String REGEX_PREFIX = ".*?/(.*?/)\\1{";
061: private final String REGEX_SUFFIX = ",}.*";
062:
063: /** Constructs a new PathologicalPathFilter.
064: *
065: * @param name the name of the filter.
066: */
067: public PathologicalPathFilter(String name) {
068: super (name);
069: setDescription("Pathological path filter *Deprecated* Use"
070: + "DecidingFilter and equivalent DecideRule instead. "
071: + "The Pathologicalpath filter"
072: + " is used to avoid crawler traps by adding a constraint on"
073: + " how many times a pattern in the URI could be repeated."
074: + " Returns false if the path is NOT pathological (There"
075: + " are no subpath reptitions or reptitions are less than"
076: + " the '" + ATTR_REPETITIONS + "' limit).");
077:
078: Type type = getElementFromDefinition(ATTR_MATCH_RETURN_VALUE);
079: type.setTransient(true);
080:
081: type = getElementFromDefinition(ATTR_REGEXP);
082: type.setTransient(true);
083:
084: addElementToDefinition(new SimpleType(
085: ATTR_REPETITIONS,
086: "Number of times the pattern should be allowed to occur. \n"
087: + "This filter returns true if number of repetitions of a"
088: + " pattern exceeds this value",
089: DEFAULT_REPETITIONS));
090: }
091:
092: /**
093: * Construct the regexp string to be matched aginst the URI.
094: * @param o an object to extract a URI from.
095: * @return the regexp pattern.
096: */
097: protected String getRegexp(Object o) {
098: int rep = 0;
099: try {
100: rep = ((Integer) getAttribute(o, ATTR_REPETITIONS))
101: .intValue();
102: } catch (AttributeNotFoundException e) {
103: logger.severe(e.getMessage());
104: }
105: return rep == 0 ? null : REGEX_PREFIX + (rep - 1)
106: + REGEX_SUFFIX;
107: }
108:
109: protected boolean getFilterOffPosition(CrawlURI curi) {
110: return false;
111: }
112: }
|