001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * HopsFilter.java
020: * Created on Oct 3, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.filter;
025:
026: import java.util.logging.Logger;
027:
028: import javax.management.AttributeNotFoundException;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CandidateURI;
032: import org.archive.crawler.datamodel.CrawlURI;
033: import org.archive.crawler.deciderules.DecideRule;
034: import org.archive.crawler.deciderules.DecidingFilter;
035: import org.archive.crawler.framework.Filter;
036: import org.archive.crawler.settings.SimpleType;
037: import org.archive.net.UURI;
038:
039: /**
040: * Accepts all urls passed in with a path depth
041: * less or equal than the max-path-depth
042: * value.
043: *
044: * @author Igor Ranitovic
045: * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
046: * equivalent {@link DecideRule}.
047: */
048: public class PathDepthFilter extends Filter {
049:
050: private static final long serialVersionUID = 1626115117327154205L;
051:
052: private static final Logger logger = Logger
053: .getLogger(PathDepthFilter.class.getName());
054: public static final String ATTR_MATCH_RETURN_VALUE = "path-less-or-equal-return";
055: public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth";
056: Integer maxPathDepth = new Integer(Integer.MAX_VALUE);
057: final static char slash = '/';
058:
059: /**
060: * @param name
061: */
062: public PathDepthFilter(String name) {
063: super (name, "Path depth less or equal filter *Deprecated* Use"
064: + "DecidingFilter and equivalent DecideRule instead.");
065: addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH,
066: "Max path" + " depth for which this filter will match",
067: maxPathDepth));
068: addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE,
069: "What to return when path depth is less or equal to max path"
070: + " depth. \n", new Boolean(true)));
071: }
072:
073: protected boolean innerAccepts(Object o) {
074: String path = null;
075: if (o == null) {
076: return false;
077: }
078:
079: if (o instanceof CandidateURI) {
080: try {
081: if (((CandidateURI) o).getUURI() != null) {
082: path = ((CandidateURI) o).getUURI().getPath();
083: }
084: } catch (URIException e) {
085: logger.severe("Failed getpath for "
086: + ((CandidateURI) o).getUURI());
087: }
088: } else if (o instanceof UURI) {
089: try {
090: path = ((UURI) o).getPath();
091: } catch (URIException e) {
092: logger.severe("Failed getpath for " + o);
093: }
094: }
095:
096: if (path == null) {
097: return true;
098: }
099:
100: int count = 0;
101: for (int i = path.indexOf(slash); i != -1; i = path.indexOf(
102: slash, i + 1)) {
103: count++;
104: }
105:
106: if (o instanceof CrawlURI) {
107: try {
108: this .maxPathDepth = (Integer) getAttribute(
109: ATTR_MAX_PATH_DEPTH, (CrawlURI) o);
110: } catch (AttributeNotFoundException e) {
111: logger.severe(e.getMessage());
112: }
113: }
114:
115: return (this .maxPathDepth != null) ? count <= this .maxPathDepth
116: .intValue() : false;
117: }
118:
119: protected boolean returnTrueIfMatches(CrawlURI curi) {
120: try {
121: return ((Boolean) getAttribute(ATTR_MATCH_RETURN_VALUE,
122: curi)).booleanValue();
123: } catch (AttributeNotFoundException e) {
124: logger.severe(e.getMessage());
125: return true;
126: }
127: }
128:
129: protected boolean getFilterOffPosition(CrawlURI curi) {
130: return returnTrueIfMatches(curi);
131: }
132: }
|