001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * HopsFilter.java
020: * Created on Oct 3, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.filter;
025:
026: import java.util.logging.Logger;
027:
028: import javax.management.AttributeNotFoundException;
029:
030: import org.archive.crawler.datamodel.CandidateURI;
031: import org.archive.crawler.datamodel.CrawlURI;
032: import org.archive.crawler.extractor.Link;
033: import org.archive.crawler.framework.CrawlScope;
034: import org.archive.crawler.framework.Filter;
035: import org.archive.crawler.scope.ClassicScope;
036:
037: /**
038: * Accepts (returns for)) for all CandidateURIs passed in
039: * with a link-hop-count greater than the max-link-hops
040: * value.
041: *
042: * @author gojomo
043: * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
044: * equivalent {@link DecideRule}.
045: */
046: public class HopsFilter extends Filter {
047:
048: private static final long serialVersionUID = -5943030310651023640L;
049:
050: private static final Logger logger = Logger
051: .getLogger(HopsFilter.class.getName());
052:
053: /**
054: * @param name
055: */
056: public HopsFilter(String name) {
057: super (name, "Hops filter *Deprecated* Use"
058: + "DecidingFilter and equivalent DecideRule instead");
059: }
060:
061: int maxLinkHops = Integer.MAX_VALUE;
062: int maxTransHops = Integer.MAX_VALUE;
063:
064: /* (non-Javadoc)
065: * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object)
066: */
067: protected boolean innerAccepts(Object o) {
068: if (!(o instanceof CandidateURI)) {
069: return false;
070: }
071: String path = ((CandidateURI) o).getPathFromSeed();
072: int linkCount = 0;
073: int transCount = 0;
074: for (int i = path.length() - 1; i >= 0; i--) {
075: if (path.charAt(i) == Link.NAVLINK_HOP) {
076: linkCount++;
077: } else if (linkCount == 0) {
078: transCount++;
079: }
080: }
081: if (o instanceof CrawlURI) {
082: CrawlURI curi = (CrawlURI) o;
083: CrawlScope scope = (CrawlScope) globalSettings().getModule(
084: CrawlScope.ATTR_NAME);
085: try {
086: maxLinkHops = ((Integer) scope.getAttribute(
087: ClassicScope.ATTR_MAX_LINK_HOPS, curi))
088: .intValue();
089: maxTransHops = ((Integer) scope.getAttribute(
090: ClassicScope.ATTR_MAX_TRANS_HOPS, curi))
091: .intValue();
092: } catch (AttributeNotFoundException e) {
093: logger.severe(e.getMessage());
094: // Basically, true means the filter is PASSing this URI.
095: return true;
096: }
097: }
098:
099: return (linkCount > maxLinkHops) || (transCount > maxTransHops);
100: }
101: }
|