001: /* HopsPathMatchesRegExpDecideRule
002: *
003: * $Id: HopsPathMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
004: *
005: * Created on June 23, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules;
026:
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import javax.management.AttributeNotFoundException;
031:
032: import org.archive.crawler.datamodel.CandidateURI;
033: import org.archive.crawler.settings.SimpleType;
034: import org.archive.util.TextUtils;
035:
036: /**
037: * Rule applies configured decision to any CrawlURIs whose 'hops-path'
038: * (string like "LLXE" etc.) matches the supplied regexp.
039: *
040: * @author gojomo
041: */
042: public class HopsPathMatchesRegExpDecideRule extends
043: PredicatedDecideRule {
044:
045: private static final long serialVersionUID = -8881013912393934053L;
046:
047: private static final Logger logger = Logger
048: .getLogger(HopsPathMatchesRegExpDecideRule.class.getName());
049:
050: public static final String ATTR_REGEXP = "regexp";
051:
052: /**
053: * Usual constructor.
054: * @param name
055: */
056: public HopsPathMatchesRegExpDecideRule(String name) {
057: super (name);
058: setDescription("HopsPathMatchesRegExpDecideRule. Applies the "
059: + "configured decision to URIs whose hops-path (string with "
060: + "L E R X P etc) matches the supplied regular expression.");
061: addElementToDefinition(new SimpleType(ATTR_REGEXP,
062: "Java regular" + "expression to match.", ""));
063: }
064:
065: /**
066: * Evaluate whether given object (if CandidateURI) has hops-path
067: * matching configured regexp
068: *
069: * @param object
070: * @return true if regexp is matched
071: */
072: protected boolean evaluate(Object object) {
073: try {
074: String regexp = getRegexp(object);
075: String str = ((CandidateURI) object).getPathFromSeed();
076: boolean result = (regexp == null) ? false : TextUtils
077: .matches(regexp, str);
078: if (logger.isLoggable(Level.FINE)) {
079: logger.fine("Tested '" + str + "' match with regex '"
080: + regexp + " and result was " + result);
081: }
082: return result;
083: } catch (ClassCastException e) {
084: // if not CrawlURI, always disregard
085: return false;
086: }
087: }
088:
089: /**
090: * Get the regular expression string to match the URI against.
091: *
092: * @param o the object for which the regular expression should be
093: * matched against.
094: * @return the regular expression to match against.
095: */
096: protected String getRegexp(Object o) {
097: try {
098: return (String) getAttribute(o, ATTR_REGEXP);
099: } catch (AttributeNotFoundException e) {
100: logger.severe(e.getMessage());
101: return null; // Basically the filter is inactive if this occurs.
102: }
103: }
104: }
|