001: /* MatchesRegExpDecideRule
002: *
003: * $Id: MatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
004: *
005: * Created on Apr 4, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules;
026:
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import javax.management.AttributeNotFoundException;
031:
032: import org.archive.crawler.settings.SimpleType;
033: import org.archive.util.TextUtils;
034:
035: /**
036: * Rule applies configured decision to any CrawlURIs whose String URI
037: * matches the supplied regexp.
038: *
039: * @author gojomo
040: */
041: public class MatchesRegExpDecideRule extends PredicatedDecideRule {
042:
043: private static final long serialVersionUID = 6441410917074319295L;
044:
045: private static final Logger logger = Logger
046: .getLogger(MatchesRegExpDecideRule.class.getName());
047:
048: public static final String ATTR_REGEXP = "regexp";
049:
050: /**
051: * Usual constructor.
052: * @param name
053: */
054: public MatchesRegExpDecideRule(String name) {
055: super (name);
056: setDescription("MatchesRegExpDecideRule. Applies the configured "
057: + "decision to URIs matching the supplied regular expression.");
058: addElementToDefinition(new SimpleType(ATTR_REGEXP,
059: "Java regular" + "expression to match.", ""));
060: }
061:
062: /**
063: * Evaluate whether given object's string version
064: * matches configured regexp
065: *
066: * @param object
067: * @return true if regexp is matched
068: */
069: protected boolean evaluate(Object object) {
070: try {
071: String regexp = getRegexp(object);
072: String str = object.toString();
073: boolean result = (regexp == null) ? false : TextUtils
074: .matches(regexp, str);
075: if (logger.isLoggable(Level.FINE)) {
076: logger.fine("Tested '" + str + "' match with regex '"
077: + regexp + " and result was " + result);
078: }
079: return result;
080: } catch (ClassCastException e) {
081: // if not CrawlURI, always disregard
082: return false;
083: }
084: }
085:
086: /**
087: * Get the regular expression string to match the URI against.
088: *
089: * @param o the object for which the regular expression should be
090: * matched against.
091: * @return the regular expression to match against.
092: */
093: protected String getRegexp(Object o) {
094: try {
095: return (String) getAttribute(o, ATTR_REGEXP);
096: } catch (AttributeNotFoundException e) {
097: logger.severe(e.getMessage());
098: return null; // Basically the filter is inactive if this occurs.
099: }
100: }
101: }
|