001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * RegExpFilter.java
020: * Created on Apr 16, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.filter;
025:
026: import java.util.logging.Level;
027: import java.util.logging.Logger;
028:
029: import javax.management.AttributeNotFoundException;
030:
031: import org.archive.crawler.datamodel.CrawlURI;
032: import org.archive.crawler.deciderules.DecideRule;
033: import org.archive.crawler.deciderules.DecidingFilter;
034: import org.archive.crawler.framework.Filter;
035: import org.archive.crawler.settings.SimpleType;
036: import org.archive.util.TextUtils;
037:
038: /**
039: * Compares passed object -- a CrawlURI, UURI, or String --
040: * against a regular expression, accepting matches.
041: *
042: * @author Gordon Mohr
043: * @deprecated As of release 1.10.0. Replaced by {@link DecidingFilter} and
044: * equivalent {@link DecideRule}.
045: */
046: public class URIRegExpFilter extends Filter {
047:
048: private static final long serialVersionUID = 1878356276332865537L;
049:
050: private static final Logger logger = Logger
051: .getLogger(URIRegExpFilter.class.getName());
052: public static final String ATTR_REGEXP = "regexp";
053: public static final String ATTR_MATCH_RETURN_VALUE = "if-match-return";
054:
055: /**
056: * @param name Filter name.
057: */
058: public URIRegExpFilter(String name) {
059: this (name,
060: "URI regexp filter *Deprecated* Use DecidingFilter and "
061: + "equivalent DecideRule instead. ", "");
062: addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE,
063: "What to return when"
064: + " regular expression matches. \n",
065: new Boolean(true)));
066: addElementToDefinition(new SimpleType(ATTR_REGEXP,
067: "Java regular expression.", ""));
068: }
069:
070: public URIRegExpFilter(String name, String regexp) {
071: this (name, "URI regexp filter.", regexp);
072: }
073:
074: protected URIRegExpFilter(String name, String description,
075: String regexp) {
076: super (name, description);
077: addElementToDefinition(new SimpleType(ATTR_MATCH_RETURN_VALUE,
078: "What to return when"
079: + " regular expression matches. \n",
080: new Boolean(true)));
081: addElementToDefinition(new SimpleType(ATTR_REGEXP,
082: "Java regular expression.", regexp));
083: }
084:
085: protected boolean innerAccepts(Object o) {
086: String regexp = getRegexp(o);
087: String str = o.toString();
088: boolean result = (regexp == null) ? false : TextUtils.matches(
089: regexp, str);
090: if (logger.isLoggable(Level.FINE)) {
091: logger.fine("Tested '" + str + "' match with regex '"
092: + getRegexp(o) + " and result was " + result);
093: }
094: return result;
095: }
096:
097: /**
098: * Get the regular expression string to match the URI against.
099: *
100: * @param o the object for which the regular expression should be
101: * matched against.
102: * @return the regular expression to match against.
103: */
104: protected String getRegexp(Object o) {
105: try {
106: return (String) getAttribute(o, ATTR_REGEXP);
107: } catch (AttributeNotFoundException e) {
108: logger.severe(e.getMessage());
109: // Basically the filter is inactive if this occurs
110: // (The caller should be returning false when regexp is null).
111: return null;
112: }
113: }
114:
115: protected boolean returnTrueIfMatches(CrawlURI curi) {
116: try {
117: return ((Boolean) getAttribute(ATTR_MATCH_RETURN_VALUE,
118: curi)).booleanValue();
119: } catch (AttributeNotFoundException e) {
120: logger.severe(e.getMessage());
121: return true;
122: }
123: }
124: }
|