001: /* MatchesListRegExpDecideRule
002: *
003: * $Id: MatchesListRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
004: *
005: * Created on 30.5.2005
006: *
007: * Copyright (C) 2005 Kristinn Sigurdsson
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.deciderules;
026:
027: import java.util.Iterator;
028: import java.util.List;
029: import java.util.logging.Level;
030: import java.util.logging.Logger;
031:
032: import javax.management.AttributeNotFoundException;
033:
034: import org.archive.crawler.settings.SimpleType;
035: import org.archive.crawler.settings.StringList;
036: import org.archive.util.TextUtils;
037:
038: /**
039: * Rule applies configured decision to any CrawlURIs whose String URI
040: * matches the supplied regexps.
041: * <p>
042: * The list of regular expressions can be considered logically AND or OR.
043: *
044: * @author Kristinn Sigurdsson
045: *
046: * @see MatchesRegExpDecideRule
047: */
048: public class MatchesListRegExpDecideRule extends PredicatedDecideRule {
049:
050: private static final long serialVersionUID = 3011579758573454930L;
051:
052: private static final Logger logger = Logger
053: .getLogger(MatchesListRegExpDecideRule.class.getName());
054:
055: public static final String ATTR_REGEXP_LIST = "regexp-list";
056: public static final String ATTR_LIST_LOGIC = "list-logic";
057:
058: public static final String DEFAULT_LIST_LOGIC = "OR";
059: public static final String[] LEGAL_LIST_LOGIC = { "OR", "AND" };
060:
061: /**
062: * Usual constructor.
063: * @param name
064: */
065: public MatchesListRegExpDecideRule(String name) {
066: super (name);
067: setDescription("MatchesListRegExpDecideRule. Applies the configured "
068: + "decision to URIs matching the supplied regular expressions.\n"
069: + "The list of regular expressions can be considered logically AND "
070: + "or OR.");
071: addElementToDefinition(new SimpleType(
072: ATTR_LIST_LOGIC,
073: "Should the list of regular "
074: + "expressions be considered as logically AND or OR when "
075: + "matching.", DEFAULT_LIST_LOGIC,
076: LEGAL_LIST_LOGIC));
077: addElementToDefinition(new StringList(
078: ATTR_REGEXP_LIST,
079: "The list of "
080: + "regular expressions to evalute against the URI."));
081: }
082:
083: /**
084: * Evaluate whether given object's string version
085: * matches configured regexps
086: *
087: * @param o
088: * @return true if regexps are matched
089: */
090: protected boolean evaluate(Object o) {
091: try {
092: List regexps = getRegexp(o);
093: if (regexps.size() == 0) {
094: return false;
095: }
096: String str = o.toString();
097: Iterator it = regexps.iterator();
098:
099: boolean listLogicOR = isListLogicOR(o);
100: // Result is initialized so that if OR based the default assumption is
101: // false (find no matches) but if AND based the default assumption is
102: // true (finds no non-matches)
103: boolean result = listLogicOR == false;
104:
105: while (it.hasNext()) {
106: String regexp = (String) it.next();
107: boolean matches = TextUtils.matches(regexp, str);
108:
109: if (logger.isLoggable(Level.FINER)) {
110: logger.finer("Tested '" + str
111: + "' match with regex '" + regexp
112: + " and result was " + matches);
113: }
114:
115: if (matches) {
116: if (listLogicOR) {
117: // OR based and we just got a match, done!
118: result = true;
119: break;
120: }
121: } else {
122: if (listLogicOR == false) {
123: // AND based and we just found a non-match, done!
124: result = false;
125: break;
126: }
127: }
128: }
129:
130: if (logger.isLoggable(Level.FINE) && result) {
131: logger.fine("Matched: " + str);
132: }
133:
134: return result;
135: } catch (ClassCastException e) {
136: // if not CrawlURI, always disregard
137: return false;
138: }
139: }
140:
141: /**
142: * Get the regular expressions list to match the URI against.
143: *
144: * @param o the object for which the regular expression should be
145: * matched against.
146: * @return the regular expression to match against.
147: */
148: protected List getRegexp(Object o) {
149: try {
150: return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
151: } catch (AttributeNotFoundException e) {
152: logger.severe(e.getMessage());
153: // Basically the filter is inactive if this occurs
154: // (The caller should be returning false when regexp is null).
155: return null;
156: }
157: }
158:
159: protected boolean isListLogicOR(Object o) {
160: String logic = DEFAULT_LIST_LOGIC;
161: try {
162: logic = (String) getAttribute(o, ATTR_LIST_LOGIC);
163: } catch (AttributeNotFoundException e) {
164: logger.severe(e.getMessage());
165: }
166: return logic.equals("OR") ? true : false;
167: }
168: }
|