01: /* $Id: FetchStatusMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
02: *
03: * Created on Sep 4, 2006
04: *
05: * Copyright (C) 2006 Olaf Freyer.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.deciderules;
24:
25: import java.util.logging.Level;
26: import java.util.logging.Logger;
27: import javax.management.AttributeNotFoundException;
28:
29: import org.archive.crawler.datamodel.CrawlURI;
30: import org.archive.crawler.settings.SimpleType;
31: import org.archive.util.TextUtils;
32:
33: public class FetchStatusMatchesRegExpDecideRule extends
34: PredicatedDecideRule {
35:
36: private static final long serialVersionUID = -3088156729860241312L;
37:
38: private final Logger logger = Logger.getLogger(this .getClass()
39: .getName());
40:
41: public static final String ATTR_REGEXP = "regexp";
42:
43: /**
44: * Usual constructor.
45: * @param name Name of this DecideRule.
46: */
47: public FetchStatusMatchesRegExpDecideRule(String name) {
48: super (name);
49: setDescription("FetchStatusMatchesRegExpDecideRule. Applies "
50: + "configured decision to any URI that has a fetch status matching "
51: + "the given regular expression.");
52: addElementToDefinition(new SimpleType(ATTR_REGEXP,
53: "Java regular" + "expression to match.", ""));
54: }
55:
56: protected boolean evaluate(Object object) {
57: try {
58: String regexp = getRegexp(object);
59: CrawlURI curi = (CrawlURI) object;
60: String str = String.valueOf(curi.getFetchStatus());
61: boolean result = (regexp == null) ? false : TextUtils
62: .matches(regexp, str);
63: if (logger.isLoggable(Level.FINE)) {
64: logger.fine("Tested '" + str + "' match with regex '"
65: + regexp + " and result was " + result);
66: }
67: return result;
68: } catch (ClassCastException e) {
69: // if not CrawlURI, always disregard
70: return false;
71: }
72: }
73:
74: /**
75: * Get the regular expression string to match the URI against.
76: *
77: * @param o the object for which the regular expression should be
78: * matched against.
79: * @return the regular expression to match against.
80: */
81: protected String getRegexp(Object o) {
82: try {
83: return (String) getAttribute(o, ATTR_REGEXP);
84: } catch (AttributeNotFoundException e) {
85: logger.severe(e.getMessage());
86: return null; // Basically the filter is inactive if this occurs.
87: }
88: }
89: }
|