01: /* ClassKeyMatchesRegExpDecideRule
02: *
03: * $Id: ClassKeyMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
04: *
05: * Created on Apr 4, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.deciderules;
26:
27: import java.util.logging.Level;
28: import java.util.logging.Logger;
29:
30: import org.archive.crawler.datamodel.CandidateURI;
31: import org.archive.util.TextUtils;
32:
33: /**
34: * Rule applies configured decision to any CrawlURI class key -- i.e.
35: * {@link CandidateURI#getClassKey()} -- matches matches supplied regexp.
36: *
37: * @author gojomo
38: */
39: public class ClassKeyMatchesRegExpDecideRule extends
40: MatchesRegExpDecideRule {
41:
42: private static final long serialVersionUID = 1178873944436973294L;
43:
44: private static final Logger logger = Logger
45: .getLogger(ClassKeyMatchesRegExpDecideRule.class.getName());
46:
47: /**
48: * Usual constructor.
49: * @param name
50: */
51: public ClassKeyMatchesRegExpDecideRule(String name) {
52: super (name);
53: setDescription("ClassKeyMatchesRegExpDecideRule. "
54: + "Applies the configured "
55: + "decision to class keys matching the supplied "
56: + "regular expression. Class keys are values set into "
57: + "an URL by the Frontier. They are usually the names "
58: + "of queues used by the Frontier. Class keys can "
59: + "look like hostname + port or be plain IPs (It will "
60: + "depend on the Frontier implementation/configuration).");
61: }
62:
63: /**
64: * Evaluate passed object.
65: * Test first that its CandidateURI. If so, does it have a class key.
66: * If not, ask frontier for its classkey. Then test against regex.
67: *
68: * @param object
69: * @return true if regexp is matched
70: */
71: protected boolean evaluate(Object object) {
72: try {
73: CandidateURI cauri = (CandidateURI) object;
74: String classKey = cauri.getClassKey();
75: if (classKey == null || classKey.length() <= 0) {
76: classKey = getSettingsHandler().getOrder()
77: .getController().getFrontier().getClassKey(
78: cauri);
79: cauri.setClassKey(classKey);
80: }
81: String regexp = getRegexp(cauri);
82: boolean result = (regexp == null) ? false : TextUtils
83: .matches(regexp, cauri.getClassKey());
84: if (logger.isLoggable(Level.FINE)) {
85: logger.fine("Tested '" + cauri.getClassKey()
86: + "' match with regex '" + regexp
87: + " and result was " + result);
88: }
89: return result;
90: } catch (ClassCastException e) {
91: // if not CrawlURI, always disregard
92: return false;
93: }
94: }
95: }
|