01: /* RegularExpressionCriteria
02: *
03: * $Id: RegularExpressionCriteria.java 3704 2005-07-18 17:30:21Z stack-sf $
04: *
05: * Created on Apr 8, 2004
06: *
07: * Copyright (C) 2004 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.settings.refinements;
26:
27: import org.archive.net.UURI;
28: import org.archive.util.TextUtils;
29:
30: /**
31: * A refinement criteria that test if a URI matches a regular expression.
32: *
33: * @author John Erik Halse
34: */
35: public class RegularExpressionCriteria implements Criteria {
36: private String regexp = "";
37:
38: /**
39: * Create a new instance of RegularExpressionCriteria.
40: */
41: public RegularExpressionCriteria() {
42: super ();
43: }
44:
45: /**
46: * Create a new instance of RegularExpressionCriteria initializing it with
47: * a regular expression.
48: *
49: * @param regexp the regular expression for this criteria.
50: */
51: public RegularExpressionCriteria(String regexp) {
52: setRegexp(regexp);
53: }
54:
55: /* (non-Javadoc)
56: * @see org.archive.crawler.settings.refinements.Criteria#isWithinRefinementBounds(org.archive.crawler.datamodel.UURI, int)
57: */
58: public boolean isWithinRefinementBounds(UURI uri) {
59: return (uri == null || uri == null) ? false : TextUtils
60: .matches(regexp, uri.toString());
61: }
62:
63: /**
64: * Get the regular expression to be matched against a URI.
65: *
66: * @return Returns the regexp.
67: */
68: public String getRegexp() {
69: return regexp;
70: }
71:
72: /**
73: * Set the regular expression to be matched against a URI.
74: *
75: * @param regexp The regexp to set.
76: */
77: public void setRegexp(String regexp) {
78: this .regexp = regexp;
79: }
80:
81: /* (non-Javadoc)
82: * @see org.archive.crawler.settings.refinements.Criteria#getName()
83: */
84: public String getName() {
85: return "Regular expression criteria";
86: }
87:
88: /* (non-Javadoc)
89: * @see org.archive.crawler.settings.refinements.Criteria#getDescription()
90: */
91: public String getDescription() {
92: return "Accept URIs that match the following regular expression: "
93: + getRegexp();
94: }
95: }
|