01: /* $Id: $
02: *
03: * Copyright (C) 2007 Olaf Freyer
04: *
05: * This file is part of the Heritrix web crawler (crawler.archive.org).
06: *
07: * Heritrix is free software; you can redistribute it and/or modify
08: * it under the terms of the GNU Lesser Public License as published by
09: * the Free Software Foundation; either version 2.1 of the License, or
10: * any later version.
11: *
12: * Heritrix is distributed in the hope that it will be useful,
13: * but WITHOUT ANY WARRANTY; without even the implied warranty of
14: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15: * GNU Lesser Public License for more details.
16: *
17: * You should have received a copy of the GNU Lesser Public License
18: * along with Heritrix; if not, write to the Free Software
19: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20: */
21: package org.archive.crawler.deciderules;
22:
23: import org.archive.crawler.datamodel.CrawlURI;
24: import org.archive.util.TextUtils;
25:
26: /**
27: * DecideRule whose decision is applied if the URI's content-type
28: * is present and matches the supplied regular expression.
29: *
30: * @author Olaf Freyer
31: */
32: public class ContentTypeMatchesRegExpDecideRule extends
33: MatchesRegExpDecideRule {
34: private static final long serialVersionUID = -2066930281015155843L;
35:
36: public ContentTypeMatchesRegExpDecideRule(String name) {
37: super (name);
38: setDescription("ContentTypeMatchesRegExpDecideRule. Applies the "
39: + "configured decision to URIs matching the supplied regular "
40: + "expression. Cannot be used until after fetcher processors. "
41: + "Only then is the Content-Type known. A good place for this "
42: + "rule is at the writer step processing. If the content-type "
43: + "is null, 301s usually have no content-type, this deciderule "
44: + "will PASS.");
45: }
46:
47: @Override
48: protected boolean evaluate(Object o) {
49: if (!(o instanceof CrawlURI)) {
50: return false;
51: }
52: String content_type = ((CrawlURI) o).getContentType();
53: String regexp = getRegexp(o);
54: return (regexp == null || content_type == null) ? false
55: : TextUtils.matches(getRegexp(o), content_type);
56: }
57: }
|