01: /* ContentTypeRegExpFilter.java
02: *
03: * Created on Sep 13, 2004
04: *
05: * Copyright (C) 2004 Tom Emerson.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.filter;
24:
25: import org.archive.crawler.datamodel.CrawlURI;
26: import org.archive.util.TextUtils;
27:
28: /**
29: * Compares the content-type of the passed CrawlURI to a regular expression.
30: *
31: * @author Tom Emerson
32: * @version $Date: 2006-09-25 18:41:10 +0000 (Mon, 25 Sep 2006) $, $Revision: 4652 $
33: * @deprecated As of release 1.10.0. To be replaced by an equivalent
34: * {@link DecideRule}.
35: */
36: public class ContentTypeRegExpFilter extends URIRegExpFilter {
37:
38: private static final long serialVersionUID = 206378978342655106L;
39:
40: private static final String DESCRIPTION = "ContentType regexp filter"
41: + "*Deprecated* To be replaced by an equivalent DecideRule. "
42: + "Cannot be used until after fetcher processors. Only then is the"
43: + " Content-Type known. A good place for this filter is at"
44: + " the writer step processing. If the content-type is null,"
45: + " 301s usually have no content-type, the filter returns true.";
46:
47: /**
48: * @param name Filter name.
49: */
50: public ContentTypeRegExpFilter(String name) {
51: super (name, DESCRIPTION, "");
52: }
53:
54: public ContentTypeRegExpFilter(String name, String regexp) {
55: super (name, DESCRIPTION, regexp);
56: }
57:
58: protected boolean innerAccepts(Object o) {
59: // FIXME: can o ever be anything but a CrawlURI?
60: if (!(o instanceof CrawlURI)) {
61: return false;
62: }
63: String content_type = ((CrawlURI) o).getContentType();
64: String regexp = getRegexp(o);
65: return (regexp == null) ? false : (content_type == null) ? true
66: : TextUtils.matches(getRegexp(o), content_type);
67: }
68: }
|