01: /* $Id: ExceedsDocumentLengthTresholdDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
02: *
03: * Created on 28.8.2006
04: *
05: * Copyright (C) 2006 Olaf Freyer
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.deciderules;
24:
25: import org.archive.crawler.settings.SimpleType;
26:
27: public class ExceedsDocumentLengthTresholdDecideRule extends
28: NotExceedsDocumentLengthTresholdDecideRule {
29:
30: private static final long serialVersionUID = -3008503096295212224L;
31:
32: /**
33: * Usual constructor.
34: * @param name Name of this rule.
35: */
36: public ExceedsDocumentLengthTresholdDecideRule(String name) {
37: super (name);
38: setDescription("ExceedsDocumentLengthTresholdDecideRule. ACCEPTs URIs "
39: + "with content length exceeding a given treshold. "
40: + "Either examines HTTP header content length or "
41: + "actual downloaded content length and returns false "
42: + "for documents exceeding a given length treshold.");
43:
44: addElementToDefinition(new SimpleType(
45: ATTR_CONTENT_LENGTH_TRESHOLD,
46: "Min "
47: + "content-length this filter will allow to pass through. If -1, "
48: + "then no limit.",
49: DEFAULT_CONTENT_LENGTH_TRESHOLD));
50: }
51:
52: /**
53: * @param contentLength content length to check against treshold
54: * @param obj Context object.
55: * @return contentLength exceeding treshold?
56: */
57: protected Boolean makeDecision(int contentLength, Object obj) {
58: return contentLength > getContentLengthTreshold(obj);
59: }
60:
61: /**
62: * @param obj Context object.
63: * @return content length threshold
64: */
65: protected int getContentLengthTreshold(Object obj) {
66: int len = ((Integer) getUncheckedAttribute(obj,
67: ATTR_CONTENT_LENGTH_TRESHOLD)).intValue();
68: return len == -1 ? 0 : len;
69: }
70: }
|