001: /* $Id: NotExceedsDocumentLengthTresholdDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
002: *
003: * Created on 28.8.2006
004: *
005: * Copyright (C) 2006 Olaf Freyer
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.deciderules;
024:
025: import java.util.logging.Level;
026: import java.util.logging.Logger;
027:
028: import org.apache.commons.httpclient.HttpMethod;
029: import org.archive.crawler.datamodel.CoreAttributeConstants;
030: import org.archive.crawler.datamodel.CrawlURI;
031: import org.archive.crawler.settings.SimpleType;
032:
033: public class NotExceedsDocumentLengthTresholdDecideRule extends
034: PredicatedDecideRule implements CoreAttributeConstants {
035:
036: private static final long serialVersionUID = -8774160016195991876L;
037:
038: private static final Logger logger = Logger
039: .getLogger(NotExceedsDocumentLengthTresholdDecideRule.class
040: .getName());
041: public static final String ATTR_CONTENT_LENGTH_TRESHOLD = "content-length-treshold";
042: static final Integer DEFAULT_CONTENT_LENGTH_TRESHOLD = -1;
043: public static final String ATTR_USE_AS_MIDFETCH = "use-as-midfetch-filter";
044: static final Boolean DEFAULT_USE_AS_MIDFETCH = new Boolean(true);
045:
046: // Header predictor state constants
047: public static final int HEADER_PREDICTS_MISSING = -1;
048:
049: public NotExceedsDocumentLengthTresholdDecideRule(String name) {
050: super (name);
051: setDescription("NotExceedsDocumentLengthTresholdDecideRule. "
052: + "REJECTs URIs "
053: + "with content length exceeding a given treshold. "
054: + "Either examines HTTP header content length or "
055: + "actual downloaded content length and returns false "
056: + "for documents exceeding a given length treshold.");
057:
058: addElementToDefinition(new SimpleType(
059: ATTR_USE_AS_MIDFETCH,
060: "Shall this rule be used as a midfetch rule? If true, "
061: + "this rule will determine content length based on HTTP "
062: + "header information, otherwise the size of the already "
063: + "downloaded content will be used.",
064: DEFAULT_USE_AS_MIDFETCH));
065:
066: addElementToDefinition(new SimpleType(
067: ATTR_CONTENT_LENGTH_TRESHOLD,
068: "Max "
069: + "content-length this filter will allow to pass through. If -1, "
070: + "then no limit.",
071: DEFAULT_CONTENT_LENGTH_TRESHOLD));
072: }
073:
074: protected boolean evaluate(Object object) {
075: try {
076: CrawlURI curi = (CrawlURI) object;
077:
078: int contentlength = HEADER_PREDICTS_MISSING;
079:
080: //filter used as midfetch filter
081: if (getIsMidfetchRule(object)) {
082:
083: if (curi.containsKey(A_HTTP_TRANSACTION) == false) {
084: // Missing header info, let pass
085: if (logger.isLoggable(Level.INFO)) {
086: logger
087: .info("Error: Missing HttpMethod object in "
088: + "CrawlURI. "
089: + curi.toString());
090: }
091: return false;
092: }
093:
094: // Initially assume header info is missing
095: HttpMethod method = (HttpMethod) curi
096: .getObject(A_HTTP_TRANSACTION);
097:
098: // get content-length
099: String newContentlength = null;
100: if (method.getResponseHeader("content-length") != null) {
101: newContentlength = method.getResponseHeader(
102: "content-length").getValue();
103: }
104:
105: if (newContentlength != null
106: && newContentlength.length() > 0) {
107: try {
108: contentlength = Integer
109: .parseInt(newContentlength);
110: } catch (NumberFormatException nfe) {
111: // Ignore.
112: }
113: }
114:
115: // If no document length was reported or format was wrong,
116: // let pass
117: if (contentlength == HEADER_PREDICTS_MISSING) {
118: return false;
119: }
120: } else {
121: contentlength = (int) curi.getContentSize();
122: }
123:
124: return makeDecision(contentlength, object);
125:
126: } catch (ClassCastException e) {
127: // if not CrawlURI, always disregard
128: return false;
129: }
130: }
131:
132: /**
133: * @param contentLength content length to check against treshold
134: * @param obj Context object.
135: * @return contentLength not exceeding treshold?
136: */
137: protected Boolean makeDecision(int contentLength, Object obj) {
138: return contentLength < getContentLengthTreshold(obj);
139: }
140:
141: /**
142: * @param obj Context object.
143: * @return content length threshold
144: */
145: protected int getContentLengthTreshold(Object obj) {
146: int len = ((Integer) getUncheckedAttribute(obj,
147: ATTR_CONTENT_LENGTH_TRESHOLD)).intValue();
148: return len == -1 ? Integer.MAX_VALUE : len;
149: }
150:
151: /**
152: * @param obj Context object.
153: * @return to be used as midfetch rule?
154: */
155: private Boolean getIsMidfetchRule(Object obj) {
156: return ((Boolean) getUncheckedAttribute(obj,
157: ATTR_USE_AS_MIDFETCH)).booleanValue();
158: }
159: }
|