001: /* ContentBasedWaitEvaluator
002: *
003: * $Id: ContentBasedWaitEvaluator.java 4654 2006-09-25 20:19:54Z paul_jack $
004: *
005: * Created on 1.4.2005
006: *
007: * Copyright (C) 2005 Kristinn Sigurdsson
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.postprocessor;
026:
027: import javax.management.AttributeNotFoundException;
028:
029: import org.archive.crawler.datamodel.CrawlURI;
030: import org.archive.crawler.settings.SimpleType;
031: import org.archive.util.TextUtils;
032:
033: /**
034: * A WaitEvaluator that compares the CrawlURIs content type to a configurable
035: * regular expression. If it matches, then the wait evaluation is performed.
036: * Otherwise the processor passes on the CrawlURI, doing nothing.
037: *
038: * @author Kristinn Sigurdsson
039: *
040: * @see org.archive.crawler.postprocessor.WaitEvaluator
041: */
042: public class ContentBasedWaitEvaluator extends WaitEvaluator {
043:
044: private static final long serialVersionUID = 1623347208782997347L;
045:
046: /** The regular expression that we limit this evaluator to. */
047: public final static String ATTR_CONTENT_REGEXPR = "content-regular-expression";
048: protected final static String DEFAULT_CONTENT_REGEXPR = "^.*$"; //Everything
049:
050: /**
051: * Constructor
052: *
053: * @param name The name of the module
054: */
055: public ContentBasedWaitEvaluator(String name) {
056: this (
057: name,
058: "Evaluates how long to wait before fetching a URI again. "
059: + "Only handles CrawlURIs whose content type matches the "
060: + "regular expression set. "
061: + "Typically, this processor should be in the post processing "
062: + "chain. It will pass if another wait evaluator has already "
063: + "processed the CrawlURI.",
064: DEFAULT_CONTENT_REGEXPR, DEFAULT_INITIAL_WAIT_INTERVAL,
065: DEFAULT_MAX_WAIT_INTERVAL, DEFAULT_MIN_WAIT_INTERVAL,
066: DEFAULT_UNCHANGED_FACTOR, DEFAULT_CHANGED_FACTOR);
067: }
068:
069: /**
070: * Constructor
071: *
072: * @param name The name of the module
073: * @param description Description of the module
074: * @param default_inital_wait_interval The default value for initial wait
075: * time
076: * @param default_max_wait_interval The maximum value for wait time
077: * @param default_min_wait_interval The minimum value for wait time
078: * @param default_unchanged_factor The factor for changing wait times of
079: * unchanged documents (will be multiplied by this value)
080: * @param default_changed_factor The factor for changing wait times of
081: * changed documents (will be divided by this value)
082: */
083: public ContentBasedWaitEvaluator(String name, String description,
084: String defaultRegExpr, Long default_inital_wait_interval,
085: Long default_max_wait_interval,
086: Long default_min_wait_interval,
087: Double default_unchanged_factor,
088: Double default_changed_factor) {
089: super (name, description, default_inital_wait_interval,
090: default_max_wait_interval, default_min_wait_interval,
091: default_unchanged_factor, default_changed_factor);
092:
093: addElementToDefinition(new SimpleType(ATTR_CONTENT_REGEXPR,
094: "Only URIs whose content type matches this regular "
095: + "expression will be evaluated.",
096: defaultRegExpr));
097:
098: }
099:
100: protected void innerProcess(CrawlURI curi)
101: throws InterruptedException {
102: // Check if content type is available and if it matches the reg.expr.
103: String content_type = curi.getContentType();
104: if (content_type == null) {
105: // No content type, exit
106: return;
107: }
108: String regexpr;
109: try {
110: regexpr = (String) getAttribute(curi, ATTR_CONTENT_REGEXPR);
111: } catch (AttributeNotFoundException e) {
112: logger
113: .warning("Regular expression for content type not found");
114: return;
115: }
116:
117: if (TextUtils.matches(regexpr, content_type) == false) {
118: // Content type does not match reg.expr. Exit
119: return;
120: }
121: // Ok, it matches, invoke parent method.
122:
123: super.innerProcess(curi);
124: }
125: }
|