001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * SimplePreselector.java
020: * Created on Sep 22, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.prefetch;
025:
026: import javax.management.AttributeNotFoundException;
027:
028: import org.archive.crawler.datamodel.CrawlURI;
029: import org.archive.crawler.datamodel.FetchStatusCodes;
030: import org.archive.crawler.framework.Scoper;
031: import org.archive.crawler.settings.SimpleType;
032: import org.archive.crawler.settings.Type;
033: import org.archive.util.TextUtils;
034:
035: /**
036: * If set to recheck the crawl's scope, gives a yes/no on whether
037: * a CrawlURI should be processed at all. If not, its status
038: * will be marked OUT_OF_SCOPE and the URI will skip directly
039: * to the first "postprocessor".
040: *
041: *
042: * @author gojomo
043: *
044: */
045: public class Preselector extends Scoper implements FetchStatusCodes {
046:
047: private static final long serialVersionUID = 3738560264369561017L;
048:
049: /** whether to reapply crawl scope at this step */
050: public static final String ATTR_RECHECK_SCOPE = "recheck-scope";
051: /** indicator allowing all URIs (of a given host, typically) to
052: * be blocked at this step*/
053: public static final String ATTR_BLOCK_ALL = "block-all";
054: /** indicator allowing all matching URIs to be blocked at this step */
055: public static final String ATTR_BLOCK_BY_REGEXP = "block-by-regexp";
056: /** indicator allowing all matching URIs */
057: public static final String ATTR_ALLOW_BY_REGEXP = "allow-by-regexp";
058:
059: /**
060: * Constructor.
061: * @param name Name of this processor.
062: */
063: public Preselector(String name) {
064: super (
065: name,
066: "Preselector. Does one last bit of checking to make "
067: + "sure that the current URI should be fetched.");
068: Type e;
069: e = addElementToDefinition(new SimpleType(
070: ATTR_RECHECK_SCOPE,
071: "Recheck if uri is in scope. This is meaningful if the scope"
072: + " is altered during a crawl. URIs are checked against the"
073: + " scope when they are added to queues. Setting this value to"
074: + " true forces the URI to be checked against the scope when it"
075: + " is comming out of the queue, possibly after the scope is"
076: + " altered.", new Boolean(false)));
077: e.setExpertSetting(true);
078:
079: e = addElementToDefinition(new SimpleType(
080: ATTR_BLOCK_ALL,
081: "Block all URIs from being processed. This is most likely to"
082: + " be used in overrides to easily reject certain hosts from"
083: + " being processed.", new Boolean(false)));
084: e.setExpertSetting(true);
085:
086: e = addElementToDefinition(new SimpleType(ATTR_BLOCK_BY_REGEXP,
087: "Block all URIs matching the regular expression from being"
088: + " processed.", ""));
089: e.setExpertSetting(true);
090:
091: e = addElementToDefinition(new SimpleType(ATTR_ALLOW_BY_REGEXP,
092: "Allow only URIs matching the regular expression to be"
093: + " processed.", ""));
094: e.setExpertSetting(true);
095: }
096:
097: protected void innerProcess(CrawlURI curi) {
098: // Check if uris should be blocked
099: try {
100: if (((Boolean) getAttribute(ATTR_BLOCK_ALL, curi))
101: .booleanValue()) {
102: curi.setFetchStatus(S_BLOCKED_BY_USER);
103: curi.skipToProcessorChain(getController()
104: .getPostprocessorChain());
105: }
106: } catch (AttributeNotFoundException e) {
107: // Act as attribute was false, that is: do nothing.
108: }
109:
110: // Check if allowed by regular expression
111: try {
112: String regexp = (String) getAttribute(ATTR_ALLOW_BY_REGEXP,
113: curi);
114: if (regexp != null && !regexp.equals("")) {
115: if (!TextUtils.matches(regexp, curi.toString())) {
116: curi.setFetchStatus(S_BLOCKED_BY_USER);
117: curi.skipToProcessorChain(getController()
118: .getPostprocessorChain());
119: }
120: }
121: } catch (AttributeNotFoundException e) {
122: // Act as regexp was null, that is: do nothing.
123: }
124:
125: // Check if blocked by regular expression
126: try {
127: String regexp = (String) getAttribute(ATTR_BLOCK_BY_REGEXP,
128: curi);
129: if (regexp != null && !regexp.equals("")) {
130: if (TextUtils.matches(regexp, curi.toString())) {
131: curi.setFetchStatus(S_BLOCKED_BY_USER);
132: curi.skipToProcessorChain(getController()
133: .getPostprocessorChain());
134: }
135: }
136: } catch (AttributeNotFoundException e) {
137: // Act as regexp was null, that is: do nothing.
138: }
139:
140: // Possibly recheck scope
141: try {
142: if (((Boolean) getAttribute(ATTR_RECHECK_SCOPE, curi))
143: .booleanValue()) {
144: if (!isInScope(curi)) {
145: // Scope rejected
146: curi.setFetchStatus(S_OUT_OF_SCOPE);
147: curi.skipToProcessorChain(getController()
148: .getPostprocessorChain());
149: }
150: }
151: } catch (AttributeNotFoundException e) {
152: // Act as attribute was false, that is: do nothing.
153: }
154: }
155: }
|