001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * Processor.java
020: * Created on Apr 16, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.framework;
025:
026: import java.lang.reflect.Constructor;
027: import java.util.Iterator;
028: import java.util.logging.Level;
029: import java.util.logging.Logger;
030:
031: import javax.management.AttributeNotFoundException;
032:
033: import org.archive.crawler.datamodel.CrawlURI;
034: import org.archive.crawler.deciderules.DecideRule;
035: import org.archive.crawler.deciderules.DecideRuleSequence;
036: import org.archive.crawler.settings.MapType;
037: import org.archive.crawler.settings.ModuleType;
038: import org.archive.crawler.settings.SimpleType;
039:
040: /**
041: * Base class for URI processing classes.
042: *
043: * <p> Each URI is processed by a user defined series of processors. This class
044: * provides the basic infrastructure for these but does not actually do
045: * anything. New processors can be easily created by subclassing this class.
046: *
047: * <p> Classes subclassing this one should not trap InterruptedExceptions.
048: * They should be allowed to propagate to the ToeThread executing the processor.
049: * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
050: * if the <tt>interrupted</tt> flag is set.
051: *
052: * @author Gordon Mohr
053: *
054: * @see org.archive.crawler.framework.ToeThread
055: */
056: public class Processor extends ModuleType {
057:
058: private static final long serialVersionUID = 6248563827413710226L;
059:
060: /**
061: * Key to use asking settings for decide-rules value.
062: */
063: public static final String ATTR_DECIDE_RULES = "decide-rules";
064: /** local name for decide-rules */
065: protected String attrDecideRules;
066:
067: /**
068: * Key to use asking settings for enabled value.
069: */
070: public final static String ATTR_ENABLED = "enabled";
071:
072: private Processor defaultNextProcessor = null;
073:
074: private static Logger logger = Logger
075: .getLogger("org.archive.crawler.framework.Processor");
076:
077: /**
078: * @param name
079: * @param description
080: */
081: public Processor(String name, String description) {
082: super (name, description);
083: addElementToDefinition(new SimpleType(ATTR_ENABLED,
084: "Is processor enabled", new Boolean(true)));
085: attrDecideRules = getName() + "#" + ATTR_DECIDE_RULES;
086: addElementToDefinition(new DecideRuleSequence(attrDecideRules,
087: "DecideRules which, if their final decision is REJECT, "
088: + "prevent this Processor from running."));
089: }
090:
091: /**
092: * Perform processing on the given CrawlURI.
093: *
094: * @param curi
095: * @throws InterruptedException
096: */
097: public final void process(CrawlURI curi)
098: throws InterruptedException {
099: // by default, arrange for curi to proceed to next processor
100: curi.setNextProcessor(getDefaultNextProcessor(curi));
101:
102: // Check if this processor is enabled before processing
103: try {
104: if (!((Boolean) getAttribute(ATTR_ENABLED, curi))
105: .booleanValue()) {
106: return;
107: }
108: } catch (AttributeNotFoundException e) {
109: logger.severe(e.getMessage());
110: }
111:
112: if (rulesAccept(curi)) {
113: innerProcess(curi);
114: } else {
115: innerRejectProcess(curi);
116: }
117: }
118:
119: protected void checkForInterrupt() throws InterruptedException {
120: if (Thread.interrupted()) {
121: throw new InterruptedException("interrupted");
122: }
123: }
124:
125: /**
126: * @param curi CrawlURI instance.
127: * @throws InterruptedException
128: */
129: protected void innerRejectProcess(CrawlURI curi)
130: throws InterruptedException {
131: // by default do nothing
132: }
133:
134: /**
135: * Classes subclassing this one should override this method to perform
136: * their custom actions on the CrawlURI.
137: *
138: * @param curi The CrawlURI being processed.
139: * @throws InterruptedException
140: */
141: protected void innerProcess(CrawlURI curi)
142: throws InterruptedException {
143: // by default do nothing
144: }
145:
146: /**
147: * Classes subclassing this one should override this method to perform
148: * processor specific actions.
149: * <p>
150: *
151: * This method is garanteed to be called after the crawl is set up, but
152: * before any URI-processing has occured.
153: */
154: protected void initialTasks() {
155: // by default do nothing
156: }
157:
158: /**
159: * Classes subclassing this one should override this method to perform
160: * processor specific actions.
161: *
162: */
163: protected void finalTasks() {
164: // by default do nothing
165: }
166:
167: protected DecideRule getDecideRule(Object o) {
168: try {
169: return (DecideRule) getAttribute(o, attrDecideRules);
170: } catch (AttributeNotFoundException e) {
171: throw new RuntimeException(e);
172: }
173: }
174:
175: protected boolean rulesAccept(Object o) {
176: return rulesAccept(getDecideRule(o), o);
177: }
178:
179: protected boolean rulesAccept(DecideRule rule, Object o) {
180: return rule.decisionFor(o) != DecideRule.REJECT;
181: }
182:
183: /**
184: * Returns the next processor for the given CrawlURI in the processor chain.
185: * @param curi The CrawlURI that we want to find the next processor for.
186: * @return The next processor for the given CrawlURI in the processor chain.
187: */
188: public Processor getDefaultNextProcessor(CrawlURI curi) {
189: return defaultNextProcessor;
190: }
191:
192: /** Set the default next processor in the chain.
193: *
194: * @param nextProcessor the default next processor in the chain.
195: */
196: public void setDefaultNextProcessor(Processor nextProcessor) {
197: defaultNextProcessor = nextProcessor;
198: }
199:
200: /**
201: * Get the controller object.
202: *
203: * @return the controller object.
204: */
205: public CrawlController getController() {
206: return getSettingsHandler().getOrder().getController();
207: }
208:
209: public Processor spawn(int serialNum) {
210: Processor newInst = null;
211: try {
212: Constructor co = getClass().getConstructor(
213: new Class[] { String.class });
214: newInst = (Processor) co
215: .newInstance(new Object[] { getName() + serialNum });
216: getParent().setAttribute(newInst);
217: newInst.setTransient(true);
218: } catch (Exception e) {
219: // TODO Auto-generated catch block
220: e.printStackTrace();
221: }
222: return newInst;
223: }
224:
225: /**
226: * Compiles and returns a report (in human readable form) about the status
227: * of the processor. The processor's name (of implementing class) should
228: * always be included.
229: * <p>
230: * Examples of stats declared would include:<br>
231: * * Number of CrawlURIs handled.<br>
232: * * Number of links extracted (for link extractors)<br>
233: * etc.
234: *
235: * @return A human readable report on the processor's state.
236: */
237: public String report() {
238: return ""; // Default behavior.
239: }
240:
241: /**
242: * @param curi CrawlURI to examine.
243: * @return True if content to process -- content length is > 0
244: * -- and links have not yet been extracted.
245: */
246: protected boolean isContentToProcess(CrawlURI curi) {
247: return !curi.hasBeenLinkExtracted()
248: && curi.getContentLength() > 0;
249: }
250:
251: /**
252: * @param curi CrawlURI to examine.
253: * @return True if {@link #isContentToProcess(CrawlURI)} and
254: * the CrawlURI represents a successful http transaction.
255: */
256: protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
257: return isContentToProcess(curi) && curi.isHttpTransaction()
258: && curi.isSuccess();
259: }
260:
261: /**
262: * @param contentType Found content type.
263: * @param expectedPrefix String to find at start of contenttype: e.g.
264: * <code>text/html</code>.
265: * @return True if passed content-type begins with
266: * expected mimetype.
267: */
268: protected boolean isExpectedMimeType(String contentType,
269: String expectedPrefix) {
270: return contentType != null
271: && contentType.toLowerCase().startsWith(expectedPrefix);
272: }
273:
274: public void kickUpdate() {
275: // by default do nothing
276: }
277: }
|