001: /* BeanShellProcessor
002: *
003: * Created on Aug 4, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.processor;
024:
025: import java.io.File;
026: import java.io.IOException;
027: import java.util.Collections;
028: import java.util.HashMap;
029: import java.util.Map;
030: import java.util.logging.Level;
031: import java.util.logging.Logger;
032:
033: import org.archive.crawler.datamodel.CrawlURI;
034: import org.archive.crawler.datamodel.FetchStatusCodes;
035: import org.archive.crawler.framework.Processor;
036: import org.archive.crawler.settings.SimpleType;
037: import org.archive.crawler.settings.Type;
038:
039: import bsh.EvalError;
040: import bsh.Interpreter;
041:
042: /**
043: * A processor which runs a BeanShell script on the CrawlURI.
044: *
045: * Script source may be provided via a file
046: * local to the crawler.
047: * Script source should define
048: * a method with one argument, 'run(curi)'. Each processed CrawlURI is
049: * passed to this script method.
050: *
051: * Other variables available to the script include 'self' (this
052: * BeanShellProcessor instance) and 'controller' (the crawl's
053: * CrawlController instance).
054: *
055: * @author gojomo
056: * @version $Date: 2006-09-25 20:19:54 +0000 (Mon, 25 Sep 2006) $, $Revision: 4654 $
057: */
058: public class BeanShellProcessor extends Processor implements
059: FetchStatusCodes {
060:
061: private static final long serialVersionUID = 6926589944337050754L;
062:
063: private static final Logger logger = Logger
064: .getLogger(BeanShellProcessor.class.getName());
065:
066: /** setting for script file */
067: public final static String ATTR_SCRIPT_FILE = "script-file";
068:
069: /** whether each thread should have its own script runner (true), or
070: * they should share a single script runner with synchronized access */
071: public final static String ATTR_ISOLATE_THREADS = "isolate-threads";
072:
073: protected ThreadLocal<Interpreter> threadInterpreter;
074: protected Interpreter sharedInterpreter;
075: public Map<Object, Object> sharedMap = Collections
076: .synchronizedMap(new HashMap<Object, Object>());
077:
078: /**
079: * Constructor.
080: * @param name Name of this processor.
081: */
082: public BeanShellProcessor(String name) {
083: super (
084: name,
085: "BeanShellProcessor. Runs the BeanShell script source "
086: + "(supplied directly or via a file path) against the "
087: + "current URI. Source should define a script method "
088: + "'process(curi)' which will be passed the current CrawlURI. "
089: + "The script may also access this BeanShellProcessor via"
090: + "the 'self' variable and the CrawlController via the "
091: + "'controller' variable.");
092: Type t = addElementToDefinition(new SimpleType(
093: ATTR_SCRIPT_FILE, "BeanShell script file", ""));
094: t.setOverrideable(false);
095: t = addElementToDefinition(new SimpleType(
096: ATTR_ISOLATE_THREADS,
097: "Whether each ToeThread should get its own independent "
098: + "script context, or they should share synchronized access "
099: + "to one context. Default is true, meaning each threads "
100: + "gets its own isolated context.", true));
101: t.setOverrideable(false);
102:
103: }
104:
105: protected synchronized void innerProcess(CrawlURI curi) {
106: // depending on previous configuration, interpreter may
107: // be local to this thread or shared
108: Interpreter interpreter = getInterpreter();
109: synchronized (interpreter) {
110: // synchronization is harmless for local thread interpreter,
111: // necessary for shared interpreter
112: try {
113: interpreter.set("curi", curi);
114: interpreter.eval("process(curi)");
115: } catch (EvalError e) {
116: // TODO Auto-generated catch block
117: e.printStackTrace();
118: }
119: }
120: }
121:
122: /**
123: * Get the proper Interpreter instance -- either shared or local
124: * to this thread.
125: * @return Interpreter to use
126: */
127: protected Interpreter getInterpreter() {
128: if (sharedInterpreter != null) {
129: return sharedInterpreter;
130: }
131: Interpreter interpreter = threadInterpreter.get();
132: if (interpreter == null) {
133: interpreter = newInterpreter();
134: threadInterpreter.set(interpreter);
135: }
136: return interpreter;
137: }
138:
139: /**
140: * Create a new Interpreter instance, preloaded with any supplied
141: * source code or source file and the variables 'self' (this
142: * BeanShellProcessor) and 'controller' (the CrawlController).
143: *
144: * @return the new Interpreter instance
145: */
146: protected Interpreter newInterpreter() {
147: Interpreter interpreter = new Interpreter();
148: try {
149: interpreter.set("self", this );
150: interpreter.set("controller", getController());
151:
152: String filePath = (String) getUncheckedAttribute(null,
153: ATTR_SCRIPT_FILE);
154: if (filePath.length() > 0) {
155: try {
156: File file = getSettingsHandler()
157: .getPathRelativeToWorkingDirectory(filePath);
158: interpreter.source(file.getPath());
159: } catch (IOException e) {
160: logger.log(Level.SEVERE,
161: "unable to read script file", e);
162: }
163: }
164: } catch (EvalError e) {
165: // TODO Auto-generated catch block
166: e.printStackTrace();
167: }
168:
169: return interpreter;
170: }
171:
172: protected void initialTasks() {
173: super .initialTasks();
174: kickUpdate();
175: }
176:
177: /**
178: * Setup (or reset) Intepreter variables, as appropraite based on
179: * thread-isolation setting.
180: */
181: public void kickUpdate() {
182: // TODO make it so running state (tallies, etc.) isn't lost on changes
183: // unless unavoidable
184: if ((Boolean) getUncheckedAttribute(null, ATTR_ISOLATE_THREADS)) {
185: sharedInterpreter = null;
186: threadInterpreter = new ThreadLocal<Interpreter>();
187: } else {
188: sharedInterpreter = newInterpreter();
189: threadInterpreter = null;
190: }
191: }
192: }
|