0001: /* Heritrix
0002: *
0003: * $Id: Heritrix.java 4858 2007-01-15 23:37:08Z stack-sf $
0004: *
0005: * Created on May 15, 2003
0006: *
0007: * Copyright (C) 2003 Internet Archive.
0008: *
0009: * This file is part of the Heritrix web crawler (crawler.archive.org).
0010: *
0011: * Heritrix is free software; you can redistribute it and/or modify
0012: * it under the terms of the GNU Lesser Public License as published by
0013: * the Free Software Foundation; either version 2.1 of the License, or
0014: * any later version.
0015: *
0016: * Heritrix is distributed in the hope that it will be useful,
0017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0019: * GNU Lesser Public License for more details.
0020: *
0021: * You should have received a copy of the GNU Lesser Public License
0022: * along with Heritrix; if not, write to the Free Software
0023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0024: */
0025: package org.archive.crawler;
0026:
0027: import java.io.File;
0028: import java.io.FileInputStream;
0029: import java.io.FileNotFoundException;
0030: import java.io.FileOutputStream;
0031: import java.io.IOException;
0032: import java.io.InputStream;
0033: import java.io.PrintStream;
0034: import java.io.PrintWriter;
0035: import java.net.HttpURLConnection;
0036: import java.net.InetAddress;
0037: import java.net.URL;
0038: import java.net.URLConnection;
0039: import java.net.UnknownHostException;
0040: import java.util.ArrayList;
0041: import java.util.Arrays;
0042: import java.util.Collection;
0043: import java.util.Collections;
0044: import java.util.Enumeration;
0045: import java.util.Hashtable;
0046: import java.util.Iterator;
0047: import java.util.List;
0048: import java.util.Map;
0049: import java.util.Properties;
0050: import java.util.StringTokenizer;
0051: import java.util.TimeZone;
0052: import java.util.Vector;
0053: import java.util.logging.Level;
0054: import java.util.logging.LogManager;
0055: import java.util.logging.Logger;
0056:
0057: import javax.management.Attribute;
0058: import javax.management.AttributeList;
0059: import javax.management.AttributeNotFoundException;
0060: import javax.management.DynamicMBean;
0061: import javax.management.InstanceAlreadyExistsException;
0062: import javax.management.InstanceNotFoundException;
0063: import javax.management.InvalidAttributeValueException;
0064: import javax.management.MBeanInfo;
0065: import javax.management.MBeanNotificationInfo;
0066: import javax.management.MBeanOperationInfo;
0067: import javax.management.MBeanRegistration;
0068: import javax.management.MBeanRegistrationException;
0069: import javax.management.MBeanServer;
0070: import javax.management.MBeanServerFactory;
0071: import javax.management.MalformedObjectNameException;
0072: import javax.management.NotCompliantMBeanException;
0073: import javax.management.ObjectName;
0074: import javax.management.ReflectionException;
0075: import javax.management.RuntimeOperationsException;
0076: import javax.management.openmbean.CompositeData;
0077: import javax.management.openmbean.CompositeDataSupport;
0078: import javax.management.openmbean.CompositeType;
0079: import javax.management.openmbean.OpenDataException;
0080: import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
0081: import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
0082: import javax.management.openmbean.OpenMBeanInfoSupport;
0083: import javax.management.openmbean.OpenMBeanOperationInfoSupport;
0084: import javax.management.openmbean.OpenMBeanParameterInfo;
0085: import javax.management.openmbean.OpenMBeanParameterInfoSupport;
0086: import javax.management.openmbean.OpenType;
0087: import javax.management.openmbean.SimpleType;
0088: import javax.management.openmbean.TabularData;
0089: import javax.management.openmbean.TabularDataSupport;
0090: import javax.management.openmbean.TabularType;
0091: import javax.naming.CompoundName;
0092: import javax.naming.Context;
0093: import javax.naming.NameNotFoundException;
0094: import javax.naming.NamingException;
0095: import javax.naming.NoInitialContextException;
0096:
0097: import org.apache.commons.cli.Option;
0098: import org.archive.crawler.admin.CrawlJob;
0099: import org.archive.crawler.admin.CrawlJobErrorHandler;
0100: import org.archive.crawler.admin.CrawlJobHandler;
0101: import org.archive.crawler.datamodel.CredentialStore;
0102: import org.archive.crawler.datamodel.credential.Credential;
0103: import org.archive.crawler.event.CrawlStatusListener;
0104: import org.archive.crawler.framework.AlertManager;
0105: import org.archive.crawler.framework.CrawlController;
0106: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0107: import org.archive.crawler.framework.exceptions.InitializationException;
0108: import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
0109: import org.archive.crawler.settings.XMLSettingsHandler;
0110: import org.archive.io.SinkHandler;
0111: import org.archive.io.SinkHandlerLogRecord;
0112: import org.archive.net.UURI;
0113: import org.archive.util.FileUtils;
0114: import org.archive.util.IoUtils;
0115: import org.archive.util.JmxUtils;
0116: import org.archive.util.JndiUtils;
0117: import org.archive.util.PropertyUtils;
0118: import org.archive.util.TextUtils;
0119:
0120: import sun.net.www.protocol.file.FileURLConnection;
0121:
0122: /**
0123: * Main class for Heritrix crawler.
0124: *
0125: * Heritrix is usually launched by a shell script that backgrounds heritrix
0126: * that redirects all stdout and stderr emitted by heritrix to a log file. So
0127: * that startup messages emitted subsequent to the redirection of stdout and
0128: * stderr show on the console, this class prints usage or startup output
0129: * such as where the web UI can be found, etc., to a STARTLOG that the shell
0130: * script is waiting on. As soon as the shell script sees output in this file,
0131: * it prints its content and breaks out of its wait.
0132: * See ${HERITRIX_HOME}/bin/heritrix.
0133: *
0134: * <p>Heritrix can also be embedded or launched by webapp initialization or
0135: * by JMX bootstrapping. So far I count 4 methods of instantiation:
0136: * <ol>
0137: * <li>From this classes main -- the method usually used;</li>
0138: * <li>From the Heritrix UI (The local-instances.jsp) page;</li>
0139: * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
0140: * <li>A container such as tomcat or jboss.</li>
0141: * </ol>
0142: *
0143: * @author gojomo
0144: * @author Kristinn Sigurdsson
0145: * @author Stack
0146: */
0147: public class Heritrix implements DynamicMBean, MBeanRegistration {
0148: /**
0149: * Heritrix logging instance.
0150: */
0151: private static final Logger logger = Logger
0152: .getLogger(Heritrix.class.getName());
0153:
0154: private static final File TMPDIR = new File(System.getProperty(
0155: "java.io.tmpdir", "/tmp"));
0156:
0157: /**
0158: * Name of the heritrix properties file.
0159: */
0160: private static final String PROPERTIES = "heritrix.properties";
0161:
0162: /**
0163: * Name of the key to use specifying alternate heritrix properties on
0164: * command line.
0165: */
0166: private static final String PROPERTIES_KEY = PROPERTIES;
0167:
0168: /**
0169: * Prefix used on properties we'll add to the System.properties list.
0170: */
0171: private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
0172:
0173: /**
0174: * Instance of web server if one was started.
0175: */
0176: private static SimpleHttpServer httpServer = null;
0177:
0178: /**
0179: * CrawlJob handler. Manages multiple crawl jobs at runtime.
0180: */
0181: private CrawlJobHandler jobHandler = null;
0182:
0183: /**
0184: * Heritrix start log file.
0185: *
0186: * This file contains standard out produced by this main class for startup
0187: * only. Used by heritrix shell script. Name here MUST match that in the
0188: * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
0189: * wrapper has on this here java heritrix.
0190: */
0191: private static final String STARTLOG = "heritrix_dmesg.log";
0192:
0193: /**
0194: * Default encoding.
0195: *
0196: * Used for content when fetching if none specified.
0197: */
0198: public static final String DEFAULT_ENCODING = "ISO-8859-1";
0199:
0200: /**
0201: * Heritrix stderr/stdout log file.
0202: *
0203: * This file should have nothing in it except messages over which we have
0204: * no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
0205: * startup script directs stderr/stdout here. This is an INTERDEPENDENCY
0206: * this program has with the wrapper shell script. Shell can actually
0207: * pass us an alternate to use for this file.
0208: */
0209: private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
0210:
0211: /**
0212: * Where to write this classes startup output.
0213: *
0214: * This out should only be used if Heritrix is being run from the
0215: * command-line.
0216: */
0217: private static PrintWriter out = null;
0218:
0219: /**
0220: * The org.archive package
0221: */
0222: private static final String ARCHIVE_PACKAGE = "org.archive.";
0223:
0224: /**
0225: * The crawler package.
0226: */
0227: private static final String CRAWLER_PACKAGE = Heritrix.class
0228: .getName().substring(0,
0229: Heritrix.class.getName().lastIndexOf('.'));
0230:
0231: /**
0232: * The root context for a webapp.
0233: */
0234: private static final String ROOT_CONTEXT = "/";
0235:
0236: /**
0237: * Set to true if application is started from command line.
0238: */
0239: private static boolean commandLine = false;
0240:
0241: /**
0242: * True if container initialization has been run.
0243: */
0244: private static boolean containerInitialized = false;
0245:
0246: /**
0247: * True if properties have been loaded.
0248: */
0249: private static boolean propertiesLoaded = false;
0250:
0251: private static final String JAR_SUFFIX = ".jar";
0252:
0253: private AlertManager alertManager;
0254:
0255: /**
0256: * The context of the GUI webapp. Default is root.
0257: */
0258: private static String adminContext = ROOT_CONTEXT;
0259:
0260: /**
0261: * True if we're to put up a GUI.
0262: * Cmdline processing can override.
0263: */
0264: private static boolean gui = !PropertyUtils
0265: .getBooleanProperty("heritrix.cmdline.nowui");
0266:
0267: /**
0268: * Port to put the GUI up on.
0269: * Cmdline processing can override.
0270: */
0271: private static int guiPort = SimpleHttpServer.DEFAULT_PORT;
0272:
0273: /**
0274: * A collection containing only localhost. Used as default value
0275: * for guiHosts, and passed to SimpleHttpServer when doing selftest.
0276: */
0277: final private static Collection<String> LOCALHOST_ONLY = Collections
0278: .unmodifiableList(Arrays
0279: .asList(new String[] { "127.0.0.1" }));
0280:
0281: /**
0282: * Hosts to bind the GUI webserver to.
0283: * By default, only contans localhost.
0284: * Set to an empty collection to indicate that all available network
0285: * interfaces should be used for the webserver.
0286: */
0287: private static Collection<String> guiHosts = LOCALHOST_ONLY;
0288:
0289: /**
0290: * Web UI server, realm, context name.
0291: */
0292: private static String ADMIN = "admin";
0293:
0294: // OpenMBean support.
0295: /**
0296: * The MBean server we're registered with (May be null).
0297: */
0298: private MBeanServer mbeanServer = null;
0299:
0300: /**
0301: * MBean name we were registered as.
0302: */
0303: private ObjectName mbeanName = null;
0304:
0305: /**
0306: * Keep reference to all instances of Heritrix.
0307: * Used by the UI to figure which of the local Heritrice it should
0308: * be going against and to figure what to shutdown on the way out (If
0309: * there was always a JMX Agent, we wouldn't need to keep this list. We
0310: * could always ask the JMX Agent for all instances. UPDATE: True we could
0311: * always ask the JMX Agent but we might keep around this local reference
0312: * because it will allow faster, less awkward -- think of marshalling the args
0313: * for JMX invoke operation -- access to local Heritrix instances. A new
0314: * usage for this instances Map is in CrawlJob#preRegister to find the hosting
0315: * Heritrix instance).
0316: */
0317: private static Map<String, Heritrix> instances = new Hashtable<String, Heritrix>();
0318:
0319: private OpenMBeanInfoSupport openMBeanInfo;
0320: private final static String STATUS_ATTR = "Status";
0321: private final static String VERSION_ATTR = "Version";
0322: private final static String ISRUNNING_ATTR = "IsRunning";
0323: private final static String ISCRAWLING_ATTR = "IsCrawling";
0324: private final static String ALERTCOUNT_ATTR = "AlertCount";
0325: private final static String NEWALERTCOUNT_ATTR = "NewAlertCount";
0326: private final static String CURRENTJOB_ATTR = "CurrentJob";
0327: private final static List ATTRIBUTE_LIST;
0328: static {
0329: ATTRIBUTE_LIST = Arrays.asList(new String[] { STATUS_ATTR,
0330: VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR,
0331: ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR });
0332: }
0333:
0334: private final static String START_OPER = "start";
0335: private final static String STOP_OPER = "stop";
0336: private final static String DESTROY_OPER = "destroy";
0337: private final static String INTERRUPT_OPER = "interrupt";
0338: private final static String START_CRAWLING_OPER = "startCrawling";
0339: private final static String STOP_CRAWLING_OPER = "stopCrawling";
0340: private final static String ADD_CRAWL_JOB_OPER = "addJob";
0341: private final static String TERMINATE_CRAWL_JOB_OPER = "terminateCurrentJob";
0342: private final static String DELETE_CRAWL_JOB_OPER = "deleteJob";
0343: private final static String ALERT_OPER = "alert";
0344: private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";
0345: private final static String PENDING_JOBS_OPER = "pendingJobs";
0346: private final static String COMPLETED_JOBS_OPER = "completedJobs";
0347: private final static String CRAWLEND_REPORT_OPER = "crawlendReport";
0348: private final static String SHUTDOWN_OPER = "shutdown";
0349: private final static String LOG_OPER = "log";
0350: private final static String REBIND_JNDI_OPER = "rebindJNDI";
0351: private final static List OPERATION_LIST;
0352: static {
0353: OPERATION_LIST = Arrays.asList(new String[] { START_OPER,
0354: STOP_OPER, INTERRUPT_OPER, START_CRAWLING_OPER,
0355: STOP_CRAWLING_OPER, ADD_CRAWL_JOB_OPER,
0356: ADD_CRAWL_JOB_BASEDON_OPER, DELETE_CRAWL_JOB_OPER,
0357: ALERT_OPER, PENDING_JOBS_OPER, COMPLETED_JOBS_OPER,
0358: CRAWLEND_REPORT_OPER, SHUTDOWN_OPER, LOG_OPER,
0359: DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,
0360: REBIND_JNDI_OPER });
0361: }
0362: private CompositeType jobCompositeType = null;
0363: private TabularType jobsTabularType = null;
0364: private static final String[] JOB_KEYS = new String[] { "uid",
0365: "name", "status" };
0366:
0367: private static String adminUsername;
0368:
0369: private static String adminPassword;
0370:
0371: /**
0372: * Constructor.
0373: * Does not register the created instance with JMX. Assumed this
0374: * constructor is used by such as JMX agent creating an instance of
0375: * Heritrix at the commmand of a remote client (In this case Heritrix will
0376: * be registered by the invoking agent).
0377: * @throws IOException
0378: */
0379: public Heritrix() throws IOException {
0380: this (null, false);
0381: }
0382:
0383: public Heritrix(final boolean jmxregister) throws IOException {
0384: this (null, jmxregister);
0385: }
0386:
0387: /**
0388: * Constructor.
0389: * @param name If null, we bring up the default Heritrix instance.
0390: * @param jmxregister True if we are to register this instance with JMX
0391: * agent.
0392: * @throws IOException
0393: */
0394: public Heritrix(final String name, final boolean jmxregister)
0395: throws IOException {
0396: this (name, jmxregister, new CrawlJobHandler(getJobsdir()));
0397: }
0398:
0399: /**
0400: * Constructor.
0401: * @param name If null, we bring up the default Heritrix instance.
0402: * @param jmxregister True if we are to register this instance with JMX
0403: * agent.
0404: * @param cjh CrawlJobHandler to use.
0405: * @throws IOException
0406: */
0407: public Heritrix(final String name, final boolean jmxregister,
0408: final CrawlJobHandler cjh) throws IOException {
0409: super ();
0410: containerInitialization();
0411: this .jobHandler = cjh;
0412: this .openMBeanInfo = buildMBeanInfo();
0413: // Set up the alerting system. SinkHandler is also a global so will
0414: // catch alerts for all running Heritrix instances. Will need to
0415: // address (Add name of instance that threw the alert to SinkRecord?).
0416: final SinkHandler sinkHandler = SinkHandler.getInstance();
0417: if (sinkHandler == null) {
0418: throw new NullPointerException("SinkHandler not found.");
0419: }
0420: // Adapt the alerting system to use SinkHandler.
0421: this .alertManager = new AlertManager() {
0422: public void add(SinkHandlerLogRecord record) {
0423: sinkHandler.publish(record);
0424: }
0425:
0426: public Vector getAll() {
0427: return sinkHandler.getAll();
0428: }
0429:
0430: public Vector getNewAll() {
0431: return sinkHandler.getAllUnread();
0432: }
0433:
0434: public SinkHandlerLogRecord get(String alertID) {
0435: return sinkHandler.get(Long.parseLong(alertID));
0436: }
0437:
0438: public int getCount() {
0439: return sinkHandler.getCount();
0440: }
0441:
0442: public int getNewCount() {
0443: return sinkHandler.getUnreadCount();
0444: }
0445:
0446: public void remove(String alertID) {
0447: sinkHandler.remove(Long.parseLong(alertID));
0448: }
0449:
0450: public void read(String alertID) {
0451: sinkHandler.read(Long.parseLong(alertID));
0452: }
0453: };
0454:
0455: try {
0456: Heritrix.registerHeritrix(this , name, jmxregister);
0457: } catch (InstanceAlreadyExistsException e) {
0458: throw new RuntimeException(e);
0459: } catch (MBeanRegistrationException e) {
0460: throw new RuntimeException(e);
0461: } catch (NotCompliantMBeanException e) {
0462: throw new RuntimeException(e);
0463: } catch (MalformedObjectNameException e) {
0464: throw new RuntimeException(e);
0465: }
0466: }
0467:
0468: /**
0469: * Run setup tasks for this 'container'. Idempotent.
0470: *
0471: * @throws IOException
0472: */
0473: protected static void containerInitialization() throws IOException {
0474: if (Heritrix.containerInitialized) {
0475: return;
0476: }
0477: Heritrix.containerInitialized = true;
0478: // Load up the properties. This invocation adds heritrix properties
0479: // to system properties so all available via System.getProperty.
0480: // Note, loadProperties and patchLogging have global effects. May be an
0481: // issue if we're running inside a container such as tomcat or jboss.
0482: Heritrix.loadProperties();
0483: Heritrix.patchLogging();
0484: Heritrix.configureTrustStore();
0485: // Will run on SIGTERM but not on SIGKILL, unfortunately.
0486: // Otherwise, ensures we cleanup after ourselves (Deregister from
0487: // JMX and JNDI).
0488: Runtime.getRuntime().addShutdownHook(
0489: Heritrix.getShutdownThread(false, 0,
0490: "Heritrix shutdown hook"));
0491: // Register this heritrix 'container' though we may be inside another
0492: // tomcat or jboss container.
0493: try {
0494: registerContainerJndi();
0495: } catch (Exception e) {
0496: logger.log(Level.WARNING,
0497: "Failed jndi container registration.", e);
0498: }
0499: }
0500:
0501: /**
0502: * Do inverse of construction. Used by anyone who does a 'new Heritrix' when
0503: * they want to cleanup the instance.
0504: * Of note, there may be Heritrix threads still hanging around after the
0505: * call to destroy completes. They'll eventually go down after they've
0506: * finished their cleanup routines. In particular, if you are watching
0507: * Heritrix via JMX, you can see the Heritrix instance JMX bean unregister
0508: * ahead of the CrawlJob JMX bean that its hosting.
0509: */
0510: public void destroy() {
0511: stop();
0512: try {
0513: Heritrix.unregisterHeritrix(this );
0514: } catch (InstanceNotFoundException e) {
0515: e.printStackTrace();
0516: } catch (MBeanRegistrationException e) {
0517: e.printStackTrace();
0518: } catch (NullPointerException e) {
0519: e.printStackTrace();
0520: }
0521: this .jobHandler = null;
0522: this .openMBeanInfo = null;
0523: }
0524:
0525: /**
0526: * Launch program.
0527: * Optionally will launch a web server to host UI. Will also register
0528: * Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM
0529: * Agent).
0530: *
0531: * @param args Command line arguments.
0532: * @throws Exception
0533: */
0534: public static void main(String[] args) throws Exception {
0535: Heritrix.commandLine = true;
0536:
0537: // Set timezone here. Would be problematic doing it if we're running
0538: // inside in a container.
0539: TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
0540:
0541: File startLog = new File(getHeritrixHome(), STARTLOG);
0542: Heritrix.out = new PrintWriter(isDevelopment() ? System.out
0543: : new PrintStream(new FileOutputStream(startLog)));
0544:
0545: try {
0546: containerInitialization();
0547: String status = doCmdLineArgs(args);
0548: if (status != null) {
0549: Heritrix.out.println(status);
0550: }
0551: }
0552:
0553: catch (Exception e) {
0554: // Show any exceptions in STARTLOG.
0555: e.printStackTrace(Heritrix.out);
0556: throw e;
0557: }
0558:
0559: finally {
0560: // If not development, close the file that signals the wrapper
0561: // script that we've started. Otherwise, just flush it; if in
0562: // development, the output is probably a console.
0563: if (!isDevelopment()) {
0564: if (Heritrix.out != null) {
0565: Heritrix.out.close();
0566: }
0567: System.out.println("Heritrix version: "
0568: + Heritrix.getVersion());
0569: } else {
0570: if (Heritrix.out != null) {
0571: Heritrix.out.flush();
0572: }
0573: }
0574: }
0575: }
0576:
0577: protected static String doCmdLineArgs(final String[] args)
0578: throws Exception {
0579: // Get defaults for commandline arguments from the properties file.
0580: String tmpStr = PropertyUtils
0581: .getPropertyOrNull("heritrix.context");
0582: if (tmpStr != null) {
0583: Heritrix.adminContext = tmpStr;
0584: }
0585: tmpStr = PropertyUtils
0586: .getPropertyOrNull("heritrix.cmdline.port");
0587: if (tmpStr != null) {
0588: Heritrix.guiPort = Integer.parseInt(tmpStr);
0589: }
0590: tmpStr = PropertyUtils
0591: .getPropertyOrNull("heritrix.cmdline.admin");
0592: String adminLoginPassword = (tmpStr == null) ? "" : tmpStr;
0593: String crawlOrderFile = PropertyUtils
0594: .getPropertyOrNull("heritrix.cmdline.order");
0595: tmpStr = PropertyUtils
0596: .getPropertyOrNull("heritrix.cmdline.run");
0597: boolean runMode = PropertyUtils
0598: .getBooleanProperty("heritrix.cmdline.run");
0599: boolean selfTest = false;
0600: String selfTestName = null;
0601: CommandLineParser clp = new CommandLineParser(args,
0602: Heritrix.out, Heritrix.getVersion());
0603: List arguments = clp.getCommandLineArguments();
0604: Option[] options = clp.getCommandLineOptions();
0605:
0606: // Check passed argument. Only one argument, the ORDER_FILE is allowed.
0607: // If one argument, make sure exists and xml suffix.
0608: if (arguments.size() > 1) {
0609: clp.usage(1);
0610: } else if (arguments.size() == 1) {
0611: crawlOrderFile = (String) arguments.get(0);
0612: if (!(new File(crawlOrderFile).exists())) {
0613: clp.usage("ORDER.XML <" + crawlOrderFile
0614: + "> specified does not exist.", 1);
0615: }
0616: // Must end with '.xml'
0617: if (crawlOrderFile.length() > 4
0618: && !crawlOrderFile.substring(
0619: crawlOrderFile.length() - 4)
0620: .equalsIgnoreCase(".xml")) {
0621: clp.usage("ORDER.XML <" + crawlOrderFile
0622: + "> does not have required '.xml' suffix.", 1);
0623: }
0624: }
0625:
0626: // Now look at options passed.
0627: for (int i = 0; i < options.length; i++) {
0628: switch (options[i].getId()) {
0629: case 'h':
0630: clp.usage();
0631: break;
0632:
0633: case 'a':
0634: adminLoginPassword = options[i].getValue();
0635: break;
0636:
0637: case 'n':
0638: if (crawlOrderFile == null) {
0639: clp.usage("You must specify an ORDER_FILE with"
0640: + " '--nowui' option.", 1);
0641: }
0642: Heritrix.gui = false;
0643: break;
0644:
0645: case 'b':
0646: Heritrix.guiHosts = parseHosts(options[i].getValue());
0647: break;
0648:
0649: case 'p':
0650: try {
0651: Heritrix.guiPort = Integer.parseInt(options[i]
0652: .getValue());
0653: } catch (NumberFormatException e) {
0654: clp.usage("Failed parse of port number: "
0655: + options[i].getValue(), 1);
0656: }
0657: if (Heritrix.guiPort <= 0) {
0658: clp.usage("Nonsensical port number: "
0659: + options[i].getValue(), 1);
0660: }
0661: break;
0662:
0663: case 'r':
0664: runMode = true;
0665: break;
0666:
0667: case 's':
0668: selfTestName = options[i].getValue();
0669: selfTest = true;
0670: break;
0671:
0672: default:
0673: assert false : options[i].getId();
0674: }
0675: }
0676:
0677: // Ok, we should now have everything to launch the program.
0678: String status = null;
0679: if (selfTest) {
0680: // If more than just '--selftest' and '--port' passed, then
0681: // there is confusion on what is being asked of us. Print usage
0682: // rather than proceed.
0683: for (int i = 0; i < options.length; i++) {
0684: if (options[i].getId() != 'p'
0685: && options[i].getId() != 's') {
0686: clp.usage(1);
0687: }
0688: }
0689:
0690: if (arguments.size() > 0) {
0691: // No arguments accepted by selftest.
0692: clp.usage(1);
0693: }
0694: status = selftest(selfTestName, Heritrix.guiPort);
0695: } else {
0696: if (!isValidLoginPasswordString(adminLoginPassword)) {
0697: clp.usage(
0698: "Invalid admin login:password value, or none "
0699: + "specified. ", 1);
0700: }
0701:
0702: if (!Heritrix.gui) {
0703: if (options.length > 1) {
0704: // If more than just '--nowui' passed, then there is
0705: // confusion on what is being asked of us. Print usage
0706: // rather than proceed.
0707: clp.usage(1);
0708: }
0709: Heritrix h = new Heritrix(true);
0710: status = h.doOneCrawl(crawlOrderFile);
0711: } else {
0712: status = startEmbeddedWebserver(Heritrix.guiHosts,
0713: Heritrix.guiPort, adminLoginPassword);
0714: Heritrix h = new Heritrix(true);
0715:
0716: String tmp = h.launch(crawlOrderFile, runMode);
0717: if (tmp != null) {
0718: status += ('\n' + tmp);
0719: }
0720: }
0721: }
0722: return status;
0723: }
0724:
0725: /**
0726: * @return The file we dump stdout and stderr into.
0727: */
0728: public static String getHeritrixOut() {
0729: String tmp = System.getProperty("heritrix.out");
0730: if (tmp == null || tmp.length() == 0) {
0731: tmp = Heritrix.DEFAULT_HERITRIX_OUT;
0732: }
0733: return tmp;
0734: }
0735:
0736: /**
0737: * Exploit <code>-Dheritrix.home</code> if available to us.
0738: * Is current working dir if no heritrix.home property supplied.
0739: * @return Heritrix home directory.
0740: * @throws IOException
0741: */
0742: protected static File getHeritrixHome() throws IOException {
0743: File heritrixHome = null;
0744: String home = System.getProperty("heritrix.home");
0745: if (home != null && home.length() > 0) {
0746: heritrixHome = new File(home);
0747: if (!heritrixHome.exists()) {
0748: throw new IOException("HERITRIX_HOME <" + home
0749: + "> does not exist.");
0750: }
0751: } else {
0752: heritrixHome = new File(new File("").getAbsolutePath());
0753: }
0754: return heritrixHome;
0755: }
0756:
0757: /**
0758: * @return The directory into which we put jobs. If the system property
0759: * 'heritrix.jobsdir' is set, we will use its value in place of the default
0760: * 'jobs' directory in the current working directory.
0761: * @throws IOException
0762: */
0763: public static File getJobsdir() throws IOException {
0764: Heritrix.loadProperties(); // if called in constructor
0765: String jobsdirStr = System.getProperty("heritrix.jobsdir",
0766: "jobs");
0767: File jobsdir = new File(jobsdirStr);
0768: return (jobsdir.isAbsolute()) ? jobsdir : new File(
0769: getHeritrixHome(), jobsdirStr);
0770: }
0771:
0772: /**
0773: * Get and check for existence of expected subdir.
0774: *
0775: * If development flag set, then look for dir under src dir.
0776: *
0777: * @param subdirName Dir to look for.
0778: * @return The extant subdir. Otherwise null if we're running
0779: * in a webapp context where there is no conf directory available.
0780: * @throws IOException if unable to find expected subdir.
0781: */
0782: protected static File getSubDir(String subdirName)
0783: throws IOException {
0784: return getSubDir(subdirName, true);
0785: }
0786:
0787: /**
0788: * Get and optionally check for existence of subdir.
0789: *
0790: * If development flag set, then look for dir under src dir.
0791: *
0792: * @param subdirName Dir to look for.
0793: * @param fail True if we are to fail if directory does not
0794: * exist; false if we are to return false if the directory does not exist.
0795: * @return The extant subdir. Otherwise null if we're running
0796: * in a webapp context where there is no subdir directory available.
0797: * @throws IOException if unable to find expected subdir.
0798: */
0799: protected static File getSubDir(String subdirName, boolean fail)
0800: throws IOException {
0801: String path = isDevelopment() ? "src" + File.separator
0802: + subdirName : subdirName;
0803: File dir = new File(getHeritrixHome(), path);
0804: if (!dir.exists()) {
0805: if (fail) {
0806: throw new IOException("Cannot find subdir: "
0807: + subdirName);
0808: }
0809: dir = null;
0810: }
0811: return dir;
0812: }
0813:
0814: /**
0815: * Test string is valid login/password string.
0816: *
0817: * A valid login/password string has the login and password compounded
0818: * w/ a ':' delimiter.
0819: *
0820: * @param str String to test.
0821: * @return True if valid password/login string.
0822: */
0823: protected static boolean isValidLoginPasswordString(String str) {
0824: boolean isValid = false;
0825: StringTokenizer tokenizer = new StringTokenizer(str, ":");
0826: if (tokenizer.countTokens() == 2) {
0827: String login = ((String) tokenizer.nextElement()).trim();
0828: String password = ((String) tokenizer.nextElement()).trim();
0829: if (login.length() > 0 && password.length() > 0) {
0830: isValid = true;
0831: }
0832: }
0833: return isValid;
0834: }
0835:
0836: protected static boolean isDevelopment() {
0837: return System.getProperty("heritrix.development") != null;
0838: }
0839:
0840: /**
0841: * Load the heritrix.properties file.
0842: *
0843: * Adds any property that starts with
0844: * <code>HERITRIX_PROPERTIES_PREFIX</code>
0845: * or <code>ARCHIVE_PACKAGE</code>
0846: * into system properties (except logging '.level' directives).
0847: * @return Loaded properties.
0848: * @throws IOException
0849: */
0850: protected static Properties loadProperties() throws IOException {
0851: if (Heritrix.propertiesLoaded) {
0852: return System.getProperties();
0853: }
0854: Heritrix.propertiesLoaded = true;
0855:
0856: Properties properties = new Properties();
0857: properties.load(getPropertiesInputStream());
0858:
0859: // Any property that begins with ARCHIVE_PACKAGE, make it
0860: // into a system property. While iterating, check to see if anything
0861: // defined on command-line, and if so, it overrules whats in
0862: // heritrix.properties.
0863: for (Enumeration e = properties.keys(); e.hasMoreElements();) {
0864: String key = ((String) e.nextElement()).trim();
0865: if (key.startsWith(ARCHIVE_PACKAGE)
0866: || key.startsWith(HERITRIX_PROPERTIES_PREFIX)) {
0867: // Don't add the heritrix.properties entries that are
0868: // changing the logging level of particular classes.
0869: if (key.indexOf(".level") < 0) {
0870: if (System.getProperty(key) == null
0871: || System.getProperty(key).length() == 0) {
0872: System.setProperty(key, properties.getProperty(
0873: key).trim());
0874: }
0875: }
0876: }
0877: }
0878: return properties;
0879: }
0880:
0881: protected static InputStream getPropertiesInputStream()
0882: throws IOException {
0883: File file = null;
0884: // Look to see if properties have been passed on the cmd-line.
0885: String alternateProperties = System.getProperty(PROPERTIES_KEY);
0886: if (alternateProperties != null
0887: && alternateProperties.length() > 0) {
0888: file = new File(alternateProperties);
0889: }
0890: // Get properties from conf directory if one available.
0891: if ((file == null || !file.exists())
0892: && getConfdir(false) != null) {
0893: file = new File(getConfdir(), PROPERTIES);
0894: if (!file.exists()) {
0895: // If no properties file in the conf dir, set file back to
0896: // null so we go looking for heritrix.properties on classpath.
0897: file = null;
0898: }
0899: }
0900: // If not on the command-line, there is no conf dir. Then get the
0901: // properties from the CLASSPATH (Classpath file separator is always
0902: // '/', whatever the platform.
0903: InputStream is = (file != null) ? new FileInputStream(file)
0904: : Heritrix.class.getResourceAsStream("/"
0905: + PROPERTIES_KEY);
0906: if (is == null) {
0907: throw new IOException("Failed to load properties file from"
0908: + " filesystem or from classpath.");
0909: }
0910: return is;
0911: }
0912:
0913: /**
0914: * If the user hasn't altered the default logging parameters, tighten them
0915: * up somewhat: some of our libraries are way too verbose at the INFO or
0916: * WARNING levels.
0917: *
0918: * This might be a problem running inside in someone else's
0919: * container. Container's seem to prefer commons logging so we
0920: * ain't messing them doing the below.
0921: *
0922: * @throws IOException
0923: * @throws SecurityException
0924: */
0925: protected static void patchLogging() throws SecurityException,
0926: IOException {
0927: if (System.getProperty("java.util.logging.config.class") != null) {
0928: return;
0929: }
0930:
0931: if (System.getProperty("java.util.logging.config.file") != null) {
0932: return;
0933: }
0934:
0935: // No user-set logging properties established; use defaults
0936: // from distribution-packaged 'heritrix.properties'.
0937: LogManager.getLogManager().readConfiguration(
0938: getPropertiesInputStream());
0939: }
0940:
0941: /**
0942: * Configure our trust store.
0943: *
0944: * If system property is defined, then use it for our truststore. Otherwise
0945: * use the heritrix truststore under conf directory if it exists.
0946: *
0947: * <p>If we're not launched from the command-line, we will not be able
0948: * to find our truststore. The truststore is nor normally used so rare
0949: * should this be a problem (In case where we don't use find our trust
0950: * store, we'll use the 'default' -- either the JVMs or the containers).
0951: */
0952: protected static void configureTrustStore() {
0953: // Below must be defined in jsse somewhere but can' find it.
0954: final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore";
0955: String value = System.getProperty(TRUSTSTORE_KEY);
0956: File confdir = null;
0957: try {
0958: confdir = getConfdir(false);
0959: } catch (IOException e) {
0960: logger.log(Level.WARNING, "Failed to get confdir.", e);
0961: }
0962: if ((value == null || value.length() <= 0) && confdir != null) {
0963: // Use the heritrix store if it exists on disk.
0964: File heritrixStore = new File(confdir, "heritrix.cacerts");
0965: if (heritrixStore.exists()) {
0966: value = heritrixStore.getAbsolutePath();
0967: }
0968: }
0969:
0970: if (value != null && value.length() > 0) {
0971: System.setProperty(TRUSTSTORE_KEY, value);
0972: }
0973: }
0974:
0975: /**
0976: * Run the selftest
0977: *
0978: * @param oneSelfTestName Name of a test if we are to run one only rather
0979: * than the default running all tests.
0980: * @param port Port number to use for web UI.
0981: *
0982: * @exception Exception
0983: * @return Status of how selftest startup went.
0984: */
0985: protected static String selftest(final String oneSelfTestName,
0986: final int port) throws Exception {
0987: // Put up the webserver w/ the root and selftest webapps only.
0988: final String SELFTEST = "selftest";
0989: Heritrix.httpServer = new SimpleHttpServer(SELFTEST,
0990: Heritrix.adminContext, LOCALHOST_ONLY, port, true);
0991: // Set up digest auth for a section of the server so selftest can run
0992: // auth tests. Looks like can only set one login realm going by the
0993: // web.xml dtd. Otherwise, would be nice to selftest basic and digest.
0994: // Have login, password and role all be SELFTEST. Must match what is
0995: // in the selftest order.xml file.
0996: Heritrix.httpServer.setAuthentication(SELFTEST,
0997: Heritrix.adminContext, SELFTEST, SELFTEST, SELFTEST);
0998: Heritrix.httpServer.startServer();
0999: // Get the order file from the CLASSPATH unless we're running in dev
1000: // environment.
1001: File selftestDir = (isDevelopment()) ? new File(getConfdir(),
1002: SELFTEST) : new File(File.separator + SELFTEST);
1003: File crawlOrderFile = new File(selftestDir, "order.xml");
1004: // Create a job based off the selftest order file. Then use this as
1005: // a template to pass jobHandler.newJob(). Doing this gets our
1006: // selftest output to show under the jobs directory.
1007: // Pass as a seed a pointer to the webserver we just put up.
1008: final String ROOTURI = "127.0.0.1:" + Integer.toString(port);
1009: String selfTestUrl = "http://" + ROOTURI + '/';
1010: if (oneSelfTestName != null && oneSelfTestName.length() > 0) {
1011: selfTestUrl += (oneSelfTestName + '/');
1012: }
1013: CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(),
1014: oneSelfTestName, selfTestUrl);
1015: Heritrix h = new Heritrix("Selftest", true, cjh);
1016: CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template");
1017: job = h.getJobHandler().newJob(job, null, SELFTEST,
1018: "Integration self test", selfTestUrl,
1019: CrawlJob.PRIORITY_CRITICAL);
1020: h.getJobHandler().addJob(job);
1021: // Before we start, need to change some items in the settings file.
1022: CredentialStore cs = (CredentialStore) job.getSettingsHandler()
1023: .getOrder().getAttribute(CredentialStore.ATTR_NAME);
1024: for (Iterator i = cs.iterator(null); i.hasNext();) {
1025: ((Credential) i.next()).setCredentialDomain(null, ROOTURI);
1026: }
1027: h.getJobHandler().startCrawler();
1028: StringBuffer buffer = new StringBuffer();
1029: buffer.append("Heritrix " + Heritrix.getVersion()
1030: + " selftest started.");
1031: buffer.append("\nSelftest first crawls " + selfTestUrl
1032: + " and then runs an analysis.");
1033: buffer.append("\nResult of analysis printed to "
1034: + getHeritrixOut() + " when done.");
1035: buffer.append("\nSelftest job directory for logs and arcs:\n"
1036: + job.getDirectory().getAbsolutePath());
1037: return buffer.toString();
1038: }
1039:
1040: /**
1041: * Launch the crawler without a web UI and run the passed crawl only.
1042: *
1043: * Specialized version of {@link #launch()}.
1044: *
1045: * @param crawlOrderFile The crawl order to crawl.
1046: * @throws InitializationException
1047: * @throws InvalidAttributeValueException
1048: * @return Status string.
1049: */
1050: protected String doOneCrawl(String crawlOrderFile)
1051: throws InitializationException,
1052: InvalidAttributeValueException {
1053: return doOneCrawl(crawlOrderFile, null);
1054: }
1055:
1056: /**
1057: * Launch the crawler without a web UI and run passed crawl only.
1058: *
1059: * Specialized version of {@link #launch()}.
1060: *
1061: * @param crawlOrderFile The crawl order to crawl.
1062: * @param listener Register this crawl status listener before starting
1063: * crawl (You can use this listener to notice end-of-crawl).
1064: * @throws InitializationException
1065: * @throws InvalidAttributeValueException
1066: * @return Status string.
1067: */
1068: protected String doOneCrawl(String crawlOrderFile,
1069: CrawlStatusListener listener)
1070: throws InitializationException,
1071: InvalidAttributeValueException {
1072: XMLSettingsHandler handler = new XMLSettingsHandler(new File(
1073: crawlOrderFile));
1074: handler.initialize();
1075: CrawlController controller = new CrawlController();
1076: controller.initialize(handler);
1077: if (listener != null) {
1078: controller.addCrawlStatusListener(listener);
1079: }
1080: controller.requestCrawlStart();
1081: return "Crawl started using " + crawlOrderFile + ".";
1082: }
1083:
1084: /**
1085: * Launch the crawler for a web UI.
1086: *
1087: * Crawler hangs around waiting on jobs.
1088: *
1089: * @exception Exception
1090: * @return A status string describing how the launch went.
1091: * @throws Exception
1092: */
1093: public String launch() throws Exception {
1094: return launch(null, false);
1095: }
1096:
1097: /**
1098: * Launch the crawler for a web UI.
1099: *
1100: * Crawler hangs around waiting on jobs.
1101: *
1102: * @param crawlOrderFile File to crawl. May be null.
1103: * @param runMode Whether crawler should be set to run mode.
1104: *
1105: * @exception Exception
1106: * @return A status string describing how the launch went.
1107: */
1108: public String launch(String crawlOrderFile, boolean runMode)
1109: throws Exception {
1110: String status = null;
1111: if (crawlOrderFile != null) {
1112: addCrawlJob(crawlOrderFile, "Autolaunched", "", "");
1113: if (runMode) {
1114: this .jobHandler.startCrawler();
1115: status = "Job being crawled: " + crawlOrderFile;
1116: } else {
1117: status = "Crawl job ready and pending: "
1118: + crawlOrderFile;
1119: }
1120: } else if (runMode) {
1121: // The use case is that jobs are to be run on a schedule and that
1122: // if the crawler is in run mode, then the scheduled job will be
1123: // run at appropriate time. Otherwise, not.
1124: this .jobHandler.startCrawler();
1125: status = "Crawler set to run mode.";
1126: }
1127: return status;
1128: }
1129:
1130: /**
1131: * Start up the embedded Jetty webserver instance.
1132: * This is done when we're run from the command-line.
1133: * @param port Port number to use for web UI.
1134: * @param adminLoginPassword Compound of login and password.
1135: * @throws Exception
1136: * @return Status on webserver startup.
1137: * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword)
1138: */
1139: protected static String startEmbeddedWebserver(final int port,
1140: final boolean lho, final String adminLoginPassword)
1141: throws Exception {
1142: ArrayList<String> hosts = new ArrayList<String>();
1143: if (lho) {
1144: hosts.add("127.0.0.1");
1145: }
1146: return startEmbeddedWebserver(hosts, port, adminLoginPassword);
1147: }
1148:
1149: /**
1150: * Parses a list of host names.
1151: *
1152: * <p>If the given string is <code>/</code>, then an empty
1153: * collection is returned. This indicates that all available network
1154: * interfaces should be used.
1155: *
1156: * <p>Otherwise, the string must contain a comma-separated list of
1157: * IP addresses or host names. The parsed list is then returned.
1158: *
1159: * @param hosts the string to parse
1160: * @return the parsed collection of hosts
1161: */
1162: private static Collection<String> parseHosts(String hosts) {
1163: hosts = hosts.trim();
1164: if (hosts.equals("/")) {
1165: return new ArrayList<String>(1);
1166: }
1167: String[] hostArray = hosts.split(",");
1168: for (int i = 0; i < hostArray.length; i++) {
1169: hostArray[i] = hostArray[i].trim();
1170: }
1171: return Arrays.asList(hostArray);
1172: }
1173:
1174: /**
1175: * Start up the embedded Jetty webserver instance.
1176: * This is done when we're run from the command-line.
1177: *
1178: * @param hosts a list of IP addresses or hostnames to bind to, or an
1179: * empty collection to bind to all available network
1180: * interfaces
1181: * @param port Port number to use for web UI.
1182: * @param adminLoginPassword Compound of login and password.
1183: * @throws Exception
1184: * @return Status on webserver startup.
1185: */
1186: protected static String startEmbeddedWebserver(
1187: Collection<String> hosts, int port,
1188: String adminLoginPassword) throws Exception {
1189: adminUsername = adminLoginPassword.substring(0,
1190: adminLoginPassword.indexOf(":"));
1191: adminPassword = adminLoginPassword.substring(adminLoginPassword
1192: .indexOf(":") + 1);
1193: Heritrix.httpServer = new SimpleHttpServer("admin",
1194: Heritrix.adminContext, hosts, port, false);
1195:
1196: final String DOTWAR = ".war";
1197: final String SELFTEST = "selftest";
1198:
1199: // Look for additional WAR files beyond 'selftest' and 'admin'.
1200: File[] wars = getWarsdir().listFiles();
1201: for (int i = 0; i < wars.length; i++) {
1202: if (wars[i].isFile()) {
1203: final String warName = wars[i].getName();
1204: final String warNameNC = warName.toLowerCase();
1205: if (warNameNC.endsWith(DOTWAR)
1206: && !warNameNC.equals(ADMIN + DOTWAR)
1207: && !warNameNC.equals(SELFTEST + DOTWAR)) {
1208: int dot = warName.indexOf('.');
1209: Heritrix.httpServer.addWebapp(warName.substring(0,
1210: dot), null, true);
1211: }
1212: }
1213: }
1214:
1215: // Name of passed 'realm' must match what is in configured in web.xml.
1216: // We'll use ROLE for 'realm' and 'role'.
1217: final String ROLE = ADMIN;
1218: Heritrix.httpServer.setAuthentication(ROLE,
1219: Heritrix.adminContext, adminUsername, adminPassword,
1220: ROLE);
1221: Heritrix.httpServer.startServer();
1222: StringBuffer buffer = new StringBuffer();
1223: buffer.append("Heritrix " + Heritrix.getVersion()
1224: + " is running.");
1225: for (String host : httpServer.getHosts()) {
1226: buffer.append("\nWeb console is at: http://");
1227: buffer.append(host).append(':').append(port);
1228: }
1229: buffer.append("\nWeb console login and password: "
1230: + adminUsername + "/" + adminPassword);
1231: return buffer.toString();
1232: }
1233:
1234: /**
1235: * Replace existing administrator login info with new info.
1236: *
1237: * @param newUsername new administrator login username
1238: * @param newPassword new administrator login password
1239: */
1240: public static void resetAuthentication(String newUsername,
1241: String newPassword) {
1242: Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,
1243: newUsername, newPassword);
1244: adminUsername = newUsername;
1245: adminPassword = newPassword;
1246: logger.info("administrative login changed to " + newUsername
1247: + ":" + newPassword);
1248: }
1249:
1250: protected static CrawlJob createCrawlJob(CrawlJobHandler handler,
1251: File crawlOrderFile, String name)
1252: throws InvalidAttributeValueException {
1253: XMLSettingsHandler settings = new XMLSettingsHandler(
1254: crawlOrderFile);
1255: settings.initialize();
1256: return new CrawlJob(handler.getNextJobUID(), name, settings,
1257: new CrawlJobErrorHandler(Level.SEVERE),
1258: CrawlJob.PRIORITY_HIGH, crawlOrderFile
1259: .getAbsoluteFile().getParentFile());
1260: }
1261:
1262: /**
1263: * This method is called when we have an order file to hand that we want
1264: * to base a job on. It leaves the order file in place and just starts up
1265: * a job that uses all the order points to for locations for logs, etc.
1266: * @param orderPathOrUrl Path to an order file or to a seeds file.
1267: * @param name Name to use for this job.
1268: * @param description
1269: * @param seeds
1270: * @return A status string.
1271: * @throws IOException
1272: * @throws FatalConfigurationException
1273: */
1274: public String addCrawlJob(String orderPathOrUrl, String name,
1275: String description, String seeds) throws IOException,
1276: FatalConfigurationException {
1277: if (!UURI.hasScheme(orderPathOrUrl)) {
1278: // Assume its a file path.
1279: return addCrawlJob(new File(orderPathOrUrl), name,
1280: description, seeds);
1281: }
1282:
1283: // Otherwise, must be an URL.
1284: URL url = new URL(orderPathOrUrl);
1285:
1286: // Handle http and file only for now (Tried to handle JarUrlConnection
1287: // but too awkward undoing jar stream. Rather just look for URLs that
1288: // end in '.jar').
1289: String result = null;
1290: URLConnection connection = url.openConnection();
1291: if (connection instanceof HttpURLConnection) {
1292: result = addCrawlJob(url, (HttpURLConnection) connection,
1293: name, description, seeds);
1294: } else if (connection instanceof FileURLConnection) {
1295: result = addCrawlJob(new File(url.getPath()), name,
1296: description, seeds);
1297: } else {
1298: throw new UnsupportedOperationException("No support for "
1299: + connection);
1300: }
1301:
1302: return result;
1303: }
1304:
1305: protected String addCrawlJob(final URL url,
1306: final HttpURLConnection connection, final String name,
1307: final String description, final String seeds)
1308: throws IOException, FatalConfigurationException {
1309: // Look see if its a jar file. If it is undo it.
1310: boolean isJar = url.getPath() != null
1311: && url.getPath().toLowerCase().endsWith(JAR_SUFFIX);
1312: // If http url connection, bring down the resource local.
1313: File localFile = File.createTempFile(Heritrix.class.getName(),
1314: isJar ? JAR_SUFFIX : null, TMPDIR);
1315: connection.connect();
1316: String result = null;
1317: try {
1318: IoUtils.readFullyToFile(connection.getInputStream(),
1319: localFile);
1320: result = addCrawlJob(localFile, name, description, seeds);
1321: } catch (IOException ioe) {
1322: // Cleanup if an Exception.
1323: localFile.delete();
1324: localFile = null;
1325: } finally {
1326: connection.disconnect();
1327: // If its a jar file, then we made a job based on the jar contents.
1328: // Its no longer needed. Remove it. If not a jar file, then leave
1329: // the file around because the job depends on it.
1330: if (isJar && localFile != null && localFile.exists()) {
1331: localFile.delete();
1332: }
1333: }
1334: return result;
1335: }
1336:
1337: protected String addCrawlJob(final File order, final String name,
1338: final String description, final String seeds)
1339: throws FatalConfigurationException, IOException {
1340: CrawlJob addedJob = null;
1341: if (this .jobHandler == null) {
1342: throw new NullPointerException(
1343: "Heritrix jobhandler is null.");
1344: }
1345: try {
1346: if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) {
1347: return addCrawlJobBasedonJar(order, name, description,
1348: seeds);
1349: }
1350: addedJob = this .jobHandler.addJob(createCrawlJob(
1351: this .jobHandler, order, name));
1352: } catch (InvalidAttributeValueException e) {
1353: FatalConfigurationException fce = new FatalConfigurationException(
1354: "Converted InvalidAttributeValueException on "
1355: + order.getAbsolutePath() + ": "
1356: + e.getMessage());
1357: fce.setStackTrace(e.getStackTrace());
1358: }
1359: return addedJob != null ? addedJob.getUID() : null;
1360: }
1361:
1362: /**
1363: * Undo jar file and use as basis for a new job.
1364: * @param jarFile Pointer to file that holds jar.
1365: * @param name Name to use for new job.
1366: * @param description
1367: * @param seeds
1368: * @return Message.
1369: * @throws IOException
1370: * @throws FatalConfigurationException
1371: */
1372: protected String addCrawlJobBasedonJar(final File jarFile,
1373: final String name, final String description,
1374: final String seeds) throws IOException,
1375: FatalConfigurationException {
1376: if (jarFile == null || !jarFile.exists()) {
1377: throw new FileNotFoundException(jarFile.getAbsolutePath());
1378: }
1379: // Create a directory with a tmp name. Do it by first creating file,
1380: // removing it, then creating the directory. There is a hole during
1381: // which the OS may put a file of same exact name in our way but
1382: // unlikely.
1383: File dir = File.createTempFile(Heritrix.class.getName(),
1384: ".expandedjar", TMPDIR);
1385: dir.delete();
1386: dir.mkdir();
1387: try {
1388: org.archive.crawler.util.IoUtils.unzip(jarFile, dir);
1389: // Expect to find an order file at least.
1390: File orderFile = new File(dir, "order.xml");
1391: if (!orderFile.exists()) {
1392: throw new IOException("Missing order: "
1393: + orderFile.getAbsolutePath());
1394: }
1395: CrawlJob job = createCrawlJobBasedOn(orderFile, name,
1396: description, seeds);
1397: // Copy into place any seeds and settings directories before we
1398: // add job to Heritrix to crawl.
1399: File seedsFile = new File(dir, "seeds.txt");
1400: if (seedsFile.exists()) {
1401: FileUtils.copyFiles(seedsFile, new File(job
1402: .getDirectory(), seedsFile.getName()));
1403: }
1404: File settingsDir = new File(dir, "settings");
1405: if (settingsDir.exists()) {
1406: FileUtils.copyFiles(settingsDir, job.getDirectory());
1407: }
1408: addCrawlJob(job);
1409: return job.getUID();
1410: } finally {
1411: // After job has been added, no more need of expanded content.
1412: // (Let the caller be responsible for cleanup of jar. Sometimes
1413: // its should be deleted -- when its a local copy of a jar pulled
1414: // across the net -- wherease other times, if its a jar passed
1415: // in w/ a 'file' scheme, it shouldn't be deleted.
1416: org.archive.util.FileUtils.deleteDir(dir);
1417: }
1418: }
1419:
1420: public String addCrawlJobBasedOn(String jobUidOrProfile,
1421: String name, String description, String seeds) {
1422: try {
1423: CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);
1424: if (cj == null) {
1425: throw new InvalidAttributeValueException(
1426: jobUidOrProfile
1427: + " is not a job UID or profile name (Job UIDs are "
1428: + " usually the 14 digit date portion of job name).");
1429: }
1430: CrawlJob job = addCrawlJobBasedOn(cj.getSettingsHandler()
1431: .getOrderFile(), name, description, seeds);
1432: return job.getUID();
1433: } catch (Exception e) {
1434: e.printStackTrace();
1435: return "Exception on " + jobUidOrProfile + ": "
1436: + e.getMessage();
1437: }
1438: }
1439:
1440: protected CrawlJob addCrawlJobBasedOn(final File orderFile,
1441: final String name, final String description,
1442: final String seeds) throws FatalConfigurationException {
1443: return addCrawlJob(createCrawlJobBasedOn(orderFile, name,
1444: description, seeds));
1445: }
1446:
1447: protected CrawlJob createCrawlJobBasedOn(final File orderFile,
1448: final String name, final String description,
1449: final String seeds) throws FatalConfigurationException {
1450: CrawlJob job = getJobHandler().newJob(orderFile, name,
1451: description, seeds);
1452: return CrawlJobHandler.ensureNewJobWritten(job, name,
1453: description);
1454: }
1455:
1456: protected CrawlJob addCrawlJob(final CrawlJob job) {
1457: return getJobHandler().addJob(job);
1458: }
1459:
1460: public void startCrawling() {
1461: if (getJobHandler() == null) {
1462: throw new NullPointerException(
1463: "Heritrix jobhandler is null.");
1464: }
1465: getJobHandler().startCrawler();
1466: }
1467:
1468: public void stopCrawling() {
1469: if (getJobHandler() == null) {
1470: throw new NullPointerException(
1471: "Heritrix jobhandler is null.");
1472: }
1473: getJobHandler().stopCrawler();
1474: }
1475:
1476: /**
1477: * Get the heritrix version.
1478: *
1479: * @return The heritrix version. May be null.
1480: */
1481: public static String getVersion() {
1482: return System.getProperty("heritrix.version");
1483: }
1484:
1485: /**
1486: * Get the job handler
1487: *
1488: * @return The CrawlJobHandler being used.
1489: */
1490: public CrawlJobHandler getJobHandler() {
1491: return this .jobHandler;
1492: }
1493:
1494: /**
1495: * Get the configuration directory.
1496: * @return The conf directory under HERITRIX_HOME or null if none can
1497: * be found.
1498: * @throws IOException
1499: */
1500: public static File getConfdir() throws IOException {
1501: return getConfdir(true);
1502: }
1503:
1504: /**
1505: * Get the configuration directory.
1506: * @param fail Throw IOE if can't find directory if true, else just
1507: * return null.
1508: * @return The conf directory under HERITRIX_HOME or null (or an IOE) if
1509: * can't be found.
1510: * @throws IOException
1511: */
1512: public static File getConfdir(final boolean fail)
1513: throws IOException {
1514: final String key = "heritrix.conf";
1515: // Look to see if heritrix.conf property passed on the cmd-line.
1516: String tmp = System.getProperty(key);
1517: // if not fall back to default $HERITIX_HOME/conf
1518: if (tmp == null || tmp.length() == 0) {
1519: return getSubDir("conf", fail);
1520: }
1521: File dir = new File(tmp);
1522: if (!dir.exists()) {
1523: if (fail) {
1524: throw new IOException("Cannot find conf dir: " + tmp);
1525: } else {
1526: logger
1527: .log(
1528: Level.WARNING,
1529: "Specified "
1530: + key
1531: + " dir does not exist. Falling back on default");
1532: }
1533: dir = getSubDir("conf", fail);
1534: }
1535: return dir;
1536: }
1537:
1538: /**
1539: * @return Returns the httpServer. May be null if one was not started.
1540: */
1541: public static SimpleHttpServer getHttpServer() {
1542: return Heritrix.httpServer;
1543: }
1544:
1545: /**
1546: * @throws IOException
1547: * @return Returns the directory under which reside the WAR files
1548: * we're to load into the servlet container.
1549: */
1550: public static File getWarsdir() throws IOException {
1551: return getSubDir("webapps");
1552: }
1553:
1554: /**
1555: * Prepars for program shutdown. This method does it's best to prepare the
1556: * program so that it can exit normally. It will kill the httpServer and
1557: * terminate any running job.<br>
1558: * It is advisible to wait a few (~1000) millisec after calling this method
1559: * and before calling performHeritrixShutDown() to allow as many threads as
1560: * possible to finish what they are doing.
1561: */
1562: public static void prepareHeritrixShutDown() {
1563: // Stop and destroy all running Heritrix instances.
1564: // Get array of the key set to avoid CCEs for case where call to
1565: // destroy does a remove of an instance from Heritrix.instances.
1566: final Object[] keys = Heritrix.instances.keySet().toArray();
1567: for (int i = 0; i < keys.length; i++) {
1568: ((Heritrix) Heritrix.instances.get(keys[i])).destroy();
1569: }
1570:
1571: try {
1572: deregisterJndi(getJndiContainerName());
1573: } catch (NameNotFoundException e) {
1574: // We were probably unbound already. Ignore.
1575: logger.log(Level.WARNING, "deregistration of jndi", e);
1576: } catch (Exception e) {
1577: e.printStackTrace();
1578: }
1579:
1580: if (Heritrix.httpServer != null) {
1581: // Shut down the web access.
1582: try {
1583: Heritrix.httpServer.stopServer();
1584: } catch (InterruptedException e) {
1585: // Generally this can be ignored, but we'll print a stack trace
1586: // just in case.
1587: e.printStackTrace();
1588: } finally {
1589: Heritrix.httpServer = null;
1590: }
1591: }
1592: }
1593:
1594: /**
1595: * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1596: * prior to this method.
1597: */
1598: public static void performHeritrixShutDown() {
1599: performHeritrixShutDown(0);
1600: }
1601:
1602: /**
1603: * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1604: * prior to this method.
1605: *
1606: * @param exitCode Code to pass System.exit.
1607: *
1608: */
1609: public static void performHeritrixShutDown(int exitCode) {
1610: System.exit(exitCode);
1611: }
1612:
1613: /**
1614: * Shutdown all running heritrix instances and the JVM.
1615: * Assumes stop has already been called.
1616: * @param exitCode Exit code to pass system exit.
1617: */
1618: public static void shutdown(final int exitCode) {
1619: getShutdownThread(true, exitCode, "Heritrix shutdown").start();
1620: }
1621:
1622: protected static Thread getShutdownThread(final boolean sysexit,
1623: final int exitCode, final String name) {
1624: Thread t = new Thread(name) {
1625: public void run() {
1626: Heritrix.prepareHeritrixShutDown();
1627: if (sysexit) {
1628: Heritrix.performHeritrixShutDown(exitCode);
1629: }
1630: }
1631: };
1632: t.setDaemon(true);
1633: return t;
1634: }
1635:
1636: public static void shutdown() {
1637: shutdown(0);
1638: }
1639:
1640: /**
1641: * Register Heritrix with JNDI, JMX, and with the static hashtable of all
1642: * Heritrix instances known to this JVM.
1643: *
1644: * If launched from cmdline, register Heritrix MBean if an agent to register
1645: * ourselves with. Usually this method will only have effect if we're
1646: * running in a 1.5.0 JDK and command line options such as
1647: * '-Dcom.sun.management.jmxremote.port=8082
1648: * -Dcom.sun.management.jmxremote.authenticate=false
1649: * -Dcom.sun.management.jmxremote.ssl=false' are supplied.
1650: * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring
1651: * and Management Using JMX</a>
1652: * for more on the command line options and how to connect to the
1653: * Heritrix bean using the JDK 1.5.0 jconsole tool. We register currently
1654: * with first server we find (TODO: Make configurable).
1655: *
1656: * <p>If we register successfully with a JMX agent, then part of the
1657: * registration will include our registering ourselves with JNDI.
1658: *
1659: * <p>Finally, add the heritrix instance to the hashtable of all the
1660: * Heritrix instances floating in the current VM. This latter registeration
1661: * happens whether or no there is a JMX agent to register with. This is
1662: * a list we keep out of convenience so its easy iterating over all
1663: * all instances calling stop when main application is going down.
1664: *
1665: * @param h Instance of heritrix to register.
1666: * @param name Name to use for this Heritrix instance.
1667: * @param jmxregister True if we are to register this instance with JMX.
1668: * @throws NullPointerException
1669: * @throws MalformedObjectNameException
1670: * @throws NotCompliantMBeanException
1671: * @throws MBeanRegistrationException
1672: * @throws InstanceAlreadyExistsException
1673: */
1674: protected static void registerHeritrix(final Heritrix h,
1675: final String name, final boolean jmxregister)
1676: throws MalformedObjectNameException,
1677: InstanceAlreadyExistsException, MBeanRegistrationException,
1678: NotCompliantMBeanException {
1679: MBeanServer server = getMBeanServer();
1680: if (server != null) {
1681: // Are we to manage the jmx registration? Or is it being done for
1682: // us by an external process: e.g. This instance was created by
1683: // MBeanAgent.
1684: if (jmxregister) {
1685: ObjectName objName = (name == null || name.length() <= 0) ? getJmxObjectName()
1686: : getJmxObjectName(name);
1687: registerMBean(server, h, objName);
1688: }
1689: } else {
1690: // JMX ain't available. Put this instance into the list of Heritrix
1691: // instances so findable by the UI (Normally this is done in the
1692: // JMX postRegister routine below). When no JMX, can only have
1693: // one instance of Heritrix so no need to do the deregisteration.
1694: Heritrix.instances.put(h.getNoJmxName(), h);
1695: }
1696: }
1697:
1698: protected static void unregisterHeritrix(final Heritrix h)
1699: throws InstanceNotFoundException,
1700: MBeanRegistrationException, NullPointerException {
1701: MBeanServer server = getMBeanServer();
1702: if (server != null) {
1703: server.unregisterMBean(h.mbeanName);
1704: } else {
1705: // JMX ain't available. Remove from list of Heritrix instances.
1706: // Usually this is done by the JMX postDeregister below.
1707: Heritrix.instances.remove(h.getNoJmxName());
1708: }
1709: }
1710:
1711: /**
1712: * Get MBeanServer.
1713: * Currently uses first MBeanServer found. This will definetly not be whats
1714: * always wanted. TODO: Make which server settable. Also, if none, put up
1715: * our own MBeanServer.
1716: * @return An MBeanServer to register with or null.
1717: */
1718: public static MBeanServer getMBeanServer() {
1719: MBeanServer result = null;
1720: List servers = MBeanServerFactory.findMBeanServer(null);
1721: if (servers == null) {
1722: return result;
1723: }
1724: for (Iterator i = servers.iterator(); i.hasNext();) {
1725: MBeanServer server = (MBeanServer) i.next();
1726: if (server == null) {
1727: continue;
1728: }
1729: result = server;
1730: break;
1731: }
1732: return result;
1733: }
1734:
1735: public static MBeanServer registerMBean(final Object objToRegister,
1736: final String name, final String type)
1737: throws InstanceAlreadyExistsException,
1738: MBeanRegistrationException, NotCompliantMBeanException {
1739: MBeanServer server = getMBeanServer();
1740: if (server != null) {
1741: server = registerMBean(server, objToRegister, name, type);
1742: }
1743: return server;
1744: }
1745:
1746: public static MBeanServer registerMBean(final MBeanServer server,
1747: final Object objToRegister, final String name,
1748: final String type) throws InstanceAlreadyExistsException,
1749: MBeanRegistrationException, NotCompliantMBeanException {
1750: try {
1751: Hashtable<String, String> ht = new Hashtable<String, String>();
1752: ht.put(JmxUtils.NAME, name);
1753: ht.put(JmxUtils.TYPE, type);
1754: registerMBean(server, objToRegister, new ObjectName(
1755: CRAWLER_PACKAGE, ht));
1756: } catch (MalformedObjectNameException e) {
1757: e.printStackTrace();
1758: }
1759: return server;
1760: }
1761:
1762: public static MBeanServer registerMBean(final MBeanServer server,
1763: final Object objToRegister, final ObjectName objName)
1764: throws InstanceAlreadyExistsException,
1765: MBeanRegistrationException, NotCompliantMBeanException {
1766: server.registerMBean(objToRegister, objName);
1767: return server;
1768: }
1769:
1770: public static void unregisterMBean(final MBeanServer server,
1771: final String name, final String type) {
1772: if (server == null) {
1773: return;
1774: }
1775: try {
1776: unregisterMBean(server, getJmxObjectName(name, type));
1777: } catch (MalformedObjectNameException e) {
1778: e.printStackTrace();
1779: }
1780: }
1781:
1782: public static void unregisterMBean(final MBeanServer server,
1783: final ObjectName name) {
1784: try {
1785: server.unregisterMBean(name);
1786: logger.info("Unregistered bean " + name.getCanonicalName());
1787: } catch (InstanceNotFoundException e) {
1788: e.printStackTrace();
1789: } catch (MBeanRegistrationException e) {
1790: e.printStackTrace();
1791: } catch (NullPointerException e) {
1792: e.printStackTrace();
1793: }
1794: }
1795:
1796: /**
1797: * @return Name to use when no JMX agent available.
1798: */
1799: protected String getNoJmxName() {
1800: return this .getClass().getName();
1801: }
1802:
1803: public static ObjectName getJmxObjectName()
1804: throws MalformedObjectNameException, NullPointerException {
1805: return getJmxObjectName("Heritrix", JmxUtils.SERVICE);
1806: }
1807:
1808: public static ObjectName getJmxObjectName(final String name)
1809: throws MalformedObjectNameException, NullPointerException {
1810: return getJmxObjectName(name, JmxUtils.SERVICE);
1811: }
1812:
1813: public static ObjectName getJmxObjectName(final String name,
1814: final String type) throws MalformedObjectNameException,
1815: NullPointerException {
1816: Hashtable<String, String> ht = new Hashtable<String, String>();
1817: ht.put(JmxUtils.NAME, name);
1818: ht.put(JmxUtils.TYPE, type);
1819: return new ObjectName(CRAWLER_PACKAGE, ht);
1820: }
1821:
1822: /**
1823: * @return Returns true if Heritrix was launched from the command line.
1824: * (When launched from command line, we do stuff like put up a web server
1825: * to manage our web interface and we register ourselves with the first
1826: * available jmx agent).
1827: */
1828: public static boolean isCommandLine() {
1829: return Heritrix.commandLine;
1830: }
1831:
1832: /**
1833: * @return True if heritrix has been started.
1834: */
1835: public boolean isStarted() {
1836: return this .jobHandler != null;
1837: }
1838:
1839: public String getStatus() {
1840: StringBuffer buffer = new StringBuffer();
1841: if (this .getJobHandler() != null) {
1842: buffer.append("isRunning=");
1843: buffer.append(this .getJobHandler().isRunning());
1844: buffer.append(" isCrawling=");
1845: buffer.append(this .getJobHandler().isCrawling());
1846: buffer.append(" alertCount=");
1847: buffer.append(getAlertsCount());
1848: buffer.append(" newAlertCount=");
1849: buffer.append(getNewAlertsCount());
1850: if (this .getJobHandler().isCrawling()) {
1851: buffer.append(" currentJob=");
1852: buffer.append(this .getJobHandler().getCurrentJob()
1853: .getJmxJobName());
1854: }
1855: }
1856: return buffer.toString();
1857: }
1858:
1859: // Alert methods.
1860: public int getAlertsCount() {
1861: return this .alertManager.getCount();
1862: }
1863:
1864: public int getNewAlertsCount() {
1865: return this .alertManager.getNewCount();
1866: }
1867:
1868: public Vector getAlerts() {
1869: return this .alertManager.getAll();
1870: }
1871:
1872: public Vector getNewAlerts() {
1873: return this .alertManager.getNewAll();
1874: }
1875:
1876: public SinkHandlerLogRecord getAlert(final String id) {
1877: return this .alertManager.get(id);
1878: }
1879:
1880: public void readAlert(final String id) {
1881: this .alertManager.read(id);
1882: }
1883:
1884: public void removeAlert(final String id) {
1885: this .alertManager.remove(id);
1886: }
1887:
1888: /**
1889: * Start Heritrix.
1890: *
1891: * Used by JMX and webapp initialization for starting Heritrix.
1892: * Not by the cmdline launched Heritrix. Idempotent.
1893: * If start is called by JMX, then new instance of Heritrix is automatically
1894: * registered w/ JMX Agent. If started by webapp, need to register the new
1895: * Heritrix instance.
1896: */
1897: public void start() {
1898: // Don't start if we've been launched from the command line.
1899: // Don't start if already started.
1900: if (!Heritrix.isCommandLine() && !isStarted()) {
1901: try {
1902: logger.info(launch());
1903: } catch (Exception e) {
1904: e.printStackTrace();
1905: }
1906: }
1907: }
1908:
1909: /**
1910: * Stop Heritrix.
1911: *
1912: * Used by JMX and webapp initialization for stopping Heritrix.
1913: */
1914: public void stop() {
1915: if (this .jobHandler != null) {
1916: this .jobHandler.stop();
1917: }
1918: }
1919:
1920: public String interrupt(String threadName) {
1921: String result = "Thread " + threadName + " not found";
1922: ThreadGroup group = Thread.currentThread().getThreadGroup();
1923: if (group == null) {
1924: return result;
1925: }
1926: // Back up to the root threadgroup before starting
1927: // to iterate over threads.
1928: ThreadGroup parent = null;
1929: while ((parent = group.getParent()) != null) {
1930: group = parent;
1931: }
1932: // Do an array that is twice the size of active
1933: // thread count. That should be big enough.
1934: final int max = group.activeCount() * 2;
1935: Thread[] threads = new Thread[max];
1936: int threadCount = group.enumerate(threads, true);
1937: if (threadCount >= max) {
1938: logger.info("Some threads not found...array too small: "
1939: + max);
1940: }
1941: for (int j = 0; j < threadCount; j++) {
1942: if (threads[j].getName().equals(threadName)) {
1943: threads[j].interrupt();
1944: result = "Interrupt sent to " + threadName;
1945: break;
1946: }
1947: }
1948: return result;
1949: }
1950:
1951: // OpenMBean implementation.
1952:
1953: /**
1954: * Build up the MBean info for Heritrix main.
1955: * @return Return created mbean info instance.
1956: */
1957: protected OpenMBeanInfoSupport buildMBeanInfo() {
1958: OpenMBeanAttributeInfoSupport[] attributes = new OpenMBeanAttributeInfoSupport[Heritrix.ATTRIBUTE_LIST
1959: .size()];
1960: OpenMBeanConstructorInfoSupport[] constructors = new OpenMBeanConstructorInfoSupport[1];
1961: OpenMBeanOperationInfoSupport[] operations = new OpenMBeanOperationInfoSupport[Heritrix.OPERATION_LIST
1962: .size()];
1963: MBeanNotificationInfo[] notifications = new MBeanNotificationInfo[0];
1964:
1965: // Attributes.
1966: attributes[0] = new OpenMBeanAttributeInfoSupport(
1967: Heritrix.STATUS_ATTR, "Short basic status message",
1968: SimpleType.STRING, true, false, false);
1969: // Attributes.
1970: attributes[1] = new OpenMBeanAttributeInfoSupport(
1971: Heritrix.VERSION_ATTR, "Heritrix version",
1972: SimpleType.STRING, true, false, false);
1973: // Attributes.
1974: attributes[2] = new OpenMBeanAttributeInfoSupport(
1975: Heritrix.ISRUNNING_ATTR,
1976: "Whether the crawler is running", SimpleType.BOOLEAN,
1977: true, false, false);
1978: // Attributes.
1979: attributes[3] = new OpenMBeanAttributeInfoSupport(
1980: Heritrix.ISCRAWLING_ATTR,
1981: "Whether the crawler is crawling", SimpleType.BOOLEAN,
1982: true, false, false);
1983: // Attributes.
1984: attributes[4] = new OpenMBeanAttributeInfoSupport(
1985: Heritrix.ALERTCOUNT_ATTR, "The number of alerts",
1986: SimpleType.INTEGER, true, false, false);
1987: // Attributes.
1988: attributes[5] = new OpenMBeanAttributeInfoSupport(
1989: Heritrix.NEWALERTCOUNT_ATTR,
1990: "The number of new alerts", SimpleType.INTEGER, true,
1991: false, false);
1992: // Attributes.
1993: attributes[6] = new OpenMBeanAttributeInfoSupport(
1994: Heritrix.CURRENTJOB_ATTR,
1995: "The name of the job currently being crawled",
1996: SimpleType.STRING, true, false, false);
1997:
1998: // Constructors.
1999: constructors[0] = new OpenMBeanConstructorInfoSupport(
2000: "HeritrixOpenMBean",
2001: "Constructs Heritrix OpenMBean instance ",
2002: new OpenMBeanParameterInfoSupport[0]);
2003:
2004: // Operations.
2005: operations[0] = new OpenMBeanOperationInfoSupport(
2006: Heritrix.START_OPER, "Start Heritrix instance", null,
2007: SimpleType.VOID, MBeanOperationInfo.ACTION);
2008:
2009: operations[1] = new OpenMBeanOperationInfoSupport(
2010: Heritrix.STOP_OPER, "Stop Heritrix instance", null,
2011: SimpleType.VOID, MBeanOperationInfo.ACTION);
2012:
2013: OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[1];
2014: args[0] = new OpenMBeanParameterInfoSupport("threadName",
2015: "Name of thread to send interrupt", SimpleType.STRING);
2016: operations[2] = new OpenMBeanOperationInfoSupport(
2017: Heritrix.INTERRUPT_OPER, "Send thread an interrupt "
2018: + "(Used debugging)", args, SimpleType.STRING,
2019: MBeanOperationInfo.ACTION_INFO);
2020:
2021: operations[3] = new OpenMBeanOperationInfoSupport(
2022: Heritrix.START_CRAWLING_OPER, "Set Heritrix instance "
2023: + "into crawling mode", null, SimpleType.VOID,
2024: MBeanOperationInfo.ACTION);
2025:
2026: operations[4] = new OpenMBeanOperationInfoSupport(
2027: Heritrix.STOP_CRAWLING_OPER, "Unset Heritrix instance "
2028: + " crawling mode", null, SimpleType.VOID,
2029: MBeanOperationInfo.ACTION);
2030:
2031: args = new OpenMBeanParameterInfoSupport[4];
2032: args[0] = new OpenMBeanParameterInfoSupport("pathOrURL",
2033: "Path/URL to order or jar of order+seed",
2034: SimpleType.STRING);
2035: args[1] = new OpenMBeanParameterInfoSupport("name",
2036: "Basename for new job", SimpleType.STRING);
2037: args[2] = new OpenMBeanParameterInfoSupport("description",
2038: "Description to save with new job", SimpleType.STRING);
2039: args[3] = new OpenMBeanParameterInfoSupport("seeds",
2040: "Initial seed(s)", SimpleType.STRING);
2041: operations[5] = new OpenMBeanOperationInfoSupport(
2042: Heritrix.ADD_CRAWL_JOB_OPER, "Add new crawl job", args,
2043: SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2044:
2045: args = new OpenMBeanParameterInfoSupport[4];
2046: args[0] = new OpenMBeanParameterInfoSupport("uidOrName",
2047: "Job UID or profile name", SimpleType.STRING);
2048: args[1] = new OpenMBeanParameterInfoSupport("name",
2049: "Basename for new job", SimpleType.STRING);
2050: args[2] = new OpenMBeanParameterInfoSupport("description",
2051: "Description to save with new job", SimpleType.STRING);
2052: args[3] = new OpenMBeanParameterInfoSupport("seeds",
2053: "Initial seed(s)", SimpleType.STRING);
2054: operations[6] = new OpenMBeanOperationInfoSupport(
2055: Heritrix.ADD_CRAWL_JOB_BASEDON_OPER,
2056: "Add a new crawl job based on passed Job UID or profile",
2057: args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2058:
2059: args = new OpenMBeanParameterInfoSupport[1];
2060: args[0] = new OpenMBeanParameterInfoSupport("UID", "Job UID",
2061: SimpleType.STRING);
2062: operations[7] = new OpenMBeanOperationInfoSupport(
2063: DELETE_CRAWL_JOB_OPER, "Delete/stop this crawl job",
2064: args, SimpleType.VOID, MBeanOperationInfo.ACTION);
2065:
2066: args = new OpenMBeanParameterInfoSupport[1];
2067: args[0] = new OpenMBeanParameterInfoSupport("index",
2068: "Zero-based index into array of alerts",
2069: SimpleType.INTEGER);
2070: operations[8] = new OpenMBeanOperationInfoSupport(
2071: Heritrix.ALERT_OPER, "Return alert at passed index",
2072: args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2073:
2074: try {
2075: this .jobCompositeType = new CompositeType("job",
2076: "Job attributes", JOB_KEYS,
2077: new String[] { "Job unique ID", "Job name",
2078: "Job status" }, new OpenType[] {
2079: SimpleType.STRING, SimpleType.STRING,
2080: SimpleType.STRING });
2081: this .jobsTabularType = new TabularType("jobs",
2082: "List of jobs", this .jobCompositeType,
2083: new String[] { "uid" });
2084: } catch (OpenDataException e) {
2085: // This should never happen.
2086: throw new RuntimeException(e);
2087: }
2088: operations[9] = new OpenMBeanOperationInfoSupport(
2089: Heritrix.PENDING_JOBS_OPER,
2090: "List of pending jobs (or null if none)", null,
2091: this .jobsTabularType, MBeanOperationInfo.INFO);
2092: operations[10] = new OpenMBeanOperationInfoSupport(
2093: Heritrix.COMPLETED_JOBS_OPER,
2094: "List of completed jobs (or null if none)", null,
2095: this .jobsTabularType, MBeanOperationInfo.INFO);
2096:
2097: args = new OpenMBeanParameterInfoSupport[2];
2098: args[0] = new OpenMBeanParameterInfoSupport("uid",
2099: "Job unique ID", SimpleType.STRING);
2100: args[1] = new OpenMBeanParameterInfoSupport("name",
2101: "Report name (e.g. crawl-report, etc.)",
2102: SimpleType.STRING);
2103: operations[11] = new OpenMBeanOperationInfoSupport(
2104: Heritrix.CRAWLEND_REPORT_OPER,
2105: "Return crawl-end report", args, SimpleType.STRING,
2106: MBeanOperationInfo.ACTION_INFO);
2107:
2108: operations[12] = new OpenMBeanOperationInfoSupport(
2109: Heritrix.SHUTDOWN_OPER, "Shutdown container", null,
2110: SimpleType.VOID, MBeanOperationInfo.ACTION);
2111:
2112: args = new OpenMBeanParameterInfoSupport[2];
2113: args[0] = new OpenMBeanParameterInfoSupport("level",
2114: "Log level: e.g. SEVERE, WARNING, etc.",
2115: SimpleType.STRING);
2116: args[1] = new OpenMBeanParameterInfoSupport("message",
2117: "Log message", SimpleType.STRING);
2118: operations[13] = new OpenMBeanOperationInfoSupport(
2119: Heritrix.LOG_OPER, "Add a log message", args,
2120: SimpleType.VOID, MBeanOperationInfo.ACTION);
2121:
2122: operations[14] = new OpenMBeanOperationInfoSupport(
2123: Heritrix.DESTROY_OPER, "Destroy Heritrix instance",
2124: null, SimpleType.VOID, MBeanOperationInfo.ACTION);
2125:
2126: operations[15] = new OpenMBeanOperationInfoSupport(
2127: Heritrix.TERMINATE_CRAWL_JOB_OPER,
2128: "Returns false if no current job", null,
2129: SimpleType.BOOLEAN, MBeanOperationInfo.ACTION);
2130:
2131: operations[16] = new OpenMBeanOperationInfoSupport(
2132: Heritrix.REBIND_JNDI_OPER,
2133: "Rebinds this Heritrix with JNDI.", null,
2134: SimpleType.VOID, MBeanOperationInfo.ACTION);
2135:
2136: // Build the info object.
2137: return new OpenMBeanInfoSupport(this .getClass().getName(),
2138: "Heritrix Main OpenMBean", attributes, constructors,
2139: operations, notifications);
2140: }
2141:
2142: public Object getAttribute(String attribute_name)
2143: throws AttributeNotFoundException {
2144: if (attribute_name == null) {
2145: throw new RuntimeOperationsException(
2146: new IllegalArgumentException(
2147: "Attribute name cannot be null"),
2148: "Cannot call getAttribute with null attribute name");
2149: }
2150: if (!Heritrix.ATTRIBUTE_LIST.contains(attribute_name)) {
2151: throw new AttributeNotFoundException("Attribute "
2152: + attribute_name + " is unimplemented.");
2153: }
2154: // The pattern in the below is to match an attribute and when found
2155: // do a return out of if clause. Doing it this way, I can fall
2156: // on to the AttributeNotFoundException for case where we've an
2157: // attribute but no handler.
2158: if (attribute_name.equals(STATUS_ATTR)) {
2159: return getStatus();
2160: }
2161: if (attribute_name.equals(VERSION_ATTR)) {
2162: return getVersion();
2163: }
2164:
2165: if (attribute_name.equals(ISRUNNING_ATTR)) {
2166: return new Boolean(this .getJobHandler().isRunning());
2167: }
2168: if (attribute_name.equals(ISCRAWLING_ATTR)) {
2169: return new Boolean(this .getJobHandler().isCrawling());
2170: }
2171: if (attribute_name.equals(ALERTCOUNT_ATTR)) {
2172: return new Integer(getAlertsCount());
2173: }
2174: if (attribute_name.equals(NEWALERTCOUNT_ATTR)) {
2175: return new Integer(getNewAlertsCount());
2176: }
2177: if (attribute_name.equals(CURRENTJOB_ATTR)) {
2178: if (this .getJobHandler().isCrawling()) {
2179: return this .getJobHandler().getCurrentJob()
2180: .getJmxJobName();
2181: }
2182: return null;
2183: }
2184: throw new AttributeNotFoundException("Attribute "
2185: + attribute_name + " not found.");
2186: }
2187:
2188: public void setAttribute(Attribute attribute)
2189: throws AttributeNotFoundException {
2190: throw new AttributeNotFoundException(
2191: "No attribute can be set in " + "this MBean");
2192: }
2193:
2194: public AttributeList getAttributes(String[] attributeNames) {
2195: if (attributeNames == null) {
2196: throw new RuntimeOperationsException(
2197: new IllegalArgumentException(
2198: "attributeNames[] cannot be " + "null"),
2199: "Cannot call getAttributes with null attribute "
2200: + "names");
2201: }
2202: AttributeList resultList = new AttributeList();
2203: if (attributeNames.length == 0) {
2204: return resultList;
2205: }
2206: for (int i = 0; i < attributeNames.length; i++) {
2207: try {
2208: Object value = getAttribute(attributeNames[i]);
2209: resultList.add(new Attribute(attributeNames[i], value));
2210: } catch (Exception e) {
2211: e.printStackTrace();
2212: }
2213: }
2214: return (resultList);
2215: }
2216:
2217: public AttributeList setAttributes(AttributeList attributes) {
2218: return new AttributeList(); // always empty
2219: }
2220:
2221: public Object invoke(final String operationName,
2222: final Object[] params, final String[] signature)
2223: throws ReflectionException {
2224: if (operationName == null) {
2225: throw new RuntimeOperationsException(
2226: new IllegalArgumentException(
2227: "Operation name cannot be null"),
2228: "Cannot call invoke with null operation name");
2229: }
2230: // The pattern in the below is to match an operation and when found
2231: // do a return out of if clause. Doing it this way, I can fall
2232: // on to the MethodNotFoundException for case where we've an
2233: // attribute but no handler.
2234: if (operationName.equals(START_OPER)) {
2235: JmxUtils.checkParamsCount(START_OPER, params, 0);
2236: start();
2237: return null;
2238: }
2239: if (operationName.equals(STOP_OPER)) {
2240: JmxUtils.checkParamsCount(STOP_OPER, params, 0);
2241: stop();
2242: return null;
2243: }
2244: if (operationName.equals(DESTROY_OPER)) {
2245: JmxUtils.checkParamsCount(DESTROY_OPER, params, 0);
2246: destroy();
2247: return null;
2248: }
2249: if (operationName.equals(TERMINATE_CRAWL_JOB_OPER)) {
2250: JmxUtils.checkParamsCount(TERMINATE_CRAWL_JOB_OPER, params,
2251: 0);
2252: return new Boolean(this .jobHandler.terminateCurrentJob());
2253: }
2254: if (operationName.equals(REBIND_JNDI_OPER)) {
2255: JmxUtils.checkParamsCount(REBIND_JNDI_OPER, params, 0);
2256: try {
2257: registerContainerJndi();
2258: } catch (MalformedObjectNameException e) {
2259: throw new RuntimeOperationsException(
2260: new RuntimeException(e));
2261: } catch (UnknownHostException e) {
2262: throw new RuntimeOperationsException(
2263: new RuntimeException(e));
2264: } catch (NamingException e) {
2265: throw new RuntimeOperationsException(
2266: new RuntimeException(e));
2267: }
2268: return null;
2269: }
2270: if (operationName.equals(SHUTDOWN_OPER)) {
2271: JmxUtils.checkParamsCount(SHUTDOWN_OPER, params, 0);
2272: Heritrix.shutdown();
2273: return null;
2274: }
2275: if (operationName.equals(LOG_OPER)) {
2276: JmxUtils.checkParamsCount(LOG_OPER, params, 2);
2277: logger.log(Level.parse((String) params[0]),
2278: (String) params[1]);
2279: return null;
2280: }
2281: if (operationName.equals(INTERRUPT_OPER)) {
2282: JmxUtils.checkParamsCount(INTERRUPT_OPER, params, 1);
2283: return interrupt((String) params[0]);
2284: }
2285: if (operationName.equals(START_CRAWLING_OPER)) {
2286: JmxUtils.checkParamsCount(START_CRAWLING_OPER, params, 0);
2287: startCrawling();
2288: return null;
2289: }
2290: if (operationName.equals(STOP_CRAWLING_OPER)) {
2291: JmxUtils.checkParamsCount(STOP_CRAWLING_OPER, params, 0);
2292: stopCrawling();
2293: return null;
2294: }
2295: if (operationName.equals(ADD_CRAWL_JOB_OPER)) {
2296: JmxUtils.checkParamsCount(ADD_CRAWL_JOB_OPER, params, 4);
2297: try {
2298: return addCrawlJob((String) params[0],
2299: (String) params[1],
2300: checkForEmptyPlaceHolder((String) params[2]),
2301: checkForEmptyPlaceHolder((String) params[3]));
2302: } catch (IOException e) {
2303: throw new RuntimeOperationsException(
2304: new RuntimeException(e));
2305: } catch (FatalConfigurationException e) {
2306: throw new RuntimeOperationsException(
2307: new RuntimeException(e));
2308: }
2309: }
2310: if (operationName.equals(DELETE_CRAWL_JOB_OPER)) {
2311: JmxUtils.checkParamsCount(DELETE_CRAWL_JOB_OPER, params, 1);
2312: this .jobHandler.deleteJob((String) params[0]);
2313: return null;
2314: }
2315:
2316: if (operationName.equals(ADD_CRAWL_JOB_BASEDON_OPER)) {
2317: JmxUtils.checkParamsCount(ADD_CRAWL_JOB_BASEDON_OPER,
2318: params, 4);
2319: return addCrawlJobBasedOn((String) params[0],
2320: (String) params[1],
2321: checkForEmptyPlaceHolder((String) params[2]),
2322: checkForEmptyPlaceHolder((String) params[3]));
2323: }
2324: if (operationName.equals(ALERT_OPER)) {
2325: JmxUtils.checkParamsCount(ALERT_OPER, params, 1);
2326: SinkHandlerLogRecord slr = null;
2327: if (this .alertManager.getCount() > 0) {
2328: // This is creating a vector of all alerts just so I can then
2329: // use passed index into resultant vector -- needs to be
2330: // improved.
2331: slr = (SinkHandlerLogRecord) this .alertManager.getAll()
2332: .get(((Integer) params[0]).intValue());
2333: }
2334: return (slr != null) ? slr.toString() : null;
2335: }
2336:
2337: if (operationName.equals(PENDING_JOBS_OPER)) {
2338: JmxUtils.checkParamsCount(PENDING_JOBS_OPER, params, 0);
2339: try {
2340: return makeJobsTabularData(getJobHandler()
2341: .getPendingJobs());
2342: } catch (OpenDataException e) {
2343: throw new RuntimeOperationsException(
2344: new RuntimeException(e));
2345: }
2346: }
2347:
2348: if (operationName.equals(COMPLETED_JOBS_OPER)) {
2349: JmxUtils.checkParamsCount(COMPLETED_JOBS_OPER, params, 0);
2350: try {
2351: return makeJobsTabularData(getJobHandler()
2352: .getCompletedJobs());
2353: } catch (OpenDataException e) {
2354: throw new RuntimeOperationsException(
2355: new RuntimeException(e));
2356: }
2357: }
2358:
2359: if (operationName.equals(CRAWLEND_REPORT_OPER)) {
2360: JmxUtils.checkParamsCount(CRAWLEND_REPORT_OPER, params, 2);
2361: try {
2362: return getCrawlendReport((String) params[0],
2363: (String) params[1]);
2364: } catch (IOException e) {
2365: throw new RuntimeOperationsException(
2366: new RuntimeException(e));
2367: }
2368: }
2369:
2370: throw new ReflectionException(new NoSuchMethodException(
2371: operationName), "Cannot find the operation "
2372: + operationName);
2373: }
2374:
2375: /**
2376: * Return named crawl end report for job with passed uid.
2377: * Crawler makes reports when its finished its crawl. Use this method
2378: * to get a String version of one of these files.
2379: * @param jobUid The unique ID for the job whose reports you want to see
2380: * (Must be a completed job).
2381: * @param reportName Name of report minus '.txt' (e.g. crawl-report).
2382: * @return String version of the on-disk report.
2383: * @throws IOException
2384: */
2385: protected String getCrawlendReport(String jobUid, String reportName)
2386: throws IOException {
2387: CrawlJob job = getJobHandler().getJob(jobUid);
2388: if (job == null) {
2389: throw new IOException("No such job: " + jobUid);
2390: }
2391: File report = new File(job.getDirectory(), reportName + ".txt");
2392: if (!report.exists()) {
2393: throw new FileNotFoundException(report.getAbsolutePath());
2394: }
2395: return FileUtils.readFileAsString(report);
2396: }
2397:
2398: protected TabularData makeJobsTabularData(List jobs)
2399: throws OpenDataException {
2400: if (jobs == null || jobs.size() == 0) {
2401: return null;
2402: }
2403: TabularData td = new TabularDataSupport(this .jobsTabularType);
2404: for (Iterator i = jobs.iterator(); i.hasNext();) {
2405: CrawlJob job = (CrawlJob) i.next();
2406: CompositeData cd = new CompositeDataSupport(
2407: this .jobCompositeType, JOB_KEYS, new String[] {
2408: job.getUID(), job.getJobName(),
2409: job.getStatus() });
2410: td.put(cd);
2411: }
2412: return td;
2413: }
2414:
2415: /**
2416: * If passed str has placeholder for the empty string, return the empty
2417: * string else return orginal.
2418: * Dumb jmx clients can't pass empty string so they'll pass a representation
2419: * of empty string such as ' ' or '-'. Convert such strings to empty
2420: * string.
2421: * @param str String to check.
2422: * @return Original <code>str</code> or empty string if <code>str</code>
2423: * contains a placeholder for the empty-string (e.g. '-', or ' ').
2424: */
2425: protected String checkForEmptyPlaceHolder(String str) {
2426: return TextUtils.matches("-| +", str) ? "" : str;
2427: }
2428:
2429: public MBeanInfo getMBeanInfo() {
2430: return this .openMBeanInfo;
2431: }
2432:
2433: /**
2434: * @return Name this instance registered in JMX (Only available after JMX
2435: * registration).
2436: */
2437: public ObjectName getMBeanName() {
2438: return this .mbeanName;
2439: }
2440:
2441: public ObjectName preRegister(MBeanServer server, ObjectName name)
2442: throws Exception {
2443: this .mbeanServer = server;
2444: @SuppressWarnings("unchecked")
2445: Hashtable<String, String> ht = name.getKeyPropertyList();
2446: if (!ht.containsKey(JmxUtils.NAME)) {
2447: throw new IllegalArgumentException("Name property required"
2448: + name.getCanonicalName());
2449: }
2450: if (!ht.containsKey(JmxUtils.TYPE)) {
2451: ht.put(JmxUtils.TYPE, JmxUtils.SERVICE);
2452: name = new ObjectName(name.getDomain(), ht);
2453: }
2454: this .mbeanName = addGuiPort(addVitals(name));
2455: Heritrix.instances.put(this .mbeanName
2456: .getCanonicalKeyPropertyListString(), this );
2457: return this .mbeanName;
2458: }
2459:
2460: /**
2461: * Add vital stats to passed in ObjectName.
2462: * @param name ObjectName to add to.
2463: * @return name with host, guiport, and jmxport added.
2464: * @throws UnknownHostException
2465: * @throws MalformedObjectNameException
2466: * @throws NullPointerException
2467: */
2468: protected static ObjectName addVitals(ObjectName name)
2469: throws UnknownHostException, MalformedObjectNameException,
2470: NullPointerException {
2471: @SuppressWarnings("unchecked")
2472: Hashtable<String, String> ht = name.getKeyPropertyList();
2473: if (!ht.containsKey(JmxUtils.HOST)) {
2474: ht.put(JmxUtils.HOST, InetAddress.getLocalHost()
2475: .getHostName());
2476: name = new ObjectName(name.getDomain(), ht);
2477: }
2478: if (!ht.containsKey(JmxUtils.JMX_PORT)) {
2479: // Add jdk jmx-port. This will be present if we've attached
2480: // ourselves to the jdk jmx agent. Otherwise, we've been
2481: // deployed in a j2ee container with its own jmx agent. In
2482: // this case we won't know how to get jmx port.
2483: String p = System
2484: .getProperty("com.sun.management.jmxremote.port");
2485: if (p != null && p.length() > 0) {
2486: ht.put(JmxUtils.JMX_PORT, p);
2487: name = new ObjectName(name.getDomain(), ht);
2488: }
2489: }
2490: return name;
2491: }
2492:
2493: protected static ObjectName addGuiPort(ObjectName name)
2494: throws MalformedObjectNameException, NullPointerException {
2495: @SuppressWarnings("unchecked")
2496: Hashtable<String, String> ht = name.getKeyPropertyList();
2497: if (!ht.containsKey(JmxUtils.GUI_PORT)) {
2498: // Add gui port if this instance was started with a gui.
2499: if (Heritrix.gui) {
2500: ht.put(JmxUtils.GUI_PORT, Integer
2501: .toString(Heritrix.guiPort));
2502: name = new ObjectName(name.getDomain(), ht);
2503: }
2504: }
2505: return name;
2506: }
2507:
2508: public void postRegister(Boolean registrationDone) {
2509: if (logger.isLoggable(Level.INFO)) {
2510: logger.info(JmxUtils.getLogRegistrationMsg(this .mbeanName
2511: .getCanonicalName(), this .mbeanServer,
2512: registrationDone.booleanValue()));
2513: }
2514: try {
2515: registerJndi(this .mbeanName);
2516: } catch (Exception e) {
2517: logger.log(Level.SEVERE, "Failed jndi registration", e);
2518: }
2519: }
2520:
2521: public void preDeregister() throws Exception {
2522: deregisterJndi(this .mbeanName);
2523: }
2524:
2525: public void postDeregister() {
2526: Heritrix.instances.remove(this .mbeanName
2527: .getCanonicalKeyPropertyListString());
2528: if (logger.isLoggable(Level.INFO)) {
2529: logger.info(JmxUtils.getLogUnregistrationMsg(this .mbeanName
2530: .getCanonicalName(), this .mbeanServer));
2531: }
2532: }
2533:
2534: protected static void registerContainerJndi()
2535: throws MalformedObjectNameException, NullPointerException,
2536: UnknownHostException, NamingException {
2537: registerJndi(getJndiContainerName());
2538: }
2539:
2540: protected static void registerJndi(final ObjectName name)
2541: throws NullPointerException, NamingException {
2542: Context c = getJndiContext();
2543: if (c == null) {
2544: return;
2545: }
2546: CompoundName key = JndiUtils.bindObjectName(c, name);
2547: if (logger.isLoggable(Level.FINE)) {
2548: logger.fine("Bound '"
2549: + key
2550: + "' to '"
2551: + JndiUtils.getCompoundName(c.getNameInNamespace())
2552: .toString() + "' jndi context");
2553: }
2554: }
2555:
2556: protected static void deregisterJndi(final ObjectName name)
2557: throws NullPointerException, NamingException {
2558: Context c = getJndiContext();
2559: if (c == null) {
2560: return;
2561: }
2562: CompoundName key = JndiUtils.unbindObjectName(c, name);
2563: if (logger.isLoggable(Level.FINE)) {
2564: logger.fine("Unbound '"
2565: + key
2566: + "' from '"
2567: + JndiUtils.getCompoundName(c.getNameInNamespace())
2568: .toString() + "' jndi context");
2569: }
2570: }
2571:
2572: /**
2573: * @return Jndi context for the crawler or null if none found.
2574: * @throws NamingException
2575: */
2576: protected static Context getJndiContext() throws NamingException {
2577: Context c = null;
2578: try {
2579: c = JndiUtils.getSubContext(CRAWLER_PACKAGE);
2580: } catch (NoInitialContextException e) {
2581: logger.fine("No JNDI Context: " + e.toString());
2582: }
2583: return c;
2584: }
2585:
2586: /**
2587: * @return Jndi container name -- the name to use for the 'container' that
2588: * can host zero or more heritrix instances (Return a JMX ObjectName. We
2589: * use ObjectName because then we're sync'd with JMX naming and ObjectName
2590: * has nice parsing).
2591: * @throws NullPointerException
2592: * @throws MalformedObjectNameException
2593: * @throws UnknownHostException
2594: */
2595: protected static ObjectName getJndiContainerName()
2596: throws MalformedObjectNameException, NullPointerException,
2597: UnknownHostException {
2598: ObjectName objName = new ObjectName(CRAWLER_PACKAGE, "type",
2599: "container");
2600: return addVitals(objName);
2601: }
2602:
2603: /**
2604: * @return Return all registered instances of Heritrix (Rare are there
2605: * more than one).
2606: */
2607: public static Map getInstances() {
2608: return Heritrix.instances;
2609: }
2610:
2611: /**
2612: * @return True if only one instance of Heritrix.
2613: */
2614: public static boolean isSingleInstance() {
2615: return Heritrix.instances != null
2616: && Heritrix.instances.size() == 1;
2617: }
2618:
2619: /**
2620: * @return Returns single instance or null if no instance or multiple.
2621: */
2622: public static Heritrix getSingleInstance() {
2623: return !isSingleInstance() ? null
2624: : (Heritrix) Heritrix.instances.get(Heritrix.instances
2625: .keySet().iterator().next());
2626: }
2627: }
|