0001: /* CrawlJob
0002: *
0003: * Copyright (C) 2003 Internet Archive.
0004: *
0005: * This file is part of the Heritrix web crawler (crawler.archive.org).
0006: *
0007: * Heritrix is free software; you can redistribute it and/or modify
0008: * it under the terms of the GNU Lesser Public License as published by
0009: * the Free Software Foundation; either version 2.1 of the License, or
0010: * any later version.
0011: *
0012: * Heritrix is distributed in the hope that it will be useful,
0013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0015: * GNU Lesser Public License for more details.
0016: *
0017: * You should have received a copy of the GNU Lesser Public License
0018: * along with Heritrix; if not, write to the Free Software
0019: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0020: */
0021: package org.archive.crawler.admin;
0022:
0023: import java.io.BufferedReader;
0024: import java.io.File;
0025: import java.io.FileNotFoundException;
0026: import java.io.FileReader;
0027: import java.io.FileWriter;
0028: import java.io.IOException;
0029: import java.io.InputStream;
0030: import java.io.InputStreamReader;
0031: import java.io.PrintWriter;
0032: import java.io.Serializable;
0033: import java.io.StringWriter;
0034: import java.util.ArrayList;
0035: import java.util.Arrays;
0036: import java.util.Collection;
0037: import java.util.EventObject;
0038: import java.util.Hashtable;
0039: import java.util.Iterator;
0040: import java.util.List;
0041: import java.util.Map;
0042: import java.util.logging.Level;
0043: import java.util.logging.Logger;
0044:
0045: import javax.management.Attribute;
0046: import javax.management.AttributeList;
0047: import javax.management.AttributeNotFoundException;
0048: import javax.management.DynamicMBean;
0049: import javax.management.InstanceAlreadyExistsException;
0050: import javax.management.InvalidAttributeValueException;
0051: import javax.management.MBeanAttributeInfo;
0052: import javax.management.MBeanException;
0053: import javax.management.MBeanInfo;
0054: import javax.management.MBeanNotificationInfo;
0055: import javax.management.MBeanOperationInfo;
0056: import javax.management.MBeanParameterInfo;
0057: import javax.management.MBeanRegistration;
0058: import javax.management.MBeanRegistrationException;
0059: import javax.management.MBeanServer;
0060: import javax.management.NotCompliantMBeanException;
0061: import javax.management.Notification;
0062: import javax.management.NotificationBroadcasterSupport;
0063: import javax.management.ObjectName;
0064: import javax.management.ReflectionException;
0065: import javax.management.RuntimeOperationsException;
0066: import javax.management.openmbean.CompositeData;
0067: import javax.management.openmbean.CompositeDataSupport;
0068: import javax.management.openmbean.CompositeType;
0069: import javax.management.openmbean.OpenDataException;
0070: import javax.management.openmbean.OpenMBeanAttributeInfo;
0071: import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
0072: import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
0073: import javax.management.openmbean.OpenMBeanInfoSupport;
0074: import javax.management.openmbean.OpenMBeanOperationInfo;
0075: import javax.management.openmbean.OpenMBeanOperationInfoSupport;
0076: import javax.management.openmbean.OpenMBeanParameterInfo;
0077: import javax.management.openmbean.OpenMBeanParameterInfoSupport;
0078: import javax.management.openmbean.SimpleType;
0079:
0080: import org.apache.commons.httpclient.URIException;
0081: import org.archive.crawler.Heritrix;
0082: import org.archive.crawler.datamodel.CandidateURI;
0083: import org.archive.crawler.datamodel.Checkpoint;
0084: import org.archive.crawler.datamodel.CrawlOrder;
0085: import org.archive.crawler.event.CrawlStatusListener;
0086: import org.archive.crawler.framework.CrawlController;
0087: import org.archive.crawler.framework.FrontierMarker;
0088: import org.archive.crawler.framework.StatisticsTracking;
0089: import org.archive.crawler.framework.exceptions.InitializationException;
0090: import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
0091: import org.archive.crawler.frontier.AbstractFrontier;
0092: import org.archive.crawler.settings.ComplexType;
0093: import org.archive.crawler.settings.ModuleAttributeInfo;
0094: import org.archive.crawler.settings.TextField;
0095: import org.archive.crawler.settings.XMLSettingsHandler;
0096: import org.archive.crawler.util.CheckpointUtils;
0097: import org.archive.crawler.util.IoUtils;
0098: import org.archive.util.ArchiveUtils;
0099: import org.archive.util.FileUtils;
0100: import org.archive.util.JEMBeanHelper;
0101: import org.archive.util.JmxUtils;
0102: import org.archive.util.iterator.LineReadingIterator;
0103: import org.archive.util.iterator.RegexpLineIterator;
0104:
0105: import com.sleepycat.je.DatabaseException;
0106: import com.sleepycat.je.Environment;
0107:
0108: /**
0109: * A CrawlJob encapsulates a 'crawl order' with any and all information and
0110: * methods needed by a CrawlJobHandler to accept and execute them.
0111: *
0112: * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
0113: * should not be executed as a crawl but can be edited and used as a template
0114: * for creating new CrawlJobs.
0115: *
0116: * <p>All of it's constructors are protected since only a CrawlJobHander
0117: * should construct new CrawlJobs.
0118: *
0119: * @author Kristinn Sigurdsson
0120: *
0121: * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
0122: * String, String, String, int)
0123: * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
0124: * String, String, String)
0125: */
0126:
0127: public class CrawlJob extends NotificationBroadcasterSupport implements
0128: DynamicMBean, MBeanRegistration, CrawlStatusListener,
0129: Serializable {
0130: /**
0131: * Eclipse generated serial number.
0132: */
0133: private static final long serialVersionUID = 3411161000452525856L;
0134:
0135: private static final Logger logger = Logger
0136: .getLogger(CrawlJob.class.getName());
0137: /*
0138: * Possible values for Priority
0139: */
0140: /** lowest */
0141: public static final int PRIORITY_MINIMAL = 0;
0142: /** low */
0143: public static final int PRIORITY_LOW = 1;
0144: /** average */
0145: public static final int PRIORITY_AVERAGE = 2;
0146: /** high */
0147: public static final int PRIORITY_HIGH = 3;
0148: /** highest */
0149: public static final int PRIORITY_CRITICAL = 4;
0150:
0151: /*
0152: * Possible states for a Job.
0153: */
0154: /** Inital value. May not be ready to run/incomplete. */
0155: public static final String STATUS_CREATED = "Created";
0156: /** Job has been successfully submitted to a CrawlJobHandler */
0157: public static final String STATUS_PENDING = "Pending";
0158: /** Job is being crawled */
0159: public static final String STATUS_RUNNING = "Running";
0160: /** Job was deleted by user, will not be displayed in UI. */
0161: public static final String STATUS_DELETED = "Deleted";
0162: /** Job was terminted by user input while crawling */
0163: public static final String STATUS_ABORTED = "Finished - Ended by operator";
0164: /** Something went very wrong */
0165: public static final String STATUS_FINISHED_ABNORMAL = "Finished - Abnormal exit from crawling";
0166: /** Job finished normally having completed its crawl. */
0167: public static final String STATUS_FINISHED = "Finished";
0168: /** Job finished normally when the specified timelimit was hit. */
0169: public static final String STATUS_FINISHED_TIME_LIMIT = "Finished - Timelimit hit";
0170: /** Job finished normally when the specifed amount of
0171: * data (MB) had been downloaded */
0172: public static final String STATUS_FINISHED_DATA_LIMIT = "Finished - Maximum amount of data limit hit";
0173: /** Job finished normally when the specified number of documents had been
0174: * fetched.
0175: */
0176: public static final String STATUS_FINISHED_DOCUMENT_LIMIT = "Finished - Maximum number of documents limit hit";
0177: /** Job is going to be temporarly stopped after active threads are finished. */
0178: public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - "
0179: + "Waiting for threads to finish";
0180: /** Job was temporarly stopped. State is kept so it can be resumed */
0181: public static final String STATUS_PAUSED = "Paused";
0182: /**
0183: * Job is being checkpointed. When finished checkpointing, job is set
0184: * back to STATUS_PAUSED (Job must be first paused before checkpointing
0185: * will run).
0186: */
0187: public static final String STATUS_CHECKPOINTING = "Checkpointing";
0188: /** Job could not be launced due to an InitializationException */
0189: public static final String STATUS_MISCONFIGURED = "Could not launch job "
0190: + "- Fatal InitializationException";
0191: /** Job is actually a profile */
0192: public static final String STATUS_PROFILE = "Profile";
0193:
0194: public static final String STATUS_PREPARING = "Preparing";
0195:
0196: // Class variables
0197: private String UID; //A UID issued by the CrawlJobHandler.
0198: private String name;
0199: private String status;
0200: private boolean isReadOnly = false;
0201: private boolean isNew = true;
0202: private boolean isProfile = false;
0203: private boolean isRunning = false;
0204: private int priority;
0205: private int numberOfJournalEntries = 0;
0206:
0207: private String statisticsFileSave = "";
0208:
0209: private String errorMessage = null;
0210:
0211: private File jobDir = null;
0212:
0213: private transient CrawlJobErrorHandler errorHandler = null;
0214:
0215: protected transient XMLSettingsHandler settingsHandler;
0216:
0217: private transient CrawlController controller = null;
0218:
0219: private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";
0220: private static final String CRAWL_LOG_STYLE = "crawlLog";
0221:
0222: // OpenMBean support.
0223:
0224: /**
0225: * Server we registered with. Maybe null.
0226: */
0227: private transient MBeanServer mbeanServer = null;
0228: private transient ObjectName mbeanName = null;
0229: private static final String CRAWLJOB_JMXMBEAN_TYPE = JmxUtils.SERVICE
0230: + ".Job";
0231: private transient JEMBeanHelper bdbjeMBeanHelper = null;
0232: private transient List<String> bdbjeAttributeNameList = null;
0233: private transient List<String> bdbjeOperationsNameList = null;
0234:
0235: /**
0236: * The MBean we've registered ourselves with (May be null
0237: * throughout life of Heritrix).
0238: */
0239: private transient OpenMBeanInfoSupport openMBeanInfo;
0240:
0241: private final static String NAME_ATTR = "Name";
0242: private final static String UID_ATTR = "UID";
0243: private final static String STATUS_ATTR = "Status";
0244: private final static String FRONTIER_SHORT_REPORT_ATTR = "FrontierShortReport";
0245: private final static String THREADS_SHORT_REPORT_ATTR = "ThreadsShortReport";
0246: private final static String TOTAL_DATA_ATTR = "TotalData";
0247: private final static String CRAWL_TIME_ATTR = "CrawlTime";
0248: private final static String DOC_RATE_ATTR = "DocRate";
0249: private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
0250: private final static String KB_RATE_ATTR = "KbRate";
0251: private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate";
0252: private final static String THREAD_COUNT_ATTR = "ThreadCount";
0253: private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount";
0254: private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount";
0255: private final static String[] ATTRIBUTE_ARRAY = { NAME_ATTR,
0256: UID_ATTR, STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR,
0257: THREADS_SHORT_REPORT_ATTR, TOTAL_DATA_ATTR,
0258: CRAWL_TIME_ATTR, DOC_RATE_ATTR, CURRENT_DOC_RATE_ATTR,
0259: KB_RATE_ATTR, CURRENT_KB_RATE_ATTR, THREAD_COUNT_ATTR,
0260: DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR };
0261: private final static List ATTRIBUTE_LIST = Arrays
0262: .asList(ATTRIBUTE_ARRAY);
0263:
0264: private final static String IMPORT_URI_OPER = "importUri";
0265: private final static String IMPORT_URIS_OPER = "importUris";
0266: private final static String PAUSE_OPER = "pause";
0267: private final static String RESUME_OPER = "resume";
0268: private final static String FRONTIER_REPORT_OPER = "frontierReport";
0269: private final static String THREADS_REPORT_OPER = "threadsReport";
0270: private final static String SEEDS_REPORT_OPER = "seedsReport";
0271: private final static String CHECKPOINT_OPER = "startCheckpoint";
0272: private final static String PROGRESS_STATISTICS_OPER = "progressStatistics";
0273: private final static String PROGRESS_STATISTICS_LEGEND_OPER = "progressStatisticsLegend";
0274:
0275: private final static String PROG_STATS = "progressStatistics";
0276:
0277: // Same as JEMBeanHelper.OP_DB_STAT
0278: private final static String OP_DB_STAT = "getDatabaseStats";
0279:
0280: /**
0281: * Don't add the following crawl-order items.
0282: */
0283: private final static List ORDER_EXCLUDE;
0284: static {
0285: ORDER_EXCLUDE = Arrays.asList(new String[] {
0286: "bdb-cache-percent", "extract-processors", "DNS",
0287: "uri-included-structure" });
0288: }
0289:
0290: /**
0291: * Sequence number for jmx notifications.
0292: */
0293: private static int notificationsSequenceNumber = 1;
0294:
0295: /**
0296: * A shutdown Constructor.
0297: */
0298: protected CrawlJob() {
0299: super ();
0300: }
0301:
0302: /**
0303: * A constructor for jobs.
0304: *
0305: * <p> Create, ready to crawl, jobs.
0306: * @param UID A unique ID for this job. Typically emitted by the
0307: * CrawlJobHandler.
0308: * @param name The name of the job
0309: * @param settingsHandler The associated settings
0310: * @param errorHandler The crawl jobs settings error handler.
0311: * <tt>null</tt> means none is set
0312: * @param priority job priority.
0313: * @param dir The directory that is considered this jobs working directory.
0314: */
0315: public CrawlJob(final String UID, final String name,
0316: final XMLSettingsHandler settingsHandler,
0317: final CrawlJobErrorHandler errorHandler,
0318: final int priority, final File dir) {
0319: this (UID, name, settingsHandler, errorHandler, priority, dir,
0320: null, false, true);
0321: }
0322:
0323: /**
0324: * A constructor for profiles.
0325: *
0326: * <p> Any job created with this constructor will be
0327: * considered a profile. Profiles are not stored on disk (only their
0328: * settings files are stored on disk). This is because their data is
0329: * predictible given any settings files.
0330: * @param UIDandName A unique ID for this job. For profiles this is the same
0331: * as name
0332: * @param settingsHandler The associated settings
0333: * @param errorHandler The crawl jobs settings error handler.
0334: * <tt>null</tt> means none is set
0335: */
0336: protected CrawlJob(final String UIDandName,
0337: final XMLSettingsHandler settingsHandler,
0338: final CrawlJobErrorHandler errorHandler) {
0339: this (UIDandName, UIDandName, settingsHandler, errorHandler,
0340: PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
0341: }
0342:
0343: public CrawlJob(final String UID, final String name,
0344: final XMLSettingsHandler settingsHandler,
0345: final CrawlJobErrorHandler errorHandler,
0346: final int priority, final File dir, final String status,
0347: final boolean isProfile, final boolean isNew) {
0348: super ();
0349: this .UID = UID;
0350: this .name = name;
0351: this .settingsHandler = settingsHandler;
0352: this .errorHandler = errorHandler;
0353: this .status = status;
0354: this .isProfile = isProfile;
0355: this .isNew = isNew;
0356: this .jobDir = dir;
0357: this .priority = priority;
0358: }
0359:
0360: /**
0361: * A constructor for reloading jobs from disk. Jobs (not profiles) have
0362: * their data written to persistent storage in the file system. This method
0363: * is used to load the job from such storage. This is done by the
0364: * <code>CrawlJobHandler</code>.
0365: * <p>
0366: * Proper structure of a job file (TODO: Maybe one day make this an XML file)
0367: * Line 1. UID <br>
0368: * Line 2. Job name (string) <br>
0369: * Line 3. Job status (string) <br>
0370: * Line 4. is job read only (true/false) <br>
0371: * Line 5. is job running (true/false) <br>
0372: * Line 6. job priority (int) <br>
0373: * Line 7. number of journal entries <br>
0374: * Line 8. setting file (with path) <br>
0375: * Line 9. statistics tracker file (with path) <br>
0376: * Line 10-?. error message (String, empty for null), can be many lines <br>
0377: * @param jobFile
0378: * a file containing information about the job to load.
0379: * @param errorHandler The crawl jobs settings error handler.
0380: * null means none is set
0381: * @throws InvalidJobFileException
0382: * if the specified file does not refer to a valid job file.
0383: * @throws IOException
0384: * if io operations fail
0385: */
0386: protected CrawlJob(final File jobFile,
0387: final CrawlJobErrorHandler errorHandler)
0388: throws InvalidJobFileException, IOException {
0389: this (null, null, null, errorHandler, PRIORITY_AVERAGE, null,
0390: null, false, true);
0391: this .jobDir = jobFile.getParentFile();
0392:
0393: // Check for corrupt job.state files (can be corrupt if we crash).
0394: if (jobFile.length() == 0) {
0395: throw new InvalidJobFileException(jobFile
0396: .getCanonicalPath()
0397: + " is corrupt (length is zero)");
0398: }
0399:
0400: // Open file. Read data and set up class variables accordingly...
0401: BufferedReader jobReader = new BufferedReader(new FileReader(
0402: jobFile), 4096);
0403: // UID
0404: this .UID = jobReader.readLine();
0405: // name
0406: this .name = jobReader.readLine();
0407: // status
0408: this .status = jobReader.readLine();
0409: if (status.equals(STATUS_ABORTED) == false
0410: && status.equals(STATUS_CREATED) == false
0411: && status.equals(STATUS_DELETED) == false
0412: && status.equals(STATUS_FINISHED) == false
0413: && status.equals(STATUS_FINISHED_ABNORMAL) == false
0414: && status.equals(STATUS_FINISHED_DATA_LIMIT) == false
0415: && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT) == false
0416: && status.equals(STATUS_FINISHED_TIME_LIMIT) == false
0417: && status.equals(STATUS_MISCONFIGURED) == false
0418: && status.equals(STATUS_PAUSED) == false
0419: && status.equals(STATUS_CHECKPOINTING) == false
0420: && status.equals(STATUS_PENDING) == false
0421: && status.equals(STATUS_RUNNING) == false
0422: && status.equals(STATUS_WAITING_FOR_PAUSE) == false
0423: && status.equals(STATUS_PREPARING) == false) {
0424: // status is invalid. Must be one of the above
0425: throw new InvalidJobFileException(
0426: "Status (line 3) in job file " + "is not valid: '"
0427: + status + "'");
0428: }
0429: // isReadOnly
0430: String tmp = jobReader.readLine();
0431: if (tmp.equals("true")) {
0432: isReadOnly = true;
0433: } else if (tmp.equals("false")) {
0434: isReadOnly = false;
0435: } else {
0436: throw new InvalidJobFileException(
0437: "isReadOnly (line 4) in job" + " file '"
0438: + jobFile.getAbsolutePath() + "' is not "
0439: + "valid: '" + tmp + "'");
0440: }
0441: // isRunning
0442: tmp = jobReader.readLine();
0443: if (tmp.equals("true")) {
0444: this .isRunning = true;
0445: } else if (tmp.equals("false")) {
0446: this .isRunning = false;
0447: } else {
0448: throw new InvalidJobFileException(
0449: "isRunning (line 5) in job " + "file '"
0450: + jobFile.getAbsolutePath()
0451: + "' is not valid: " + "'" + tmp + "'");
0452: }
0453: // priority
0454: tmp = jobReader.readLine();
0455: try {
0456: this .priority = Integer.parseInt(tmp);
0457: } catch (NumberFormatException e) {
0458: throw new InvalidJobFileException(
0459: "priority (line 5) in job " + "file '"
0460: + jobFile.getAbsolutePath()
0461: + "' is not valid: " + "'" + tmp + "'");
0462: }
0463: // numberOfJournalEntries
0464: tmp = jobReader.readLine();
0465: try {
0466: this .numberOfJournalEntries = Integer.parseInt(tmp);
0467: } catch (NumberFormatException e) {
0468: throw new InvalidJobFileException("numberOfJournalEntries "
0469: + "(line 5) in job file '"
0470: + jobFile.getAbsolutePath() + "' is not valid: "
0471: + "'" + tmp + "'");
0472: }
0473: // settingsHandler
0474: tmp = jobReader.readLine();
0475: try {
0476: File f = new File(tmp);
0477: this .settingsHandler = new XMLSettingsHandler((f
0478: .isAbsolute()) ? f : new File(jobDir, f.getName()));
0479: if (this .errorHandler != null) {
0480: this .settingsHandler
0481: .registerValueErrorHandler(errorHandler);
0482: }
0483: this .settingsHandler.initialize();
0484: } catch (InvalidAttributeValueException e1) {
0485: throw new InvalidJobFileException(
0486: "Problem reading from settings " + "file (" + tmp
0487: + ") specified in job file '"
0488: + jobFile.getAbsolutePath() + "'\n"
0489: + e1.getMessage());
0490: }
0491: // Statistics tracker.
0492: jobReader.readLine();
0493: // errorMessage
0494: // TODO: Multilines
0495: tmp = jobReader.readLine();
0496: errorMessage = "";
0497: while (tmp != null) {
0498: errorMessage += tmp + '\n';
0499: tmp = jobReader.readLine();
0500: }
0501: if (errorMessage.length() == 0) {
0502: // Empty error message should be null
0503: errorMessage = null;
0504: }
0505: // TODO: Load stattrack if needed.
0506:
0507: // TODO: This should be inside a finally block.
0508: jobReader.close();
0509: }
0510:
0511: /**
0512: * Cause the job to be written to persistent storage.
0513: * This will also save the statistics tracker if it is not null and the
0514: * job status is finished (regardless of how it's finished)
0515: */
0516: private void writeJobFile() {
0517: if (isProfile) {
0518: return;
0519: }
0520:
0521: final String jobDirAbsolute = jobDir.getAbsolutePath();
0522: if (!jobDir.exists() || !jobDir.canWrite()) {
0523: logger.warning("Can't update status on " + jobDirAbsolute
0524: + " because file does not"
0525: + " exist (or is unwriteable)");
0526: return;
0527: }
0528: File f = new File(jobDirAbsolute, "state.job");
0529:
0530: String settingsFile = getSettingsDirectory();
0531: // Make settingsFile's path relative if order.xml is somewhere in the
0532: // job's directory tree
0533: if (settingsFile.startsWith(jobDirAbsolute
0534: .concat(File.separator))) {
0535: settingsFile = settingsFile.substring(jobDirAbsolute
0536: .length() + 1);
0537: }
0538: try {
0539: FileWriter jobWriter = new FileWriter(f, false);
0540: try {
0541: jobWriter.write(UID + "\n");
0542: jobWriter.write(name + "\n");
0543: jobWriter.write(status + "\n");
0544: jobWriter.write(isReadOnly + "\n");
0545: jobWriter.write(isRunning + "\n");
0546: jobWriter.write(priority + "\n");
0547: jobWriter.write(numberOfJournalEntries + "\n");
0548: jobWriter.write(settingsFile + "\n");
0549: jobWriter.write(statisticsFileSave + "\n");// TODO: Is this
0550: // right?
0551: // Can be multiple lines so we keep it last
0552: if (errorMessage != null) {
0553: jobWriter.write(errorMessage + "\n");
0554: }
0555: } finally {
0556: if (jobWriter != null) {
0557: jobWriter.close();
0558: }
0559: }
0560: } catch (IOException e) {
0561: logger.log(Level.WARNING,
0562: "An IOException occured saving job " + name + " ("
0563: + UID + ")", e);
0564: }
0565: }
0566:
0567: /**
0568: * Returns this jobs unique ID (UID) that was issued by the
0569: * CrawlJobHandler() when this job was first created.
0570: *
0571: * @return Job This jobs UID.
0572: * @see CrawlJobHandler#getNextJobUID()
0573: */
0574: public String getUID() {
0575: return UID;
0576: }
0577:
0578: /**
0579: * Returns this job's 'name'. The name comes from the settings for this job,
0580: * need not be unique and may change. For a unique identifier use
0581: * {@link #getUID() getUID()}.
0582: * <p>
0583: * The name corrisponds to the value of the 'name' tag in the 'meta' section
0584: * of the settings file.
0585: *
0586: * @return This job's 'name'
0587: */
0588: public String getJobName() {
0589: return name;
0590: }
0591:
0592: /**
0593: * Return the combination of given name and UID most commonly
0594: * used in administrative interface.
0595: *
0596: * @return Job's name with UID notation
0597: */
0598: public String getDisplayName() {
0599: return getJobName() + " [" + getUID() + "]";
0600: }
0601:
0602: /**
0603: * Set this job's level of priority.
0604: *
0605: * @param priority The level of priority
0606: *
0607: * @see #getJobPriority()
0608: * @see #PRIORITY_MINIMAL
0609: * @see #PRIORITY_LOW
0610: * @see #PRIORITY_AVERAGE
0611: * @see #PRIORITY_HIGH
0612: * @see #PRIORITY_CRITICAL
0613: */
0614: public void setJobPriority(int priority) {
0615: this .priority = priority;
0616: }
0617:
0618: /**
0619: * Get this job's level of priority.
0620: *
0621: * @return this job's priority
0622: * @see #setJobPriority(int)
0623: * @see #PRIORITY_MINIMAL
0624: * @see #PRIORITY_LOW
0625: * @see #PRIORITY_AVERAGE
0626: * @see #PRIORITY_HIGH
0627: * @see #PRIORITY_CRITICAL
0628: */
0629: public int getJobPriority() {
0630: return priority;
0631: }
0632:
0633: /**
0634: * Once called no changes can be made to the settings for this job.
0635: * Typically this is done once a crawl is completed and further changes
0636: * to the crawl order are therefor meaningless.
0637: */
0638: public void setReadOnly() {
0639: isReadOnly = true;
0640: writeJobFile(); //Save changes
0641: }
0642:
0643: /**
0644: * Is job read only?
0645: * @return false until setReadOnly has been invoked, after that it returns true.
0646: */
0647: public boolean isReadOnly() {
0648: return isReadOnly;
0649: }
0650:
0651: /**
0652: * Set the status of this CrawlJob.
0653: *
0654: * @param status Current status of CrawlJob
0655: * (see constants defined here beginning with STATUS)
0656: */
0657: public void setStatus(String status) {
0658: this .status = status;
0659: writeJobFile(); //Save changes
0660: // TODO: If job finished, save StatisticsTracker!
0661: }
0662:
0663: /**
0664: * @return Status of the crawler (Used by JMX).
0665: */
0666: public String getCrawlStatus() {
0667: return this .controller != null ? this .controller.getState()
0668: .toString() : "Illegal State";
0669: }
0670:
0671: /**
0672: * Get the current status of this CrawlJob
0673: *
0674: * @return The current status of this CrawlJob
0675: * (see constants defined here beginning with STATUS)
0676: */
0677: public String getStatus() {
0678: return this .status;
0679: }
0680:
0681: /**
0682: * Returns the settings handler for this job. It will have been initialized.
0683: * @return the settings handler for this job.
0684: */
0685: public XMLSettingsHandler getSettingsHandler() {
0686: return this .settingsHandler;
0687: }
0688:
0689: /**
0690: * Is this a new job?
0691: * @return True if is new.
0692: */
0693: public boolean isNew() {
0694: return isNew;
0695: }
0696:
0697: /**
0698: * Set if the job is considered to be a profile
0699: * @return True if is a profile.
0700: */
0701: public boolean isProfile() {
0702: return isProfile;
0703: }
0704:
0705: /**
0706: * Set if the job is considered a new job or not.
0707: * @param b Is the job considered to be new.
0708: */
0709: public void setNew(boolean b) {
0710: isNew = b;
0711: writeJobFile(); //Save changes
0712: }
0713:
0714: /**
0715: * Returns true if the job is being crawled.
0716: * @return true if the job is being crawled
0717: */
0718: public boolean isRunning() {
0719: return isRunning;
0720: }
0721:
0722: /**
0723: * Set if job is being crawled.
0724: * @param b Is job being crawled.
0725: */
0726: protected void setRunning(boolean b) {
0727: isRunning = b;
0728: writeJobFile(); // Save changes
0729: //TODO: Job ending -> Save statistics tracker.
0730: //TODO: This is likely to happen as the CrawlEnding event occurs,
0731: // need to ensure that the StatisticsTracker is saved to disk on
0732: // CrawlEnded. Maybe move responsibility for this into the
0733: // StatisticsTracker?
0734: }
0735:
0736: protected void unregisterMBean() {
0737: // Unregister current job from JMX agent, if there one.
0738: if (this .mbeanServer == null) {
0739: return;
0740: }
0741: try {
0742: this .mbeanServer.unregisterMBean(this .mbeanName);
0743: this .mbeanServer = null;
0744: } catch (Exception e) {
0745: logger
0746: .log(Level.SEVERE, "Failed with " + this .mbeanName,
0747: e);
0748: }
0749: }
0750:
0751: /**
0752: * Subclass of crawlcontroller that unregisters beans when stopped.
0753: * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
0754: * pollution, so for sure CrawlJob is unregistered with JMX and so any
0755: * listeners on the CrawlJob get a chance to get crawl ended message
0756: * (These latter notifications may not actually be getting through -- TBD).
0757: * <p>TODO: This override dirtys the data model since CC knows about CJs.
0758: * The facility provided by this class emitting events and statistics so
0759: * they can be read by JMX needs to go back into CC. Probably best to
0760: * registering in JMX the CC, rather than CJ. Lets do this in Heritrix 2.0
0761: * since means changing the JMX API some.
0762: */
0763: public class MBeanCrawlController extends CrawlController implements
0764: Serializable {
0765: private static final long serialVersionUID = -4608537998168407222L;
0766: private CrawlJob cj = null;
0767: private CompositeType ct = null;
0768:
0769: public CrawlJob getCrawlJob() {
0770: return this .cj;
0771: }
0772:
0773: public void setCrawlJob(CrawlJob cj) {
0774: this .cj = cj;
0775: }
0776:
0777: public void progressStatisticsEvent(final EventObject e) {
0778: super .progressStatisticsEvent(e);
0779: if (this .cj.getMbeanName() == null) {
0780: // Can be null around job startup. Return w/o doing anything.
0781: return;
0782: }
0783:
0784: Map s = ((StatisticsTracking) e.getSource())
0785: .getProgressStatistics();
0786: // Convert the statistics to OpenType CompositeData and add as
0787: // user data to Notification.
0788: CompositeData cd = null;
0789: try {
0790: if (this .ct == null) {
0791: this .ct = JmxUtils.createCompositeType(s,
0792: PROG_STATS, PROG_STATS + " for "
0793: + this .cj.getMbeanName());
0794: }
0795: cd = new CompositeDataSupport(this .ct, s);
0796: } catch (OpenDataException ode) {
0797: ode.printStackTrace();
0798: }
0799: if (cd != null) {
0800: Notification n = new Notification(PROG_STATS, this .cj
0801: .getMbeanName(),
0802: getNotificationsSequenceNumber(),
0803: ((StatisticsTracking) e.getSource())
0804: .getProgressStatisticsLine());
0805: n.setUserData(cd);
0806: this .cj.sendNotification(n);
0807: }
0808: }
0809:
0810: protected void completeStop() {
0811: try {
0812: super .completeStop();
0813: } finally {
0814: if (this .cj != null) {
0815: this .cj.unregisterMBean();
0816: }
0817: this .cj = null;
0818: }
0819: }
0820: }
0821:
0822: protected CrawlController setupCrawlController()
0823: throws InitializationException {
0824: CrawlController controller = null;
0825:
0826: // Check if we're to do a checkpoint recover. If so, deserialize
0827: // the checkpoint's CrawlController and use that in place of a new
0828: // CrawlController instance.
0829: Checkpoint cp = CrawlController
0830: .getCheckpointRecover(getSettingsHandler().getOrder());
0831: if (cp != null) {
0832: try {
0833: controller = (MBeanCrawlController) CheckpointUtils
0834: .readObjectFromFile(MBeanCrawlController.class,
0835: cp.getDirectory());
0836: } catch (FileNotFoundException e) {
0837: throw new InitializationException(e);
0838: } catch (IOException e) {
0839: throw new InitializationException(e);
0840: } catch (ClassNotFoundException e) {
0841: throw new InitializationException(e);
0842: }
0843: } else {
0844: controller = new MBeanCrawlController();
0845: }
0846: return controller;
0847: }
0848:
0849: protected CrawlController createCrawlController() {
0850: return new MBeanCrawlController();
0851: }
0852:
0853: public void setupForCrawlStart() throws InitializationException {
0854: try {
0855: this .controller = setupCrawlController();
0856: // Register as listener to get job finished notice.
0857: this .controller.addCrawlStatusListener(this );
0858: this .controller.initialize(getSettingsHandler());
0859: // Set the crawl job this MBeanCrawlController needs to worry about.
0860: ((MBeanCrawlController) this .controller).setCrawlJob(this );
0861: // Create our mbean description and register our crawljob.
0862: this .openMBeanInfo = buildMBeanInfo();
0863: try {
0864: Heritrix.registerMBean(this , getJmxJobName(),
0865: CRAWLJOB_JMXMBEAN_TYPE);
0866: } catch (InstanceAlreadyExistsException e) {
0867: throw new InitializationException(e);
0868: } catch (MBeanRegistrationException e) {
0869: throw new InitializationException(e);
0870: } catch (NotCompliantMBeanException e) {
0871: throw new InitializationException(e);
0872: }
0873: } catch (InitializationException e) {
0874: // Can't load current job since it is misconfigured.
0875: setStatus(CrawlJob.STATUS_MISCONFIGURED);
0876: setErrorMessage("A fatal InitializationException occured when "
0877: + "loading job:\n" + e.getMessage());
0878: // Log to stdout so its seen in logs as well as in UI.
0879: e.printStackTrace();
0880: this .controller = null;
0881: throw e;
0882: }
0883: setStatus(CrawlJob.STATUS_RUNNING);
0884: setRunning(true);
0885: }
0886:
0887: public void stopCrawling() {
0888: if (this .controller != null) {
0889: this .controller.requestCrawlStop();
0890: }
0891: }
0892:
0893: /**
0894: * @return One-line Frontier report.
0895: */
0896: public String getFrontierOneLine() {
0897: if (this .controller == null
0898: || this .controller.getFrontier() == null) {
0899: return "Crawler not running";
0900: }
0901: return this .controller.getFrontier().singleLineReport();
0902: }
0903:
0904: /**
0905: * @param reportName Name of report to write.
0906: * @return A report of the frontier's status.
0907: */
0908: public String getFrontierReport(final String reportName) {
0909: if (this .controller == null
0910: || this .controller.getFrontier() == null) {
0911: return "Crawler not running";
0912: }
0913: return ArchiveUtils.writeReportToString(this .controller
0914: .getFrontier(), reportName);
0915: }
0916:
0917: /**
0918: * Write the requested frontier report to the given PrintWriter
0919: * @param reportName Name of report to write.
0920: * @param writer Where to write to.
0921: */
0922: public void writeFrontierReport(String reportName,
0923: PrintWriter writer) {
0924: if (this .controller == null
0925: || this .controller.getFrontier() == null) {
0926: writer.println("Crawler not running.");
0927: return;
0928: }
0929: this .controller.getFrontier().reportTo(reportName, writer);
0930: }
0931:
0932: /**
0933: * @return One-line threads report.
0934: */
0935: public String getThreadOneLine() {
0936: if (this .controller == null) {
0937: return "Crawler not running";
0938: }
0939: return this .controller.oneLineReportThreads();
0940: }
0941:
0942: /**
0943: * Get the CrawlControllers ToeThreads report for the running crawl.
0944: * @return The CrawlControllers ToeThreads report
0945: */
0946: public String getThreadsReport() {
0947: if (this .controller == null) {
0948: return "Crawler not running";
0949: }
0950: return ArchiveUtils.writeReportToString(this .controller
0951: .getToePool(), null);
0952: }
0953:
0954: /**
0955: * Write the requested threads report to the given PrintWriter
0956: * @param reportName Name of report to write.
0957: * @param writer Where to write to.
0958: */
0959: public void writeThreadsReport(String reportName, PrintWriter writer) {
0960: if (this .controller == null
0961: || this .controller.getFrontier() == null) {
0962: writer.println("Crawler not running.");
0963: return;
0964: }
0965: this .controller.getToePool().reportTo(reportName, writer);
0966: }
0967:
0968: /**
0969: * Kills a thread. For details see
0970: * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
0971: * ToePool.killThread(int, boolean)}.
0972: * @param threadNumber Thread to kill.
0973: * @param replace Should thread be replaced.
0974: * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
0975: */
0976: public void killThread(int threadNumber, boolean replace) {
0977: if (this .controller == null) {
0978: return;
0979: }
0980: this .controller.killThread(threadNumber, replace);
0981: }
0982:
0983: /**
0984: * Get the Processors report for the running crawl.
0985: * @return The Processors report for the running crawl.
0986: */
0987: public String getProcessorsReport() {
0988: if (this .controller == null) {
0989: return "Crawler not running";
0990: }
0991: return ArchiveUtils.writeReportToString(this .controller,
0992: CrawlController.PROCESSORS_REPORT);
0993: }
0994:
0995: /**
0996: * Returns the directory where the configuration files for this job are
0997: * located.
0998: *
0999: * @return the directory where the configuration files for this job are
1000: * located
1001: */
1002: public String getSettingsDirectory() {
1003: return settingsHandler.getOrderFile().getPath();
1004: }
1005:
1006: /**
1007: * Returns the path of the job's base directory. For profiles this is always
1008: * equal to <code>new File(getSettingsDirectory())</code>.
1009: * @return the path of the job's base directory.
1010: */
1011: public File getDirectory() {
1012: return isProfile ? new File(getSettingsDirectory()) : jobDir;
1013: }
1014:
1015: /**
1016: * Get the error message associated with this job. Will return null if there
1017: * is no error message.
1018: * @return the error message associated with this job
1019: */
1020: public String getErrorMessage() {
1021: return errorMessage;
1022: }
1023:
1024: /**
1025: * Set an error message for this job. Generally this only occurs if the job
1026: * is misconfigured.
1027: * @param string the error message associated with this job
1028: */
1029: public void setErrorMessage(String string) {
1030: errorMessage = string;
1031: writeJobFile(); //Save changes
1032: }
1033:
1034: /**
1035: * @return Returns the number of journal entries.
1036: */
1037: public int getNumberOfJournalEntries() {
1038: return numberOfJournalEntries;
1039: }
1040:
1041: /**
1042: * @param numberOfJournalEntries The number of journal entries to set.
1043: */
1044: public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1045: this .numberOfJournalEntries = numberOfJournalEntries;
1046: writeJobFile();
1047: }
1048:
1049: /**
1050: * @return Returns the error handler for this crawl job
1051: */
1052: public CrawlJobErrorHandler getErrorHandler() {
1053: return errorHandler;
1054: }
1055:
1056: /**
1057: * Read all the checkpoints found in the job's checkpoints
1058: * directory into Checkpoint instances
1059: * @return Collection containing list of all checkpoints.
1060: */
1061: public Collection scanCheckpoints() {
1062: File checkpointsDirectory = settingsHandler.getOrder()
1063: .getCheckpointsDirectory();
1064: File[] perCheckpointDirs = checkpointsDirectory.listFiles();
1065: Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();
1066: if (perCheckpointDirs != null) {
1067: for (int i = 0; i < perCheckpointDirs.length; i++) {
1068: Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1069: checkpoints.add(cp);
1070: }
1071: }
1072: return checkpoints;
1073: }
1074:
1075: /**
1076: * Returns the absolute path of the specified log.
1077: * Note: If crawl has not begun, this file may not exist.
1078: * @param log
1079: * @return the absolute path for the specified log.
1080: * @throws AttributeNotFoundException
1081: * @throws ReflectionException
1082: * @throws MBeanException
1083: */
1084: public String getLogPath(String log)
1085: throws AttributeNotFoundException, MBeanException,
1086: ReflectionException {
1087: String logsPath = (String) settingsHandler.getOrder()
1088: .getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1089: CrawlOrder order = settingsHandler.getOrder();
1090: String diskPath = (String) order.getAttribute(null,
1091: CrawlOrder.ATTR_DISK_PATH);
1092: File disk = settingsHandler
1093: .getPathRelativeToWorkingDirectory(diskPath);
1094: File f = new File(logsPath, log);
1095: if (!f.isAbsolute()) {
1096: f = new File(disk.getPath(), f.getPath());
1097: }
1098: return f.getAbsolutePath();
1099: }
1100:
1101: // OpenMBean implementation.
1102:
1103: protected void pause() {
1104: if (this .controller != null
1105: && this .controller.isPaused() == false) {
1106: this .controller.requestCrawlPause();
1107: }
1108: }
1109:
1110: protected void resume() {
1111: if (this .controller != null) {
1112: this .controller.requestCrawlResume();
1113: }
1114: }
1115:
1116: /**
1117: * @throws IllegalStateException Thrown if crawl is not paused.
1118: */
1119: protected void checkpoint() throws IllegalStateException {
1120: if (this .controller != null) {
1121: this .controller.requestCrawlCheckpoint();
1122: }
1123: }
1124:
1125: /**
1126: * @return True if checkpointing.
1127: */
1128: public boolean isCheckpointing() {
1129: return this .controller != null ? this .controller
1130: .isCheckpointing() : false;
1131: }
1132:
1133: /**
1134: * If its a HostQueuesFrontier, needs to be flushed for the queued.
1135: */
1136: protected void flush() {
1137: // Nothing to do.
1138: }
1139:
1140: /**
1141: * Delete any URI from the frontier of the current (paused) job that match
1142: * the specified regular expression. If the current job is not paused (or
1143: * there is no current job) nothing will be done.
1144: * @param regexpr Regular expression to delete URIs by.
1145: * @return the number of URIs deleted
1146: */
1147: public long deleteURIsFromPending(String regexpr) {
1148: return (this .controller != null
1149: && this .controller.getFrontier() != null && this .controller
1150: .isPaused()) ? this .controller.getFrontier()
1151: .deleteURIs(regexpr) : 0;
1152: }
1153:
1154: public String importUris(String file, String style, String force) {
1155: return importUris(file, style, "true".equals(force));
1156: }
1157:
1158: public String importUris(final String fileOrUrl,
1159: final String style, final boolean forceRevisit) {
1160: return importUris(fileOrUrl, style, forceRevisit, false);
1161: }
1162:
1163: /**
1164: * @param fileOrUrl Name of file w/ seeds.
1165: * @param style What style of seeds -- crawl log, recovery journal, or
1166: * seeds file.
1167: * @param forceRevisit Should we revisit even if seen before?
1168: * @param areSeeds Is the file exclusively seeds?
1169: * @return A display string that has a count of all added.
1170: */
1171: public String importUris(final String fileOrUrl,
1172: final String style, final boolean forceRevisit,
1173: final boolean areSeeds) {
1174: InputStream is = IoUtils.getInputStream(this .controller
1175: .getDisk(), fileOrUrl);
1176: String message = null;
1177: // Do we have an inputstream?
1178: if (is == null) {
1179: message = "Failed to get inputstream from " + fileOrUrl;
1180: logger.severe(message);
1181: } else {
1182: int addedCount = importUris(is, style, forceRevisit,
1183: areSeeds);
1184: message = Integer.toString(addedCount)
1185: + " URIs added from " + fileOrUrl;
1186: }
1187: return message;
1188: }
1189:
1190: protected int importUris(InputStream is, String style,
1191: boolean forceRevisit) {
1192: return importUris(is, style, forceRevisit, false);
1193: }
1194:
1195: /**
1196: * Import URIs.
1197: * @param is Stream to use as URI source.
1198: * @param style Style in which URIs are rendored. Currently support for
1199: * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1200: * format (i.e <code>default</code>) where <code>default</code> style is
1201: * a UURI per line (comments allowed).
1202: * @param forceRevisit Whether we should revisit this URI even if we've
1203: * visited it previously.
1204: * @param areSeeds Are the imported URIs seeds?
1205: * @return Count of added URIs.
1206: */
1207: protected int importUris(InputStream is, String style,
1208: boolean forceRevisit, final boolean areSeeds) {
1209: // Figure the regex to use parsing each line of input stream.
1210: String extractor;
1211: String output;
1212: if (CRAWL_LOG_STYLE.equals(style)) {
1213: // Skip first 3 fields
1214: extractor = "\\S+\\s+\\S+\\s+\\S+\\s+(\\S+\\s+\\S+\\s+\\S+\\s+).*";
1215: output = "$1";
1216: } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1217: // Skip the begin-of-line directive
1218: extractor = "\\S+\\s+((\\S+)(?:\\s+\\S+\\s+\\S+)?)\\s*";
1219: output = "$1";
1220: } else {
1221: extractor = RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1222: output = RegexpLineIterator.ENTRY;
1223: }
1224:
1225: // Read the input stream.
1226: BufferedReader br = null;
1227: int addedCount = 0;
1228: try {
1229: br = new BufferedReader(new InputStreamReader(is));
1230: Iterator iter = new RegexpLineIterator(
1231: new LineReadingIterator(br),
1232: RegexpLineIterator.COMMENT_LINE, extractor, output);
1233: while (iter.hasNext()) {
1234: try {
1235: importUri((String) iter.next(), forceRevisit,
1236: areSeeds, false);
1237: addedCount++;
1238: } catch (URIException e) {
1239: e.printStackTrace();
1240: }
1241: }
1242: br.close();
1243: flush();
1244: } catch (IOException e) {
1245: e.printStackTrace();
1246: }
1247: return addedCount;
1248: }
1249:
1250: /**
1251: * Schedule a uri.
1252: * @param uri Uri to schedule.
1253: * @param forceFetch Should it be forcefetched.
1254: * @param isSeed True if seed.
1255: * @throws URIException
1256: */
1257: public void importUri(final String uri, final boolean forceFetch,
1258: final boolean isSeed) throws URIException {
1259: importUri(uri, forceFetch, isSeed, true);
1260: }
1261:
1262: /**
1263: * Schedule a uri.
1264: * @param str String that can be: 1. a UURI, 2. a snippet of the
1265: * crawl.log line, or 3. a snippet from recover log. See
1266: * {@link #importUris(InputStream, String, boolean)} for how it subparses
1267: * the lines from crawl.log and recover.log.
1268: * @param forceFetch Should it be forcefetched.
1269: * @param isSeed True if seed.
1270: * @param isFlush If true, flush the frontier IF it implements
1271: * flushing.
1272: * @throws URIException
1273: */
1274: public void importUri(final String str, final boolean forceFetch,
1275: final boolean isSeed, final boolean isFlush)
1276: throws URIException {
1277: CandidateURI caUri = CandidateURI.fromString(str);
1278: caUri.setForceFetch(forceFetch);
1279: if (isSeed) {
1280: caUri.setIsSeed(isSeed);
1281: if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1282: // Danger of double-add of seeds because of this code here.
1283: // Only call addSeed if no via. If a via, the schedule will
1284: // take care of updating scope.
1285: this .controller.getScope().addSeed(caUri);
1286: }
1287: }
1288: this .controller.getFrontier().schedule(caUri);
1289: if (isFlush) {
1290: flush();
1291: }
1292: }
1293:
1294: /**
1295: * @return Our mbean info (Needed for CrawlJob to qualify as a
1296: * DynamicMBean).
1297: */
1298: public MBeanInfo getMBeanInfo() {
1299: return this .openMBeanInfo;
1300: }
1301:
1302: /**
1303: * Build up the MBean info for Heritrix main.
1304: * @return Return created mbean info instance.
1305: * @throws InitializationException
1306: */
1307: protected OpenMBeanInfoSupport buildMBeanInfo()
1308: throws InitializationException {
1309: // Start adding my attributes.
1310: List<OpenMBeanAttributeInfo> attributes = new ArrayList<OpenMBeanAttributeInfo>();
1311:
1312: // Attributes.
1313: attributes
1314: .add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,
1315: "Crawl job name", SimpleType.STRING, true,
1316: false, false));
1317: attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,
1318: "Short basic status message", SimpleType.STRING, true,
1319: false, false));
1320: attributes.add(new OpenMBeanAttributeInfoSupport(
1321: FRONTIER_SHORT_REPORT_ATTR, "Short frontier report",
1322: SimpleType.STRING, true, false, false));
1323: attributes.add(new OpenMBeanAttributeInfoSupport(
1324: THREADS_SHORT_REPORT_ATTR, "Short threads report",
1325: SimpleType.STRING, true, false, false));
1326: attributes
1327: .add(new OpenMBeanAttributeInfoSupport(UID_ATTR,
1328: "Crawl job UID", SimpleType.STRING, true,
1329: false, false));
1330: attributes.add(new OpenMBeanAttributeInfoSupport(
1331: TOTAL_DATA_ATTR, "Total data received",
1332: SimpleType.LONG, true, false, false));
1333: attributes.add(new OpenMBeanAttributeInfoSupport(
1334: CRAWL_TIME_ATTR, "Crawl time", SimpleType.LONG, true,
1335: false, false));
1336: attributes.add(new OpenMBeanAttributeInfoSupport(
1337: CURRENT_DOC_RATE_ATTR,
1338: "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1339: true, false, false));
1340: attributes.add(new OpenMBeanAttributeInfoSupport(
1341: CURRENT_KB_RATE_ATTR, "Current crawling rate (Kb/sec)",
1342: SimpleType.LONG, true, false, false));
1343: attributes.add(new OpenMBeanAttributeInfoSupport(
1344: THREAD_COUNT_ATTR, "Active thread count",
1345: SimpleType.INTEGER, true, false, false));
1346: attributes.add(new OpenMBeanAttributeInfoSupport(DOC_RATE_ATTR,
1347: "Crawling rate (Docs/sec)", SimpleType.DOUBLE, true,
1348: false, false));
1349: attributes.add(new OpenMBeanAttributeInfoSupport(KB_RATE_ATTR,
1350: "Current crawling rate (Kb/sec)", SimpleType.LONG,
1351: true, false, false));
1352: attributes.add(new OpenMBeanAttributeInfoSupport(
1353: DOWNLOAD_COUNT_ATTR, "Count of downloaded documents",
1354: SimpleType.LONG, true, false, false));
1355: attributes.add(new OpenMBeanAttributeInfoSupport(
1356: DISCOVERED_COUNT_ATTR, "Count of discovered documents",
1357: SimpleType.LONG, true, false, false));
1358:
1359: // Add in the crawl order attributes.
1360: addCrawlOrderAttributes(this .getController().getOrder(),
1361: attributes);
1362:
1363: // Add the bdbje attributes. Convert to open mbean attributes.
1364: // First do bdbeje setup. Then add a subset of the bdbje attributes.
1365: // Keep around the list of names as a convenience for when it comes
1366: // time to test if attribute is supported.
1367: Environment env = this .controller.getBdbEnvironment();
1368: try {
1369: this .bdbjeMBeanHelper = new JEMBeanHelper(env.getConfig(),
1370: env.getHome(), true);
1371: } catch (DatabaseException e) {
1372: e.printStackTrace();
1373: InitializationException ie = new InitializationException(e
1374: .getMessage());
1375: ie.setStackTrace(e.getStackTrace());
1376: throw ie;
1377: }
1378: this .bdbjeAttributeNameList = Arrays.asList(new String[] {
1379: JEMBeanHelper.ATT_ENV_HOME, JEMBeanHelper.ATT_OPEN,
1380: JEMBeanHelper.ATT_IS_READ_ONLY,
1381: JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1382: JEMBeanHelper.ATT_CACHE_SIZE,
1383: JEMBeanHelper.ATT_CACHE_PERCENT,
1384: JEMBeanHelper.ATT_LOCK_TIMEOUT,
1385: JEMBeanHelper.ATT_IS_SERIALIZABLE,
1386: JEMBeanHelper.ATT_SET_READ_ONLY, });
1387: addBdbjeAttributes(attributes, this .bdbjeMBeanHelper
1388: .getAttributeList(env), this .bdbjeAttributeNameList);
1389:
1390: // Operations.
1391: List<OpenMBeanOperationInfo> operations = new ArrayList<OpenMBeanOperationInfo>();
1392: OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[3];
1393: args[0] = new OpenMBeanParameterInfoSupport("url",
1394: "URL to add to the frontier", SimpleType.STRING);
1395: args[1] = new OpenMBeanParameterInfoSupport("forceFetch",
1396: "True if URL is to be force fetched",
1397: SimpleType.BOOLEAN);
1398: args[2] = new OpenMBeanParameterInfoSupport("seed",
1399: "True if URL is a seed", SimpleType.BOOLEAN);
1400: operations.add(new OpenMBeanOperationInfoSupport(
1401: IMPORT_URI_OPER, "Add passed URL to the frontier",
1402: args, SimpleType.VOID, MBeanOperationInfo.ACTION));
1403:
1404: args = new OpenMBeanParameterInfoSupport[4];
1405: args[0] = new OpenMBeanParameterInfoSupport("pathOrUrl",
1406: "Path or URL to file of URLs", SimpleType.STRING);
1407: args[1] = new OpenMBeanParameterInfoSupport("style",
1408: "Format format:default|crawlLog|recoveryJournal",
1409: SimpleType.STRING);
1410: args[2] = new OpenMBeanParameterInfoSupport("forceFetch",
1411: "True if URLs are to be force fetched",
1412: SimpleType.BOOLEAN);
1413: args[3] = new OpenMBeanParameterInfoSupport("seed",
1414: "True if all content are seeds.", SimpleType.BOOLEAN);
1415: operations.add(new OpenMBeanOperationInfoSupport(
1416: IMPORT_URIS_OPER,
1417: "Add file of passed URLs to the frontier", args,
1418: SimpleType.STRING, MBeanOperationInfo.ACTION));
1419:
1420: operations.add(new OpenMBeanOperationInfoSupport(PAUSE_OPER,
1421: "Pause crawling (noop if already paused)", null,
1422: SimpleType.VOID, MBeanOperationInfo.ACTION));
1423:
1424: operations.add(new OpenMBeanOperationInfoSupport(RESUME_OPER,
1425: "Resume crawling (noop if already resumed)", null,
1426: SimpleType.VOID, MBeanOperationInfo.ACTION));
1427:
1428: args = new OpenMBeanParameterInfoSupport[1];
1429: args[0] = new OpenMBeanParameterInfoSupport("name",
1430: "Name of report ('all', 'standard', etc.).",
1431: SimpleType.STRING);
1432: operations.add(new OpenMBeanOperationInfoSupport(
1433: FRONTIER_REPORT_OPER, "Full frontier report", args,
1434: SimpleType.STRING, MBeanOperationInfo.INFO));
1435:
1436: operations.add(new OpenMBeanOperationInfoSupport(
1437: THREADS_REPORT_OPER, "Full thread report", null,
1438: SimpleType.STRING, MBeanOperationInfo.INFO));
1439:
1440: operations.add(new OpenMBeanOperationInfoSupport(
1441: SEEDS_REPORT_OPER, "Seeds report", null,
1442: SimpleType.STRING, MBeanOperationInfo.INFO));
1443:
1444: operations.add(new OpenMBeanOperationInfoSupport(
1445: PROGRESS_STATISTICS_OPER,
1446: "Progress statistics at time of invocation", null,
1447: SimpleType.STRING, MBeanOperationInfo.INFO));
1448:
1449: operations.add(new OpenMBeanOperationInfoSupport(
1450: PROGRESS_STATISTICS_LEGEND_OPER,
1451: "Progress statistics legend", null, SimpleType.STRING,
1452: MBeanOperationInfo.INFO));
1453:
1454: operations.add(new OpenMBeanOperationInfoSupport(
1455: CHECKPOINT_OPER, "Start a checkpoint", null,
1456: SimpleType.VOID, MBeanOperationInfo.ACTION));
1457:
1458: // Add bdbje operations. Add subset only. Keep around the list so have
1459: // it to hand when figuring what operations are supported. Usual actual
1460: // Strings because not accessible from JEMBeanHelper.
1461: this .bdbjeOperationsNameList = Arrays.asList(new String[] {
1462: "cleanLog", "evictMemory", "checkpoint", "sync",
1463: "getEnvironmentStatsToString", "getLockStatsToString",
1464: "getDatabaseNames", OP_DB_STAT });
1465: addBdbjeOperations(operations, this .bdbjeMBeanHelper
1466: .getOperationList(env), this .bdbjeOperationsNameList);
1467:
1468: // Register notifications
1469: List<MBeanNotificationInfo> notifications = new ArrayList<MBeanNotificationInfo>();
1470: notifications.add(new MBeanNotificationInfo(new String[] {
1471: "crawlStarted", "crawlEnding", "crawlPaused",
1472: "crawlResuming", PROG_STATS }, this .getClass()
1473: .getName()
1474: + ".notifications",
1475: "CrawlStatusListener events and progress statistics as "
1476: + "notifications"));
1477: MBeanNotificationInfo[] notificationsArray = new MBeanNotificationInfo[notifications
1478: .size()];
1479: notifications.toArray(notificationsArray);
1480:
1481: // Build the info object.
1482: OpenMBeanAttributeInfoSupport[] attributesArray = new OpenMBeanAttributeInfoSupport[attributes
1483: .size()];
1484: attributes.toArray(attributesArray);
1485: OpenMBeanOperationInfoSupport[] operationsArray = new OpenMBeanOperationInfoSupport[operations
1486: .size()];
1487: operations.toArray(operationsArray);
1488: return new OpenMBeanInfoSupport(this .getClass().getName(),
1489: "Current Crawl Job as OpenMBean", attributesArray,
1490: new OpenMBeanConstructorInfoSupport[] {},
1491: operationsArray, notificationsArray);
1492: }
1493:
1494: protected void addBdbjeAttributes(
1495: final List<OpenMBeanAttributeInfo> attributes,
1496: final List<MBeanAttributeInfo> bdbjeAttributes,
1497: final List<String> bdbjeNamesToAdd) {
1498: for (MBeanAttributeInfo info : bdbjeAttributes) {
1499: if (bdbjeNamesToAdd.contains(info.getName())) {
1500: attributes.add(JmxUtils
1501: .convertToOpenMBeanAttribute(info));
1502: }
1503: }
1504: }
1505:
1506: protected void addBdbjeOperations(
1507: final List<OpenMBeanOperationInfo> operations,
1508: final List<MBeanOperationInfo> bdbjeOperations,
1509: final List<String> bdbjeNamesToAdd) {
1510: for (MBeanOperationInfo info : bdbjeOperations) {
1511: if (bdbjeNamesToAdd.contains(info.getName())) {
1512: OpenMBeanOperationInfo omboi = null;
1513: if (info.getName().equals(OP_DB_STAT)) {
1514: // Db stats needs special handling. The published
1515: // signature is wrong and its return type is awkward.
1516: // Handle it.
1517: omboi = JmxUtils.convertToOpenMBeanOperation(info,
1518: null, SimpleType.STRING);
1519: MBeanParameterInfo[] params = omboi.getSignature();
1520: OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[params.length + 1];
1521: for (int ii = 0; ii < params.length; ii++) {
1522: args[ii] = (OpenMBeanParameterInfo) params[ii];
1523: }
1524: args[params.length] = new OpenMBeanParameterInfoSupport(
1525: "name", "Database name", SimpleType.STRING);
1526: omboi = new OpenMBeanOperationInfoSupport(omboi
1527: .getName(), omboi.getDescription(), args,
1528: omboi.getReturnOpenType(), omboi
1529: .getImpact());
1530: } else {
1531: omboi = JmxUtils.convertToOpenMBeanOperation(info);
1532: }
1533: operations.add(omboi);
1534: }
1535: }
1536: }
1537:
1538: protected void addCrawlOrderAttributes(final ComplexType type,
1539: final List<OpenMBeanAttributeInfo> attributes) {
1540: for (final Iterator i = type.getAttributeInfoIterator(null); i
1541: .hasNext();) {
1542: ModuleAttributeInfo info = (ModuleAttributeInfo) i.next();
1543: if (ORDER_EXCLUDE.contains(info.getName())) {
1544: // Skip.
1545: continue;
1546: }
1547: String absoluteName = type.getAbsoluteName() + "/"
1548: + info.getName();
1549: if (JmxUtils.isOpenType(info.getType())) {
1550: String description = info.getDescription();
1551: if (description == null || description.length() <= 0) {
1552: // Description can't be empty.
1553: description = info.getName();
1554: }
1555: attributes.add(new OpenMBeanAttributeInfoSupport(
1556: absoluteName, description, JmxUtils
1557: .getOpenType(info.getType()), true,
1558: true, false));
1559: } else if (info.isComplexType()) {
1560: try {
1561: ComplexType c = (ComplexType) type
1562: .getAttribute(info.getName());
1563: addCrawlOrderAttributes(c, attributes);
1564: } catch (AttributeNotFoundException e) {
1565: logger.log(Level.SEVERE, "Failed get of attribute",
1566: e);
1567: } catch (MBeanException e) {
1568: logger.log(Level.SEVERE, "Failed get of attribute",
1569: e);
1570: } catch (ReflectionException e) {
1571: logger.log(Level.SEVERE, "Failed get of attribute",
1572: e);
1573: }
1574: } else if (info.getType().equals(TextField.class.getName())) {
1575: // Special handling for TextField. Use the STRING OpenType.
1576: attributes.add(new OpenMBeanAttributeInfoSupport(
1577: absoluteName, info.getDescription(),
1578: SimpleType.STRING, true, true, false));
1579: } else {
1580: // Looks like only type we don't currently handle is StringList.
1581: // Figure how to do it. Add as AttributeList?
1582: logger.fine(info.getType());
1583: }
1584: }
1585: }
1586:
1587: public Object getAttribute(String attribute_name)
1588: throws AttributeNotFoundException {
1589: if (attribute_name == null) {
1590: throw new RuntimeOperationsException(
1591: new IllegalArgumentException(
1592: "Attribute name cannot be null"),
1593: "Cannot call getAttribute with null attribute name");
1594: }
1595:
1596: // If no controller, we can't do any work in here.
1597: if (this .controller == null) {
1598: throw new RuntimeOperationsException(
1599: new NullPointerException("Controller is null"),
1600: "Controller is null");
1601: }
1602:
1603: // Is it a bdbje attribute?
1604: if (this .bdbjeAttributeNameList.contains(attribute_name)) {
1605: try {
1606: return this .bdbjeMBeanHelper.getAttribute(
1607: this .controller.getBdbEnvironment(),
1608: attribute_name);
1609: } catch (MBeanException e) {
1610: throw new RuntimeOperationsException(
1611: new RuntimeException(e));
1612: }
1613: }
1614:
1615: // Is it a crawl-order attribute?
1616: if (attribute_name.startsWith(this .controller.getOrder()
1617: .getAbsoluteName())) {
1618: return getCrawlOrderAttribute(attribute_name);
1619: }
1620:
1621: if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1622: throw new AttributeNotFoundException("Attribute "
1623: + attribute_name + " is unimplemented.");
1624: }
1625:
1626: // The pattern in the below is to match an attribute and when found
1627: // do a return out of if clause. Doing it this way, I can fall
1628: // on to the AttributeNotFoundException for case where we've an
1629: // attribute but no handler.
1630: if (attribute_name.equals(STATUS_ATTR)) {
1631: return getCrawlStatus();
1632: }
1633: if (attribute_name.equals(NAME_ATTR)) {
1634: return getJobName();
1635: }
1636: if (attribute_name.equals(UID_ATTR)) {
1637: return getUID();
1638: }
1639: if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1640: return new Long(this .controller == null
1641: && this .controller.getStatistics() != null ? 0
1642: : this .controller.getStatistics()
1643: .totalBytesWritten());
1644: }
1645: if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1646: return new Long(this .controller == null
1647: && this .controller.getStatistics() != null ? 0
1648: : this .controller.getStatistics()
1649: .getCrawlerTotalElapsedTime() / 1000);
1650: }
1651: if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1652: return new Double(this .controller == null
1653: && this .controller.getStatistics() != null ? 0
1654: : this .controller.getStatistics()
1655: .currentProcessedDocsPerSec());
1656: }
1657: if (attribute_name.equals(DOC_RATE_ATTR)) {
1658: return new Double(this .controller == null
1659: && this .controller.getStatistics() != null ? 0
1660: : this .controller.getStatistics()
1661: .processedDocsPerSec());
1662: }
1663: if (attribute_name.equals(KB_RATE_ATTR)) {
1664: return new Long(this .controller == null
1665: && this .controller.getStatistics() != null ? 0
1666: : this .controller.getStatistics()
1667: .currentProcessedKBPerSec());
1668: }
1669: if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1670: return new Long(this .controller == null
1671: && this .controller.getStatistics() != null ? 0
1672: : this .controller.getStatistics()
1673: .processedKBPerSec());
1674: }
1675: if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1676: return new Integer(this .controller == null
1677: && this .controller.getStatistics() != null ? 0
1678: : this .controller.getStatistics()
1679: .activeThreadCount());
1680: }
1681: if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1682: return getFrontierOneLine();
1683: }
1684: if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1685: return getThreadOneLine();
1686: }
1687: if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1688: return new Long(this .controller == null
1689: && this .controller.getStatistics() != null ? 0
1690: : this .controller.getStatistics().totalCount());
1691: }
1692: if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1693: return new Long(this .controller == null
1694: && this .controller.getStatistics() != null ? 0
1695: : this .controller.getStatistics()
1696: .successfullyFetchedCount());
1697: }
1698:
1699: throw new AttributeNotFoundException("Attribute "
1700: + attribute_name + " not found.");
1701: }
1702:
1703: protected Object getCrawlOrderAttribute(final String attribute_name) {
1704: CrawlOrder order = this .getController().getOrder();
1705: Object result = null;
1706: try {
1707: result = getCrawlOrderAttribute(attribute_name
1708: .substring(order.getAbsoluteName().length()), order);
1709: } catch (NullPointerException e) {
1710: logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1711: e);
1712: } catch (AttributeNotFoundException e) {
1713: logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1714: e);
1715: } catch (MBeanException e) {
1716: logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1717: e);
1718: } catch (ReflectionException e) {
1719: logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1720: e);
1721: }
1722: return result;
1723: }
1724:
1725: protected Object getCrawlOrderAttribute(
1726: final String attribute_name, final ComplexType ct)
1727: throws AttributeNotFoundException, MBeanException,
1728: ReflectionException {
1729: String subName = attribute_name.startsWith("/") ? attribute_name
1730: .substring(1)
1731: : attribute_name;
1732: int index = subName.indexOf("/");
1733: if (index <= 0) {
1734: MBeanAttributeInfo info = ct.getAttributeInfo(subName);
1735: // Special handling for TextField.
1736: return info.getType().equals(TextField.class.getName()) ? ct
1737: .getAttribute(subName).toString()
1738: : ct.getAttribute(subName);
1739: }
1740: return getCrawlOrderAttribute(subName.substring(index + 1),
1741: (ComplexType) ct.getAttribute(subName.substring(0,
1742: index)));
1743: }
1744:
1745: public AttributeList getAttributes(String[] attributeNames) {
1746: if (attributeNames == null) {
1747: throw new RuntimeOperationsException(
1748: new IllegalArgumentException(
1749: "attributeNames[] cannot be " + "null"),
1750: "Cannot call getAttributes with null attribute "
1751: + "names");
1752: }
1753:
1754: // If no controller, we can't do any work in here.
1755: if (this .controller == null) {
1756: throw new RuntimeOperationsException(
1757: new NullPointerException("Controller is null"),
1758: "Controller is null");
1759: }
1760:
1761: AttributeList resultList = new AttributeList();
1762: if (attributeNames.length == 0) {
1763: return resultList;
1764: }
1765: for (int i = 0; i < attributeNames.length; i++) {
1766: try {
1767: Object value = getAttribute(attributeNames[i]);
1768: resultList.add(new Attribute(attributeNames[i], value));
1769: } catch (Exception e) {
1770: e.printStackTrace();
1771: }
1772: }
1773: return (resultList);
1774: }
1775:
1776: public void setAttribute(Attribute attribute)
1777: throws AttributeNotFoundException {
1778: // Is it a crawl order attribute?
1779: CrawlOrder order = this .getController().getOrder();
1780: String attName = attribute.getName();
1781: if (attName.startsWith(order.getAbsoluteName())) {
1782: try {
1783: setCrawlOrderAttribute(attribute.getName().substring(
1784: order.getAbsoluteName().length()), order,
1785: attribute);
1786: } catch (NullPointerException e) {
1787: logger.log(Level.SEVERE, "Failed set of " + attName, e);
1788: } catch (AttributeNotFoundException e) {
1789: logger.log(Level.SEVERE, "Failed set of " + attName, e);
1790: } catch (MBeanException e) {
1791: logger.log(Level.SEVERE, "Failed set of " + attName, e);
1792: } catch (ReflectionException e) {
1793: logger.log(Level.SEVERE, "Failed set of " + attName, e);
1794: } catch (InvalidAttributeValueException e) {
1795: logger.log(Level.SEVERE, "Failed set of " + attName, e);
1796: }
1797: return;
1798: }
1799:
1800: // Is it a bdbje attribute?
1801: if (this .bdbjeAttributeNameList.contains(attName)) {
1802: try {
1803: this .bdbjeMBeanHelper.setAttribute(this .controller
1804: .getBdbEnvironment(), attribute);
1805: } catch (AttributeNotFoundException e) {
1806: throw new RuntimeOperationsException(
1807: new RuntimeException(e));
1808: } catch (InvalidAttributeValueException e) {
1809: throw new RuntimeOperationsException(
1810: new RuntimeException(e));
1811: }
1812: return;
1813: }
1814:
1815: // Else, we don't know how to handle this attribute.
1816: throw new AttributeNotFoundException("Attribute " + attName
1817: + " can not be set.");
1818: }
1819:
1820: protected void setCrawlOrderAttribute(final String attribute_name,
1821: final ComplexType ct, final Attribute attribute)
1822: throws AttributeNotFoundException,
1823: InvalidAttributeValueException, MBeanException,
1824: ReflectionException {
1825: String subName = attribute_name.startsWith("/") ? attribute_name
1826: .substring(1)
1827: : attribute_name;
1828: int index = subName.indexOf("/");
1829: if (index <= 0) {
1830: ct
1831: .setAttribute(new Attribute(subName, attribute
1832: .getValue()));
1833: return;
1834: }
1835: setCrawlOrderAttribute(subName.substring(index + 1),
1836: (ComplexType) ct.getAttribute(subName.substring(0,
1837: index)), attribute);
1838: }
1839:
1840: public AttributeList setAttributes(AttributeList attributes) {
1841: if (attributes == null) {
1842: throw new RuntimeOperationsException(
1843: new IllegalArgumentException(
1844: "attributeNames[] cannot be " + "null"),
1845: "Cannot call getAttributes with null attribute "
1846: + "names");
1847: }
1848:
1849: AttributeList resultList = new AttributeList();
1850: if (attributes.size() == 0) {
1851: return resultList;
1852: }
1853: for (int i = 0; i < attributes.size(); i++) {
1854: try {
1855: Attribute attr = (Attribute) attributes.get(i);
1856: setAttribute(attr);
1857: String an = attr.getName();
1858: Object newValue = getAttribute(an);
1859: resultList.add(new Attribute(an, newValue));
1860: } catch (Exception e) {
1861: e.printStackTrace();
1862: }
1863: }
1864: return resultList;
1865: }
1866:
1867: public Object invoke(String operationName, Object[] params,
1868: String[] signature) throws ReflectionException {
1869: if (operationName == null) {
1870: throw new RuntimeOperationsException(
1871: new IllegalArgumentException(
1872: "Operation name cannot be null"),
1873: "Cannot call invoke with null operation name");
1874: }
1875:
1876: if (this .bdbjeOperationsNameList.contains(operationName)) {
1877: try {
1878: Object o = this .bdbjeMBeanHelper.invoke(this .controller
1879: .getBdbEnvironment(), operationName, params,
1880: signature);
1881: // If OP_DB_ST, return String version of result.
1882: if (operationName.equals(OP_DB_STAT)) {
1883: return o.toString();
1884: }
1885: return o;
1886: } catch (MBeanException e) {
1887: throw new RuntimeOperationsException(
1888: new RuntimeException(e));
1889: }
1890: }
1891:
1892: // TODO: Exploit passed signature.
1893:
1894: // The pattern in the below is to match an operation and when found
1895: // do a return out of if clause. Doing it this way, I can fall
1896: // on to the MethodNotFoundException for case where we've an
1897: // attribute but no handler.
1898: if (operationName.equals(IMPORT_URI_OPER)) {
1899: JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1900: mustBeCrawling();
1901: try {
1902: importUri((String) params[0], ((Boolean) params[1])
1903: .booleanValue(), ((Boolean) params[2])
1904: .booleanValue());
1905: } catch (URIException e) {
1906: throw new RuntimeOperationsException(
1907: new RuntimeException(e));
1908: }
1909: return null;
1910: }
1911:
1912: if (operationName.equals(IMPORT_URIS_OPER)) {
1913: JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1914: mustBeCrawling();
1915: return importUris((String) params[0], ((String) params[1])
1916: .toString(), ((Boolean) params[2]).booleanValue(),
1917: ((Boolean) params[3]).booleanValue());
1918: }
1919:
1920: if (operationName.equals(PAUSE_OPER)) {
1921: JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1922: mustBeCrawling();
1923: pause();
1924: return null;
1925: }
1926:
1927: if (operationName.equals(RESUME_OPER)) {
1928: JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1929: mustBeCrawling();
1930: resume();
1931: return null;
1932: }
1933:
1934: if (operationName.equals(FRONTIER_REPORT_OPER)) {
1935: JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1936: mustBeCrawling();
1937: return getFrontierReport((String) params[0]);
1938: }
1939:
1940: if (operationName.equals(THREADS_REPORT_OPER)) {
1941: JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1942: mustBeCrawling();
1943: return getThreadsReport();
1944: }
1945:
1946: if (operationName.equals(SEEDS_REPORT_OPER)) {
1947: JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1948: mustBeCrawling();
1949: StringWriter sw = new StringWriter();
1950: if (getStatisticsTracking() != null
1951: && getStatisticsTracking() instanceof StatisticsTracker) {
1952: ((StatisticsTracker) getStatisticsTracking())
1953: .writeSeedsReportTo(new PrintWriter(sw));
1954: } else {
1955: sw.write("Unsupported");
1956: }
1957: return sw.toString();
1958: }
1959:
1960: if (operationName.equals(CHECKPOINT_OPER)) {
1961: JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1962: mustBeCrawling();
1963: try {
1964: checkpoint();
1965: } catch (IllegalStateException e) {
1966: throw new RuntimeOperationsException(e);
1967: }
1968: return null;
1969: }
1970:
1971: if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1972: JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params,
1973: 0);
1974: mustBeCrawling();
1975: return getStatisticsTracking().getProgressStatisticsLine();
1976: }
1977:
1978: if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1979: JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1980: params, 0);
1981: return getStatisticsTracking().progressStatisticsLegend();
1982: }
1983:
1984: throw new ReflectionException(new NoSuchMethodException(
1985: operationName), "Cannot find the operation "
1986: + operationName);
1987: }
1988:
1989: public void mustBeCrawling() {
1990: if (!isCrawling()) {
1991: throw new RuntimeOperationsException(
1992: new IllegalArgumentException("Not "
1993: + "crawling (Shouldn't ever be the case)"),
1994: "Not current crawling job?");
1995: }
1996: }
1997:
1998: public boolean isCrawling() {
1999: return this .controller != null;
2000: }
2001:
2002: /**
2003: * Utility method to get the stored list of ignored seed items (if any),
2004: * from the last time the seeds were imported to the frontier.
2005: *
2006: * @return String of all ignored seed items, or null if none
2007: */
2008: public String getIgnoredSeeds() {
2009: File ignoredFile = new File(getDirectory(),
2010: AbstractFrontier.IGNORED_SEEDS_FILENAME);
2011: if (!ignoredFile.exists()) {
2012: return null;
2013: }
2014: try {
2015: return FileUtils.readFileAsString(ignoredFile);
2016: } catch (IOException e) {
2017: // TODO Auto-generated catch block
2018: e.printStackTrace();
2019: return null;
2020: }
2021: }
2022:
2023: /**
2024: * Forward a 'kick' update to current controller if any.
2025: * @see CrawlController#kickUpdate()
2026: */
2027: public void kickUpdate() {
2028: if (this .controller != null) {
2029: this .controller.kickUpdate();
2030: }
2031: }
2032:
2033: /**
2034: * Returns a URIFrontierMarker for the current, paused, job. If there is no
2035: * current job or it is not paused null will be returned.
2036: *
2037: * @param regexpr A regular expression that each URI must match in order to
2038: * be considered 'within' the marker.
2039: * @param inCacheOnly Limit marker scope to 'cached' URIs.
2040: * @return a URIFrontierMarker for the current job.
2041: * @see #getPendingURIsList(FrontierMarker, int, boolean)
2042: * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
2043: * boolean)
2044: * @see org.archive.crawler.framework.FrontierMarker
2045: */
2046: public FrontierMarker getInitialMarker(String regexpr,
2047: boolean inCacheOnly) {
2048: return (this .controller != null && this .controller.isPaused()) ? this .controller
2049: .getFrontier().getInitialMarker(regexpr, inCacheOnly)
2050: : null;
2051: }
2052:
2053: /**
2054: * Returns the frontiers URI list based on the provided marker. This method
2055: * will return null if there is not current job or if the current job is
2056: * not paused. Only when there is a paused current job will this method
2057: * return a URI list.
2058: *
2059: * @param marker URIFrontier marker
2060: * @param numberOfMatches Maximum number of matches to return
2061: * @param verbose Should detailed info be provided on each URI?
2062: * @return the frontiers URI list based on the provided marker
2063: * @throws InvalidFrontierMarkerException
2064: * When marker is inconsistent with the current state of the
2065: * frontier.
2066: * @see #getInitialMarker(String, boolean)
2067: * @see org.archive.crawler.framework.FrontierMarker
2068: */
2069: public ArrayList getPendingURIsList(FrontierMarker marker,
2070: int numberOfMatches, boolean verbose)
2071: throws InvalidFrontierMarkerException {
2072: return (this .controller != null && this .controller.isPaused()) ? this .controller
2073: .getFrontier().getURIsList(marker, numberOfMatches,
2074: verbose)
2075: : null;
2076: }
2077:
2078: public void crawlStarted(String message) {
2079: if (this .mbeanName != null) {
2080: // Can be null around job startup.
2081: sendNotification(new Notification("crawlStarted",
2082: this .mbeanName, getNotificationsSequenceNumber(),
2083: message));
2084: }
2085: }
2086:
2087: public void crawlEnding(String sExitMessage) {
2088: setRunning(false);
2089: setStatus(sExitMessage);
2090: setReadOnly();
2091: if (this .mbeanName != null) {
2092: sendNotification(new Notification("crawlEnding",
2093: this .mbeanName, getNotificationsSequenceNumber(),
2094: sExitMessage));
2095: }
2096: }
2097:
2098: public void crawlEnded(String sExitMessage) {
2099: // Let the settings handler be cleaned up by the crawl controller
2100: // completeStop. Just let go of our reference in here.
2101: // if (this.settingsHandler != null) {
2102: // this.settingsHandler.cleanup();
2103: // }
2104:
2105: // We used to zero-out datamembers but no longer needed now CrawlJobs
2106: // no longer persist after completion (They used to be kept around in
2107: // a list so operator could view CrawlJob finish state and reports --
2108: // but we now dump actual job and create a new uninitialized CrawlJob
2109: // that points at old CrawlJob data.
2110: }
2111:
2112: public void crawlPausing(String statusMessage) {
2113: setStatus(statusMessage);
2114: }
2115:
2116: public void crawlPaused(String statusMessage) {
2117: setStatus(statusMessage);
2118: if (this .mbeanName != null) {
2119: // Can be null around job startup.
2120: sendNotification(new Notification("crawlPaused",
2121: this .mbeanName, getNotificationsSequenceNumber(),
2122: statusMessage));
2123: }
2124: }
2125:
2126: public void crawlResuming(String statusMessage) {
2127: setStatus(statusMessage);
2128: if (this .mbeanName != null) {
2129: // Can be null around job startup.
2130: sendNotification(new Notification("crawlResuming",
2131: this .mbeanName, getNotificationsSequenceNumber(),
2132: statusMessage));
2133: }
2134: }
2135:
2136: public void crawlCheckpoint(File checkpointDir) throws Exception {
2137: setStatus(CrawlJob.STATUS_CHECKPOINTING);
2138: }
2139:
2140: public CrawlController getController() {
2141: return this .controller;
2142: }
2143:
2144: public ObjectName preRegister(final MBeanServer server,
2145: ObjectName on) throws Exception {
2146: this .mbeanServer = server;
2147: @SuppressWarnings("unchecked")
2148: Hashtable<String, String> ht = on.getKeyPropertyList();
2149: if (!ht.containsKey(JmxUtils.NAME)) {
2150: throw new IllegalArgumentException("Name property required"
2151: + on.getCanonicalName());
2152: }
2153: // Now append key/values from hosting heritrix JMX ObjectName so it can be
2154: // found just by examination of the CrawlJob JMX ObjectName. Add heritrix
2155: // name attribute as 'mother' attribute.
2156: Heritrix h = getHostingHeritrix();
2157: if (h == null || h.getMBeanName() == null) {
2158: throw new IllegalArgumentException(
2159: "Hosting heritrix not found "
2160: + "or not registered with JMX: "
2161: + on.getCanonicalName());
2162: }
2163: @SuppressWarnings("unchecked")
2164: Map<String, String> hht = h.getMBeanName().getKeyPropertyList();
2165: ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2166: String port = hht.get(JmxUtils.JMX_PORT);
2167: if (port != null) {
2168: ht.put(JmxUtils.JMX_PORT, port);
2169: }
2170: ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2171: if (!ht.containsKey(JmxUtils.TYPE)) {
2172: ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2173: }
2174: this .mbeanName = new ObjectName(on.getDomain(), ht);
2175: return this .mbeanName;
2176: }
2177:
2178: public void postRegister(Boolean registrationDone) {
2179: if (logger.isLoggable(Level.INFO)) {
2180: logger.info(JmxUtils.getLogRegistrationMsg(this .mbeanName
2181: .getCanonicalName(), this .mbeanServer,
2182: registrationDone.booleanValue()));
2183: }
2184: }
2185:
2186: public void preDeregister() throws Exception {
2187: // Nothing to do.
2188: }
2189:
2190: public void postDeregister() {
2191: if (mbeanName == null) {
2192: return;
2193: }
2194: if (logger.isLoggable(Level.INFO)) {
2195: logger.info(JmxUtils.getLogUnregistrationMsg(this .mbeanName
2196: .getCanonicalName(), this .mbeanServer));
2197: }
2198: this .mbeanName = null;
2199: }
2200:
2201: /**
2202: * @return Heritrix that is hosting this job.
2203: */
2204: protected Heritrix getHostingHeritrix() {
2205: Heritrix hostingHeritrix = null;
2206: Map heritrice = Heritrix.getInstances();
2207: for (final Iterator i = heritrice.keySet().iterator(); i
2208: .hasNext();) {
2209: Heritrix h = (Heritrix) heritrice.get(i.next());
2210: if (h.getJobHandler().getCurrentJob() == this ) {
2211: hostingHeritrix = h;
2212: break;
2213: }
2214: }
2215: return hostingHeritrix;
2216: }
2217:
2218: /**
2219: * @return Unique name for job that is safe to use in jmx (Like display
2220: * name but without spaces).
2221: */
2222: public String getJmxJobName() {
2223: return getJobName() + "-" + getUID();
2224: }
2225:
2226: /**
2227: * @return Notification sequence number (Does increment after each access).
2228: */
2229: protected static int getNotificationsSequenceNumber() {
2230: return notificationsSequenceNumber++;
2231: }
2232:
2233: protected ObjectName getMbeanName() {
2234: return this .mbeanName;
2235: }
2236:
2237: /**
2238: * @return the statistics tracking instance (of null if none yet available).
2239: */
2240: public StatisticsTracking getStatisticsTracking() {
2241: return this.controller == null
2242: || this.controller.getStatistics() == null ? null
2243: : this.controller.getStatistics();
2244: }
2245: }
|