Source Code Cross Referenced for CrawlJob.java in » Web-Crawler » heritrix » org » archive » crawler » admin » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.admin
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /* CrawlJob
0002:         *
0003:         * Copyright (C) 2003 Internet Archive.
0004:         *
0005:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0006:         *
0007:         * Heritrix is free software; you can redistribute it and/or modify
0008:         * it under the terms of the GNU Lesser Public License as published by
0009:         * the Free Software Foundation; either version 2.1 of the License, or
0010:         * any later version.
0011:         *
0012:         * Heritrix is distributed in the hope that it will be useful,
0013:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0014:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0015:         * GNU Lesser Public License for more details.
0016:         *
0017:         * You should have received a copy of the GNU Lesser Public License
0018:         * along with Heritrix; if not, write to the Free Software
0019:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0020:         */
0021:        package org.archive.crawler.admin;
0022:
0023:        import java.io.BufferedReader;
0024:        import java.io.File;
0025:        import java.io.FileNotFoundException;
0026:        import java.io.FileReader;
0027:        import java.io.FileWriter;
0028:        import java.io.IOException;
0029:        import java.io.InputStream;
0030:        import java.io.InputStreamReader;
0031:        import java.io.PrintWriter;
0032:        import java.io.Serializable;
0033:        import java.io.StringWriter;
0034:        import java.util.ArrayList;
0035:        import java.util.Arrays;
0036:        import java.util.Collection;
0037:        import java.util.EventObject;
0038:        import java.util.Hashtable;
0039:        import java.util.Iterator;
0040:        import java.util.List;
0041:        import java.util.Map;
0042:        import java.util.logging.Level;
0043:        import java.util.logging.Logger;
0044:
0045:        import javax.management.Attribute;
0046:        import javax.management.AttributeList;
0047:        import javax.management.AttributeNotFoundException;
0048:        import javax.management.DynamicMBean;
0049:        import javax.management.InstanceAlreadyExistsException;
0050:        import javax.management.InvalidAttributeValueException;
0051:        import javax.management.MBeanAttributeInfo;
0052:        import javax.management.MBeanException;
0053:        import javax.management.MBeanInfo;
0054:        import javax.management.MBeanNotificationInfo;
0055:        import javax.management.MBeanOperationInfo;
0056:        import javax.management.MBeanParameterInfo;
0057:        import javax.management.MBeanRegistration;
0058:        import javax.management.MBeanRegistrationException;
0059:        import javax.management.MBeanServer;
0060:        import javax.management.NotCompliantMBeanException;
0061:        import javax.management.Notification;
0062:        import javax.management.NotificationBroadcasterSupport;
0063:        import javax.management.ObjectName;
0064:        import javax.management.ReflectionException;
0065:        import javax.management.RuntimeOperationsException;
0066:        import javax.management.openmbean.CompositeData;
0067:        import javax.management.openmbean.CompositeDataSupport;
0068:        import javax.management.openmbean.CompositeType;
0069:        import javax.management.openmbean.OpenDataException;
0070:        import javax.management.openmbean.OpenMBeanAttributeInfo;
0071:        import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
0072:        import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
0073:        import javax.management.openmbean.OpenMBeanInfoSupport;
0074:        import javax.management.openmbean.OpenMBeanOperationInfo;
0075:        import javax.management.openmbean.OpenMBeanOperationInfoSupport;
0076:        import javax.management.openmbean.OpenMBeanParameterInfo;
0077:        import javax.management.openmbean.OpenMBeanParameterInfoSupport;
0078:        import javax.management.openmbean.SimpleType;
0079:
0080:        import org.apache.commons.httpclient.URIException;
0081:        import org.archive.crawler.Heritrix;
0082:        import org.archive.crawler.datamodel.CandidateURI;
0083:        import org.archive.crawler.datamodel.Checkpoint;
0084:        import org.archive.crawler.datamodel.CrawlOrder;
0085:        import org.archive.crawler.event.CrawlStatusListener;
0086:        import org.archive.crawler.framework.CrawlController;
0087:        import org.archive.crawler.framework.FrontierMarker;
0088:        import org.archive.crawler.framework.StatisticsTracking;
0089:        import org.archive.crawler.framework.exceptions.InitializationException;
0090:        import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
0091:        import org.archive.crawler.frontier.AbstractFrontier;
0092:        import org.archive.crawler.settings.ComplexType;
0093:        import org.archive.crawler.settings.ModuleAttributeInfo;
0094:        import org.archive.crawler.settings.TextField;
0095:        import org.archive.crawler.settings.XMLSettingsHandler;
0096:        import org.archive.crawler.util.CheckpointUtils;
0097:        import org.archive.crawler.util.IoUtils;
0098:        import org.archive.util.ArchiveUtils;
0099:        import org.archive.util.FileUtils;
0100:        import org.archive.util.JEMBeanHelper;
0101:        import org.archive.util.JmxUtils;
0102:        import org.archive.util.iterator.LineReadingIterator;
0103:        import org.archive.util.iterator.RegexpLineIterator;
0104:
0105:        import com.sleepycat.je.DatabaseException;
0106:        import com.sleepycat.je.Environment;
0107:
0108:        /**
0109:         * A CrawlJob encapsulates a 'crawl order' with any and all information and
0110:         * methods needed by a CrawlJobHandler to accept and execute them.
0111:         *
0112:         * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
0113:         * should not be executed as a crawl but can be edited and used as a template
0114:         * for creating new CrawlJobs.
0115:         *
0116:         * <p>All of it's constructors are protected since only a CrawlJobHander
0117:         * should construct new CrawlJobs.
0118:         *
0119:         * @author Kristinn Sigurdsson
0120:         *
0121:         * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
0122:         * String, String, String, int)
0123:         * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
0124:         *  String, String, String)
0125:         */
0126:
0127:        public class CrawlJob extends NotificationBroadcasterSupport implements 
0128:                DynamicMBean, MBeanRegistration, CrawlStatusListener,
0129:                Serializable {
0130:            /**
0131:             * Eclipse generated serial number.
0132:             */
0133:            private static final long serialVersionUID = 3411161000452525856L;
0134:
0135:            private static final Logger logger = Logger
0136:                    .getLogger(CrawlJob.class.getName());
0137:            /*
0138:             * Possible values for Priority
0139:             */
0140:            /** lowest */
0141:            public static final int PRIORITY_MINIMAL = 0;
0142:            /** low */
0143:            public static final int PRIORITY_LOW = 1;
0144:            /** average */
0145:            public static final int PRIORITY_AVERAGE = 2;
0146:            /** high */
0147:            public static final int PRIORITY_HIGH = 3;
0148:            /** highest */
0149:            public static final int PRIORITY_CRITICAL = 4;
0150:
0151:            /*
0152:             * Possible states for a Job.
0153:             */
0154:            /** Inital value. May not be ready to run/incomplete. */
0155:            public static final String STATUS_CREATED = "Created";
0156:            /** Job has been successfully submitted to a CrawlJobHandler */
0157:            public static final String STATUS_PENDING = "Pending";
0158:            /** Job is being crawled */
0159:            public static final String STATUS_RUNNING = "Running";
0160:            /** Job was deleted by user, will not be displayed in UI. */
0161:            public static final String STATUS_DELETED = "Deleted";
0162:            /** Job was terminted by user input while crawling */
0163:            public static final String STATUS_ABORTED = "Finished - Ended by operator";
0164:            /** Something went very wrong */
0165:            public static final String STATUS_FINISHED_ABNORMAL = "Finished - Abnormal exit from crawling";
0166:            /** Job finished normally having completed its crawl. */
0167:            public static final String STATUS_FINISHED = "Finished";
0168:            /** Job finished normally when the specified timelimit was hit. */
0169:            public static final String STATUS_FINISHED_TIME_LIMIT = "Finished - Timelimit hit";
0170:            /** Job finished normally when the specifed amount of 
0171:             * data (MB) had been downloaded */
0172:            public static final String STATUS_FINISHED_DATA_LIMIT = "Finished - Maximum amount of data limit hit";
0173:            /** Job finished normally when the specified number of documents had been
0174:             * fetched.
0175:             */
0176:            public static final String STATUS_FINISHED_DOCUMENT_LIMIT = "Finished - Maximum number of documents limit hit";
0177:            /** Job is going to be temporarly stopped after active threads are finished. */
0178:            public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - "
0179:                    + "Waiting for threads to finish";
0180:            /** Job was temporarly stopped. State is kept so it can be resumed */
0181:            public static final String STATUS_PAUSED = "Paused";
0182:            /**
0183:             * Job is being checkpointed.  When finished checkpointing, job is set
0184:             * back to STATUS_PAUSED (Job must be first paused before checkpointing
0185:             * will run).
0186:             */
0187:            public static final String STATUS_CHECKPOINTING = "Checkpointing";
0188:            /** Job could not be launced due to an InitializationException */
0189:            public static final String STATUS_MISCONFIGURED = "Could not launch job "
0190:                    + "- Fatal InitializationException";
0191:            /** Job is actually a profile */
0192:            public static final String STATUS_PROFILE = "Profile";
0193:
0194:            public static final String STATUS_PREPARING = "Preparing";
0195:
0196:            // Class variables
0197:            private String UID; //A UID issued by the CrawlJobHandler.
0198:            private String name;
0199:            private String status;
0200:            private boolean isReadOnly = false;
0201:            private boolean isNew = true;
0202:            private boolean isProfile = false;
0203:            private boolean isRunning = false;
0204:            private int priority;
0205:            private int numberOfJournalEntries = 0;
0206:
0207:            private String statisticsFileSave = "";
0208:
0209:            private String errorMessage = null;
0210:
0211:            private File jobDir = null;
0212:
0213:            private transient CrawlJobErrorHandler errorHandler = null;
0214:
0215:            protected transient XMLSettingsHandler settingsHandler;
0216:
0217:            private transient CrawlController controller = null;
0218:
0219:            private static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";
0220:            private static final String CRAWL_LOG_STYLE = "crawlLog";
0221:
0222:            // OpenMBean support.
0223:
0224:            /**
0225:             * Server we registered with. Maybe null.
0226:             */
0227:            private transient MBeanServer mbeanServer = null;
0228:            private transient ObjectName mbeanName = null;
0229:            private static final String CRAWLJOB_JMXMBEAN_TYPE = JmxUtils.SERVICE
0230:                    + ".Job";
0231:            private transient JEMBeanHelper bdbjeMBeanHelper = null;
0232:            private transient List<String> bdbjeAttributeNameList = null;
0233:            private transient List<String> bdbjeOperationsNameList = null;
0234:
0235:            /**
0236:             * The MBean we've registered ourselves with (May be null
0237:             * throughout life of Heritrix).
0238:             */
0239:            private transient OpenMBeanInfoSupport openMBeanInfo;
0240:
0241:            private final static String NAME_ATTR = "Name";
0242:            private final static String UID_ATTR = "UID";
0243:            private final static String STATUS_ATTR = "Status";
0244:            private final static String FRONTIER_SHORT_REPORT_ATTR = "FrontierShortReport";
0245:            private final static String THREADS_SHORT_REPORT_ATTR = "ThreadsShortReport";
0246:            private final static String TOTAL_DATA_ATTR = "TotalData";
0247:            private final static String CRAWL_TIME_ATTR = "CrawlTime";
0248:            private final static String DOC_RATE_ATTR = "DocRate";
0249:            private final static String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
0250:            private final static String KB_RATE_ATTR = "KbRate";
0251:            private final static String CURRENT_KB_RATE_ATTR = "CurrentKbRate";
0252:            private final static String THREAD_COUNT_ATTR = "ThreadCount";
0253:            private final static String DOWNLOAD_COUNT_ATTR = "DownloadedCount";
0254:            private final static String DISCOVERED_COUNT_ATTR = "DiscoveredCount";
0255:            private final static String[] ATTRIBUTE_ARRAY = { NAME_ATTR,
0256:                    UID_ATTR, STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR,
0257:                    THREADS_SHORT_REPORT_ATTR, TOTAL_DATA_ATTR,
0258:                    CRAWL_TIME_ATTR, DOC_RATE_ATTR, CURRENT_DOC_RATE_ATTR,
0259:                    KB_RATE_ATTR, CURRENT_KB_RATE_ATTR, THREAD_COUNT_ATTR,
0260:                    DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR };
0261:            private final static List ATTRIBUTE_LIST = Arrays
0262:                    .asList(ATTRIBUTE_ARRAY);
0263:
0264:            private final static String IMPORT_URI_OPER = "importUri";
0265:            private final static String IMPORT_URIS_OPER = "importUris";
0266:            private final static String PAUSE_OPER = "pause";
0267:            private final static String RESUME_OPER = "resume";
0268:            private final static String FRONTIER_REPORT_OPER = "frontierReport";
0269:            private final static String THREADS_REPORT_OPER = "threadsReport";
0270:            private final static String SEEDS_REPORT_OPER = "seedsReport";
0271:            private final static String CHECKPOINT_OPER = "startCheckpoint";
0272:            private final static String PROGRESS_STATISTICS_OPER = "progressStatistics";
0273:            private final static String PROGRESS_STATISTICS_LEGEND_OPER = "progressStatisticsLegend";
0274:
0275:            private final static String PROG_STATS = "progressStatistics";
0276:
0277:            // Same as JEMBeanHelper.OP_DB_STAT
0278:            private final static String OP_DB_STAT = "getDatabaseStats";
0279:
0280:            /**
0281:             * Don't add the following crawl-order items.
0282:             */
0283:            private final static List ORDER_EXCLUDE;
0284:            static {
0285:                ORDER_EXCLUDE = Arrays.asList(new String[] {
0286:                        "bdb-cache-percent", "extract-processors", "DNS",
0287:                        "uri-included-structure" });
0288:            }
0289:
0290:            /**
0291:             * Sequence number for jmx notifications.
0292:             */
0293:            private static int notificationsSequenceNumber = 1;
0294:
0295:            /**
0296:             * A shutdown Constructor.
0297:             */
0298:            protected CrawlJob() {
0299:                super ();
0300:            }
0301:
0302:            /**
0303:             * A constructor for jobs.
0304:             *
0305:             * <p> Create, ready to crawl, jobs.
0306:             * @param UID A unique ID for this job. Typically emitted by the
0307:             *            CrawlJobHandler.
0308:             * @param name The name of the job
0309:             * @param settingsHandler The associated settings
0310:             * @param errorHandler The crawl jobs settings error handler.
0311:             *           <tt>null</tt> means none is set
0312:             * @param priority job priority.
0313:             * @param dir The directory that is considered this jobs working directory.
0314:             */
0315:            public CrawlJob(final String UID, final String name,
0316:                    final XMLSettingsHandler settingsHandler,
0317:                    final CrawlJobErrorHandler errorHandler,
0318:                    final int priority, final File dir) {
0319:                this (UID, name, settingsHandler, errorHandler, priority, dir,
0320:                        null, false, true);
0321:            }
0322:
0323:            /**
0324:             * A constructor for profiles.
0325:             *
0326:             * <p> Any job created with this constructor will be
0327:             * considered a profile. Profiles are not stored on disk (only their
0328:             * settings files are stored on disk). This is because their data is
0329:             * predictible given any settings files.
0330:             * @param UIDandName A unique ID for this job. For profiles this is the same
0331:             *           as name
0332:             * @param settingsHandler The associated settings
0333:             * @param errorHandler The crawl jobs settings error handler.
0334:             *           <tt>null</tt> means none is set
0335:             */
0336:            protected CrawlJob(final String UIDandName,
0337:                    final XMLSettingsHandler settingsHandler,
0338:                    final CrawlJobErrorHandler errorHandler) {
0339:                this (UIDandName, UIDandName, settingsHandler, errorHandler,
0340:                        PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
0341:            }
0342:
0343:            public CrawlJob(final String UID, final String name,
0344:                    final XMLSettingsHandler settingsHandler,
0345:                    final CrawlJobErrorHandler errorHandler,
0346:                    final int priority, final File dir, final String status,
0347:                    final boolean isProfile, final boolean isNew) {
0348:                super ();
0349:                this .UID = UID;
0350:                this .name = name;
0351:                this .settingsHandler = settingsHandler;
0352:                this .errorHandler = errorHandler;
0353:                this .status = status;
0354:                this .isProfile = isProfile;
0355:                this .isNew = isNew;
0356:                this .jobDir = dir;
0357:                this .priority = priority;
0358:            }
0359:
0360:            /**
0361:             * A constructor for reloading jobs from disk. Jobs (not profiles) have
0362:             * their data written to persistent storage in the file system. This method
0363:             * is used to load the job from such storage. This is done by the
0364:             * <code>CrawlJobHandler</code>.
0365:             * <p>
0366:             * Proper structure of a job file (TODO: Maybe one day make this an XML file)
0367:             * Line 1. UID <br>
0368:             * Line 2. Job name (string) <br>
0369:             * Line 3. Job status (string) <br>
0370:             * Line 4. is job read only (true/false) <br>
0371:             * Line 5. is job running (true/false) <br>
0372:             * Line 6. job priority (int) <br>
0373:             * Line 7. number of journal entries <br>
0374:             * Line 8. setting file (with path) <br>
0375:             * Line 9. statistics tracker file (with path) <br>
0376:             * Line 10-?. error message (String, empty for null), can be many lines <br>
0377:             * @param jobFile
0378:             *            a file containing information about the job to load.
0379:             * @param errorHandler The crawl jobs settings error handler.
0380:             *            null means none is set
0381:             * @throws InvalidJobFileException
0382:             *            if the specified file does not refer to a valid job file.
0383:             * @throws IOException
0384:             *            if io operations fail
0385:             */
0386:            protected CrawlJob(final File jobFile,
0387:                    final CrawlJobErrorHandler errorHandler)
0388:                    throws InvalidJobFileException, IOException {
0389:                this (null, null, null, errorHandler, PRIORITY_AVERAGE, null,
0390:                        null, false, true);
0391:                this .jobDir = jobFile.getParentFile();
0392:
0393:                // Check for corrupt job.state files (can be corrupt if we crash).
0394:                if (jobFile.length() == 0) {
0395:                    throw new InvalidJobFileException(jobFile
0396:                            .getCanonicalPath()
0397:                            + " is corrupt (length is zero)");
0398:                }
0399:
0400:                // Open file. Read data and set up class variables accordingly...
0401:                BufferedReader jobReader = new BufferedReader(new FileReader(
0402:                        jobFile), 4096);
0403:                // UID
0404:                this .UID = jobReader.readLine();
0405:                // name
0406:                this .name = jobReader.readLine();
0407:                // status
0408:                this .status = jobReader.readLine();
0409:                if (status.equals(STATUS_ABORTED) == false
0410:                        && status.equals(STATUS_CREATED) == false
0411:                        && status.equals(STATUS_DELETED) == false
0412:                        && status.equals(STATUS_FINISHED) == false
0413:                        && status.equals(STATUS_FINISHED_ABNORMAL) == false
0414:                        && status.equals(STATUS_FINISHED_DATA_LIMIT) == false
0415:                        && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT) == false
0416:                        && status.equals(STATUS_FINISHED_TIME_LIMIT) == false
0417:                        && status.equals(STATUS_MISCONFIGURED) == false
0418:                        && status.equals(STATUS_PAUSED) == false
0419:                        && status.equals(STATUS_CHECKPOINTING) == false
0420:                        && status.equals(STATUS_PENDING) == false
0421:                        && status.equals(STATUS_RUNNING) == false
0422:                        && status.equals(STATUS_WAITING_FOR_PAUSE) == false
0423:                        && status.equals(STATUS_PREPARING) == false) {
0424:                    // status is invalid. Must be one of the above
0425:                    throw new InvalidJobFileException(
0426:                            "Status (line 3) in job file " + "is not valid: '"
0427:                                    + status + "'");
0428:                }
0429:                // isReadOnly
0430:                String tmp = jobReader.readLine();
0431:                if (tmp.equals("true")) {
0432:                    isReadOnly = true;
0433:                } else if (tmp.equals("false")) {
0434:                    isReadOnly = false;
0435:                } else {
0436:                    throw new InvalidJobFileException(
0437:                            "isReadOnly (line 4) in job" + " file '"
0438:                                    + jobFile.getAbsolutePath() + "' is not "
0439:                                    + "valid: '" + tmp + "'");
0440:                }
0441:                // isRunning
0442:                tmp = jobReader.readLine();
0443:                if (tmp.equals("true")) {
0444:                    this .isRunning = true;
0445:                } else if (tmp.equals("false")) {
0446:                    this .isRunning = false;
0447:                } else {
0448:                    throw new InvalidJobFileException(
0449:                            "isRunning (line 5) in job " + "file '"
0450:                                    + jobFile.getAbsolutePath()
0451:                                    + "' is not valid: " + "'" + tmp + "'");
0452:                }
0453:                // priority
0454:                tmp = jobReader.readLine();
0455:                try {
0456:                    this .priority = Integer.parseInt(tmp);
0457:                } catch (NumberFormatException e) {
0458:                    throw new InvalidJobFileException(
0459:                            "priority (line 5) in job " + "file '"
0460:                                    + jobFile.getAbsolutePath()
0461:                                    + "' is not valid: " + "'" + tmp + "'");
0462:                }
0463:                // numberOfJournalEntries
0464:                tmp = jobReader.readLine();
0465:                try {
0466:                    this .numberOfJournalEntries = Integer.parseInt(tmp);
0467:                } catch (NumberFormatException e) {
0468:                    throw new InvalidJobFileException("numberOfJournalEntries "
0469:                            + "(line 5) in job file '"
0470:                            + jobFile.getAbsolutePath() + "' is not valid: "
0471:                            + "'" + tmp + "'");
0472:                }
0473:                // settingsHandler
0474:                tmp = jobReader.readLine();
0475:                try {
0476:                    File f = new File(tmp);
0477:                    this .settingsHandler = new XMLSettingsHandler((f
0478:                            .isAbsolute()) ? f : new File(jobDir, f.getName()));
0479:                    if (this .errorHandler != null) {
0480:                        this .settingsHandler
0481:                                .registerValueErrorHandler(errorHandler);
0482:                    }
0483:                    this .settingsHandler.initialize();
0484:                } catch (InvalidAttributeValueException e1) {
0485:                    throw new InvalidJobFileException(
0486:                            "Problem reading from settings " + "file (" + tmp
0487:                                    + ") specified in job file '"
0488:                                    + jobFile.getAbsolutePath() + "'\n"
0489:                                    + e1.getMessage());
0490:                }
0491:                // Statistics tracker.
0492:                jobReader.readLine();
0493:                // errorMessage
0494:                // TODO: Multilines
0495:                tmp = jobReader.readLine();
0496:                errorMessage = "";
0497:                while (tmp != null) {
0498:                    errorMessage += tmp + '\n';
0499:                    tmp = jobReader.readLine();
0500:                }
0501:                if (errorMessage.length() == 0) {
0502:                    // Empty error message should be null
0503:                    errorMessage = null;
0504:                }
0505:                // TODO: Load stattrack if needed.
0506:
0507:                // TODO: This should be inside a finally block.
0508:                jobReader.close();
0509:            }
0510:
0511:            /**
0512:             * Cause the job to be written to persistent storage.
0513:             * This will also save the statistics tracker if it is not null and the
0514:             * job status is finished (regardless of how it's finished)
0515:             */
0516:            private void writeJobFile() {
0517:                if (isProfile) {
0518:                    return;
0519:                }
0520:
0521:                final String jobDirAbsolute = jobDir.getAbsolutePath();
0522:                if (!jobDir.exists() || !jobDir.canWrite()) {
0523:                    logger.warning("Can't update status on " + jobDirAbsolute
0524:                            + " because file does not"
0525:                            + " exist (or is unwriteable)");
0526:                    return;
0527:                }
0528:                File f = new File(jobDirAbsolute, "state.job");
0529:
0530:                String settingsFile = getSettingsDirectory();
0531:                // Make settingsFile's path relative if order.xml is somewhere in the
0532:                // job's directory tree
0533:                if (settingsFile.startsWith(jobDirAbsolute
0534:                        .concat(File.separator))) {
0535:                    settingsFile = settingsFile.substring(jobDirAbsolute
0536:                            .length() + 1);
0537:                }
0538:                try {
0539:                    FileWriter jobWriter = new FileWriter(f, false);
0540:                    try {
0541:                        jobWriter.write(UID + "\n");
0542:                        jobWriter.write(name + "\n");
0543:                        jobWriter.write(status + "\n");
0544:                        jobWriter.write(isReadOnly + "\n");
0545:                        jobWriter.write(isRunning + "\n");
0546:                        jobWriter.write(priority + "\n");
0547:                        jobWriter.write(numberOfJournalEntries + "\n");
0548:                        jobWriter.write(settingsFile + "\n");
0549:                        jobWriter.write(statisticsFileSave + "\n");// TODO: Is this
0550:                        // right?
0551:                        // Can be multiple lines so we keep it last
0552:                        if (errorMessage != null) {
0553:                            jobWriter.write(errorMessage + "\n");
0554:                        }
0555:                    } finally {
0556:                        if (jobWriter != null) {
0557:                            jobWriter.close();
0558:                        }
0559:                    }
0560:                } catch (IOException e) {
0561:                    logger.log(Level.WARNING,
0562:                            "An IOException occured saving job " + name + " ("
0563:                                    + UID + ")", e);
0564:                }
0565:            }
0566:
0567:            /**
0568:             * Returns this jobs unique ID (UID) that was issued by the
0569:             * CrawlJobHandler() when this job was first created.
0570:             * 
0571:             * @return Job This jobs UID.
0572:             * @see CrawlJobHandler#getNextJobUID()
0573:             */
0574:            public String getUID() {
0575:                return UID;
0576:            }
0577:
0578:            /**
0579:             * Returns this job's 'name'. The name comes from the settings for this job,
0580:             * need not be unique and may change. For a unique identifier use
0581:             * {@link #getUID() getUID()}.
0582:             * <p>
0583:             * The name corrisponds to the value of the 'name' tag in the 'meta' section
0584:             * of the settings file.
0585:             *
0586:             * @return This job's 'name'
0587:             */
0588:            public String getJobName() {
0589:                return name;
0590:            }
0591:
0592:            /**
0593:             * Return the combination of given name and UID most commonly
0594:             * used in administrative interface.
0595:             *
0596:             * @return Job's name with UID notation
0597:             */
0598:            public String getDisplayName() {
0599:                return getJobName() + " [" + getUID() + "]";
0600:            }
0601:
0602:            /**
0603:             * Set this job's level of priority.
0604:             *
0605:             * @param priority The level of priority
0606:             *
0607:             * @see #getJobPriority()
0608:             * @see #PRIORITY_MINIMAL
0609:             * @see #PRIORITY_LOW
0610:             * @see #PRIORITY_AVERAGE
0611:             * @see #PRIORITY_HIGH
0612:             * @see #PRIORITY_CRITICAL
0613:             */
0614:            public void setJobPriority(int priority) {
0615:                this .priority = priority;
0616:            }
0617:
0618:            /**
0619:             * Get this job's level of priority.
0620:             *
0621:             * @return this job's priority
0622:             * @see #setJobPriority(int)
0623:             * @see #PRIORITY_MINIMAL
0624:             * @see #PRIORITY_LOW
0625:             * @see #PRIORITY_AVERAGE
0626:             * @see #PRIORITY_HIGH
0627:             * @see #PRIORITY_CRITICAL
0628:             */
0629:            public int getJobPriority() {
0630:                return priority;
0631:            }
0632:
0633:            /**
0634:             * Once called no changes can be made to the settings for this job.
0635:             * Typically this is done once a crawl is completed and further changes
0636:             * to the crawl order are therefor meaningless.
0637:             */
0638:            public void setReadOnly() {
0639:                isReadOnly = true;
0640:                writeJobFile(); //Save changes
0641:            }
0642:
0643:            /**
0644:             * Is job read only?
0645:             * @return false until setReadOnly has been invoked, after that it returns true.
0646:             */
0647:            public boolean isReadOnly() {
0648:                return isReadOnly;
0649:            }
0650:
0651:            /**
0652:             * Set the status of this CrawlJob.
0653:             *
0654:             * @param status Current status of CrawlJob
0655:             *         (see constants defined here beginning with STATUS)
0656:             */
0657:            public void setStatus(String status) {
0658:                this .status = status;
0659:                writeJobFile(); //Save changes
0660:                // TODO: If job finished, save StatisticsTracker!
0661:            }
0662:
0663:            /**
0664:             * @return Status of the crawler (Used by JMX).
0665:             */
0666:            public String getCrawlStatus() {
0667:                return this .controller != null ? this .controller.getState()
0668:                        .toString() : "Illegal State";
0669:            }
0670:
0671:            /**
0672:             * Get the current status of this CrawlJob
0673:             *
0674:             * @return The current status of this CrawlJob
0675:             *         (see constants defined here beginning with STATUS)
0676:             */
0677:            public String getStatus() {
0678:                return this .status;
0679:            }
0680:
0681:            /**
0682:             * Returns the settings handler for this job. It will have been initialized.
0683:             * @return the settings handler for this job.
0684:             */
0685:            public XMLSettingsHandler getSettingsHandler() {
0686:                return this .settingsHandler;
0687:            }
0688:
0689:            /**
0690:             * Is this a new job?
0691:             * @return True if is new.
0692:             */
0693:            public boolean isNew() {
0694:                return isNew;
0695:            }
0696:
0697:            /**
0698:             * Set if the job is considered to be a profile
0699:             * @return True if is a profile.
0700:             */
0701:            public boolean isProfile() {
0702:                return isProfile;
0703:            }
0704:
0705:            /**
0706:             * Set if the job is considered a new job or not.
0707:             * @param b Is the job considered to be new.
0708:             */
0709:            public void setNew(boolean b) {
0710:                isNew = b;
0711:                writeJobFile(); //Save changes
0712:            }
0713:
0714:            /**
0715:             * Returns true if the job is being crawled.
0716:             * @return true if the job is being crawled
0717:             */
0718:            public boolean isRunning() {
0719:                return isRunning;
0720:            }
0721:
0722:            /**
0723:             * Set if job is being crawled.
0724:             * @param b Is job being crawled.
0725:             */
0726:            protected void setRunning(boolean b) {
0727:                isRunning = b;
0728:                writeJobFile(); // Save changes
0729:                //TODO: Job ending -> Save statistics tracker.
0730:                //TODO: This is likely to happen as the CrawlEnding event occurs,
0731:                // need to ensure that the StatisticsTracker is saved to disk on
0732:                // CrawlEnded. Maybe move responsibility for this into the
0733:                // StatisticsTracker?
0734:            }
0735:
0736:            protected void unregisterMBean() {
0737:                // Unregister current job from JMX agent, if there one.
0738:                if (this .mbeanServer == null) {
0739:                    return;
0740:                }
0741:                try {
0742:                    this .mbeanServer.unregisterMBean(this .mbeanName);
0743:                    this .mbeanServer = null;
0744:                } catch (Exception e) {
0745:                    logger
0746:                            .log(Level.SEVERE, "Failed with " + this .mbeanName,
0747:                                    e);
0748:                }
0749:            }
0750:
0751:            /**
0752:             * Subclass of crawlcontroller that unregisters beans when stopped.
0753:             * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
0754:             * pollution, so for sure CrawlJob is unregistered with JMX and so any
0755:             * listeners on the CrawlJob get a chance to get crawl ended message
0756:             * (These latter notifications may not actually be getting through -- TBD).
0757:             * <p>TODO: This override dirtys the data model since CC knows about CJs.
0758:             * The facility provided by this class emitting events and statistics so
0759:             * they can be read by JMX needs to go back into CC.  Probably best to
0760:             * registering in JMX the CC, rather than CJ.  Lets do this in Heritrix 2.0
0761:             * since means changing the JMX API some.
0762:             */
0763:            public class MBeanCrawlController extends CrawlController implements 
0764:                    Serializable {
0765:                private static final long serialVersionUID = -4608537998168407222L;
0766:                private CrawlJob cj = null;
0767:                private CompositeType ct = null;
0768:
0769:                public CrawlJob getCrawlJob() {
0770:                    return this .cj;
0771:                }
0772:
0773:                public void setCrawlJob(CrawlJob cj) {
0774:                    this .cj = cj;
0775:                }
0776:
0777:                public void progressStatisticsEvent(final EventObject e) {
0778:                    super .progressStatisticsEvent(e);
0779:                    if (this .cj.getMbeanName() == null) {
0780:                        // Can be null around job startup.  Return w/o doing anything.
0781:                        return;
0782:                    }
0783:
0784:                    Map s = ((StatisticsTracking) e.getSource())
0785:                            .getProgressStatistics();
0786:                    // Convert the statistics to OpenType CompositeData and add as
0787:                    // user data to Notification.
0788:                    CompositeData cd = null;
0789:                    try {
0790:                        if (this .ct == null) {
0791:                            this .ct = JmxUtils.createCompositeType(s,
0792:                                    PROG_STATS, PROG_STATS + " for "
0793:                                            + this .cj.getMbeanName());
0794:                        }
0795:                        cd = new CompositeDataSupport(this .ct, s);
0796:                    } catch (OpenDataException ode) {
0797:                        ode.printStackTrace();
0798:                    }
0799:                    if (cd != null) {
0800:                        Notification n = new Notification(PROG_STATS, this .cj
0801:                                .getMbeanName(),
0802:                                getNotificationsSequenceNumber(),
0803:                                ((StatisticsTracking) e.getSource())
0804:                                        .getProgressStatisticsLine());
0805:                        n.setUserData(cd);
0806:                        this .cj.sendNotification(n);
0807:                    }
0808:                }
0809:
0810:                protected void completeStop() {
0811:                    try {
0812:                        super .completeStop();
0813:                    } finally {
0814:                        if (this .cj != null) {
0815:                            this .cj.unregisterMBean();
0816:                        }
0817:                        this .cj = null;
0818:                    }
0819:                }
0820:            }
0821:
0822:            protected CrawlController setupCrawlController()
0823:                    throws InitializationException {
0824:                CrawlController controller = null;
0825:
0826:                // Check if we're to do a checkpoint recover.  If so, deserialize
0827:                // the checkpoint's CrawlController and use that in place of a new
0828:                // CrawlController instance.
0829:                Checkpoint cp = CrawlController
0830:                        .getCheckpointRecover(getSettingsHandler().getOrder());
0831:                if (cp != null) {
0832:                    try {
0833:                        controller = (MBeanCrawlController) CheckpointUtils
0834:                                .readObjectFromFile(MBeanCrawlController.class,
0835:                                        cp.getDirectory());
0836:                    } catch (FileNotFoundException e) {
0837:                        throw new InitializationException(e);
0838:                    } catch (IOException e) {
0839:                        throw new InitializationException(e);
0840:                    } catch (ClassNotFoundException e) {
0841:                        throw new InitializationException(e);
0842:                    }
0843:                } else {
0844:                    controller = new MBeanCrawlController();
0845:                }
0846:                return controller;
0847:            }
0848:
0849:            protected CrawlController createCrawlController() {
0850:                return new MBeanCrawlController();
0851:            }
0852:
0853:            public void setupForCrawlStart() throws InitializationException {
0854:                try {
0855:                    this .controller = setupCrawlController();
0856:                    // Register as listener to get job finished notice.
0857:                    this .controller.addCrawlStatusListener(this );
0858:                    this .controller.initialize(getSettingsHandler());
0859:                    // Set the crawl job this MBeanCrawlController needs to worry about.
0860:                    ((MBeanCrawlController) this .controller).setCrawlJob(this );
0861:                    // Create our mbean description and register our crawljob.
0862:                    this .openMBeanInfo = buildMBeanInfo();
0863:                    try {
0864:                        Heritrix.registerMBean(this , getJmxJobName(),
0865:                                CRAWLJOB_JMXMBEAN_TYPE);
0866:                    } catch (InstanceAlreadyExistsException e) {
0867:                        throw new InitializationException(e);
0868:                    } catch (MBeanRegistrationException e) {
0869:                        throw new InitializationException(e);
0870:                    } catch (NotCompliantMBeanException e) {
0871:                        throw new InitializationException(e);
0872:                    }
0873:                } catch (InitializationException e) {
0874:                    // Can't load current job since it is misconfigured.
0875:                    setStatus(CrawlJob.STATUS_MISCONFIGURED);
0876:                    setErrorMessage("A fatal InitializationException occured when "
0877:                            + "loading job:\n" + e.getMessage());
0878:                    // Log to stdout so its seen in logs as well as in UI.
0879:                    e.printStackTrace();
0880:                    this .controller = null;
0881:                    throw e;
0882:                }
0883:                setStatus(CrawlJob.STATUS_RUNNING);
0884:                setRunning(true);
0885:            }
0886:
0887:            public void stopCrawling() {
0888:                if (this .controller != null) {
0889:                    this .controller.requestCrawlStop();
0890:                }
0891:            }
0892:
0893:            /**
0894:             * @return One-line Frontier report.
0895:             */
0896:            public String getFrontierOneLine() {
0897:                if (this .controller == null
0898:                        || this .controller.getFrontier() == null) {
0899:                    return "Crawler not running";
0900:                }
0901:                return this .controller.getFrontier().singleLineReport();
0902:            }
0903:
0904:            /**
0905:             * @param reportName Name of report to write.
0906:             * @return A report of the frontier's status.
0907:             */
0908:            public String getFrontierReport(final String reportName) {
0909:                if (this .controller == null
0910:                        || this .controller.getFrontier() == null) {
0911:                    return "Crawler not running";
0912:                }
0913:                return ArchiveUtils.writeReportToString(this .controller
0914:                        .getFrontier(), reportName);
0915:            }
0916:
0917:            /**
0918:             * Write the requested frontier report to the given PrintWriter
0919:             * @param reportName Name of report to write.
0920:             * @param writer Where to write to.
0921:             */
0922:            public void writeFrontierReport(String reportName,
0923:                    PrintWriter writer) {
0924:                if (this .controller == null
0925:                        || this .controller.getFrontier() == null) {
0926:                    writer.println("Crawler not running.");
0927:                    return;
0928:                }
0929:                this .controller.getFrontier().reportTo(reportName, writer);
0930:            }
0931:
0932:            /**
0933:             * @return One-line threads report.
0934:             */
0935:            public String getThreadOneLine() {
0936:                if (this .controller == null) {
0937:                    return "Crawler not running";
0938:                }
0939:                return this .controller.oneLineReportThreads();
0940:            }
0941:
0942:            /**
0943:             * Get the CrawlControllers ToeThreads report for the running crawl.
0944:             * @return The CrawlControllers ToeThreads report
0945:             */
0946:            public String getThreadsReport() {
0947:                if (this .controller == null) {
0948:                    return "Crawler not running";
0949:                }
0950:                return ArchiveUtils.writeReportToString(this .controller
0951:                        .getToePool(), null);
0952:            }
0953:
0954:            /**
0955:             * Write the requested threads report to the given PrintWriter
0956:             * @param reportName Name of report to write.
0957:             * @param writer Where to write to.
0958:             */
0959:            public void writeThreadsReport(String reportName, PrintWriter writer) {
0960:                if (this .controller == null
0961:                        || this .controller.getFrontier() == null) {
0962:                    writer.println("Crawler not running.");
0963:                    return;
0964:                }
0965:                this .controller.getToePool().reportTo(reportName, writer);
0966:            }
0967:
0968:            /**
0969:             * Kills a thread. For details see
0970:             * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
0971:             * ToePool.killThread(int, boolean)}.
0972:             * @param threadNumber Thread to kill.
0973:             * @param replace Should thread be replaced.
0974:             * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
0975:             */
0976:            public void killThread(int threadNumber, boolean replace) {
0977:                if (this .controller == null) {
0978:                    return;
0979:                }
0980:                this .controller.killThread(threadNumber, replace);
0981:            }
0982:
0983:            /**
0984:             * Get the Processors report for the running crawl.
0985:             * @return The Processors report for the running crawl.
0986:             */
0987:            public String getProcessorsReport() {
0988:                if (this .controller == null) {
0989:                    return "Crawler not running";
0990:                }
0991:                return ArchiveUtils.writeReportToString(this .controller,
0992:                        CrawlController.PROCESSORS_REPORT);
0993:            }
0994:
0995:            /**
0996:             * Returns the directory where the configuration files for this job are
0997:             * located.
0998:             *
0999:             * @return the directory where the configuration files for this job are
1000:             *         located
1001:             */
1002:            public String getSettingsDirectory() {
1003:                return settingsHandler.getOrderFile().getPath();
1004:            }
1005:
1006:            /**
1007:             * Returns the path of the job's base directory. For profiles this is always
1008:             * equal to <code>new File(getSettingsDirectory())</code>.
1009:             * @return the path of the job's base directory.
1010:             */
1011:            public File getDirectory() {
1012:                return isProfile ? new File(getSettingsDirectory()) : jobDir;
1013:            }
1014:
1015:            /**
1016:             * Get the error message associated with this job. Will return null if there
1017:             * is no error message.
1018:             * @return the error message associated with this job
1019:             */
1020:            public String getErrorMessage() {
1021:                return errorMessage;
1022:            }
1023:
1024:            /**
1025:             * Set an error message for this job. Generally this only occurs if the job
1026:             * is misconfigured.
1027:             * @param string the error message associated with this job
1028:             */
1029:            public void setErrorMessage(String string) {
1030:                errorMessage = string;
1031:                writeJobFile(); //Save changes
1032:            }
1033:
1034:            /**
1035:             * @return Returns the number of journal entries.
1036:             */
1037:            public int getNumberOfJournalEntries() {
1038:                return numberOfJournalEntries;
1039:            }
1040:
1041:            /**
1042:             * @param numberOfJournalEntries The number of journal entries to set.
1043:             */
1044:            public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1045:                this .numberOfJournalEntries = numberOfJournalEntries;
1046:                writeJobFile();
1047:            }
1048:
1049:            /**
1050:             * @return Returns the error handler for this crawl job
1051:             */
1052:            public CrawlJobErrorHandler getErrorHandler() {
1053:                return errorHandler;
1054:            }
1055:
1056:            /**
1057:             * Read all the checkpoints found in the job's checkpoints
1058:             * directory into Checkpoint instances
1059:             * @return Collection containing list of all checkpoints.
1060:             */
1061:            public Collection scanCheckpoints() {
1062:                File checkpointsDirectory = settingsHandler.getOrder()
1063:                        .getCheckpointsDirectory();
1064:                File[] perCheckpointDirs = checkpointsDirectory.listFiles();
1065:                Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();
1066:                if (perCheckpointDirs != null) {
1067:                    for (int i = 0; i < perCheckpointDirs.length; i++) {
1068:                        Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1069:                        checkpoints.add(cp);
1070:                    }
1071:                }
1072:                return checkpoints;
1073:            }
1074:
1075:            /**
1076:             * Returns the absolute path of the specified log.
1077:             * Note: If crawl has not begun, this file may not exist.
1078:             * @param log
1079:             * @return the absolute path for the specified log.
1080:             * @throws AttributeNotFoundException
1081:             * @throws ReflectionException
1082:             * @throws MBeanException
1083:             */
1084:            public String getLogPath(String log)
1085:                    throws AttributeNotFoundException, MBeanException,
1086:                    ReflectionException {
1087:                String logsPath = (String) settingsHandler.getOrder()
1088:                        .getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1089:                CrawlOrder order = settingsHandler.getOrder();
1090:                String diskPath = (String) order.getAttribute(null,
1091:                        CrawlOrder.ATTR_DISK_PATH);
1092:                File disk = settingsHandler
1093:                        .getPathRelativeToWorkingDirectory(diskPath);
1094:                File f = new File(logsPath, log);
1095:                if (!f.isAbsolute()) {
1096:                    f = new File(disk.getPath(), f.getPath());
1097:                }
1098:                return f.getAbsolutePath();
1099:            }
1100:
1101:            // OpenMBean implementation.
1102:
1103:            protected void pause() {
1104:                if (this .controller != null
1105:                        && this .controller.isPaused() == false) {
1106:                    this .controller.requestCrawlPause();
1107:                }
1108:            }
1109:
1110:            protected void resume() {
1111:                if (this .controller != null) {
1112:                    this .controller.requestCrawlResume();
1113:                }
1114:            }
1115:
1116:            /**
1117:             * @throws IllegalStateException Thrown if crawl is not paused.
1118:             */
1119:            protected void checkpoint() throws IllegalStateException {
1120:                if (this .controller != null) {
1121:                    this .controller.requestCrawlCheckpoint();
1122:                }
1123:            }
1124:
1125:            /**
1126:             * @return True if checkpointing.
1127:             */
1128:            public boolean isCheckpointing() {
1129:                return this .controller != null ? this .controller
1130:                        .isCheckpointing() : false;
1131:            }
1132:
1133:            /**
1134:             * If its a HostQueuesFrontier, needs to be flushed for the queued.
1135:             */
1136:            protected void flush() {
1137:                // Nothing to do.
1138:            }
1139:
1140:            /**
1141:             * Delete any URI from the frontier of the current (paused) job that match
1142:             * the specified regular expression. If the current job is not paused (or
1143:             * there is no current job) nothing will be done.
1144:             * @param regexpr Regular expression to delete URIs by.
1145:             * @return the number of URIs deleted
1146:             */
1147:            public long deleteURIsFromPending(String regexpr) {
1148:                return (this .controller != null
1149:                        && this .controller.getFrontier() != null && this .controller
1150:                        .isPaused()) ? this .controller.getFrontier()
1151:                        .deleteURIs(regexpr) : 0;
1152:            }
1153:
1154:            public String importUris(String file, String style, String force) {
1155:                return importUris(file, style, "true".equals(force));
1156:            }
1157:
1158:            public String importUris(final String fileOrUrl,
1159:                    final String style, final boolean forceRevisit) {
1160:                return importUris(fileOrUrl, style, forceRevisit, false);
1161:            }
1162:
1163:            /**
1164:             * @param fileOrUrl Name of file w/ seeds.
1165:             * @param style What style of seeds -- crawl log, recovery journal, or
1166:             * seeds file.
1167:             * @param forceRevisit Should we revisit even if seen before?
1168:             * @param areSeeds Is the file exclusively seeds?
1169:             * @return A display string that has a count of all added.
1170:             */
1171:            public String importUris(final String fileOrUrl,
1172:                    final String style, final boolean forceRevisit,
1173:                    final boolean areSeeds) {
1174:                InputStream is = IoUtils.getInputStream(this .controller
1175:                        .getDisk(), fileOrUrl);
1176:                String message = null;
1177:                // Do we have an inputstream?
1178:                if (is == null) {
1179:                    message = "Failed to get inputstream from " + fileOrUrl;
1180:                    logger.severe(message);
1181:                } else {
1182:                    int addedCount = importUris(is, style, forceRevisit,
1183:                            areSeeds);
1184:                    message = Integer.toString(addedCount)
1185:                            + " URIs added from " + fileOrUrl;
1186:                }
1187:                return message;
1188:            }
1189:
1190:            protected int importUris(InputStream is, String style,
1191:                    boolean forceRevisit) {
1192:                return importUris(is, style, forceRevisit, false);
1193:            }
1194:
1195:            /**
1196:             * Import URIs.
1197:             * @param is Stream to use as URI source.
1198:             * @param style Style in which URIs are rendored.  Currently support for
1199:             * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1200:             * format (i.e <code>default</code>) where <code>default</code> style is
1201:             * a UURI per line (comments allowed).
1202:             * @param forceRevisit Whether we should revisit this URI even if we've
1203:             * visited it previously.
1204:             * @param areSeeds Are the imported URIs seeds?
1205:             * @return Count of added URIs.
1206:             */
1207:            protected int importUris(InputStream is, String style,
1208:                    boolean forceRevisit, final boolean areSeeds) {
1209:                // Figure the regex to use parsing each line of input stream.
1210:                String extractor;
1211:                String output;
1212:                if (CRAWL_LOG_STYLE.equals(style)) {
1213:                    // Skip first 3 fields
1214:                    extractor = "\\S+\\s+\\S+\\s+\\S+\\s+(\\S+\\s+\\S+\\s+\\S+\\s+).*";
1215:                    output = "$1";
1216:                } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1217:                    // Skip the begin-of-line directive
1218:                    extractor = "\\S+\\s+((\\S+)(?:\\s+\\S+\\s+\\S+)?)\\s*";
1219:                    output = "$1";
1220:                } else {
1221:                    extractor = RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1222:                    output = RegexpLineIterator.ENTRY;
1223:                }
1224:
1225:                // Read the input stream.
1226:                BufferedReader br = null;
1227:                int addedCount = 0;
1228:                try {
1229:                    br = new BufferedReader(new InputStreamReader(is));
1230:                    Iterator iter = new RegexpLineIterator(
1231:                            new LineReadingIterator(br),
1232:                            RegexpLineIterator.COMMENT_LINE, extractor, output);
1233:                    while (iter.hasNext()) {
1234:                        try {
1235:                            importUri((String) iter.next(), forceRevisit,
1236:                                    areSeeds, false);
1237:                            addedCount++;
1238:                        } catch (URIException e) {
1239:                            e.printStackTrace();
1240:                        }
1241:                    }
1242:                    br.close();
1243:                    flush();
1244:                } catch (IOException e) {
1245:                    e.printStackTrace();
1246:                }
1247:                return addedCount;
1248:            }
1249:
1250:            /**
1251:             * Schedule a uri.
1252:             * @param uri Uri to schedule.
1253:             * @param forceFetch Should it be forcefetched.
1254:             * @param isSeed True if seed.
1255:             * @throws URIException
1256:             */
1257:            public void importUri(final String uri, final boolean forceFetch,
1258:                    final boolean isSeed) throws URIException {
1259:                importUri(uri, forceFetch, isSeed, true);
1260:            }
1261:
1262:            /**
1263:             * Schedule a uri.
1264:             * @param str String that can be: 1. a UURI, 2. a snippet of the
1265:             * crawl.log line, or 3. a snippet from recover log.  See
1266:             * {@link #importUris(InputStream, String, boolean)} for how it subparses
1267:             * the lines from crawl.log and recover.log.
1268:             * @param forceFetch Should it be forcefetched.
1269:             * @param isSeed True if seed.
1270:             * @param isFlush If true, flush the frontier IF it implements
1271:             * flushing.
1272:             * @throws URIException
1273:             */
1274:            public void importUri(final String str, final boolean forceFetch,
1275:                    final boolean isSeed, final boolean isFlush)
1276:                    throws URIException {
1277:                CandidateURI caUri = CandidateURI.fromString(str);
1278:                caUri.setForceFetch(forceFetch);
1279:                if (isSeed) {
1280:                    caUri.setIsSeed(isSeed);
1281:                    if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1282:                        // Danger of double-add of seeds because of this code here.
1283:                        // Only call addSeed if no via.  If a via, the schedule will
1284:                        // take care of updating scope.
1285:                        this .controller.getScope().addSeed(caUri);
1286:                    }
1287:                }
1288:                this .controller.getFrontier().schedule(caUri);
1289:                if (isFlush) {
1290:                    flush();
1291:                }
1292:            }
1293:
1294:            /**
1295:             * @return Our mbean info (Needed for CrawlJob to qualify as a
1296:             * DynamicMBean).
1297:             */
1298:            public MBeanInfo getMBeanInfo() {
1299:                return this .openMBeanInfo;
1300:            }
1301:
1302:            /**
1303:             * Build up the MBean info for Heritrix main.
1304:             * @return Return created mbean info instance.
1305:             * @throws InitializationException 
1306:             */
1307:            protected OpenMBeanInfoSupport buildMBeanInfo()
1308:                    throws InitializationException {
1309:                // Start adding my attributes.
1310:                List<OpenMBeanAttributeInfo> attributes = new ArrayList<OpenMBeanAttributeInfo>();
1311:
1312:                // Attributes.
1313:                attributes
1314:                        .add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,
1315:                                "Crawl job name", SimpleType.STRING, true,
1316:                                false, false));
1317:                attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,
1318:                        "Short basic status message", SimpleType.STRING, true,
1319:                        false, false));
1320:                attributes.add(new OpenMBeanAttributeInfoSupport(
1321:                        FRONTIER_SHORT_REPORT_ATTR, "Short frontier report",
1322:                        SimpleType.STRING, true, false, false));
1323:                attributes.add(new OpenMBeanAttributeInfoSupport(
1324:                        THREADS_SHORT_REPORT_ATTR, "Short threads report",
1325:                        SimpleType.STRING, true, false, false));
1326:                attributes
1327:                        .add(new OpenMBeanAttributeInfoSupport(UID_ATTR,
1328:                                "Crawl job UID", SimpleType.STRING, true,
1329:                                false, false));
1330:                attributes.add(new OpenMBeanAttributeInfoSupport(
1331:                        TOTAL_DATA_ATTR, "Total data received",
1332:                        SimpleType.LONG, true, false, false));
1333:                attributes.add(new OpenMBeanAttributeInfoSupport(
1334:                        CRAWL_TIME_ATTR, "Crawl time", SimpleType.LONG, true,
1335:                        false, false));
1336:                attributes.add(new OpenMBeanAttributeInfoSupport(
1337:                        CURRENT_DOC_RATE_ATTR,
1338:                        "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1339:                        true, false, false));
1340:                attributes.add(new OpenMBeanAttributeInfoSupport(
1341:                        CURRENT_KB_RATE_ATTR, "Current crawling rate (Kb/sec)",
1342:                        SimpleType.LONG, true, false, false));
1343:                attributes.add(new OpenMBeanAttributeInfoSupport(
1344:                        THREAD_COUNT_ATTR, "Active thread count",
1345:                        SimpleType.INTEGER, true, false, false));
1346:                attributes.add(new OpenMBeanAttributeInfoSupport(DOC_RATE_ATTR,
1347:                        "Crawling rate (Docs/sec)", SimpleType.DOUBLE, true,
1348:                        false, false));
1349:                attributes.add(new OpenMBeanAttributeInfoSupport(KB_RATE_ATTR,
1350:                        "Current crawling rate (Kb/sec)", SimpleType.LONG,
1351:                        true, false, false));
1352:                attributes.add(new OpenMBeanAttributeInfoSupport(
1353:                        DOWNLOAD_COUNT_ATTR, "Count of downloaded documents",
1354:                        SimpleType.LONG, true, false, false));
1355:                attributes.add(new OpenMBeanAttributeInfoSupport(
1356:                        DISCOVERED_COUNT_ATTR, "Count of discovered documents",
1357:                        SimpleType.LONG, true, false, false));
1358:
1359:                // Add in the crawl order attributes.
1360:                addCrawlOrderAttributes(this .getController().getOrder(),
1361:                        attributes);
1362:
1363:                // Add the bdbje attributes.  Convert to open mbean attributes.
1364:                // First do bdbeje setup.  Then add a subset of the bdbje attributes.
1365:                // Keep around the list of names as a convenience for when it comes
1366:                // time to test if attribute is supported.
1367:                Environment env = this .controller.getBdbEnvironment();
1368:                try {
1369:                    this .bdbjeMBeanHelper = new JEMBeanHelper(env.getConfig(),
1370:                            env.getHome(), true);
1371:                } catch (DatabaseException e) {
1372:                    e.printStackTrace();
1373:                    InitializationException ie = new InitializationException(e
1374:                            .getMessage());
1375:                    ie.setStackTrace(e.getStackTrace());
1376:                    throw ie;
1377:                }
1378:                this .bdbjeAttributeNameList = Arrays.asList(new String[] {
1379:                        JEMBeanHelper.ATT_ENV_HOME, JEMBeanHelper.ATT_OPEN,
1380:                        JEMBeanHelper.ATT_IS_READ_ONLY,
1381:                        JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1382:                        JEMBeanHelper.ATT_CACHE_SIZE,
1383:                        JEMBeanHelper.ATT_CACHE_PERCENT,
1384:                        JEMBeanHelper.ATT_LOCK_TIMEOUT,
1385:                        JEMBeanHelper.ATT_IS_SERIALIZABLE,
1386:                        JEMBeanHelper.ATT_SET_READ_ONLY, });
1387:                addBdbjeAttributes(attributes, this .bdbjeMBeanHelper
1388:                        .getAttributeList(env), this .bdbjeAttributeNameList);
1389:
1390:                // Operations.
1391:                List<OpenMBeanOperationInfo> operations = new ArrayList<OpenMBeanOperationInfo>();
1392:                OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[3];
1393:                args[0] = new OpenMBeanParameterInfoSupport("url",
1394:                        "URL to add to the frontier", SimpleType.STRING);
1395:                args[1] = new OpenMBeanParameterInfoSupport("forceFetch",
1396:                        "True if URL is to be force fetched",
1397:                        SimpleType.BOOLEAN);
1398:                args[2] = new OpenMBeanParameterInfoSupport("seed",
1399:                        "True if URL is a seed", SimpleType.BOOLEAN);
1400:                operations.add(new OpenMBeanOperationInfoSupport(
1401:                        IMPORT_URI_OPER, "Add passed URL to the frontier",
1402:                        args, SimpleType.VOID, MBeanOperationInfo.ACTION));
1403:
1404:                args = new OpenMBeanParameterInfoSupport[4];
1405:                args[0] = new OpenMBeanParameterInfoSupport("pathOrUrl",
1406:                        "Path or URL to file of URLs", SimpleType.STRING);
1407:                args[1] = new OpenMBeanParameterInfoSupport("style",
1408:                        "Format format:default|crawlLog|recoveryJournal",
1409:                        SimpleType.STRING);
1410:                args[2] = new OpenMBeanParameterInfoSupport("forceFetch",
1411:                        "True if URLs are to be force fetched",
1412:                        SimpleType.BOOLEAN);
1413:                args[3] = new OpenMBeanParameterInfoSupport("seed",
1414:                        "True if all content are seeds.", SimpleType.BOOLEAN);
1415:                operations.add(new OpenMBeanOperationInfoSupport(
1416:                        IMPORT_URIS_OPER,
1417:                        "Add file of passed URLs to the frontier", args,
1418:                        SimpleType.STRING, MBeanOperationInfo.ACTION));
1419:
1420:                operations.add(new OpenMBeanOperationInfoSupport(PAUSE_OPER,
1421:                        "Pause crawling (noop if already paused)", null,
1422:                        SimpleType.VOID, MBeanOperationInfo.ACTION));
1423:
1424:                operations.add(new OpenMBeanOperationInfoSupport(RESUME_OPER,
1425:                        "Resume crawling (noop if already resumed)", null,
1426:                        SimpleType.VOID, MBeanOperationInfo.ACTION));
1427:
1428:                args = new OpenMBeanParameterInfoSupport[1];
1429:                args[0] = new OpenMBeanParameterInfoSupport("name",
1430:                        "Name of report ('all', 'standard', etc.).",
1431:                        SimpleType.STRING);
1432:                operations.add(new OpenMBeanOperationInfoSupport(
1433:                        FRONTIER_REPORT_OPER, "Full frontier report", args,
1434:                        SimpleType.STRING, MBeanOperationInfo.INFO));
1435:
1436:                operations.add(new OpenMBeanOperationInfoSupport(
1437:                        THREADS_REPORT_OPER, "Full thread report", null,
1438:                        SimpleType.STRING, MBeanOperationInfo.INFO));
1439:
1440:                operations.add(new OpenMBeanOperationInfoSupport(
1441:                        SEEDS_REPORT_OPER, "Seeds report", null,
1442:                        SimpleType.STRING, MBeanOperationInfo.INFO));
1443:
1444:                operations.add(new OpenMBeanOperationInfoSupport(
1445:                        PROGRESS_STATISTICS_OPER,
1446:                        "Progress statistics at time of invocation", null,
1447:                        SimpleType.STRING, MBeanOperationInfo.INFO));
1448:
1449:                operations.add(new OpenMBeanOperationInfoSupport(
1450:                        PROGRESS_STATISTICS_LEGEND_OPER,
1451:                        "Progress statistics legend", null, SimpleType.STRING,
1452:                        MBeanOperationInfo.INFO));
1453:
1454:                operations.add(new OpenMBeanOperationInfoSupport(
1455:                        CHECKPOINT_OPER, "Start a checkpoint", null,
1456:                        SimpleType.VOID, MBeanOperationInfo.ACTION));
1457:
1458:                // Add bdbje operations. Add subset only. Keep around the list so have
1459:                // it to hand when figuring what operations are supported. Usual actual
1460:                // Strings because not accessible from JEMBeanHelper.
1461:                this .bdbjeOperationsNameList = Arrays.asList(new String[] {
1462:                        "cleanLog", "evictMemory", "checkpoint", "sync",
1463:                        "getEnvironmentStatsToString", "getLockStatsToString",
1464:                        "getDatabaseNames", OP_DB_STAT });
1465:                addBdbjeOperations(operations, this .bdbjeMBeanHelper
1466:                        .getOperationList(env), this .bdbjeOperationsNameList);
1467:
1468:                // Register notifications
1469:                List<MBeanNotificationInfo> notifications = new ArrayList<MBeanNotificationInfo>();
1470:                notifications.add(new MBeanNotificationInfo(new String[] {
1471:                        "crawlStarted", "crawlEnding", "crawlPaused",
1472:                        "crawlResuming", PROG_STATS }, this .getClass()
1473:                        .getName()
1474:                        + ".notifications",
1475:                        "CrawlStatusListener events and progress statistics as "
1476:                                + "notifications"));
1477:                MBeanNotificationInfo[] notificationsArray = new MBeanNotificationInfo[notifications
1478:                        .size()];
1479:                notifications.toArray(notificationsArray);
1480:
1481:                // Build the info object.
1482:                OpenMBeanAttributeInfoSupport[] attributesArray = new OpenMBeanAttributeInfoSupport[attributes
1483:                        .size()];
1484:                attributes.toArray(attributesArray);
1485:                OpenMBeanOperationInfoSupport[] operationsArray = new OpenMBeanOperationInfoSupport[operations
1486:                        .size()];
1487:                operations.toArray(operationsArray);
1488:                return new OpenMBeanInfoSupport(this .getClass().getName(),
1489:                        "Current Crawl Job as OpenMBean", attributesArray,
1490:                        new OpenMBeanConstructorInfoSupport[] {},
1491:                        operationsArray, notificationsArray);
1492:            }
1493:
1494:            protected void addBdbjeAttributes(
1495:                    final List<OpenMBeanAttributeInfo> attributes,
1496:                    final List<MBeanAttributeInfo> bdbjeAttributes,
1497:                    final List<String> bdbjeNamesToAdd) {
1498:                for (MBeanAttributeInfo info : bdbjeAttributes) {
1499:                    if (bdbjeNamesToAdd.contains(info.getName())) {
1500:                        attributes.add(JmxUtils
1501:                                .convertToOpenMBeanAttribute(info));
1502:                    }
1503:                }
1504:            }
1505:
1506:            protected void addBdbjeOperations(
1507:                    final List<OpenMBeanOperationInfo> operations,
1508:                    final List<MBeanOperationInfo> bdbjeOperations,
1509:                    final List<String> bdbjeNamesToAdd) {
1510:                for (MBeanOperationInfo info : bdbjeOperations) {
1511:                    if (bdbjeNamesToAdd.contains(info.getName())) {
1512:                        OpenMBeanOperationInfo omboi = null;
1513:                        if (info.getName().equals(OP_DB_STAT)) {
1514:                            // Db stats needs special handling. The published
1515:                            // signature is wrong and its return type is awkward.
1516:                            // Handle it.
1517:                            omboi = JmxUtils.convertToOpenMBeanOperation(info,
1518:                                    null, SimpleType.STRING);
1519:                            MBeanParameterInfo[] params = omboi.getSignature();
1520:                            OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[params.length + 1];
1521:                            for (int ii = 0; ii < params.length; ii++) {
1522:                                args[ii] = (OpenMBeanParameterInfo) params[ii];
1523:                            }
1524:                            args[params.length] = new OpenMBeanParameterInfoSupport(
1525:                                    "name", "Database name", SimpleType.STRING);
1526:                            omboi = new OpenMBeanOperationInfoSupport(omboi
1527:                                    .getName(), omboi.getDescription(), args,
1528:                                    omboi.getReturnOpenType(), omboi
1529:                                            .getImpact());
1530:                        } else {
1531:                            omboi = JmxUtils.convertToOpenMBeanOperation(info);
1532:                        }
1533:                        operations.add(omboi);
1534:                    }
1535:                }
1536:            }
1537:
1538:            protected void addCrawlOrderAttributes(final ComplexType type,
1539:                    final List<OpenMBeanAttributeInfo> attributes) {
1540:                for (final Iterator i = type.getAttributeInfoIterator(null); i
1541:                        .hasNext();) {
1542:                    ModuleAttributeInfo info = (ModuleAttributeInfo) i.next();
1543:                    if (ORDER_EXCLUDE.contains(info.getName())) {
1544:                        // Skip.
1545:                        continue;
1546:                    }
1547:                    String absoluteName = type.getAbsoluteName() + "/"
1548:                            + info.getName();
1549:                    if (JmxUtils.isOpenType(info.getType())) {
1550:                        String description = info.getDescription();
1551:                        if (description == null || description.length() <= 0) {
1552:                            // Description can't be empty.
1553:                            description = info.getName();
1554:                        }
1555:                        attributes.add(new OpenMBeanAttributeInfoSupport(
1556:                                absoluteName, description, JmxUtils
1557:                                        .getOpenType(info.getType()), true,
1558:                                true, false));
1559:                    } else if (info.isComplexType()) {
1560:                        try {
1561:                            ComplexType c = (ComplexType) type
1562:                                    .getAttribute(info.getName());
1563:                            addCrawlOrderAttributes(c, attributes);
1564:                        } catch (AttributeNotFoundException e) {
1565:                            logger.log(Level.SEVERE, "Failed get of attribute",
1566:                                    e);
1567:                        } catch (MBeanException e) {
1568:                            logger.log(Level.SEVERE, "Failed get of attribute",
1569:                                    e);
1570:                        } catch (ReflectionException e) {
1571:                            logger.log(Level.SEVERE, "Failed get of attribute",
1572:                                    e);
1573:                        }
1574:                    } else if (info.getType().equals(TextField.class.getName())) {
1575:                        // Special handling for TextField.  Use the STRING OpenType.
1576:                        attributes.add(new OpenMBeanAttributeInfoSupport(
1577:                                absoluteName, info.getDescription(),
1578:                                SimpleType.STRING, true, true, false));
1579:                    } else {
1580:                        // Looks like only type we don't currently handle is StringList.
1581:                        // Figure how to do it.  Add as AttributeList?
1582:                        logger.fine(info.getType());
1583:                    }
1584:                }
1585:            }
1586:
1587:            public Object getAttribute(String attribute_name)
1588:                    throws AttributeNotFoundException {
1589:                if (attribute_name == null) {
1590:                    throw new RuntimeOperationsException(
1591:                            new IllegalArgumentException(
1592:                                    "Attribute name cannot be null"),
1593:                            "Cannot call getAttribute with null attribute name");
1594:                }
1595:
1596:                // If no controller, we can't do any work in here.
1597:                if (this .controller == null) {
1598:                    throw new RuntimeOperationsException(
1599:                            new NullPointerException("Controller is null"),
1600:                            "Controller is null");
1601:                }
1602:
1603:                // Is it a bdbje attribute?
1604:                if (this .bdbjeAttributeNameList.contains(attribute_name)) {
1605:                    try {
1606:                        return this .bdbjeMBeanHelper.getAttribute(
1607:                                this .controller.getBdbEnvironment(),
1608:                                attribute_name);
1609:                    } catch (MBeanException e) {
1610:                        throw new RuntimeOperationsException(
1611:                                new RuntimeException(e));
1612:                    }
1613:                }
1614:
1615:                // Is it a crawl-order attribute?
1616:                if (attribute_name.startsWith(this .controller.getOrder()
1617:                        .getAbsoluteName())) {
1618:                    return getCrawlOrderAttribute(attribute_name);
1619:                }
1620:
1621:                if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1622:                    throw new AttributeNotFoundException("Attribute "
1623:                            + attribute_name + " is unimplemented.");
1624:                }
1625:
1626:                // The pattern in the below is to match an attribute and when found
1627:                // do a return out of if clause.  Doing it this way, I can fall
1628:                // on to the AttributeNotFoundException for case where we've an
1629:                // attribute but no handler.
1630:                if (attribute_name.equals(STATUS_ATTR)) {
1631:                    return getCrawlStatus();
1632:                }
1633:                if (attribute_name.equals(NAME_ATTR)) {
1634:                    return getJobName();
1635:                }
1636:                if (attribute_name.equals(UID_ATTR)) {
1637:                    return getUID();
1638:                }
1639:                if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1640:                    return new Long(this .controller == null
1641:                            && this .controller.getStatistics() != null ? 0
1642:                            : this .controller.getStatistics()
1643:                                    .totalBytesWritten());
1644:                }
1645:                if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1646:                    return new Long(this .controller == null
1647:                            && this .controller.getStatistics() != null ? 0
1648:                            : this .controller.getStatistics()
1649:                                    .getCrawlerTotalElapsedTime() / 1000);
1650:                }
1651:                if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1652:                    return new Double(this .controller == null
1653:                            && this .controller.getStatistics() != null ? 0
1654:                            : this .controller.getStatistics()
1655:                                    .currentProcessedDocsPerSec());
1656:                }
1657:                if (attribute_name.equals(DOC_RATE_ATTR)) {
1658:                    return new Double(this .controller == null
1659:                            && this .controller.getStatistics() != null ? 0
1660:                            : this .controller.getStatistics()
1661:                                    .processedDocsPerSec());
1662:                }
1663:                if (attribute_name.equals(KB_RATE_ATTR)) {
1664:                    return new Long(this .controller == null
1665:                            && this .controller.getStatistics() != null ? 0
1666:                            : this .controller.getStatistics()
1667:                                    .currentProcessedKBPerSec());
1668:                }
1669:                if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1670:                    return new Long(this .controller == null
1671:                            && this .controller.getStatistics() != null ? 0
1672:                            : this .controller.getStatistics()
1673:                                    .processedKBPerSec());
1674:                }
1675:                if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1676:                    return new Integer(this .controller == null
1677:                            && this .controller.getStatistics() != null ? 0
1678:                            : this .controller.getStatistics()
1679:                                    .activeThreadCount());
1680:                }
1681:                if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1682:                    return getFrontierOneLine();
1683:                }
1684:                if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1685:                    return getThreadOneLine();
1686:                }
1687:                if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1688:                    return new Long(this .controller == null
1689:                            && this .controller.getStatistics() != null ? 0
1690:                            : this .controller.getStatistics().totalCount());
1691:                }
1692:                if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1693:                    return new Long(this .controller == null
1694:                            && this .controller.getStatistics() != null ? 0
1695:                            : this .controller.getStatistics()
1696:                                    .successfullyFetchedCount());
1697:                }
1698:
1699:                throw new AttributeNotFoundException("Attribute "
1700:                        + attribute_name + " not found.");
1701:            }
1702:
1703:            protected Object getCrawlOrderAttribute(final String attribute_name) {
1704:                CrawlOrder order = this .getController().getOrder();
1705:                Object result = null;
1706:                try {
1707:                    result = getCrawlOrderAttribute(attribute_name
1708:                            .substring(order.getAbsoluteName().length()), order);
1709:                } catch (NullPointerException e) {
1710:                    logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1711:                            e);
1712:                } catch (AttributeNotFoundException e) {
1713:                    logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1714:                            e);
1715:                } catch (MBeanException e) {
1716:                    logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1717:                            e);
1718:                } catch (ReflectionException e) {
1719:                    logger.log(Level.SEVERE, "Failed get of " + attribute_name,
1720:                            e);
1721:                }
1722:                return result;
1723:            }
1724:
1725:            protected Object getCrawlOrderAttribute(
1726:                    final String attribute_name, final ComplexType ct)
1727:                    throws AttributeNotFoundException, MBeanException,
1728:                    ReflectionException {
1729:                String subName = attribute_name.startsWith("/") ? attribute_name
1730:                        .substring(1)
1731:                        : attribute_name;
1732:                int index = subName.indexOf("/");
1733:                if (index <= 0) {
1734:                    MBeanAttributeInfo info = ct.getAttributeInfo(subName);
1735:                    // Special handling for TextField.
1736:                    return info.getType().equals(TextField.class.getName()) ? ct
1737:                            .getAttribute(subName).toString()
1738:                            : ct.getAttribute(subName);
1739:                }
1740:                return getCrawlOrderAttribute(subName.substring(index + 1),
1741:                        (ComplexType) ct.getAttribute(subName.substring(0,
1742:                                index)));
1743:            }
1744:
1745:            public AttributeList getAttributes(String[] attributeNames) {
1746:                if (attributeNames == null) {
1747:                    throw new RuntimeOperationsException(
1748:                            new IllegalArgumentException(
1749:                                    "attributeNames[] cannot be " + "null"),
1750:                            "Cannot call getAttributes with null attribute "
1751:                                    + "names");
1752:                }
1753:
1754:                // If no controller, we can't do any work in here.
1755:                if (this .controller == null) {
1756:                    throw new RuntimeOperationsException(
1757:                            new NullPointerException("Controller is null"),
1758:                            "Controller is null");
1759:                }
1760:
1761:                AttributeList resultList = new AttributeList();
1762:                if (attributeNames.length == 0) {
1763:                    return resultList;
1764:                }
1765:                for (int i = 0; i < attributeNames.length; i++) {
1766:                    try {
1767:                        Object value = getAttribute(attributeNames[i]);
1768:                        resultList.add(new Attribute(attributeNames[i], value));
1769:                    } catch (Exception e) {
1770:                        e.printStackTrace();
1771:                    }
1772:                }
1773:                return (resultList);
1774:            }
1775:
1776:            public void setAttribute(Attribute attribute)
1777:                    throws AttributeNotFoundException {
1778:                // Is it a crawl order attribute?
1779:                CrawlOrder order = this .getController().getOrder();
1780:                String attName = attribute.getName();
1781:                if (attName.startsWith(order.getAbsoluteName())) {
1782:                    try {
1783:                        setCrawlOrderAttribute(attribute.getName().substring(
1784:                                order.getAbsoluteName().length()), order,
1785:                                attribute);
1786:                    } catch (NullPointerException e) {
1787:                        logger.log(Level.SEVERE, "Failed set of " + attName, e);
1788:                    } catch (AttributeNotFoundException e) {
1789:                        logger.log(Level.SEVERE, "Failed set of " + attName, e);
1790:                    } catch (MBeanException e) {
1791:                        logger.log(Level.SEVERE, "Failed set of " + attName, e);
1792:                    } catch (ReflectionException e) {
1793:                        logger.log(Level.SEVERE, "Failed set of " + attName, e);
1794:                    } catch (InvalidAttributeValueException e) {
1795:                        logger.log(Level.SEVERE, "Failed set of " + attName, e);
1796:                    }
1797:                    return;
1798:                }
1799:
1800:                // Is it a bdbje attribute?
1801:                if (this .bdbjeAttributeNameList.contains(attName)) {
1802:                    try {
1803:                        this .bdbjeMBeanHelper.setAttribute(this .controller
1804:                                .getBdbEnvironment(), attribute);
1805:                    } catch (AttributeNotFoundException e) {
1806:                        throw new RuntimeOperationsException(
1807:                                new RuntimeException(e));
1808:                    } catch (InvalidAttributeValueException e) {
1809:                        throw new RuntimeOperationsException(
1810:                                new RuntimeException(e));
1811:                    }
1812:                    return;
1813:                }
1814:
1815:                // Else, we don't know how to handle this attribute.
1816:                throw new AttributeNotFoundException("Attribute " + attName
1817:                        + " can not be set.");
1818:            }
1819:
1820:            protected void setCrawlOrderAttribute(final String attribute_name,
1821:                    final ComplexType ct, final Attribute attribute)
1822:                    throws AttributeNotFoundException,
1823:                    InvalidAttributeValueException, MBeanException,
1824:                    ReflectionException {
1825:                String subName = attribute_name.startsWith("/") ? attribute_name
1826:                        .substring(1)
1827:                        : attribute_name;
1828:                int index = subName.indexOf("/");
1829:                if (index <= 0) {
1830:                    ct
1831:                            .setAttribute(new Attribute(subName, attribute
1832:                                    .getValue()));
1833:                    return;
1834:                }
1835:                setCrawlOrderAttribute(subName.substring(index + 1),
1836:                        (ComplexType) ct.getAttribute(subName.substring(0,
1837:                                index)), attribute);
1838:            }
1839:
1840:            public AttributeList setAttributes(AttributeList attributes) {
1841:                if (attributes == null) {
1842:                    throw new RuntimeOperationsException(
1843:                            new IllegalArgumentException(
1844:                                    "attributeNames[] cannot be " + "null"),
1845:                            "Cannot call getAttributes with null attribute "
1846:                                    + "names");
1847:                }
1848:
1849:                AttributeList resultList = new AttributeList();
1850:                if (attributes.size() == 0) {
1851:                    return resultList;
1852:                }
1853:                for (int i = 0; i < attributes.size(); i++) {
1854:                    try {
1855:                        Attribute attr = (Attribute) attributes.get(i);
1856:                        setAttribute(attr);
1857:                        String an = attr.getName();
1858:                        Object newValue = getAttribute(an);
1859:                        resultList.add(new Attribute(an, newValue));
1860:                    } catch (Exception e) {
1861:                        e.printStackTrace();
1862:                    }
1863:                }
1864:                return resultList;
1865:            }
1866:
1867:            public Object invoke(String operationName, Object[] params,
1868:                    String[] signature) throws ReflectionException {
1869:                if (operationName == null) {
1870:                    throw new RuntimeOperationsException(
1871:                            new IllegalArgumentException(
1872:                                    "Operation name cannot be null"),
1873:                            "Cannot call invoke with null operation name");
1874:                }
1875:
1876:                if (this .bdbjeOperationsNameList.contains(operationName)) {
1877:                    try {
1878:                        Object o = this .bdbjeMBeanHelper.invoke(this .controller
1879:                                .getBdbEnvironment(), operationName, params,
1880:                                signature);
1881:                        // If OP_DB_ST, return String version of result.
1882:                        if (operationName.equals(OP_DB_STAT)) {
1883:                            return o.toString();
1884:                        }
1885:                        return o;
1886:                    } catch (MBeanException e) {
1887:                        throw new RuntimeOperationsException(
1888:                                new RuntimeException(e));
1889:                    }
1890:                }
1891:
1892:                // TODO: Exploit passed signature.
1893:
1894:                // The pattern in the below is to match an operation and when found
1895:                // do a return out of if clause.  Doing it this way, I can fall
1896:                // on to the MethodNotFoundException for case where we've an
1897:                // attribute but no handler.
1898:                if (operationName.equals(IMPORT_URI_OPER)) {
1899:                    JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1900:                    mustBeCrawling();
1901:                    try {
1902:                        importUri((String) params[0], ((Boolean) params[1])
1903:                                .booleanValue(), ((Boolean) params[2])
1904:                                .booleanValue());
1905:                    } catch (URIException e) {
1906:                        throw new RuntimeOperationsException(
1907:                                new RuntimeException(e));
1908:                    }
1909:                    return null;
1910:                }
1911:
1912:                if (operationName.equals(IMPORT_URIS_OPER)) {
1913:                    JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1914:                    mustBeCrawling();
1915:                    return importUris((String) params[0], ((String) params[1])
1916:                            .toString(), ((Boolean) params[2]).booleanValue(),
1917:                            ((Boolean) params[3]).booleanValue());
1918:                }
1919:
1920:                if (operationName.equals(PAUSE_OPER)) {
1921:                    JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1922:                    mustBeCrawling();
1923:                    pause();
1924:                    return null;
1925:                }
1926:
1927:                if (operationName.equals(RESUME_OPER)) {
1928:                    JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1929:                    mustBeCrawling();
1930:                    resume();
1931:                    return null;
1932:                }
1933:
1934:                if (operationName.equals(FRONTIER_REPORT_OPER)) {
1935:                    JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1936:                    mustBeCrawling();
1937:                    return getFrontierReport((String) params[0]);
1938:                }
1939:
1940:                if (operationName.equals(THREADS_REPORT_OPER)) {
1941:                    JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1942:                    mustBeCrawling();
1943:                    return getThreadsReport();
1944:                }
1945:
1946:                if (operationName.equals(SEEDS_REPORT_OPER)) {
1947:                    JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1948:                    mustBeCrawling();
1949:                    StringWriter sw = new StringWriter();
1950:                    if (getStatisticsTracking() != null
1951:                            && getStatisticsTracking() instanceof  StatisticsTracker) {
1952:                        ((StatisticsTracker) getStatisticsTracking())
1953:                                .writeSeedsReportTo(new PrintWriter(sw));
1954:                    } else {
1955:                        sw.write("Unsupported");
1956:                    }
1957:                    return sw.toString();
1958:                }
1959:
1960:                if (operationName.equals(CHECKPOINT_OPER)) {
1961:                    JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1962:                    mustBeCrawling();
1963:                    try {
1964:                        checkpoint();
1965:                    } catch (IllegalStateException e) {
1966:                        throw new RuntimeOperationsException(e);
1967:                    }
1968:                    return null;
1969:                }
1970:
1971:                if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1972:                    JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params,
1973:                            0);
1974:                    mustBeCrawling();
1975:                    return getStatisticsTracking().getProgressStatisticsLine();
1976:                }
1977:
1978:                if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1979:                    JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1980:                            params, 0);
1981:                    return getStatisticsTracking().progressStatisticsLegend();
1982:                }
1983:
1984:                throw new ReflectionException(new NoSuchMethodException(
1985:                        operationName), "Cannot find the operation "
1986:                        + operationName);
1987:            }
1988:
1989:            public void mustBeCrawling() {
1990:                if (!isCrawling()) {
1991:                    throw new RuntimeOperationsException(
1992:                            new IllegalArgumentException("Not "
1993:                                    + "crawling (Shouldn't ever be the case)"),
1994:                            "Not current crawling job?");
1995:                }
1996:            }
1997:
1998:            public boolean isCrawling() {
1999:                return this .controller != null;
2000:            }
2001:
2002:            /**
2003:             * Utility method to get the stored list of ignored seed items (if any),
2004:             * from the last time the seeds were imported to the frontier.
2005:             * 
2006:             * @return String of all ignored seed items, or null if none
2007:             */
2008:            public String getIgnoredSeeds() {
2009:                File ignoredFile = new File(getDirectory(),
2010:                        AbstractFrontier.IGNORED_SEEDS_FILENAME);
2011:                if (!ignoredFile.exists()) {
2012:                    return null;
2013:                }
2014:                try {
2015:                    return FileUtils.readFileAsString(ignoredFile);
2016:                } catch (IOException e) {
2017:                    // TODO Auto-generated catch block
2018:                    e.printStackTrace();
2019:                    return null;
2020:                }
2021:            }
2022:
2023:            /**
2024:             * Forward a 'kick' update to current controller if any.
2025:             * @see CrawlController#kickUpdate()
2026:             */
2027:            public void kickUpdate() {
2028:                if (this .controller != null) {
2029:                    this .controller.kickUpdate();
2030:                }
2031:            }
2032:
2033:            /**
2034:             * Returns a URIFrontierMarker for the current, paused, job. If there is no
2035:             * current job or it is not paused null will be returned.
2036:             *
2037:             * @param regexpr A regular expression that each URI must match in order to
2038:             * be considered 'within' the marker.
2039:             * @param inCacheOnly Limit marker scope to 'cached' URIs.
2040:             * @return a URIFrontierMarker for the current job.
2041:             * @see #getPendingURIsList(FrontierMarker, int, boolean)
2042:             * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
2043:             *      boolean)
2044:             * @see org.archive.crawler.framework.FrontierMarker
2045:             */
2046:            public FrontierMarker getInitialMarker(String regexpr,
2047:                    boolean inCacheOnly) {
2048:                return (this .controller != null && this .controller.isPaused()) ? this .controller
2049:                        .getFrontier().getInitialMarker(regexpr, inCacheOnly)
2050:                        : null;
2051:            }
2052:
2053:            /**
2054:             * Returns the frontiers URI list based on the provided marker. This method
2055:             * will return null if there is not current job or if the current job is
2056:             * not paused. Only when there is a paused current job will this method
2057:             * return a URI list.
2058:             *
2059:             * @param marker URIFrontier marker
2060:             * @param numberOfMatches Maximum number of matches to return
2061:             * @param verbose Should detailed info be provided on each URI?
2062:             * @return the frontiers URI list based on the provided marker
2063:             * @throws InvalidFrontierMarkerException
2064:             *             When marker is inconsistent with the current state of the
2065:             *             frontier.
2066:             * @see #getInitialMarker(String, boolean)
2067:             * @see org.archive.crawler.framework.FrontierMarker
2068:             */
2069:            public ArrayList getPendingURIsList(FrontierMarker marker,
2070:                    int numberOfMatches, boolean verbose)
2071:                    throws InvalidFrontierMarkerException {
2072:                return (this .controller != null && this .controller.isPaused()) ? this .controller
2073:                        .getFrontier().getURIsList(marker, numberOfMatches,
2074:                                verbose)
2075:                        : null;
2076:            }
2077:
2078:            public void crawlStarted(String message) {
2079:                if (this .mbeanName != null) {
2080:                    // Can be null around job startup.
2081:                    sendNotification(new Notification("crawlStarted",
2082:                            this .mbeanName, getNotificationsSequenceNumber(),
2083:                            message));
2084:                }
2085:            }
2086:
2087:            public void crawlEnding(String sExitMessage) {
2088:                setRunning(false);
2089:                setStatus(sExitMessage);
2090:                setReadOnly();
2091:                if (this .mbeanName != null) {
2092:                    sendNotification(new Notification("crawlEnding",
2093:                            this .mbeanName, getNotificationsSequenceNumber(),
2094:                            sExitMessage));
2095:                }
2096:            }
2097:
2098:            public void crawlEnded(String sExitMessage) {
2099:                // Let the settings handler be cleaned up by the crawl controller
2100:                // completeStop. Just let go of our reference in here.
2101:                // if (this.settingsHandler != null) {
2102:                //    this.settingsHandler.cleanup();
2103:                // }
2104:
2105:                // We used to zero-out datamembers but no longer needed now CrawlJobs
2106:                // no longer persist after completion (They used to be kept around in
2107:                // a list so operator could view CrawlJob finish state and reports --
2108:                // but we now dump actual job and create a new uninitialized CrawlJob
2109:                // that points at old CrawlJob data. 
2110:            }
2111:
2112:            public void crawlPausing(String statusMessage) {
2113:                setStatus(statusMessage);
2114:            }
2115:
2116:            public void crawlPaused(String statusMessage) {
2117:                setStatus(statusMessage);
2118:                if (this .mbeanName != null) {
2119:                    // Can be null around job startup.
2120:                    sendNotification(new Notification("crawlPaused",
2121:                            this .mbeanName, getNotificationsSequenceNumber(),
2122:                            statusMessage));
2123:                }
2124:            }
2125:
2126:            public void crawlResuming(String statusMessage) {
2127:                setStatus(statusMessage);
2128:                if (this .mbeanName != null) {
2129:                    // Can be null around job startup.
2130:                    sendNotification(new Notification("crawlResuming",
2131:                            this .mbeanName, getNotificationsSequenceNumber(),
2132:                            statusMessage));
2133:                }
2134:            }
2135:
2136:            public void crawlCheckpoint(File checkpointDir) throws Exception {
2137:                setStatus(CrawlJob.STATUS_CHECKPOINTING);
2138:            }
2139:
2140:            public CrawlController getController() {
2141:                return this .controller;
2142:            }
2143:
2144:            public ObjectName preRegister(final MBeanServer server,
2145:                    ObjectName on) throws Exception {
2146:                this .mbeanServer = server;
2147:                @SuppressWarnings("unchecked")
2148:                Hashtable<String, String> ht = on.getKeyPropertyList();
2149:                if (!ht.containsKey(JmxUtils.NAME)) {
2150:                    throw new IllegalArgumentException("Name property required"
2151:                            + on.getCanonicalName());
2152:                }
2153:                // Now append key/values from hosting heritrix JMX ObjectName so it can be
2154:                // found just by examination of the CrawlJob JMX ObjectName.  Add heritrix
2155:                // name attribute as 'mother' attribute.
2156:                Heritrix h = getHostingHeritrix();
2157:                if (h == null || h.getMBeanName() == null) {
2158:                    throw new IllegalArgumentException(
2159:                            "Hosting heritrix not found "
2160:                                    + "or not registered with JMX: "
2161:                                    + on.getCanonicalName());
2162:                }
2163:                @SuppressWarnings("unchecked")
2164:                Map<String, String> hht = h.getMBeanName().getKeyPropertyList();
2165:                ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2166:                String port = hht.get(JmxUtils.JMX_PORT);
2167:                if (port != null) {
2168:                    ht.put(JmxUtils.JMX_PORT, port);
2169:                }
2170:                ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2171:                if (!ht.containsKey(JmxUtils.TYPE)) {
2172:                    ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2173:                }
2174:                this .mbeanName = new ObjectName(on.getDomain(), ht);
2175:                return this .mbeanName;
2176:            }
2177:
2178:            public void postRegister(Boolean registrationDone) {
2179:                if (logger.isLoggable(Level.INFO)) {
2180:                    logger.info(JmxUtils.getLogRegistrationMsg(this .mbeanName
2181:                            .getCanonicalName(), this .mbeanServer,
2182:                            registrationDone.booleanValue()));
2183:                }
2184:            }
2185:
2186:            public void preDeregister() throws Exception {
2187:                // Nothing to do.
2188:            }
2189:
2190:            public void postDeregister() {
2191:                if (mbeanName == null) {
2192:                    return;
2193:                }
2194:                if (logger.isLoggable(Level.INFO)) {
2195:                    logger.info(JmxUtils.getLogUnregistrationMsg(this .mbeanName
2196:                            .getCanonicalName(), this .mbeanServer));
2197:                }
2198:                this .mbeanName = null;
2199:            }
2200:
2201:            /**
2202:             * @return Heritrix that is hosting this job.
2203:             */
2204:            protected Heritrix getHostingHeritrix() {
2205:                Heritrix hostingHeritrix = null;
2206:                Map heritrice = Heritrix.getInstances();
2207:                for (final Iterator i = heritrice.keySet().iterator(); i
2208:                        .hasNext();) {
2209:                    Heritrix h = (Heritrix) heritrice.get(i.next());
2210:                    if (h.getJobHandler().getCurrentJob() == this ) {
2211:                        hostingHeritrix = h;
2212:                        break;
2213:                    }
2214:                }
2215:                return hostingHeritrix;
2216:            }
2217:
2218:            /**
2219:             * @return Unique name for job that is safe to use in jmx (Like display
2220:             * name but without spaces).
2221:             */
2222:            public String getJmxJobName() {
2223:                return getJobName() + "-" + getUID();
2224:            }
2225:
2226:            /**
2227:             * @return Notification sequence number (Does increment after each access).
2228:             */
2229:            protected static int getNotificationsSequenceNumber() {
2230:                return notificationsSequenceNumber++;
2231:            }
2232:
2233:            protected ObjectName getMbeanName() {
2234:                return this .mbeanName;
2235:            }
2236:
2237:            /**
2238:             * @return the statistics tracking instance (of null if none yet available).
2239:             */
2240:            public StatisticsTracking getStatisticsTracking() {
2241:                return this.controller == null
2242:                        || this.controller.getStatistics() == null ? null
2243:                        : this.controller.getStatistics();
2244:            }
2245:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.