0001: /* CrawlJobHandler
0002: *
0003: * $Id: CrawlJobHandler.java 5055 2007-04-10 22:12:56Z gojomo $
0004: *
0005: * Copyright (C) 2003 Internet Archive.
0006: *
0007: * This file is part of the Heritrix web crawler (crawler.archive.org).
0008: *
0009: * Heritrix is free software; you can redistribute it and/or modify
0010: * it under the terms of the GNU Lesser Public License as published by
0011: * the Free Software Foundation; either version 2.1 of the License, or
0012: * any later version.
0013: *
0014: * Heritrix is distributed in the hope that it will be useful,
0015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0017: * GNU Lesser Public License for more details.
0018: *
0019: * You should have received a copy of the GNU Lesser Public License
0020: * along with Heritrix; if not, write to the Free Software
0021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0022: */
0023: package org.archive.crawler.admin;
0024:
0025: import java.io.BufferedReader;
0026: import java.io.BufferedWriter;
0027: import java.io.File;
0028: import java.io.FileWriter;
0029: import java.io.FilenameFilter;
0030: import java.io.IOException;
0031: import java.io.InputStream;
0032: import java.io.InputStreamReader;
0033: import java.net.URL;
0034: import java.net.URI;
0035: import java.util.ArrayList;
0036: import java.util.Comparator;
0037: import java.util.Date;
0038: import java.util.Enumeration;
0039: import java.util.Iterator;
0040: import java.util.List;
0041: import java.util.TreeSet;
0042: import java.util.logging.Level;
0043: import java.util.logging.Logger;
0044:
0045: import javax.management.Attribute;
0046: import javax.management.AttributeNotFoundException;
0047: import javax.management.InvalidAttributeValueException;
0048: import javax.management.MBeanException;
0049: import javax.management.ReflectionException;
0050:
0051: import org.apache.commons.httpclient.URIException;
0052: import org.archive.crawler.Heritrix;
0053: import org.archive.crawler.datamodel.CrawlOrder;
0054: import org.archive.crawler.event.CrawlStatusListener;
0055: import org.archive.crawler.framework.FrontierMarker;
0056: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0057: import org.archive.crawler.framework.exceptions.InitializationException;
0058: import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
0059: import org.archive.crawler.frontier.FrontierJournal;
0060: import org.archive.crawler.frontier.RecoveryJournal;
0061: import org.archive.crawler.settings.ComplexType;
0062: import org.archive.crawler.settings.CrawlerSettings;
0063: import org.archive.crawler.settings.SettingsHandler;
0064: import org.archive.crawler.settings.XMLSettingsHandler;
0065: import org.archive.util.ArchiveUtils;
0066: import org.archive.util.FileUtils;
0067:
0068: /**
0069: * This class manages CrawlJobs. Submitted crawl jobs are queued up and run
0070: * in order when the crawler is running.
0071: * <p>Basically this provides a layer between any potential user interface and
0072: * the CrawlJobs. It keeps the lists of completed jobs, pending jobs, etc.
0073: * <p>
0074: * The jobs managed by the handler can be divided into the following:
0075: * <ul>
0076: * <li> <code>Pending</code> - Jobs that are ready to run and are waiting their
0077: * turn. These can be edited, viewed, deleted etc.
0078: * <li> <code>Running</code> - Only one job can be running at a time. There may
0079: * be no job running. The running job can be viewed
0080: * and edited to some extent. It can also be
0081: * terminated. This job should have a
0082: * StatisticsTracking module attached to it for more
0083: * details on the crawl.
0084: * <li><code>Completed</code> - Jobs that have finished crawling or have been
0085: * deleted from the pending queue or terminated
0086: * while running. They can not be edited but can be
0087: * viewed. They retain the StatisticsTracking
0088: * module from their run.
0089: * <li> <code>New job</code> - At any given time their can be one 'new job' the
0090: * new job is not considered ready to run. It can
0091: * be edited or discarded (in which case it will be
0092: * totally destroyed, including any files on disk).
0093: * Once an operator deems the job ready to run it
0094: * can be moved to the pending queue.
0095: * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can
0096: * be edited normally but can not be submitted to
0097: * the pending queue. New jobs can be created
0098: * using a profile as it's template.
0099: *
0100: * @author Kristinn Sigurdsson
0101: *
0102: * @see org.archive.crawler.admin.CrawlJob
0103: */
0104:
0105: public class CrawlJobHandler implements CrawlStatusListener {
0106: private static final Logger logger = Logger
0107: .getLogger(CrawlJobHandler.class.getName());
0108:
0109: /**
0110: * Name of system property whose specification overrides default profile
0111: * used.
0112: *
0113: */
0114: public static final String DEFAULT_PROFILE_NAME = "heritrix.default.profile";
0115:
0116: /**
0117: * Default profile name.
0118: */
0119: public static final String DEFAULT_PROFILE = "default";
0120:
0121: /**
0122: * Name of the profiles directory.
0123: */
0124: public static final String PROFILES_DIR_NAME = "profiles";
0125:
0126: public static final String ORDER_FILE_NAME = "order.xml";
0127:
0128: /**
0129: * Job currently being crawled.
0130: */
0131: private CrawlJob currentJob = null;
0132:
0133: /**
0134: * A new job that is being created/configured. Not yet ready for crawling.
0135: */
0136: private CrawlJob newJob = null;
0137:
0138: /**
0139: * Thread to start the next job in background
0140: */
0141: private Thread startingNextJob = null;
0142:
0143: /**
0144: * A list of pending CrawlJobs.
0145: */
0146: private TreeSet<CrawlJob> pendingCrawlJobs;
0147:
0148: /**
0149: * A list of completed CrawlJobs.
0150: */
0151: //private Vector completedCrawlJobs = new Vector();
0152: private TreeSet<CrawlJob> completedCrawlJobs;
0153:
0154: /**
0155: * A list of profile CrawlJobs.
0156: */
0157: private TreeSet<CrawlJob> profileJobs;
0158:
0159: // The UIDs of profiles should be NOT be timestamps. A descriptive name is
0160: // ideal.
0161: private String defaultProfile = null;
0162:
0163: /**
0164: * If true the crawler is 'running'. That is the next pending job will start
0165: * crawling as soon as the current job (if any) is completed.
0166: */
0167: private boolean running = false;
0168:
0169: /**
0170: * String to indicate recovery should be based on the recovery log, not
0171: * based on checkpointing.
0172: */
0173: public static final String RECOVER_LOG = "recover";
0174:
0175: /**
0176: * Jobs directory.
0177: */
0178: private final File jobsDir;
0179:
0180: /**
0181: * Constructor.
0182: * @param jobsDir Jobs directory.
0183: */
0184: public CrawlJobHandler(final File jobsDir) {
0185: this (jobsDir, true, true);
0186: }
0187:
0188: /**
0189: * Constructor allowing for optional loading of profiles and jobs.
0190: * @param jobsDir Jobs directory.
0191: * @param loadJobs If true then any applicable jobs will be loaded.
0192: * @param loadProfiles If true then any applicable profiles will be loaded.
0193: */
0194: public CrawlJobHandler(final File jobsDir, final boolean loadJobs,
0195: final boolean loadProfiles) {
0196: this .jobsDir = jobsDir;
0197: // Make a comparator for CrawlJobs.
0198: Comparator<CrawlJob> comp = new Comparator<CrawlJob>() {
0199: public int compare(CrawlJob job1, CrawlJob job2) {
0200: if (job1.getJobPriority() < job2.getJobPriority()) {
0201: return -1;
0202: } else if (job1.getJobPriority() > job2
0203: .getJobPriority()) {
0204: return 1;
0205: } else {
0206: // Same priority, use UID (which should be a timestamp).
0207: // Lower UID (string compare) means earlier time.
0208: return job1.getUID().compareTo(job2.getUID());
0209: }
0210: }
0211: };
0212: this .pendingCrawlJobs = new TreeSet<CrawlJob>(comp);
0213: this .completedCrawlJobs = new TreeSet<CrawlJob>(comp);
0214: // Profiles always have the same priority so it will be sorted by name
0215: this .profileJobs = new TreeSet<CrawlJob>(comp);
0216: if (loadProfiles) {
0217: loadProfiles();
0218: }
0219: if (loadJobs) {
0220: loadJobs();
0221: }
0222: }
0223:
0224: /**
0225: * Find the state.job file in the job directory.
0226: * @param jobDir Directory to look in.
0227: * @return Full path to 'state.job' file or null if none found.
0228: */
0229: protected File getStateJobFile(final File jobDir) {
0230: // Need to find job file ('state.job').
0231: File[] jobFiles = jobDir.listFiles(new FilenameFilter() {
0232: public boolean accept(File dir, String name) {
0233: return name.toLowerCase().endsWith(".job")
0234: && (new File(dir, name)).canRead();
0235: }
0236:
0237: });
0238: return (jobFiles.length == 1) ? jobFiles[0] : null;
0239: }
0240:
0241: /**
0242: * Loads any availible jobs in the jobs directory.
0243: * <p>
0244: * Availible jobs are any directory containing a file called
0245: * <code>state.job</code>. The file must contain valid job information.
0246: */
0247: private void loadJobs() {
0248: this .jobsDir.mkdirs();
0249: File[] jobs = this .jobsDir.listFiles();
0250: for (int i = 0; i < jobs.length; i++) {
0251: if (jobs[i].isDirectory()) {
0252: File jobFile = getStateJobFile(jobs[i]);
0253: if (jobFile != null) {
0254: loadJob(jobFile);
0255: }
0256: }
0257: }
0258: }
0259:
0260: /**
0261: * Loads a job given a specific job file. The loaded job will be placed in
0262: * the list of completed jobs or pending queue depending on its status.
0263: * Running jobs will have their status set to 'finished abnormally' and put
0264: * into the completed list.
0265: * @param job The job file of the job to load.
0266: */
0267: protected void loadJob(final File job) {
0268: CrawlJob cjob = null;
0269: try {
0270: // Load the CrawlJob
0271: cjob = new CrawlJob(job, new CrawlJobErrorHandler());
0272: } catch (InvalidJobFileException e) {
0273: logger.log(Level.INFO, "Invalid job file for "
0274: + job.getAbsolutePath(), e);
0275: return;
0276: } catch (IOException e) {
0277: logger.log(Level.INFO, "IOException for " + job.getName()
0278: + ", " + job.getAbsolutePath(), e);
0279: return;
0280: }
0281:
0282: // TODO: Move test into CrawlJob.
0283: // Check job status and place it accordingly.
0284: if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING)
0285: || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED)
0286: || cjob.getStatus().equals(
0287: CrawlJob.STATUS_CHECKPOINTING)
0288: || cjob.getStatus().equals(
0289: CrawlJob.STATUS_WAITING_FOR_PAUSE)) {
0290: // Was a running job.
0291: cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL);
0292: this .completedCrawlJobs.add(cjob);
0293: } else if (cjob.getStatus().equals(CrawlJob.STATUS_PENDING)) {
0294: // Was a pending job.
0295: this .pendingCrawlJobs.add(cjob);
0296: } else if (cjob.getStatus().equals(CrawlJob.STATUS_CREATED)
0297: || cjob.getStatus().equals(CrawlJob.STATUS_DELETED)) {
0298: // Ignore for now. TODO: Add to 'recycle bin'
0299: } else {
0300: // Must have been completed.
0301: this .completedCrawlJobs.add(cjob);
0302: }
0303: }
0304:
0305: /**
0306: * Looks in conf dir for a profiles dir.
0307: * @return the directory where profiles are stored else null if none
0308: * available
0309: * @throws IOException
0310: */
0311: private File getProfilesDirectory() throws IOException {
0312: URL webappProfilePath = Heritrix.class.getResource("/"
0313: + PROFILES_DIR_NAME);
0314: if (webappProfilePath != null) {
0315: try {
0316: return new File(new URI(webappProfilePath.toString()));
0317: } catch (java.lang.IllegalArgumentException e) {
0318: // e.g. "profiles" within a jar file
0319: // try Heritrix.getConfdir() in this case
0320: } catch (java.net.URISyntaxException e) {
0321: e.printStackTrace();
0322: }
0323: }
0324: return (Heritrix.getConfdir(false) == null) ? null : new File(
0325: Heritrix.getConfdir().getAbsolutePath(),
0326: PROFILES_DIR_NAME);
0327: }
0328:
0329: /**
0330: * Loads the default profile and all other profiles found on disk.
0331: */
0332: private void loadProfiles() {
0333: boolean loadedDefault = false;
0334: File profileDir = null;
0335: try {
0336: profileDir = getProfilesDirectory();
0337: } catch (IOException e) {
0338: e.printStackTrace();
0339: }
0340: if (profileDir != null) {
0341: File[] ps = profileDir.listFiles();
0342: if (ps != null && ps.length > 0) {
0343: for (int i = 0; i < ps.length; i++) {
0344: File f = ps[i];
0345: if (f.isDirectory()) {
0346: // Each directory in the profiles directory should
0347: // contain the file order.xml.
0348: File profile = new File(f, ORDER_FILE_NAME);
0349: if (profile.canRead()) {
0350: boolean b = loadProfile(profile);
0351: if (b) {
0352: loadedDefault = b;
0353: }
0354: }
0355: }
0356: }
0357: }
0358: }
0359: // Now add in the default profile. Its on the CLASSPATH and needs
0360: // special handling. Don't add if already a default present.
0361: String parent = File.separator + PROFILES_DIR_NAME
0362: + File.separator;
0363: if (!loadedDefault) {
0364: loadProfile(new File(parent + DEFAULT_PROFILE,
0365: ORDER_FILE_NAME));
0366: }
0367: // Look to see if a default profile system property has been
0368: // supplied. If so, use it instead.
0369: // TODO: Try and read default profile from some permanent storage.
0370: defaultProfile = DEFAULT_PROFILE;
0371: }
0372:
0373: /**
0374: * Load one profile.
0375: * @param profile Profile to load.
0376: * @return True if loaded profile was the default profile.
0377: */
0378: protected boolean loadProfile(File profile) {
0379: boolean loadedDefault = false;
0380: // Ok, got the order file for this profile.
0381: try {
0382: // The directory name denotes the profiles UID and name.
0383: XMLSettingsHandler newSettingsHandler = new XMLSettingsHandler(
0384: profile);
0385: CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(
0386: Level.SEVERE);
0387: newSettingsHandler.setErrorReportingLevel(cjseh.getLevel());
0388: newSettingsHandler.initialize();
0389: addProfile(new CrawlJob(profile.getParentFile().getName(),
0390: newSettingsHandler, cjseh));
0391: loadedDefault = profile.getParentFile().getName().equals(
0392: DEFAULT_PROFILE);
0393: } catch (InvalidAttributeValueException e) {
0394: System.err.println("Failed to load profile '"
0395: + profile.getParentFile().getName()
0396: + "'. InvalidAttributeValueException.");
0397: }
0398: return loadedDefault;
0399: }
0400:
0401: /**
0402: * Add a new profile
0403: * @param profile The new profile
0404: */
0405: public synchronized void addProfile(CrawlJob profile) {
0406: profileJobs.add(profile);
0407: }
0408:
0409: public synchronized void deleteProfile(CrawlJob cj)
0410: throws IOException {
0411: File d = getProfilesDirectory();
0412: File p = new File(d, cj.getJobName());
0413: if (!p.exists()) {
0414: throw new IOException("No profile named " + cj.getJobName()
0415: + " at " + d.getAbsolutePath());
0416: }
0417: FileUtils.deleteDir(p);
0418: this .profileJobs.remove(cj);
0419: }
0420:
0421: /**
0422: * Returns a List of all known profiles.
0423: * @return a List of all known profiles.
0424: */
0425: public synchronized List<CrawlJob> getProfiles() {
0426: ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(profileJobs
0427: .size());
0428: tmp.addAll(profileJobs);
0429: return tmp;
0430: }
0431:
0432: /**
0433: * Submit a job to the handler. Job will be scheduled for crawling. At
0434: * present it will not take the job's priority into consideration.
0435: *
0436: * @param job A new job for the handler
0437: * @return CrawlJob that was added or null.
0438: */
0439: public CrawlJob addJob(CrawlJob job) {
0440: if (job.isProfile()) {
0441: return null; // Can't crawl profiles.
0442: }
0443: job.setStatus(CrawlJob.STATUS_PENDING);
0444: if (job.isNew()) {
0445: // Are adding the new job to the pending queue.
0446: this .newJob = null;
0447: job.setNew(false);
0448: }
0449: this .pendingCrawlJobs.add(job);
0450: if (isCrawling() == false && isRunning()) {
0451: // Start crawling
0452: startNextJob();
0453: }
0454: return job;
0455: }
0456:
0457: /**
0458: * Returns the default profile. If no default profile has been set it will
0459: * return the first profile that was set/loaded and still exists. If no
0460: * profiles exist it will return null
0461: * @return the default profile.
0462: */
0463: public synchronized CrawlJob getDefaultProfile() {
0464: if (defaultProfile != null) {
0465: for (Iterator it = profileJobs.iterator(); it.hasNext();) {
0466: CrawlJob item = (CrawlJob) it.next();
0467: if (item.getJobName().equals(defaultProfile)) {
0468: // Found it.
0469: return item;
0470: }
0471: }
0472: }
0473: if (profileJobs.size() > 0) {
0474: return (CrawlJob) profileJobs.first();
0475: }
0476: return null;
0477: }
0478:
0479: /**
0480: * Set the default profile.
0481: * @param profile The new default profile. The following must apply to it.
0482: * profile.isProfile() should return true and
0483: * this.getProfiles() should contain it.
0484: */
0485: public void setDefaultProfile(CrawlJob profile) {
0486: defaultProfile = profile.getJobName();
0487: // TODO: Make changes to default profile durable across restarts.
0488: }
0489:
0490: /**
0491: * A List of all pending jobs
0492: *
0493: * @return A List of all pending jobs.
0494: * No promises are made about the order of the list
0495: */
0496: public List<CrawlJob> getPendingJobs() {
0497: ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(
0498: pendingCrawlJobs.size());
0499: tmp.addAll(pendingCrawlJobs);
0500: return tmp;
0501: }
0502:
0503: /**
0504: * @return The job currently being crawled.
0505: */
0506: public CrawlJob getCurrentJob() {
0507: return currentJob;
0508: }
0509:
0510: /**
0511: * @return A List of all finished jobs.
0512: */
0513: public List<CrawlJob> getCompletedJobs() {
0514: ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(
0515: completedCrawlJobs.size());
0516: tmp.addAll(completedCrawlJobs);
0517: return tmp;
0518: }
0519:
0520: /**
0521: * Return a job with the given UID.
0522: * Doesn't matter if it's pending, currently running, has finished running
0523: * is new or a profile.
0524: *
0525: * @param jobUID The unique ID of the job.
0526: * @return The job with the UID or null if no such job is found
0527: */
0528: public CrawlJob getJob(String jobUID) {
0529: if (jobUID == null) {
0530: return null; // UID can't be null
0531: }
0532: // First check currently running job
0533: if (currentJob != null && currentJob.getUID().equals(jobUID)) {
0534: return currentJob;
0535: } else if (newJob != null && newJob.getUID().equals(jobUID)) {
0536: // Then check the 'new job'
0537: return newJob;
0538: } else {
0539: // Then check pending jobs.
0540: Iterator itPend = pendingCrawlJobs.iterator();
0541: while (itPend.hasNext()) {
0542: CrawlJob cj = (CrawlJob) itPend.next();
0543: if (cj.getUID().equals(jobUID)) {
0544: return cj;
0545: }
0546: }
0547:
0548: // Next check completed jobs.
0549: Iterator itComp = completedCrawlJobs.iterator();
0550: while (itComp.hasNext()) {
0551: CrawlJob cj = (CrawlJob) itComp.next();
0552: if (cj.getUID().equals(jobUID)) {
0553: return cj;
0554: }
0555: }
0556:
0557: // And finally check the profiles.
0558: for (Iterator i = getProfiles().iterator(); i.hasNext();) {
0559: CrawlJob cj = (CrawlJob) i.next();
0560: if (cj.getUID().equals(jobUID)) {
0561: return cj;
0562: }
0563: }
0564: }
0565: return null; // Nothing found, return null
0566: }
0567:
0568: /**
0569: * @return True if we terminated a current job (False if no job to
0570: * terminate)
0571: */
0572: public boolean terminateCurrentJob() {
0573: if (this .currentJob == null) {
0574: return false;
0575: }
0576: // requestCrawlStop will cause crawlEnding to be invoked.
0577: // It will handle the clean up.
0578: this .currentJob.stopCrawling();
0579: synchronized (this ) {
0580: try {
0581: // Take a few moments so that the controller can change
0582: // states before the UI updates. The CrawlEnding event
0583: // will wake us if it occurs sooner than this.
0584: wait(3000);
0585: } catch (InterruptedException e) {
0586: // Ignore.
0587: }
0588: }
0589: return true;
0590: }
0591:
0592: /**
0593: * The specified job will be removed from the pending queue or aborted if
0594: * currently running. It will be placed in the list of completed jobs with
0595: * appropriate status info. If the job is already in the completed list or
0596: * no job with the given UID is found, no action will be taken.
0597: *
0598: * @param jobUID The UID (unique ID) of the job that is to be deleted.
0599: *
0600: */
0601: public void deleteJob(String jobUID) {
0602: // First check to see if we are deleting the current job.
0603: if (currentJob != null && jobUID.equals(currentJob.getUID())) {
0604: terminateCurrentJob();
0605: return; // We're not going to find another job with the same UID
0606: }
0607:
0608: // Ok, it isn't the current job, let's check the pending jobs.
0609: for (Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) {
0610: CrawlJob cj = (CrawlJob) it.next();
0611: if (cj.getUID().equals(jobUID)) {
0612: // Found the one to delete.
0613: cj.setStatus(CrawlJob.STATUS_DELETED);
0614: it.remove();
0615: return; // We're not going to find another job with the same UID
0616: }
0617: }
0618:
0619: // And finally the completed jobs.
0620: for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) {
0621: CrawlJob cj = (CrawlJob) it.next();
0622: if (cj.getUID().equals(jobUID)) {
0623: // Found the one to delete.
0624: cj.setStatus(CrawlJob.STATUS_DELETED);
0625: it.remove();
0626: return; // No other job will have the same UID
0627: }
0628: }
0629: }
0630:
0631: /**
0632: * Cause the current job to pause. If no current job is crawling this
0633: * method will have no effect.
0634: */
0635: public void pauseJob() {
0636: if (this .currentJob != null) {
0637: this .currentJob.pause();
0638: }
0639: }
0640:
0641: /**
0642: * Cause the current job to resume crawling if it was paused. Will have no
0643: * effect if the current job was not paused or if there is no current job.
0644: * If the current job is still waiting to pause, this will not take effect
0645: * until the job has actually paused. At which time it will immeditatly
0646: * resume crawling.
0647: */
0648: public void resumeJob() {
0649: if (this .currentJob != null) {
0650: this .currentJob.resume();
0651: }
0652: }
0653:
0654: /**
0655: * Cause the current job to write a checkpoint to disk. Currently
0656: * requires job to already be paused.
0657: * @throws IllegalStateException Thrown if crawl is not paused.
0658: */
0659: public void checkpointJob() throws IllegalStateException {
0660: if (this .currentJob != null) {
0661: this .currentJob.checkpoint();
0662: }
0663: }
0664:
0665: /**
0666: * Returns a unique job ID.
0667: * <p>
0668: * No two calls to this method (on the same instance of this class) can ever
0669: * return the same value. <br>
0670: * Currently implemented to return a time stamp. That is subject to change
0671: * though.
0672: *
0673: * @return A unique job ID.
0674: *
0675: * @see ArchiveUtils#TIMESTAMP17
0676: */
0677: public String getNextJobUID() {
0678: return ArchiveUtils.get17DigitDate();
0679: }
0680:
0681: /**
0682: * Creates a new job. The new job will be returned and also registered as
0683: * the handler's 'new job'. The new job will be based on the settings
0684: * provided but created in a new location on disk.
0685: *
0686: * @param baseOn
0687: * A CrawlJob (with a valid settingshandler) to use as the
0688: * template for the new job.
0689: * @param recovery Whether to preinitialize new job as recovery of
0690: * <code>baseOn</code> job. String holds RECOVER_LOG if we are to
0691: * do the recovery based off the recover.gz log -- See RecoveryJournal in
0692: * the frontier package -- or it holds the name of
0693: * the checkpoint we're to use recoverying.
0694: * @param name
0695: * The name of the new job.
0696: * @param description
0697: * Descriptions of the job.
0698: * @param seeds
0699: * The contents of the new settings' seed file.
0700: * @param priority
0701: * The priority of the new job.
0702: *
0703: * @return The new crawl job.
0704: * @throws FatalConfigurationException If a problem occurs creating the
0705: * settings.
0706: */
0707: public CrawlJob newJob(CrawlJob baseOn, String recovery,
0708: String name, String description, String seeds, int priority)
0709: throws FatalConfigurationException {
0710: // See what the recover story is.
0711: File recover = null;
0712: try {
0713: if (recovery != null && recovery.length() > 0
0714: && recovery.equals(RECOVER_LOG)) {
0715: // Then we're to do a recovery based off the RecoveryJournal
0716: // recover.gz log.
0717: File dir = baseOn.getSettingsHandler().getOrder()
0718: .getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
0719: // Add name of recover file. We're hardcoding it as
0720: // 'recover.gz'.
0721: recover = new File(dir, FrontierJournal.LOGNAME_RECOVER);
0722: } else if (recovery != null && recovery.length() > 0) {
0723: // Must be name of a checkpoint to use.
0724: recover = new File(baseOn.getSettingsHandler()
0725: .getOrder().getSettingsDir(
0726: CrawlOrder.ATTR_CHECKPOINTS_PATH),
0727: recovery);
0728: }
0729: } catch (AttributeNotFoundException e1) {
0730: throw new FatalConfigurationException(
0731: "AttributeNotFoundException occured while setting up"
0732: + "new job/profile " + name + " \n"
0733: + e1.getMessage());
0734: }
0735:
0736: CrawlJob cj = createNewJob(baseOn.getSettingsHandler()
0737: .getOrderFile(), name, description, seeds, priority);
0738:
0739: updateRecoveryPaths(recover, cj.getSettingsHandler(), name);
0740:
0741: return cj;
0742: }
0743:
0744: /**
0745: * Creates a new job. The new job will be returned and also registered as
0746: * the handler's 'new job'. The new job will be based on the settings
0747: * provided but created in a new location on disk.
0748: * @param orderFile Order file to use as the template for the new job.
0749: * @param name The name of the new job.
0750: * @param description Descriptions of the job.
0751: * @param seeds The contents of the new settings' seed file.
0752: *
0753: * @return The new crawl job.
0754: * @throws FatalConfigurationException If a problem occurs creating the
0755: * settings.
0756: */
0757: public CrawlJob newJob(final File orderFile, final String name,
0758: final String description, final String seeds)
0759: throws FatalConfigurationException {
0760: return createNewJob(orderFile, name, description, seeds,
0761: CrawlJob.PRIORITY_AVERAGE);
0762: }
0763:
0764: protected void checkDirectory(File dir)
0765: throws FatalConfigurationException {
0766: if (dir == null) {
0767: return;
0768: }
0769: if (!dir.exists() && !dir.canRead()) {
0770: throw new FatalConfigurationException(dir.getAbsolutePath()
0771: + " does not exist or is unreadable");
0772: }
0773: }
0774:
0775: protected CrawlJob createNewJob(final File orderFile,
0776: final String name, final String description,
0777: final String seeds, final int priority)
0778: throws FatalConfigurationException {
0779: if (newJob != null) {
0780: //There already is a new job. Discard it.
0781: discardNewJob();
0782: }
0783: String UID = getNextJobUID();
0784: File jobDir;
0785: jobDir = new File(this .jobsDir, name + "-" + UID);
0786: CrawlJobErrorHandler errorHandler = new CrawlJobErrorHandler();
0787: XMLSettingsHandler handler = createSettingsHandler(orderFile,
0788: name, description, seeds, jobDir, errorHandler,
0789: "order.xml", "seeds.txt");
0790: this .newJob = new CrawlJob(UID, name, handler, errorHandler,
0791: priority, jobDir);
0792: return this .newJob;
0793: }
0794:
0795: /**
0796: * Creates a new profile. The new profile will be returned and also
0797: * registered as the handler's 'new job'. The new profile will be based on
0798: * the settings provided but created in a new location on disk.
0799: *
0800: * @param baseOn
0801: * A CrawlJob (with a valid settingshandler) to use as the
0802: * template for the new profile.
0803: * @param name
0804: * The name of the new profile.
0805: * @param description
0806: * Description of the new profile
0807: * @param seeds
0808: * The contents of the new profiles' seed file
0809: * @return The new profile.
0810: * @throws FatalConfigurationException
0811: * @throws IOException
0812: */
0813: public CrawlJob newProfile(CrawlJob baseOn, String name,
0814: String description, String seeds)
0815: throws FatalConfigurationException, IOException {
0816: File profileDir = new File(getProfilesDirectory()
0817: .getAbsoluteFile()
0818: + File.separator + name);
0819: CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(
0820: Level.SEVERE);
0821: CrawlJob newProfile = new CrawlJob(name, createSettingsHandler(
0822: baseOn.getSettingsHandler().getOrderFile(), name,
0823: description, seeds, profileDir, cjseh, "order.xml",
0824: "seeds.txt"), cjseh);
0825: addProfile(newProfile);
0826: return newProfile;
0827: }
0828:
0829: /**
0830: * Creates a new settings handler based on an existing job. Basically all
0831: * the settings file for the 'based on' will be copied to the specified
0832: * directory.
0833: *
0834: * @param orderFile Order file to base new order file on. Cannot be null.
0835: * @param name Name for the new settings
0836: * @param description Description of the new settings.
0837: * @param seeds The contents of the new settings' seed file.
0838: * @param newSettingsDir
0839: * @param errorHandler
0840: * @param filename Name of new order file.
0841: * @param seedfile Name of new seeds file.
0842: *
0843: * @return The new settings handler.
0844: * @throws FatalConfigurationException
0845: * If there are problems with reading the 'base on'
0846: * configuration, with writing the new configuration or it's
0847: * seed file.
0848: */
0849: protected XMLSettingsHandler createSettingsHandler(
0850: final File orderFile, final String name,
0851: final String description, final String seeds,
0852: final File newSettingsDir,
0853: final CrawlJobErrorHandler errorHandler,
0854: final String filename, final String seedfile)
0855: throws FatalConfigurationException {
0856: XMLSettingsHandler newHandler = null;
0857: try {
0858: newHandler = new XMLSettingsHandler(orderFile);
0859: if (errorHandler != null) {
0860: newHandler.registerValueErrorHandler(errorHandler);
0861: }
0862: newHandler.setErrorReportingLevel(errorHandler.getLevel());
0863: newHandler.initialize();
0864: } catch (InvalidAttributeValueException e2) {
0865: throw new FatalConfigurationException(
0866: "InvalidAttributeValueException occured while creating"
0867: + " new settings handler for new job/profile\n"
0868: + e2.getMessage());
0869: }
0870:
0871: // Make sure the directory exists.
0872: newSettingsDir.mkdirs();
0873:
0874: try {
0875: // Set the seed file
0876: ((ComplexType) newHandler.getOrder().getAttribute("scope"))
0877: .setAttribute(new Attribute("seedsfile", seedfile));
0878: } catch (AttributeNotFoundException e1) {
0879: throw new FatalConfigurationException(
0880: "AttributeNotFoundException occured while setting up"
0881: + "new job/profile\n" + e1.getMessage());
0882: } catch (InvalidAttributeValueException e1) {
0883: throw new FatalConfigurationException(
0884: "InvalidAttributeValueException occured while setting"
0885: + "up new job/profile\n" + e1.getMessage());
0886: } catch (MBeanException e1) {
0887: throw new FatalConfigurationException(
0888: "MBeanException occured while setting up new"
0889: + " job/profile\n" + e1.getMessage());
0890: } catch (ReflectionException e1) {
0891: throw new FatalConfigurationException(
0892: "ReflectionException occured while setting up"
0893: + " new job/profile\n" + e1.getMessage());
0894: }
0895:
0896: File newFile = new File(newSettingsDir.getAbsolutePath(),
0897: filename);
0898:
0899: try {
0900: newHandler.copySettings(newFile, (String) newHandler
0901: .getOrder().getAttribute(
0902: CrawlOrder.ATTR_SETTINGS_DIRECTORY));
0903: } catch (IOException e3) {
0904: // Print stack trace to help debug issue where cannot create
0905: // new job from an old that has overrides.
0906: e3.printStackTrace();
0907: throw new FatalConfigurationException(
0908: "IOException occured while writing new settings files"
0909: + " for new job/profile\n"
0910: + e3.getMessage());
0911: } catch (AttributeNotFoundException e) {
0912: throw new FatalConfigurationException(
0913: "AttributeNotFoundException occured while writing new"
0914: + " settings files for new job/profile\n"
0915: + e.getMessage());
0916: } catch (MBeanException e) {
0917: throw new FatalConfigurationException(
0918: "MBeanException occured while writing new settings files"
0919: + " for new job/profile\n" + e.getMessage());
0920: } catch (ReflectionException e) {
0921: throw new FatalConfigurationException(
0922: "ReflectionException occured while writing new settings"
0923: + " files for new job/profile\n"
0924: + e.getMessage());
0925: }
0926: CrawlerSettings orderfile = newHandler.getSettingsObject(null);
0927:
0928: orderfile.setName(name);
0929: orderfile.setDescription(description);
0930:
0931: if (seeds != null) {
0932: BufferedWriter writer = null;
0933: try {
0934: writer = new BufferedWriter(new FileWriter(newHandler
0935: .getPathRelativeToWorkingDirectory(seedfile)));
0936: try {
0937: writer.write(seeds);
0938: } finally {
0939: writer.close();
0940: }
0941: } catch (IOException e) {
0942: throw new FatalConfigurationException(
0943: "IOException occured while writing seed file for new"
0944: + " job/profile\n" + e.getMessage());
0945: }
0946: }
0947: return newHandler;
0948: }
0949:
0950: /**
0951: * @param recover
0952: * Source to use recovering. Can be full path to a recovery log
0953: * or full path to a checkpoint src dir.
0954: * @param sh
0955: * Settings Handler to update.
0956: * @param jobName
0957: * Name of this job.
0958: * @throws FatalConfigurationException
0959: */
0960: protected void updateRecoveryPaths(final File recover,
0961: final SettingsHandler sh, final String jobName)
0962: throws FatalConfigurationException {
0963: if (recover == null) {
0964: return;
0965: }
0966: checkDirectory(recover);
0967: try {
0968: // Set 'recover-path' to be old job's recovery log path
0969: updateRecoveryPaths(recover, sh);
0970: } catch (AttributeNotFoundException e1) {
0971: throw new FatalConfigurationException(
0972: "AttributeNotFoundException occured while setting up"
0973: + "new job/profile " + jobName + " \n"
0974: + e1.getMessage());
0975: } catch (InvalidAttributeValueException e1) {
0976: throw new FatalConfigurationException(
0977: "InvalidAttributeValueException occured while setting"
0978: + "new job/profile " + jobName + " \n"
0979: + e1.getMessage());
0980: } catch (MBeanException e1) {
0981: throw new FatalConfigurationException(
0982: "MBeanException occured while setting up new"
0983: + "new job/profile " + jobName + " \n"
0984: + e1.getMessage());
0985: } catch (ReflectionException e1) {
0986: throw new FatalConfigurationException(
0987: "ReflectionException occured while setting up"
0988: + "new job/profile " + jobName + " \n"
0989: + e1.getMessage());
0990: } catch (IOException e) {
0991: throw new FatalConfigurationException(
0992: "IOException occured while setting up"
0993: + "new job/profile " + jobName + " \n"
0994: + e.getMessage());
0995: }
0996: }
0997:
0998: /**
0999: * @param recover
1000: * Source to use recovering. Can be full path to a recovery log
1001: * or full path to a checkpoint src dir.
1002: * @param newHandler
1003: * @throws ReflectionException
1004: * @throws MBeanException
1005: * @throws InvalidAttributeValueException
1006: * @throws AttributeNotFoundException
1007: * @throws IOException
1008: */
1009: private void updateRecoveryPaths(final File recover,
1010: SettingsHandler newHandler)
1011: throws AttributeNotFoundException,
1012: InvalidAttributeValueException, MBeanException,
1013: ReflectionException, IOException {
1014: if (recover == null || !recover.exists()) {
1015: throw new IOException("Recovery src does not exist: "
1016: + recover);
1017: }
1018: newHandler.getOrder().setAttribute(
1019: new Attribute(CrawlOrder.ATTR_RECOVER_PATH, recover
1020: .getAbsolutePath()));
1021:
1022: // Now, ensure that 'logs' and 'state' don't overlap with
1023: // previous job's files (ok for 'arcs' and 'scratch' to overlap)
1024: File newLogsDisk = null;
1025: final String RECOVERY_SUFFIX = "-R";
1026: while (true) {
1027: try {
1028: newLogsDisk = newHandler.getOrder().getSettingsDir(
1029: CrawlOrder.ATTR_LOGS_PATH);
1030: } catch (AttributeNotFoundException e) {
1031: logger.log(Level.SEVERE,
1032: "Failed to get logs directory", e);
1033: }
1034: if (newLogsDisk.list().length > 0) {
1035: // 'new' directory is nonempty; rename with trailing '-R'
1036: String logsPath = (String) newHandler.getOrder()
1037: .getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1038: if (logsPath.endsWith("/")) {
1039: logsPath = logsPath.substring(0,
1040: logsPath.length() - 1);
1041: }
1042: newHandler.getOrder().setAttribute(
1043: new Attribute(CrawlOrder.ATTR_LOGS_PATH,
1044: logsPath + RECOVERY_SUFFIX));
1045: } else {
1046: // directory is suitably empty; exit loop
1047: break;
1048: }
1049: }
1050: File newStateDisk = null;
1051: while (true) {
1052: try {
1053: newStateDisk = newHandler.getOrder().getSettingsDir(
1054: CrawlOrder.ATTR_STATE_PATH);
1055: } catch (AttributeNotFoundException e) {
1056: logger.log(Level.SEVERE,
1057: "Failed to get state directory", e);
1058: }
1059: if (newStateDisk.list().length > 0) {
1060: // 'new' directory is nonempty; rename with trailing '-R'
1061: String statePath = (String) newHandler.getOrder()
1062: .getAttribute(CrawlOrder.ATTR_STATE_PATH);
1063: if (statePath.endsWith("/")) {
1064: statePath = statePath.substring(0, statePath
1065: .length() - 1);
1066: }
1067: newHandler.getOrder().setAttribute(
1068: new Attribute(CrawlOrder.ATTR_STATE_PATH,
1069: statePath + RECOVERY_SUFFIX));
1070: } else {
1071: // directory is suitably empty; exit loop
1072: break;
1073: }
1074: }
1075: }
1076:
1077: /**
1078: * Discard the handler's 'new job'. This will remove any files/directories
1079: * written to disk.
1080: */
1081: public void discardNewJob() {
1082: FileUtils.deleteDir(new File(newJob.getSettingsDirectory()));
1083: }
1084:
1085: /**
1086: * Get the handler's 'new job'
1087: * @return the handler's 'new job'
1088: */
1089: public CrawlJob getNewJob() {
1090: return newJob;
1091: }
1092:
1093: /**
1094: * Is the crawler accepting crawl jobs to run?
1095: * @return True if the next availible CrawlJob will be crawled. False otherwise.
1096: */
1097: public boolean isRunning() {
1098: return running;
1099: }
1100:
1101: /**
1102: * Is a crawl job being crawled?
1103: * @return True if a job is actually being crawled (even if it is paused).
1104: * False if no job is being crawled.
1105: */
1106: public boolean isCrawling() {
1107: return this .currentJob != null;
1108: }
1109:
1110: /**
1111: * Allow jobs to be crawled.
1112: */
1113: public void startCrawler() {
1114: running = true;
1115: if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
1116: // Ok, can just start the next job
1117: startNextJob();
1118: }
1119: }
1120:
1121: /**
1122: * Stop future jobs from being crawled.
1123: *
1124: * This action will not affect the current job.
1125: */
1126: public void stopCrawler() {
1127: running = false;
1128: }
1129:
1130: /**
1131: * Start next crawl job.
1132: *
1133: * If a is job already running this method will do nothing.
1134: */
1135: protected final void startNextJob() {
1136: synchronized (this ) {
1137: if (startingNextJob != null) {
1138: try {
1139: startingNextJob.join();
1140: } catch (InterruptedException e) {
1141: e.printStackTrace();
1142: return;
1143: }
1144: }
1145: startingNextJob = new Thread(new Runnable() {
1146: public void run() {
1147: startNextJobInternal();
1148: }
1149: }, "StartNextJob");
1150: startingNextJob.start();
1151: }
1152: }
1153:
1154: protected void startNextJobInternal() {
1155: if (pendingCrawlJobs.size() == 0 || isCrawling()) {
1156: // No job ready or already crawling.
1157: return;
1158: }
1159: this .currentJob = (CrawlJob) pendingCrawlJobs.first();
1160: assert pendingCrawlJobs.contains(currentJob) : "pendingCrawlJobs is in an illegal state";
1161: pendingCrawlJobs.remove(currentJob);
1162: try {
1163: this .currentJob.setupForCrawlStart();
1164: // This is ugly but needed so I can clear the currentJob
1165: // reference in the crawlEnding and update the list of completed
1166: // jobs. Also, crawlEnded can startup next job.
1167: this .currentJob.getController()
1168: .addCrawlStatusListener(this );
1169: // now, actually start
1170: this .currentJob.getController().requestCrawlStart();
1171: } catch (InitializationException e) {
1172: loadJob(getStateJobFile(this .currentJob.getDirectory()));
1173: this .currentJob = null;
1174: startNextJobInternal(); // Load the next job if there is one.
1175: }
1176: }
1177:
1178: /**
1179: * Forward a 'kick' update to current job if any.
1180: */
1181: public void kickUpdate() {
1182: if (this .currentJob != null) {
1183: this .currentJob.kickUpdate();
1184: }
1185: }
1186:
1187: /**
1188: * Loads options from a file. Typically these are a list of available
1189: * modules that can be plugged into some part of the configuration.
1190: * For examples Processors, Frontiers, Filters etc. Leading and trailing
1191: * spaces are trimmed from each line.
1192: *
1193: * <p>Options are loaded from the CLASSPATH.
1194: * @param file the name of the option file (without path!)
1195: * @return The option file with each option line as a seperate entry in the
1196: * ArrayList.
1197: * @throws IOException when there is trouble reading the file.
1198: */
1199: public static ArrayList<String> loadOptions(String file)
1200: throws IOException {
1201: ArrayList<String> ret = new ArrayList<String>();
1202: Enumeration resources = CrawlJob.class.getClassLoader()
1203: .getResources("modules/" + file);
1204:
1205: boolean noFileFound = true;
1206: while (resources.hasMoreElements()) {
1207: InputStream is = ((URL) resources.nextElement())
1208: .openStream();
1209: noFileFound = false;
1210:
1211: String line = null;
1212: BufferedReader bf = new BufferedReader(
1213: new InputStreamReader(is), 8192);
1214: try {
1215: while ((line = bf.readLine()) != null) {
1216: line = line.trim();
1217: if (line.indexOf('#') < 0 && line.length() > 0) {
1218: // Looks like a valid line.
1219: ret.add(line);
1220: }
1221: }
1222: } finally {
1223: bf.close();
1224: }
1225: }
1226:
1227: if (noFileFound) {
1228: throw new IOException("Failed to get " + file
1229: + " from the " + " CLASSPATH");
1230: }
1231:
1232: return ret;
1233: }
1234:
1235: /**
1236: * Returns a URIFrontierMarker for the current, paused, job. If there is no
1237: * current job or it is not paused null will be returned.
1238: *
1239: * @param regexpr
1240: * A regular expression that each URI must match in order to be
1241: * considered 'within' the marker.
1242: * @param inCacheOnly
1243: * Limit marker scope to 'cached' URIs.
1244: * @return a URIFrontierMarker for the current job.
1245: * @see #getPendingURIsList(FrontierMarker, int, boolean)
1246: * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
1247: * boolean)
1248: * @see org.archive.crawler.framework.FrontierMarker
1249: */
1250: public FrontierMarker getInitialMarker(String regexpr,
1251: boolean inCacheOnly) {
1252: return (this .currentJob != null) ? this .currentJob
1253: .getInitialMarker(regexpr, inCacheOnly) : null;
1254: }
1255:
1256: /**
1257: * Returns the frontiers URI list based on the provided marker. This method
1258: * will return null if there is not current job or if the current job is
1259: * not paused. Only when there is a paused current job will this method
1260: * return a URI list.
1261: *
1262: * @param marker
1263: * URIFrontier marker
1264: * @param numberOfMatches
1265: * maximum number of matches to return
1266: * @param verbose
1267: * should detailed info be provided on each URI?
1268: * @return the frontiers URI list based on the provided marker
1269: * @throws InvalidFrontierMarkerException
1270: * When marker is inconsistent with the current state of the
1271: * frontier.
1272: * @see #getInitialMarker(String, boolean)
1273: * @see org.archive.crawler.framework.FrontierMarker
1274: */
1275: public ArrayList getPendingURIsList(FrontierMarker marker,
1276: int numberOfMatches, boolean verbose)
1277: throws InvalidFrontierMarkerException {
1278: return (this .currentJob != null) ? this .currentJob
1279: .getPendingURIsList(marker, numberOfMatches, verbose)
1280: : null;
1281: }
1282:
1283: /**
1284: * Delete any URI from the frontier of the current (paused) job that match
1285: * the specified regular expression. If the current job is not paused (or
1286: * there is no current job) nothing will be done.
1287: * @param regexpr Regular expression to delete URIs by.
1288: * @return the number of URIs deleted
1289: */
1290: public long deleteURIsFromPending(String regexpr) {
1291: return (this .currentJob != null) ? this .currentJob
1292: .deleteURIsFromPending(regexpr) : 0;
1293: }
1294:
1295: public String importUris(String file, String style, String force) {
1296: return importUris(file, style, "true".equals(force));
1297: }
1298:
1299: /**
1300: * @param fileOrUrl Name of file w/ seeds.
1301: * @param style What style of seeds -- crawl log (<code>crawlLog</code>
1302: * style) or recovery journal (<code>recoveryJournal</code> style), or
1303: * seeds file style (Pass <code>default</code> style).
1304: * @param forceRevisit Should we revisit even if seen before?
1305: * @return A display string that has a count of all added.
1306: */
1307: public String importUris(final String fileOrUrl,
1308: final String style, final boolean forceRevisit) {
1309: return (this .currentJob != null) ? this .currentJob.importUris(
1310: fileOrUrl, style, forceRevisit) : null;
1311: }
1312:
1313: protected int importUris(InputStream is, String style,
1314: boolean forceRevisit) {
1315: return (this .currentJob != null) ? this .currentJob.importUris(
1316: is, style, forceRevisit) : 0;
1317: }
1318:
1319: /**
1320: * Schedule a uri.
1321: * @param uri Uri to schedule.
1322: * @param forceFetch Should it be forcefetched.
1323: * @param isSeed True if seed.
1324: * @throws URIException
1325: */
1326: public void importUri(final String uri, final boolean forceFetch,
1327: final boolean isSeed) throws URIException {
1328: importUri(uri, forceFetch, isSeed, true);
1329: }
1330:
1331: /**
1332: * Schedule a uri.
1333: * @param str String that can be: 1. a UURI, 2. a snippet of the
1334: * crawl.log line, or 3. a snippet from recover log. See
1335: * {@link #importUris(InputStream, String, boolean)} for how it subparses
1336: * the lines from crawl.log and recover.log.
1337: * @param forceFetch Should it be forcefetched.
1338: * @param isSeed True if seed.
1339: * @param isFlush If true, flush the frontier IF it implements
1340: * flushing.
1341: * @throws URIException
1342: */
1343: public void importUri(final String str, final boolean forceFetch,
1344: final boolean isSeed, final boolean isFlush)
1345: throws URIException {
1346: if (this .currentJob != null) {
1347: this .currentJob.importUri(str, forceFetch, isSeed, isFlush);
1348: }
1349: }
1350:
1351: /**
1352: * If its a HostQueuesFrontier, needs to be flushed for the queued.
1353: */
1354: protected void doFlush() {
1355: if (this .currentJob != null) {
1356: this .currentJob.flush();
1357: }
1358: }
1359:
1360: public void stop() {
1361: if (isCrawling()) {
1362: deleteJob(getCurrentJob().getUID());
1363: }
1364: }
1365:
1366: public void requestCrawlStop() {
1367: if (this .currentJob != null) {
1368: this .currentJob.stopCrawling();
1369: }
1370: }
1371:
1372: /**
1373: * Ensure order file with new name/desc is written.
1374: * See '[ 1066573 ] sometimes job based-on other job uses older job name'.
1375: * @param newJob Newly created job.
1376: * @param metaname Metaname for new job.
1377: * @param description Description for new job.
1378: * @return <code>newJob</code>
1379: */
1380: public static CrawlJob ensureNewJobWritten(CrawlJob newJob,
1381: String metaname, String description) {
1382: XMLSettingsHandler settingsHandler = newJob
1383: .getSettingsHandler();
1384: CrawlerSettings orderfile = settingsHandler
1385: .getSettingsObject(null);
1386: orderfile.setName(metaname);
1387: orderfile.setDescription(description);
1388: settingsHandler.writeSettingsObject(orderfile);
1389: return newJob;
1390: }
1391:
1392: public void crawlStarted(String message) {
1393: // TODO Auto-generated method stub
1394:
1395: }
1396:
1397: public void crawlEnding(String sExitMessage) {
1398: loadJob(getStateJobFile(this .currentJob.getDirectory()));
1399: currentJob = null;
1400: synchronized (this ) {
1401: // If the GUI terminated the job then it is waiting for this event.
1402: notifyAll();
1403: }
1404: }
1405:
1406: public void crawlEnded(String sExitMessage) {
1407: if (this .running) {
1408: startNextJob();
1409: }
1410: }
1411:
1412: public void crawlPausing(String statusMessage) {
1413: // TODO Auto-generated method stub
1414:
1415: }
1416:
1417: public void crawlPaused(String statusMessage) {
1418: // TODO Auto-generated method stub
1419:
1420: }
1421:
1422: public void crawlResuming(String statusMessage) {
1423: // TODO Auto-generated method stub
1424: }
1425:
1426: public void crawlCheckpoint(File checkpointDir) throws Exception {
1427: // TODO Auto-generated method stub
1428: }
1429: }
|