Source Code Cross Referenced for CrawlJobHandler.java in  » Web-Crawler » heritrix » org » archive » crawler » admin » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.admin 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /* CrawlJobHandler
0002:         *
0003:         * $Id: CrawlJobHandler.java 5055 2007-04-10 22:12:56Z gojomo $
0004:         *
0005:         * Copyright (C) 2003 Internet Archive.
0006:         *
0007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0008:         *
0009:         * Heritrix is free software; you can redistribute it and/or modify
0010:         * it under the terms of the GNU Lesser Public License as published by
0011:         * the Free Software Foundation; either version 2.1 of the License, or
0012:         * any later version.
0013:         *
0014:         * Heritrix is distributed in the hope that it will be useful,
0015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0017:         * GNU Lesser Public License for more details.
0018:         *
0019:         * You should have received a copy of the GNU Lesser Public License
0020:         * along with Heritrix; if not, write to the Free Software
0021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0022:         */
0023:        package org.archive.crawler.admin;
0024:
0025:        import java.io.BufferedReader;
0026:        import java.io.BufferedWriter;
0027:        import java.io.File;
0028:        import java.io.FileWriter;
0029:        import java.io.FilenameFilter;
0030:        import java.io.IOException;
0031:        import java.io.InputStream;
0032:        import java.io.InputStreamReader;
0033:        import java.net.URL;
0034:        import java.net.URI;
0035:        import java.util.ArrayList;
0036:        import java.util.Comparator;
0037:        import java.util.Date;
0038:        import java.util.Enumeration;
0039:        import java.util.Iterator;
0040:        import java.util.List;
0041:        import java.util.TreeSet;
0042:        import java.util.logging.Level;
0043:        import java.util.logging.Logger;
0044:
0045:        import javax.management.Attribute;
0046:        import javax.management.AttributeNotFoundException;
0047:        import javax.management.InvalidAttributeValueException;
0048:        import javax.management.MBeanException;
0049:        import javax.management.ReflectionException;
0050:
0051:        import org.apache.commons.httpclient.URIException;
0052:        import org.archive.crawler.Heritrix;
0053:        import org.archive.crawler.datamodel.CrawlOrder;
0054:        import org.archive.crawler.event.CrawlStatusListener;
0055:        import org.archive.crawler.framework.FrontierMarker;
0056:        import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0057:        import org.archive.crawler.framework.exceptions.InitializationException;
0058:        import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
0059:        import org.archive.crawler.frontier.FrontierJournal;
0060:        import org.archive.crawler.frontier.RecoveryJournal;
0061:        import org.archive.crawler.settings.ComplexType;
0062:        import org.archive.crawler.settings.CrawlerSettings;
0063:        import org.archive.crawler.settings.SettingsHandler;
0064:        import org.archive.crawler.settings.XMLSettingsHandler;
0065:        import org.archive.util.ArchiveUtils;
0066:        import org.archive.util.FileUtils;
0067:
0068:        /**
0069:         * This class manages CrawlJobs. Submitted crawl jobs are queued up and run
0070:         * in order when the crawler is running.
0071:         * <p>Basically this provides a layer between any potential user interface and
0072:         * the CrawlJobs.  It keeps the lists of completed jobs, pending jobs, etc.
0073:         * <p>
0074:         * The jobs managed by the handler can be divided into the following:
0075:         * <ul>
0076:         *  <li> <code>Pending</code> - Jobs that are ready to run and are waiting their
0077:         *                              turn. These can be edited, viewed, deleted etc.
0078:         *  <li> <code>Running</code> - Only one job can be running at a time. There may
0079:         *                              be no job running. The running job can be viewed
0080:         *                              and edited to some extent. It can also be
0081:         *                              terminated. This job should have a
0082:         *                              StatisticsTracking module attached to it for more
0083:         *                              details on the crawl.
0084:         * <li><code>Completed</code> - Jobs that have finished crawling or have been
0085:         *                              deleted from the pending queue or terminated
0086:         *                              while running. They can not be edited but can be
0087:         *                              viewed. They retain the StatisticsTracking
0088:         *                              module from their run.
0089:         *  <li> <code>New job</code> - At any given time their can be one 'new job' the
0090:         *                              new job is not considered ready to run. It can
0091:         *                              be edited or discarded (in which case it will be
0092:         *                              totally destroyed, including any files on disk).
0093:         *                              Once an operator deems the job ready to run it
0094:         *                              can be moved to the pending queue.
0095:         * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can
0096:         *                              be edited normally but can not be submitted to
0097:         *                              the pending queue. New jobs can be created
0098:         *                              using a profile as it's template.
0099:         *
0100:         * @author Kristinn Sigurdsson
0101:         *
0102:         * @see org.archive.crawler.admin.CrawlJob
0103:         */
0104:
0105:        public class CrawlJobHandler implements  CrawlStatusListener {
0106:            private static final Logger logger = Logger
0107:                    .getLogger(CrawlJobHandler.class.getName());
0108:
0109:            /**
0110:             * Name of system property whose specification overrides default profile
0111:             * used.
0112:             *
0113:             */
0114:            public static final String DEFAULT_PROFILE_NAME = "heritrix.default.profile";
0115:
0116:            /**
0117:             * Default profile name.
0118:             */
0119:            public static final String DEFAULT_PROFILE = "default";
0120:
0121:            /**
0122:             * Name of the profiles directory.
0123:             */
0124:            public static final String PROFILES_DIR_NAME = "profiles";
0125:
0126:            public static final String ORDER_FILE_NAME = "order.xml";
0127:
0128:            /**
0129:             * Job currently being crawled.
0130:             */
0131:            private CrawlJob currentJob = null;
0132:
0133:            /**
0134:             * A new job that is being created/configured. Not yet ready for crawling.
0135:             */
0136:            private CrawlJob newJob = null;
0137:
0138:            /**
0139:             * Thread to start the next job in background
0140:             */
0141:            private Thread startingNextJob = null;
0142:
0143:            /**
0144:             * A list of pending CrawlJobs.
0145:             */
0146:            private TreeSet<CrawlJob> pendingCrawlJobs;
0147:
0148:            /**
0149:             * A list of completed CrawlJobs.
0150:             */
0151:            //private Vector completedCrawlJobs = new Vector();
0152:            private TreeSet<CrawlJob> completedCrawlJobs;
0153:
0154:            /**
0155:             * A list of profile CrawlJobs.
0156:             */
0157:            private TreeSet<CrawlJob> profileJobs;
0158:
0159:            // The UIDs of profiles should be NOT be timestamps. A descriptive name is
0160:            // ideal.
0161:            private String defaultProfile = null;
0162:
0163:            /**
0164:             * If true the crawler is 'running'. That is the next pending job will start
0165:             * crawling as soon as the current job (if any) is completed.
0166:             */
0167:            private boolean running = false;
0168:
0169:            /**
0170:             * String to indicate recovery should be based on the recovery log, not
0171:             * based on checkpointing.
0172:             */
0173:            public static final String RECOVER_LOG = "recover";
0174:
0175:            /**
0176:             * Jobs directory.
0177:             */
0178:            private final File jobsDir;
0179:
0180:            /**
0181:             * Constructor.
0182:             * @param jobsDir Jobs directory.
0183:             */
0184:            public CrawlJobHandler(final File jobsDir) {
0185:                this (jobsDir, true, true);
0186:            }
0187:
0188:            /**
0189:             * Constructor allowing for optional loading of profiles and jobs.
0190:             * @param jobsDir Jobs directory.
0191:             * @param loadJobs If true then any applicable jobs will be loaded.
0192:             * @param loadProfiles If true then any applicable profiles will be loaded.
0193:             */
0194:            public CrawlJobHandler(final File jobsDir, final boolean loadJobs,
0195:                    final boolean loadProfiles) {
0196:                this .jobsDir = jobsDir;
0197:                // Make a comparator for CrawlJobs.
0198:                Comparator<CrawlJob> comp = new Comparator<CrawlJob>() {
0199:                    public int compare(CrawlJob job1, CrawlJob job2) {
0200:                        if (job1.getJobPriority() < job2.getJobPriority()) {
0201:                            return -1;
0202:                        } else if (job1.getJobPriority() > job2
0203:                                .getJobPriority()) {
0204:                            return 1;
0205:                        } else {
0206:                            // Same priority, use UID (which should be a timestamp).
0207:                            // Lower UID (string compare) means earlier time.
0208:                            return job1.getUID().compareTo(job2.getUID());
0209:                        }
0210:                    }
0211:                };
0212:                this .pendingCrawlJobs = new TreeSet<CrawlJob>(comp);
0213:                this .completedCrawlJobs = new TreeSet<CrawlJob>(comp);
0214:                // Profiles always have the same priority so it will be sorted by name
0215:                this .profileJobs = new TreeSet<CrawlJob>(comp);
0216:                if (loadProfiles) {
0217:                    loadProfiles();
0218:                }
0219:                if (loadJobs) {
0220:                    loadJobs();
0221:                }
0222:            }
0223:
0224:            /**
0225:             * Find the state.job file in the job directory.
0226:             * @param jobDir Directory to look in.
0227:             * @return Full path to 'state.job' file or null if none found.
0228:             */
0229:            protected File getStateJobFile(final File jobDir) {
0230:                // Need to find job file ('state.job').
0231:                File[] jobFiles = jobDir.listFiles(new FilenameFilter() {
0232:                    public boolean accept(File dir, String name) {
0233:                        return name.toLowerCase().endsWith(".job")
0234:                                && (new File(dir, name)).canRead();
0235:                    }
0236:
0237:                });
0238:                return (jobFiles.length == 1) ? jobFiles[0] : null;
0239:            }
0240:
0241:            /**
0242:             * Loads any availible jobs in the jobs directory.
0243:             * <p>
0244:             * Availible jobs are any directory containing a file called
0245:             * <code>state.job</code>. The file must contain valid job information.
0246:             */
0247:            private void loadJobs() {
0248:                this .jobsDir.mkdirs();
0249:                File[] jobs = this .jobsDir.listFiles();
0250:                for (int i = 0; i < jobs.length; i++) {
0251:                    if (jobs[i].isDirectory()) {
0252:                        File jobFile = getStateJobFile(jobs[i]);
0253:                        if (jobFile != null) {
0254:                            loadJob(jobFile);
0255:                        }
0256:                    }
0257:                }
0258:            }
0259:
0260:            /**
0261:             * Loads a job given a specific job file. The loaded job will be placed in
0262:             * the list of completed jobs or pending queue depending on its status.
0263:             * Running jobs will have their status set to 'finished abnormally' and put
0264:             * into the completed list.
0265:             * @param job The job file of the job to load.
0266:             */
0267:            protected void loadJob(final File job) {
0268:                CrawlJob cjob = null;
0269:                try {
0270:                    // Load the CrawlJob
0271:                    cjob = new CrawlJob(job, new CrawlJobErrorHandler());
0272:                } catch (InvalidJobFileException e) {
0273:                    logger.log(Level.INFO, "Invalid job file for "
0274:                            + job.getAbsolutePath(), e);
0275:                    return;
0276:                } catch (IOException e) {
0277:                    logger.log(Level.INFO, "IOException for " + job.getName()
0278:                            + ", " + job.getAbsolutePath(), e);
0279:                    return;
0280:                }
0281:
0282:                // TODO: Move test into CrawlJob.
0283:                // Check job status and place it accordingly.
0284:                if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING)
0285:                        || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED)
0286:                        || cjob.getStatus().equals(
0287:                                CrawlJob.STATUS_CHECKPOINTING)
0288:                        || cjob.getStatus().equals(
0289:                                CrawlJob.STATUS_WAITING_FOR_PAUSE)) {
0290:                    // Was a running job.
0291:                    cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL);
0292:                    this .completedCrawlJobs.add(cjob);
0293:                } else if (cjob.getStatus().equals(CrawlJob.STATUS_PENDING)) {
0294:                    // Was a pending job.
0295:                    this .pendingCrawlJobs.add(cjob);
0296:                } else if (cjob.getStatus().equals(CrawlJob.STATUS_CREATED)
0297:                        || cjob.getStatus().equals(CrawlJob.STATUS_DELETED)) {
0298:                    // Ignore for now. TODO: Add to 'recycle bin'
0299:                } else {
0300:                    // Must have been completed.
0301:                    this .completedCrawlJobs.add(cjob);
0302:                }
0303:            }
0304:
0305:            /**
0306:             * Looks in conf dir for a profiles dir.
0307:             * @return the directory where profiles are stored else null if none
0308:             * available
0309:             * @throws IOException
0310:             */
0311:            private File getProfilesDirectory() throws IOException {
0312:                URL webappProfilePath = Heritrix.class.getResource("/"
0313:                        + PROFILES_DIR_NAME);
0314:                if (webappProfilePath != null) {
0315:                    try {
0316:                        return new File(new URI(webappProfilePath.toString()));
0317:                    } catch (java.lang.IllegalArgumentException e) {
0318:                        // e.g. "profiles" within a jar file
0319:                        // try Heritrix.getConfdir() in this case
0320:                    } catch (java.net.URISyntaxException e) {
0321:                        e.printStackTrace();
0322:                    }
0323:                }
0324:                return (Heritrix.getConfdir(false) == null) ? null : new File(
0325:                        Heritrix.getConfdir().getAbsolutePath(),
0326:                        PROFILES_DIR_NAME);
0327:            }
0328:
0329:            /**
0330:             * Loads the default profile and all other profiles found on disk.
0331:             */
0332:            private void loadProfiles() {
0333:                boolean loadedDefault = false;
0334:                File profileDir = null;
0335:                try {
0336:                    profileDir = getProfilesDirectory();
0337:                } catch (IOException e) {
0338:                    e.printStackTrace();
0339:                }
0340:                if (profileDir != null) {
0341:                    File[] ps = profileDir.listFiles();
0342:                    if (ps != null && ps.length > 0) {
0343:                        for (int i = 0; i < ps.length; i++) {
0344:                            File f = ps[i];
0345:                            if (f.isDirectory()) {
0346:                                // Each directory in the profiles directory should
0347:                                // contain the file order.xml.
0348:                                File profile = new File(f, ORDER_FILE_NAME);
0349:                                if (profile.canRead()) {
0350:                                    boolean b = loadProfile(profile);
0351:                                    if (b) {
0352:                                        loadedDefault = b;
0353:                                    }
0354:                                }
0355:                            }
0356:                        }
0357:                    }
0358:                }
0359:                // Now add in the default profile.  Its on the CLASSPATH and needs
0360:                // special handling.  Don't add if already a default present.
0361:                String parent = File.separator + PROFILES_DIR_NAME
0362:                        + File.separator;
0363:                if (!loadedDefault) {
0364:                    loadProfile(new File(parent + DEFAULT_PROFILE,
0365:                            ORDER_FILE_NAME));
0366:                }
0367:                // Look to see if a default profile system property has been
0368:                // supplied. If so, use it instead.
0369:                // TODO: Try and read default profile from some permanent storage.
0370:                defaultProfile = DEFAULT_PROFILE;
0371:            }
0372:
0373:            /**
0374:             * Load one profile.
0375:             * @param profile Profile to load.
0376:             * @return True if loaded profile was the default profile.
0377:             */
0378:            protected boolean loadProfile(File profile) {
0379:                boolean loadedDefault = false;
0380:                // Ok, got the order file for this profile.
0381:                try {
0382:                    // The directory name denotes the profiles UID and name.
0383:                    XMLSettingsHandler newSettingsHandler = new XMLSettingsHandler(
0384:                            profile);
0385:                    CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(
0386:                            Level.SEVERE);
0387:                    newSettingsHandler.setErrorReportingLevel(cjseh.getLevel());
0388:                    newSettingsHandler.initialize();
0389:                    addProfile(new CrawlJob(profile.getParentFile().getName(),
0390:                            newSettingsHandler, cjseh));
0391:                    loadedDefault = profile.getParentFile().getName().equals(
0392:                            DEFAULT_PROFILE);
0393:                } catch (InvalidAttributeValueException e) {
0394:                    System.err.println("Failed to load profile '"
0395:                            + profile.getParentFile().getName()
0396:                            + "'. InvalidAttributeValueException.");
0397:                }
0398:                return loadedDefault;
0399:            }
0400:
0401:            /**
0402:             * Add a new profile
0403:             * @param profile The new profile
0404:             */
0405:            public synchronized void addProfile(CrawlJob profile) {
0406:                profileJobs.add(profile);
0407:            }
0408:
0409:            public synchronized void deleteProfile(CrawlJob cj)
0410:                    throws IOException {
0411:                File d = getProfilesDirectory();
0412:                File p = new File(d, cj.getJobName());
0413:                if (!p.exists()) {
0414:                    throw new IOException("No profile named " + cj.getJobName()
0415:                            + " at " + d.getAbsolutePath());
0416:                }
0417:                FileUtils.deleteDir(p);
0418:                this .profileJobs.remove(cj);
0419:            }
0420:
0421:            /**
0422:             * Returns a List of all known profiles.
0423:             * @return a List of all known profiles.
0424:             */
0425:            public synchronized List<CrawlJob> getProfiles() {
0426:                ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(profileJobs
0427:                        .size());
0428:                tmp.addAll(profileJobs);
0429:                return tmp;
0430:            }
0431:
0432:            /**
0433:             * Submit a job to the handler. Job will be scheduled for crawling. At
0434:             * present it will not take the job's priority into consideration.
0435:             *
0436:             * @param job A new job for the handler
0437:             * @return CrawlJob that was added or null.
0438:             */
0439:            public CrawlJob addJob(CrawlJob job) {
0440:                if (job.isProfile()) {
0441:                    return null; // Can't crawl profiles.
0442:                }
0443:                job.setStatus(CrawlJob.STATUS_PENDING);
0444:                if (job.isNew()) {
0445:                    // Are adding the new job to the pending queue.
0446:                    this .newJob = null;
0447:                    job.setNew(false);
0448:                }
0449:                this .pendingCrawlJobs.add(job);
0450:                if (isCrawling() == false && isRunning()) {
0451:                    // Start crawling
0452:                    startNextJob();
0453:                }
0454:                return job;
0455:            }
0456:
0457:            /**
0458:             * Returns the default profile. If no default profile has been set it will
0459:             * return the first profile that was set/loaded and still exists. If no
0460:             * profiles exist it will return null
0461:             * @return the default profile.
0462:             */
0463:            public synchronized CrawlJob getDefaultProfile() {
0464:                if (defaultProfile != null) {
0465:                    for (Iterator it = profileJobs.iterator(); it.hasNext();) {
0466:                        CrawlJob item = (CrawlJob) it.next();
0467:                        if (item.getJobName().equals(defaultProfile)) {
0468:                            // Found it.
0469:                            return item;
0470:                        }
0471:                    }
0472:                }
0473:                if (profileJobs.size() > 0) {
0474:                    return (CrawlJob) profileJobs.first();
0475:                }
0476:                return null;
0477:            }
0478:
0479:            /**
0480:             * Set the default profile.
0481:             * @param profile The new default profile. The following must apply to it.
0482:             *                profile.isProfile() should return true and
0483:             *                this.getProfiles() should contain it.
0484:             */
0485:            public void setDefaultProfile(CrawlJob profile) {
0486:                defaultProfile = profile.getJobName();
0487:                // TODO: Make changes to default profile durable across restarts.
0488:            }
0489:
0490:            /**
0491:             * A List of all pending jobs
0492:             *
0493:             * @return A List of all pending jobs.
0494:             * No promises are made about the order of the list
0495:             */
0496:            public List<CrawlJob> getPendingJobs() {
0497:                ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(
0498:                        pendingCrawlJobs.size());
0499:                tmp.addAll(pendingCrawlJobs);
0500:                return tmp;
0501:            }
0502:
0503:            /**
0504:             * @return The job currently being crawled.
0505:             */
0506:            public CrawlJob getCurrentJob() {
0507:                return currentJob;
0508:            }
0509:
0510:            /**
0511:             * @return A List of all finished jobs.
0512:             */
0513:            public List<CrawlJob> getCompletedJobs() {
0514:                ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(
0515:                        completedCrawlJobs.size());
0516:                tmp.addAll(completedCrawlJobs);
0517:                return tmp;
0518:            }
0519:
0520:            /**
0521:             * Return a job with the given UID.
0522:             * Doesn't matter if it's pending, currently running, has finished running
0523:             * is new or a profile.
0524:             *
0525:             * @param jobUID The unique ID of the job.
0526:             * @return The job with the UID or null if no such job is found
0527:             */
0528:            public CrawlJob getJob(String jobUID) {
0529:                if (jobUID == null) {
0530:                    return null; // UID can't be null
0531:                }
0532:                // First check currently running job
0533:                if (currentJob != null && currentJob.getUID().equals(jobUID)) {
0534:                    return currentJob;
0535:                } else if (newJob != null && newJob.getUID().equals(jobUID)) {
0536:                    // Then check the 'new job'
0537:                    return newJob;
0538:                } else {
0539:                    // Then check pending jobs.
0540:                    Iterator itPend = pendingCrawlJobs.iterator();
0541:                    while (itPend.hasNext()) {
0542:                        CrawlJob cj = (CrawlJob) itPend.next();
0543:                        if (cj.getUID().equals(jobUID)) {
0544:                            return cj;
0545:                        }
0546:                    }
0547:
0548:                    // Next check completed jobs.
0549:                    Iterator itComp = completedCrawlJobs.iterator();
0550:                    while (itComp.hasNext()) {
0551:                        CrawlJob cj = (CrawlJob) itComp.next();
0552:                        if (cj.getUID().equals(jobUID)) {
0553:                            return cj;
0554:                        }
0555:                    }
0556:
0557:                    // And finally check the profiles.
0558:                    for (Iterator i = getProfiles().iterator(); i.hasNext();) {
0559:                        CrawlJob cj = (CrawlJob) i.next();
0560:                        if (cj.getUID().equals(jobUID)) {
0561:                            return cj;
0562:                        }
0563:                    }
0564:                }
0565:                return null; // Nothing found, return null
0566:            }
0567:
0568:            /**
0569:             * @return True if we terminated a current job (False if no job to
0570:             * terminate)
0571:             */
0572:            public boolean terminateCurrentJob() {
0573:                if (this .currentJob == null) {
0574:                    return false;
0575:                }
0576:                // requestCrawlStop will cause crawlEnding to be invoked.
0577:                // It will handle the clean up.
0578:                this .currentJob.stopCrawling();
0579:                synchronized (this ) {
0580:                    try {
0581:                        // Take a few moments so that the controller can change
0582:                        // states before the UI updates. The CrawlEnding event
0583:                        // will wake us if it occurs sooner than this.
0584:                        wait(3000);
0585:                    } catch (InterruptedException e) {
0586:                        // Ignore.
0587:                    }
0588:                }
0589:                return true;
0590:            }
0591:
0592:            /**
0593:             * The specified job will be removed from the pending queue or aborted if
0594:             * currently running.  It will be placed in the list of completed jobs with
0595:             * appropriate status info. If the job is already in the completed list or
0596:             * no job with the given UID is found, no action will be taken.
0597:             *
0598:             * @param jobUID The UID (unique ID) of the job that is to be deleted.
0599:             *
0600:             */
0601:            public void deleteJob(String jobUID) {
0602:                // First check to see if we are deleting the current job.
0603:                if (currentJob != null && jobUID.equals(currentJob.getUID())) {
0604:                    terminateCurrentJob();
0605:                    return; // We're not going to find another job with the same UID
0606:                }
0607:
0608:                // Ok, it isn't the current job, let's check the pending jobs.
0609:                for (Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) {
0610:                    CrawlJob cj = (CrawlJob) it.next();
0611:                    if (cj.getUID().equals(jobUID)) {
0612:                        // Found the one to delete.
0613:                        cj.setStatus(CrawlJob.STATUS_DELETED);
0614:                        it.remove();
0615:                        return; // We're not going to find another job with the same UID
0616:                    }
0617:                }
0618:
0619:                // And finally the completed jobs.
0620:                for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) {
0621:                    CrawlJob cj = (CrawlJob) it.next();
0622:                    if (cj.getUID().equals(jobUID)) {
0623:                        // Found the one to delete.
0624:                        cj.setStatus(CrawlJob.STATUS_DELETED);
0625:                        it.remove();
0626:                        return; // No other job will have the same UID
0627:                    }
0628:                }
0629:            }
0630:
0631:            /**
0632:             * Cause the current job to pause. If no current job is crawling this
0633:             * method will have no effect. 
0634:             */
0635:            public void pauseJob() {
0636:                if (this .currentJob != null) {
0637:                    this .currentJob.pause();
0638:                }
0639:            }
0640:
0641:            /**
0642:             * Cause the current job to resume crawling if it was paused. Will have no
0643:             * effect if the current job was not paused or if there is no current job.
0644:             * If the current job is still waiting to pause, this will not take effect
0645:             * until the job has actually paused. At which time it will immeditatly
0646:             * resume crawling.
0647:             */
0648:            public void resumeJob() {
0649:                if (this .currentJob != null) {
0650:                    this .currentJob.resume();
0651:                }
0652:            }
0653:
0654:            /**
0655:             * Cause the current job to write a checkpoint to disk. Currently
0656:             * requires job to already be paused.
0657:             * @throws IllegalStateException Thrown if crawl is not paused.
0658:             */
0659:            public void checkpointJob() throws IllegalStateException {
0660:                if (this .currentJob != null) {
0661:                    this .currentJob.checkpoint();
0662:                }
0663:            }
0664:
0665:            /**
0666:             * Returns a unique job ID.
0667:             * <p>
0668:             * No two calls to this method (on the same instance of this class) can ever
0669:             * return the same value. <br>
0670:             * Currently implemented to return a time stamp. That is subject to change
0671:             * though.
0672:             *
0673:             * @return A unique job ID.
0674:             *
0675:             * @see ArchiveUtils#TIMESTAMP17
0676:             */
0677:            public String getNextJobUID() {
0678:                return ArchiveUtils.get17DigitDate();
0679:            }
0680:
0681:            /**
0682:             * Creates a new job. The new job will be returned and also registered as
0683:             * the handler's 'new job'. The new job will be based on the settings
0684:             * provided but created in a new location on disk.
0685:             *
0686:             * @param baseOn
0687:             *            A CrawlJob (with a valid settingshandler) to use as the
0688:             *            template for the new job.
0689:             * @param recovery Whether to preinitialize new job as recovery of
0690:             * <code>baseOn</code> job.  String holds RECOVER_LOG if we are to
0691:             * do the recovery based off the recover.gz log -- See RecoveryJournal in
0692:             * the frontier package -- or it holds the name of
0693:             * the checkpoint we're to use recoverying.
0694:             * @param name
0695:             *            The name of the new job.
0696:             * @param description
0697:             *            Descriptions of the job.
0698:             * @param seeds
0699:             *            The contents of the new settings' seed file.
0700:             * @param priority
0701:             *            The priority of the new job.
0702:             *
0703:             * @return The new crawl job.
0704:             * @throws FatalConfigurationException If a problem occurs creating the
0705:             *             settings.
0706:             */
0707:            public CrawlJob newJob(CrawlJob baseOn, String recovery,
0708:                    String name, String description, String seeds, int priority)
0709:                    throws FatalConfigurationException {
0710:                // See what the recover story is.
0711:                File recover = null;
0712:                try {
0713:                    if (recovery != null && recovery.length() > 0
0714:                            && recovery.equals(RECOVER_LOG)) {
0715:                        // Then we're to do a recovery based off the RecoveryJournal
0716:                        // recover.gz log.
0717:                        File dir = baseOn.getSettingsHandler().getOrder()
0718:                                .getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
0719:                        // Add name of recover file.  We're hardcoding it as
0720:                        // 'recover.gz'.
0721:                        recover = new File(dir, FrontierJournal.LOGNAME_RECOVER);
0722:                    } else if (recovery != null && recovery.length() > 0) {
0723:                        // Must be name of a checkpoint to use.
0724:                        recover = new File(baseOn.getSettingsHandler()
0725:                                .getOrder().getSettingsDir(
0726:                                        CrawlOrder.ATTR_CHECKPOINTS_PATH),
0727:                                recovery);
0728:                    }
0729:                } catch (AttributeNotFoundException e1) {
0730:                    throw new FatalConfigurationException(
0731:                            "AttributeNotFoundException occured while setting up"
0732:                                    + "new job/profile " + name + " \n"
0733:                                    + e1.getMessage());
0734:                }
0735:
0736:                CrawlJob cj = createNewJob(baseOn.getSettingsHandler()
0737:                        .getOrderFile(), name, description, seeds, priority);
0738:
0739:                updateRecoveryPaths(recover, cj.getSettingsHandler(), name);
0740:
0741:                return cj;
0742:            }
0743:
0744:            /**
0745:             * Creates a new job. The new job will be returned and also registered as
0746:             * the handler's 'new job'. The new job will be based on the settings
0747:             * provided but created in a new location on disk.
0748:             * @param orderFile Order file to use as the template for the new job.
0749:             * @param name The name of the new job.
0750:             * @param description Descriptions of the job.
0751:             * @param seeds The contents of the new settings' seed file.
0752:             *
0753:             * @return The new crawl job.
0754:             * @throws FatalConfigurationException If a problem occurs creating the
0755:             *             settings.
0756:             */
0757:            public CrawlJob newJob(final File orderFile, final String name,
0758:                    final String description, final String seeds)
0759:                    throws FatalConfigurationException {
0760:                return createNewJob(orderFile, name, description, seeds,
0761:                        CrawlJob.PRIORITY_AVERAGE);
0762:            }
0763:
0764:            protected void checkDirectory(File dir)
0765:                    throws FatalConfigurationException {
0766:                if (dir == null) {
0767:                    return;
0768:                }
0769:                if (!dir.exists() && !dir.canRead()) {
0770:                    throw new FatalConfigurationException(dir.getAbsolutePath()
0771:                            + " does not exist or is unreadable");
0772:                }
0773:            }
0774:
0775:            protected CrawlJob createNewJob(final File orderFile,
0776:                    final String name, final String description,
0777:                    final String seeds, final int priority)
0778:                    throws FatalConfigurationException {
0779:                if (newJob != null) {
0780:                    //There already is a new job. Discard it.
0781:                    discardNewJob();
0782:                }
0783:                String UID = getNextJobUID();
0784:                File jobDir;
0785:                jobDir = new File(this .jobsDir, name + "-" + UID);
0786:                CrawlJobErrorHandler errorHandler = new CrawlJobErrorHandler();
0787:                XMLSettingsHandler handler = createSettingsHandler(orderFile,
0788:                        name, description, seeds, jobDir, errorHandler,
0789:                        "order.xml", "seeds.txt");
0790:                this .newJob = new CrawlJob(UID, name, handler, errorHandler,
0791:                        priority, jobDir);
0792:                return this .newJob;
0793:            }
0794:
0795:            /**
0796:             * Creates a new profile. The new profile will be returned and also
0797:             * registered as the handler's 'new job'. The new profile will be based on
0798:             * the settings provided but created in a new location on disk.
0799:             *
0800:             * @param baseOn
0801:             *            A CrawlJob (with a valid settingshandler) to use as the
0802:             *            template for the new profile.
0803:             * @param name
0804:             *            The name of the new profile.
0805:             * @param description
0806:             *            Description of the new profile
0807:             * @param seeds
0808:             *            The contents of the new profiles' seed file
0809:             * @return The new profile.
0810:             * @throws FatalConfigurationException
0811:             * @throws IOException
0812:             */
0813:            public CrawlJob newProfile(CrawlJob baseOn, String name,
0814:                    String description, String seeds)
0815:                    throws FatalConfigurationException, IOException {
0816:                File profileDir = new File(getProfilesDirectory()
0817:                        .getAbsoluteFile()
0818:                        + File.separator + name);
0819:                CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(
0820:                        Level.SEVERE);
0821:                CrawlJob newProfile = new CrawlJob(name, createSettingsHandler(
0822:                        baseOn.getSettingsHandler().getOrderFile(), name,
0823:                        description, seeds, profileDir, cjseh, "order.xml",
0824:                        "seeds.txt"), cjseh);
0825:                addProfile(newProfile);
0826:                return newProfile;
0827:            }
0828:
0829:            /**
0830:             * Creates a new settings handler based on an existing job. Basically all
0831:             * the settings file for the 'based on' will be copied to the specified
0832:             * directory.
0833:             *
0834:             * @param orderFile Order file to base new order file on.  Cannot be null.
0835:             * @param name Name for the new settings
0836:             * @param description Description of the new settings.
0837:             * @param seeds The contents of the new settings' seed file.
0838:             * @param newSettingsDir
0839:             * @param errorHandler
0840:             * @param filename Name of new order file.
0841:             * @param seedfile Name of new seeds file.
0842:             *
0843:             * @return The new settings handler.
0844:             * @throws FatalConfigurationException
0845:             *             If there are problems with reading the 'base on'
0846:             *             configuration, with writing the new configuration or it's
0847:             *             seed file.
0848:             */
0849:            protected XMLSettingsHandler createSettingsHandler(
0850:                    final File orderFile, final String name,
0851:                    final String description, final String seeds,
0852:                    final File newSettingsDir,
0853:                    final CrawlJobErrorHandler errorHandler,
0854:                    final String filename, final String seedfile)
0855:                    throws FatalConfigurationException {
0856:                XMLSettingsHandler newHandler = null;
0857:                try {
0858:                    newHandler = new XMLSettingsHandler(orderFile);
0859:                    if (errorHandler != null) {
0860:                        newHandler.registerValueErrorHandler(errorHandler);
0861:                    }
0862:                    newHandler.setErrorReportingLevel(errorHandler.getLevel());
0863:                    newHandler.initialize();
0864:                } catch (InvalidAttributeValueException e2) {
0865:                    throw new FatalConfigurationException(
0866:                            "InvalidAttributeValueException occured while creating"
0867:                                    + " new settings handler for new job/profile\n"
0868:                                    + e2.getMessage());
0869:                }
0870:
0871:                // Make sure the directory exists.
0872:                newSettingsDir.mkdirs();
0873:
0874:                try {
0875:                    // Set the seed file
0876:                    ((ComplexType) newHandler.getOrder().getAttribute("scope"))
0877:                            .setAttribute(new Attribute("seedsfile", seedfile));
0878:                } catch (AttributeNotFoundException e1) {
0879:                    throw new FatalConfigurationException(
0880:                            "AttributeNotFoundException occured while setting up"
0881:                                    + "new job/profile\n" + e1.getMessage());
0882:                } catch (InvalidAttributeValueException e1) {
0883:                    throw new FatalConfigurationException(
0884:                            "InvalidAttributeValueException occured while setting"
0885:                                    + "up new job/profile\n" + e1.getMessage());
0886:                } catch (MBeanException e1) {
0887:                    throw new FatalConfigurationException(
0888:                            "MBeanException occured while setting up new"
0889:                                    + " job/profile\n" + e1.getMessage());
0890:                } catch (ReflectionException e1) {
0891:                    throw new FatalConfigurationException(
0892:                            "ReflectionException occured while setting up"
0893:                                    + " new job/profile\n" + e1.getMessage());
0894:                }
0895:
0896:                File newFile = new File(newSettingsDir.getAbsolutePath(),
0897:                        filename);
0898:
0899:                try {
0900:                    newHandler.copySettings(newFile, (String) newHandler
0901:                            .getOrder().getAttribute(
0902:                                    CrawlOrder.ATTR_SETTINGS_DIRECTORY));
0903:                } catch (IOException e3) {
0904:                    // Print stack trace to help debug issue where cannot create
0905:                    // new job from an old that has overrides.
0906:                    e3.printStackTrace();
0907:                    throw new FatalConfigurationException(
0908:                            "IOException occured while writing new settings files"
0909:                                    + " for new job/profile\n"
0910:                                    + e3.getMessage());
0911:                } catch (AttributeNotFoundException e) {
0912:                    throw new FatalConfigurationException(
0913:                            "AttributeNotFoundException occured while writing new"
0914:                                    + " settings files for new job/profile\n"
0915:                                    + e.getMessage());
0916:                } catch (MBeanException e) {
0917:                    throw new FatalConfigurationException(
0918:                            "MBeanException occured while writing new settings files"
0919:                                    + " for new job/profile\n" + e.getMessage());
0920:                } catch (ReflectionException e) {
0921:                    throw new FatalConfigurationException(
0922:                            "ReflectionException occured while writing new settings"
0923:                                    + " files for new job/profile\n"
0924:                                    + e.getMessage());
0925:                }
0926:                CrawlerSettings orderfile = newHandler.getSettingsObject(null);
0927:
0928:                orderfile.setName(name);
0929:                orderfile.setDescription(description);
0930:
0931:                if (seeds != null) {
0932:                    BufferedWriter writer = null;
0933:                    try {
0934:                        writer = new BufferedWriter(new FileWriter(newHandler
0935:                                .getPathRelativeToWorkingDirectory(seedfile)));
0936:                        try {
0937:                            writer.write(seeds);
0938:                        } finally {
0939:                            writer.close();
0940:                        }
0941:                    } catch (IOException e) {
0942:                        throw new FatalConfigurationException(
0943:                                "IOException occured while writing seed file for new"
0944:                                        + " job/profile\n" + e.getMessage());
0945:                    }
0946:                }
0947:                return newHandler;
0948:            }
0949:
0950:            /**
0951:             * @param recover
0952:             *            Source to use recovering. Can be full path to a recovery log
0953:             *            or full path to a checkpoint src dir.
0954:             * @param sh
0955:             *            Settings Handler to update.
0956:             * @param jobName
0957:             *            Name of this job.
0958:             * @throws FatalConfigurationException 
0959:             */
0960:            protected void updateRecoveryPaths(final File recover,
0961:                    final SettingsHandler sh, final String jobName)
0962:                    throws FatalConfigurationException {
0963:                if (recover == null) {
0964:                    return;
0965:                }
0966:                checkDirectory(recover);
0967:                try {
0968:                    // Set 'recover-path' to be old job's recovery log path
0969:                    updateRecoveryPaths(recover, sh);
0970:                } catch (AttributeNotFoundException e1) {
0971:                    throw new FatalConfigurationException(
0972:                            "AttributeNotFoundException occured while setting up"
0973:                                    + "new job/profile " + jobName + " \n"
0974:                                    + e1.getMessage());
0975:                } catch (InvalidAttributeValueException e1) {
0976:                    throw new FatalConfigurationException(
0977:                            "InvalidAttributeValueException occured while setting"
0978:                                    + "new job/profile " + jobName + " \n"
0979:                                    + e1.getMessage());
0980:                } catch (MBeanException e1) {
0981:                    throw new FatalConfigurationException(
0982:                            "MBeanException occured while setting up new"
0983:                                    + "new job/profile " + jobName + " \n"
0984:                                    + e1.getMessage());
0985:                } catch (ReflectionException e1) {
0986:                    throw new FatalConfigurationException(
0987:                            "ReflectionException occured while setting up"
0988:                                    + "new job/profile " + jobName + " \n"
0989:                                    + e1.getMessage());
0990:                } catch (IOException e) {
0991:                    throw new FatalConfigurationException(
0992:                            "IOException occured while setting up"
0993:                                    + "new job/profile " + jobName + " \n"
0994:                                    + e.getMessage());
0995:                }
0996:            }
0997:
0998:            /**
0999:             * @param recover
1000:             *            Source to use recovering. Can be full path to a recovery log
1001:             *            or full path to a checkpoint src dir.
1002:             * @param newHandler
1003:             * @throws ReflectionException
1004:             * @throws MBeanException
1005:             * @throws InvalidAttributeValueException
1006:             * @throws AttributeNotFoundException
1007:             * @throws IOException
1008:             */
1009:            private void updateRecoveryPaths(final File recover,
1010:                    SettingsHandler newHandler)
1011:                    throws AttributeNotFoundException,
1012:                    InvalidAttributeValueException, MBeanException,
1013:                    ReflectionException, IOException {
1014:                if (recover == null || !recover.exists()) {
1015:                    throw new IOException("Recovery src does not exist: "
1016:                            + recover);
1017:                }
1018:                newHandler.getOrder().setAttribute(
1019:                        new Attribute(CrawlOrder.ATTR_RECOVER_PATH, recover
1020:                                .getAbsolutePath()));
1021:
1022:                // Now, ensure that 'logs' and 'state' don't overlap with
1023:                // previous job's files (ok for 'arcs' and 'scratch' to overlap)
1024:                File newLogsDisk = null;
1025:                final String RECOVERY_SUFFIX = "-R";
1026:                while (true) {
1027:                    try {
1028:                        newLogsDisk = newHandler.getOrder().getSettingsDir(
1029:                                CrawlOrder.ATTR_LOGS_PATH);
1030:                    } catch (AttributeNotFoundException e) {
1031:                        logger.log(Level.SEVERE,
1032:                                "Failed to get logs directory", e);
1033:                    }
1034:                    if (newLogsDisk.list().length > 0) {
1035:                        // 'new' directory is nonempty; rename with trailing '-R'
1036:                        String logsPath = (String) newHandler.getOrder()
1037:                                .getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1038:                        if (logsPath.endsWith("/")) {
1039:                            logsPath = logsPath.substring(0,
1040:                                    logsPath.length() - 1);
1041:                        }
1042:                        newHandler.getOrder().setAttribute(
1043:                                new Attribute(CrawlOrder.ATTR_LOGS_PATH,
1044:                                        logsPath + RECOVERY_SUFFIX));
1045:                    } else {
1046:                        // directory is suitably empty; exit loop
1047:                        break;
1048:                    }
1049:                }
1050:                File newStateDisk = null;
1051:                while (true) {
1052:                    try {
1053:                        newStateDisk = newHandler.getOrder().getSettingsDir(
1054:                                CrawlOrder.ATTR_STATE_PATH);
1055:                    } catch (AttributeNotFoundException e) {
1056:                        logger.log(Level.SEVERE,
1057:                                "Failed to get state directory", e);
1058:                    }
1059:                    if (newStateDisk.list().length > 0) {
1060:                        // 'new' directory is nonempty; rename with trailing '-R'
1061:                        String statePath = (String) newHandler.getOrder()
1062:                                .getAttribute(CrawlOrder.ATTR_STATE_PATH);
1063:                        if (statePath.endsWith("/")) {
1064:                            statePath = statePath.substring(0, statePath
1065:                                    .length() - 1);
1066:                        }
1067:                        newHandler.getOrder().setAttribute(
1068:                                new Attribute(CrawlOrder.ATTR_STATE_PATH,
1069:                                        statePath + RECOVERY_SUFFIX));
1070:                    } else {
1071:                        // directory is suitably empty; exit loop
1072:                        break;
1073:                    }
1074:                }
1075:            }
1076:
1077:            /**
1078:             * Discard the handler's 'new job'. This will remove any files/directories
1079:             * written to disk.
1080:             */
1081:            public void discardNewJob() {
1082:                FileUtils.deleteDir(new File(newJob.getSettingsDirectory()));
1083:            }
1084:
1085:            /**
1086:             * Get the handler's 'new job'
1087:             * @return the handler's 'new job'
1088:             */
1089:            public CrawlJob getNewJob() {
1090:                return newJob;
1091:            }
1092:
1093:            /**
1094:             * Is the crawler accepting crawl jobs to run?
1095:             * @return True if the next availible CrawlJob will be crawled. False otherwise.
1096:             */
1097:            public boolean isRunning() {
1098:                return running;
1099:            }
1100:
1101:            /**
1102:             * Is a crawl job being crawled?
1103:             * @return True if a job is actually being crawled (even if it is paused).
1104:             *         False if no job is being crawled.
1105:             */
1106:            public boolean isCrawling() {
1107:                return this .currentJob != null;
1108:            }
1109:
1110:            /**
1111:             * Allow jobs to be crawled.
1112:             */
1113:            public void startCrawler() {
1114:                running = true;
1115:                if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
1116:                    // Ok, can just start the next job
1117:                    startNextJob();
1118:                }
1119:            }
1120:
1121:            /**
1122:             * Stop future jobs from being crawled.
1123:             *
1124:             * This action will not affect the current job.
1125:             */
1126:            public void stopCrawler() {
1127:                running = false;
1128:            }
1129:
1130:            /**
1131:             * Start next crawl job.
1132:             *
1133:             * If a is job already running this method will do nothing.
1134:             */
1135:            protected final void startNextJob() {
1136:                synchronized (this ) {
1137:                    if (startingNextJob != null) {
1138:                        try {
1139:                            startingNextJob.join();
1140:                        } catch (InterruptedException e) {
1141:                            e.printStackTrace();
1142:                            return;
1143:                        }
1144:                    }
1145:                    startingNextJob = new Thread(new Runnable() {
1146:                        public void run() {
1147:                            startNextJobInternal();
1148:                        }
1149:                    }, "StartNextJob");
1150:                    startingNextJob.start();
1151:                }
1152:            }
1153:
1154:            protected void startNextJobInternal() {
1155:                if (pendingCrawlJobs.size() == 0 || isCrawling()) {
1156:                    // No job ready or already crawling.
1157:                    return;
1158:                }
1159:                this .currentJob = (CrawlJob) pendingCrawlJobs.first();
1160:                assert pendingCrawlJobs.contains(currentJob) : "pendingCrawlJobs is in an illegal state";
1161:                pendingCrawlJobs.remove(currentJob);
1162:                try {
1163:                    this .currentJob.setupForCrawlStart();
1164:                    // This is ugly but needed so I can clear the currentJob
1165:                    // reference in the crawlEnding and update the list of completed
1166:                    // jobs.  Also, crawlEnded can startup next job.
1167:                    this .currentJob.getController()
1168:                            .addCrawlStatusListener(this );
1169:                    // now, actually start
1170:                    this .currentJob.getController().requestCrawlStart();
1171:                } catch (InitializationException e) {
1172:                    loadJob(getStateJobFile(this .currentJob.getDirectory()));
1173:                    this .currentJob = null;
1174:                    startNextJobInternal(); // Load the next job if there is one.
1175:                }
1176:            }
1177:
1178:            /**
1179:             * Forward a 'kick' update to current job if any.
1180:             */
1181:            public void kickUpdate() {
1182:                if (this .currentJob != null) {
1183:                    this .currentJob.kickUpdate();
1184:                }
1185:            }
1186:
1187:            /**
1188:             * Loads options from a file. Typically these are a list of available
1189:             * modules that can be plugged into some part of the configuration.
1190:             * For examples Processors, Frontiers, Filters etc. Leading and trailing
1191:             * spaces are trimmed from each line.
1192:             * 
1193:             * <p>Options are loaded from the CLASSPATH.
1194:             * @param file the name of the option file (without path!)
1195:             * @return The option file with each option line as a seperate entry in the
1196:             *         ArrayList.
1197:             * @throws IOException when there is trouble reading the file.
1198:             */
1199:            public static ArrayList<String> loadOptions(String file)
1200:                    throws IOException {
1201:                ArrayList<String> ret = new ArrayList<String>();
1202:                Enumeration resources = CrawlJob.class.getClassLoader()
1203:                        .getResources("modules/" + file);
1204:
1205:                boolean noFileFound = true;
1206:                while (resources.hasMoreElements()) {
1207:                    InputStream is = ((URL) resources.nextElement())
1208:                            .openStream();
1209:                    noFileFound = false;
1210:
1211:                    String line = null;
1212:                    BufferedReader bf = new BufferedReader(
1213:                            new InputStreamReader(is), 8192);
1214:                    try {
1215:                        while ((line = bf.readLine()) != null) {
1216:                            line = line.trim();
1217:                            if (line.indexOf('#') < 0 && line.length() > 0) {
1218:                                // Looks like a valid line.
1219:                                ret.add(line);
1220:                            }
1221:                        }
1222:                    } finally {
1223:                        bf.close();
1224:                    }
1225:                }
1226:
1227:                if (noFileFound) {
1228:                    throw new IOException("Failed to get " + file
1229:                            + " from the " + " CLASSPATH");
1230:                }
1231:
1232:                return ret;
1233:            }
1234:
1235:            /**
1236:             * Returns a URIFrontierMarker for the current, paused, job. If there is no
1237:             * current job or it is not paused null will be returned.
1238:             *
1239:             * @param regexpr
1240:             *            A regular expression that each URI must match in order to be
1241:             *            considered 'within' the marker.
1242:             * @param inCacheOnly
1243:             *            Limit marker scope to 'cached' URIs.
1244:             * @return a URIFrontierMarker for the current job.
1245:             * @see #getPendingURIsList(FrontierMarker, int, boolean)
1246:             * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
1247:             *      boolean)
1248:             * @see org.archive.crawler.framework.FrontierMarker
1249:             */
1250:            public FrontierMarker getInitialMarker(String regexpr,
1251:                    boolean inCacheOnly) {
1252:                return (this .currentJob != null) ? this .currentJob
1253:                        .getInitialMarker(regexpr, inCacheOnly) : null;
1254:            }
1255:
1256:            /**
1257:             * Returns the frontiers URI list based on the provided marker. This method
1258:             * will return null if there is not current job or if the current job is
1259:             * not paused. Only when there is a paused current job will this method
1260:             * return a URI list.
1261:             *
1262:             * @param marker
1263:             *            URIFrontier marker
1264:             * @param numberOfMatches
1265:             *            maximum number of matches to return
1266:             * @param verbose
1267:             *            should detailed info be provided on each URI?
1268:             * @return the frontiers URI list based on the provided marker
1269:             * @throws InvalidFrontierMarkerException
1270:             *             When marker is inconsistent with the current state of the
1271:             *             frontier.
1272:             * @see #getInitialMarker(String, boolean)
1273:             * @see org.archive.crawler.framework.FrontierMarker
1274:             */
1275:            public ArrayList getPendingURIsList(FrontierMarker marker,
1276:                    int numberOfMatches, boolean verbose)
1277:                    throws InvalidFrontierMarkerException {
1278:                return (this .currentJob != null) ? this .currentJob
1279:                        .getPendingURIsList(marker, numberOfMatches, verbose)
1280:                        : null;
1281:            }
1282:
1283:            /**
1284:             * Delete any URI from the frontier of the current (paused) job that match
1285:             * the specified regular expression. If the current job is not paused (or
1286:             * there is no current job) nothing will be done.
1287:             * @param regexpr Regular expression to delete URIs by.
1288:             * @return the number of URIs deleted
1289:             */
1290:            public long deleteURIsFromPending(String regexpr) {
1291:                return (this .currentJob != null) ? this .currentJob
1292:                        .deleteURIsFromPending(regexpr) : 0;
1293:            }
1294:
1295:            public String importUris(String file, String style, String force) {
1296:                return importUris(file, style, "true".equals(force));
1297:            }
1298:
1299:            /**
1300:             * @param fileOrUrl Name of file w/ seeds.
1301:             * @param style What style of seeds -- crawl log (<code>crawlLog</code>
1302:             * style) or recovery journal (<code>recoveryJournal</code> style), or
1303:             * seeds file style (Pass <code>default</code> style).
1304:             * @param forceRevisit Should we revisit even if seen before?
1305:             * @return A display string that has a count of all added.
1306:             */
1307:            public String importUris(final String fileOrUrl,
1308:                    final String style, final boolean forceRevisit) {
1309:                return (this .currentJob != null) ? this .currentJob.importUris(
1310:                        fileOrUrl, style, forceRevisit) : null;
1311:            }
1312:
1313:            protected int importUris(InputStream is, String style,
1314:                    boolean forceRevisit) {
1315:                return (this .currentJob != null) ? this .currentJob.importUris(
1316:                        is, style, forceRevisit) : 0;
1317:            }
1318:
1319:            /**
1320:             * Schedule a uri.
1321:             * @param uri Uri to schedule.
1322:             * @param forceFetch Should it be forcefetched.
1323:             * @param isSeed True if seed.
1324:             * @throws URIException
1325:             */
1326:            public void importUri(final String uri, final boolean forceFetch,
1327:                    final boolean isSeed) throws URIException {
1328:                importUri(uri, forceFetch, isSeed, true);
1329:            }
1330:
1331:            /**
1332:             * Schedule a uri.
1333:             * @param str String that can be: 1. a UURI, 2. a snippet of the
1334:             * crawl.log line, or 3. a snippet from recover log.  See
1335:             * {@link #importUris(InputStream, String, boolean)} for how it subparses
1336:             * the lines from crawl.log and recover.log.
1337:             * @param forceFetch Should it be forcefetched.
1338:             * @param isSeed True if seed.
1339:             * @param isFlush If true, flush the frontier IF it implements
1340:             * flushing.
1341:             * @throws URIException
1342:             */
1343:            public void importUri(final String str, final boolean forceFetch,
1344:                    final boolean isSeed, final boolean isFlush)
1345:                    throws URIException {
1346:                if (this .currentJob != null) {
1347:                    this .currentJob.importUri(str, forceFetch, isSeed, isFlush);
1348:                }
1349:            }
1350:
1351:            /**
1352:             * If its a HostQueuesFrontier, needs to be flushed for the queued.
1353:             */
1354:            protected void doFlush() {
1355:                if (this .currentJob != null) {
1356:                    this .currentJob.flush();
1357:                }
1358:            }
1359:
1360:            public void stop() {
1361:                if (isCrawling()) {
1362:                    deleteJob(getCurrentJob().getUID());
1363:                }
1364:            }
1365:
1366:            public void requestCrawlStop() {
1367:                if (this .currentJob != null) {
1368:                    this .currentJob.stopCrawling();
1369:                }
1370:            }
1371:
1372:            /**
1373:             * Ensure order file with new name/desc is written.
1374:             * See '[ 1066573 ] sometimes job based-on other job uses older job name'.
1375:             * @param newJob Newly created job.
1376:             * @param metaname Metaname for new job.
1377:             * @param description Description for new job.
1378:             * @return <code>newJob</code>
1379:             */
1380:            public static CrawlJob ensureNewJobWritten(CrawlJob newJob,
1381:                    String metaname, String description) {
1382:                XMLSettingsHandler settingsHandler = newJob
1383:                        .getSettingsHandler();
1384:                CrawlerSettings orderfile = settingsHandler
1385:                        .getSettingsObject(null);
1386:                orderfile.setName(metaname);
1387:                orderfile.setDescription(description);
1388:                settingsHandler.writeSettingsObject(orderfile);
1389:                return newJob;
1390:            }
1391:
1392:            public void crawlStarted(String message) {
1393:                // TODO Auto-generated method stub
1394:
1395:            }
1396:
1397:            public void crawlEnding(String sExitMessage) {
1398:                loadJob(getStateJobFile(this .currentJob.getDirectory()));
1399:                currentJob = null;
1400:                synchronized (this ) {
1401:                    // If the GUI terminated the job then it is waiting for this event.
1402:                    notifyAll();
1403:                }
1404:            }
1405:
1406:            public void crawlEnded(String sExitMessage) {
1407:                if (this .running) {
1408:                    startNextJob();
1409:                }
1410:            }
1411:
1412:            public void crawlPausing(String statusMessage) {
1413:                // TODO Auto-generated method stub
1414:
1415:            }
1416:
1417:            public void crawlPaused(String statusMessage) {
1418:                // TODO Auto-generated method stub
1419:
1420:            }
1421:
1422:            public void crawlResuming(String statusMessage) {
1423:                // TODO Auto-generated method stub
1424:            }
1425:
1426:            public void crawlCheckpoint(File checkpointDir) throws Exception {
1427:                // TODO Auto-generated method stub
1428:            }
1429:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.