Source Code Cross Referenced for Crawler.java in » Web-Crawler » WebSPHINX » websphinx » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » WebSPHINX » websphinx
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         * WebSphinx web-crawling toolkit
0003:         *
0004:         * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights
0005:         * reserved.
0006:         *
0007:         * Redistribution and use in source and binary forms, with or without
0008:         * modification, are permitted provided that the following conditions
0009:         * are met:
0010:         *
0011:         * 1. Redistributions of source code must retain the above copyright
0012:         *    notice, this list of conditions and the following disclaimer.
0013:         *
0014:         * 2. Redistributions in binary form must reproduce the above copyright
0015:         *    notice, this list of conditions and the following disclaimer in
0016:         *    the documentation and/or other materials provided with the
0017:         *    distribution.
0018:         *
0019:         * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
0020:         * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
0021:         * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0022:         * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
0023:         * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0024:         * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0025:         * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0026:         * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0027:         * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0028:         * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
0029:         * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0030:         *
0031:         */
0032:
0033:        package websphinx;
0034:
0035:        import rcm.util.PriorityQueue;
0036:        import rcm.util.Timer;
0037:        import java.util.Vector;
0038:        import java.util.Enumeration;
0039:        import java.util.Hashtable;
0040:        import java.util.StringTokenizer;
0041:        import java.net.URL;
0042:        import java.net.MalformedURLException;
0043:        import java.io.IOException; //#ifdef JDK1.1 
0044:        import java.io.Serializable;
0045:        import java.io.ObjectInputStream;
0046:        import java.io.ObjectOutputStream;
0047:
0048:        //#endif JDK1.1
0049:
0050:        /**
0051:         * Web crawler.
0052:         * <P>
0053:         * To write a crawler, extend this class and override 
0054:         * shouldVisit () and visit() to create your own crawler.
0055:         * <P>
0056:         * To use a crawler:
0057:         * <OL>
0058:         * <LI>Initialize the crawler by calling
0059:         * setRoot() (or one of its variants) and setting other 
0060:         * crawl parameters.
0061:         * <LI>Register any classifiers you need with addClassifier().
0062:         * <LI>Connect event listeners to monitor the crawler,
0063:         *     such as websphinx.EventLog, websphinx.workbench.WebGraph,
0064:         *     or websphinx.workbench.Statistics.
0065:         * <LI>Call run() to start the crawler.
0066:         * </OL>
0067:         * A running crawler consists of a priority queue of 
0068:         * Links waiting to be visited and a set of threads 
0069:         * retrieving pages in parallel.  When a page is downloaded,
0070:         * it is processed as follows:
0071:         * <OL>
0072:         * <LI><B>classify()</B>: The page is passed to the classify() method of 
0073:         * every registered classifier, in increasing order of
0074:         * their priority values.  Classifiers typically attach
0075:         * informative labels to the page and its links, such as "homepage"
0076:         * or "root page".
0077:         * <LI><B>visit()</B>: The page is passed to the crawler's
0078:         * visit() method for user-defined processing.
0079:         * <LI><B>expand()</B>: The page is passed to the crawler's
0080:         * expand() method to be expanded.  The default implementation
0081:         * tests every unvisited hyperlink on the page with shouldVisit(), 
0082:         * and puts
0083:         * each link approved by shouldVisit() into the crawling queue.
0084:         * </OL>
0085:         * By default, when expanding the links of a page, the crawler 
0086:         * only considers hyperlinks (not applets or inline images, for instance) that
0087:         * point to Web pages (not mailto: links, for instance).  If you want
0088:         * shouldVisit() to test every link on the page, use setLinkType(Crawler.ALL_LINKS).
0089:         * 
0090:         */
0091:
0092:        public class Crawler implements  Runnable
0093:        //#ifdef JDK1.1 
0094:                , Serializable
0095:        //#endif JDK1.1
0096:        {
0097:
0098:            //#ifdef JDK1.1 
0099:            private static final long serialVersionUID = -3757789861952010450L;
0100:            //#endif JDK1.1
0101:
0102:            /**
0103:             * Specify WEB as the crawl domain to allow the crawler
0104:             * to visit any page on the World Wide Web.
0105:             */
0106:            public static final String[] WEB = null;
0107:
0108:            /**
0109:             * Specify SERVER as the crawl domain to limit the crawler
0110:             * to visit only pages on the same Web server (hostname
0111:             * and port number) as the root link from which it started.
0112:             */
0113:            public static final String[] SERVER = { "local" };
0114:
0115:            /**
0116:             * Specify SUBTREE as the crawl domain to limit the crawler
0117:             * to visit only pages which are descendants of the root link 
0118:             * from which it started.
0119:             */
0120:            public static final String[] SUBTREE = { "sibling", "descendent" };
0121:
0122:            /**
0123:             * Specify HYPERLINKS as the link type to allow the crawler
0124:             * to visit only hyperlinks (A, AREA, and FRAME tags which
0125:             * point to http:, ftp:, file:, or gopher: URLs).
0126:             */
0127:            public static final String[] HYPERLINKS = { "hyperlink" };
0128:
0129:            /**
0130:             * Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler
0131:             * to visit only hyperlinks and inline images.
0132:             */
0133:            public static final String[] HYPERLINKS_AND_IMAGES = { "hyperlink",
0134:                    "image" };
0135:
0136:            /**
0137:             * Specify ALL_LINKS as the link type to allow the crawler
0138:             * to visit any kind of link
0139:             */
0140:            public static final String[] ALL_LINKS = null;
0141:
0142:            // Crawler parameters
0143:            private String name = getClass().getName(); // crawler's name
0144:            private transient Link[] roots = null;
0145:            private String[] rootHrefs = null; // exists only when serializing crawler
0146:            private String[] domain = WEB;
0147:            private boolean synchronous = false;
0148:            private boolean depthFirst = true;
0149:            private String[] type = HYPERLINKS;
0150:            private boolean ignoreVisitedLinks = true;
0151:            private int maxDepth = 5;
0152:            private DownloadParameters dp = new DownloadParameters()
0153:                    .changeUserAgent(name);
0154:            private Vector classifiers = new Vector();
0155:            private LinkPredicate linkPredicate;
0156:            private PagePredicate pagePredicate;
0157:            private Action action;
0158:
0159:            // Transient state
0160:
0161:            private transient Link[] crawledRoots = null;
0162:
0163:            private transient int state = CrawlEvent.CLEARED;
0164:
0165:            private transient Worm[] worms;
0166:            // background threads
0167:
0168:            private transient PriorityQueue fetchQueue;
0169:            // links waiting to be downloaded
0170:            private transient PriorityQueue crawlQueue;
0171:            // all links that have been expanded but not
0172:            // processed (used only if crawler is in synchronous mode)
0173:
0174:            private transient int numLinksTested;
0175:            // number of links tested by shouldVisit()
0176:            private transient int numPagesVisited;
0177:            // number of pages passed to visit()
0178:            private transient int numPagesLeft;
0179:            // all links that have been expanded but not processed
0180:            // == crawlQueue.size ()
0181:
0182:            // FIX: convert to immutable linked lists
0183:            private transient Vector crawlListeners;
0184:            // list of CrawlListeners
0185:            private transient Vector linkListeners;
0186:            // list of LinkListeners
0187:
0188:            private transient Hashtable visitedPages;
0189:            // visited pages (a set of URLs)
0190:
0191:            private transient RobotExclusion robotExclusion;
0192:
0193:            // robot exclusion cache
0194:
0195:            /**
0196:             * Make a new Crawler.
0197:             */
0198:            public Crawler() {
0199:                addClassifier(new StandardClassifier());
0200:                init();
0201:            }
0202:
0203:            /*
0204:             * Initialize the transient fields of the crawler.
0205:             */
0206:            private void init() {
0207:                state = CrawlEvent.CLEARED;
0208:
0209:                numLinksTested = 0;
0210:                numPagesVisited = 0;
0211:                numPagesLeft = 0;
0212:
0213:                worms = null;
0214:                crawlQueue = new PriorityQueue();
0215:                fetchQueue = new PriorityQueue();
0216:
0217:                crawlListeners = new Vector();
0218:                linkListeners = new Vector();
0219:
0220:                visitedPages = new Hashtable();
0221:                robotExclusion = new RobotExclusion(getName());
0222:            }
0223:
0224:            /*
0225:             * Write a Crawler to an output stream.
0226:             */
0227:            //#ifdef JDK1.1 
0228:            private void writeObject(ObjectOutputStream out) throws IOException {
0229:                if (roots != null) {
0230:                    rootHrefs = new String[roots.length];
0231:                    for (int i = 0; i < roots.length; ++i)
0232:                        rootHrefs[i] = roots[i].getURL().toString();
0233:                } else
0234:                    rootHrefs = null;
0235:
0236:                out.defaultWriteObject();
0237:
0238:                rootHrefs = null;
0239:            }
0240:
0241:            //#endif JDK1.1
0242:
0243:            /*
0244:             * Read a Crawler from an input stream.
0245:             */
0246:            //#ifdef JDK1.1 
0247:            private void readObject(ObjectInputStream in) throws IOException,
0248:                    ClassNotFoundException {
0249:                in.defaultReadObject();
0250:
0251:                if (rootHrefs != null) {
0252:                    roots = new Link[rootHrefs.length];
0253:                    for (int i = 0; i < rootHrefs.length; ++i)
0254:                        roots[i] = new Link(rootHrefs[i]);
0255:                } else
0256:                    roots = null;
0257:
0258:                domain = useStandard(WEB, domain);
0259:                domain = useStandard(SERVER, domain);
0260:                domain = useStandard(SUBTREE, domain);
0261:
0262:                type = useStandard(HYPERLINKS, type);
0263:                type = useStandard(HYPERLINKS_AND_IMAGES, type);
0264:                type = useStandard(ALL_LINKS, type);
0265:
0266:                init();
0267:
0268:                if (linkPredicate != null)
0269:                    linkPredicate.connected(this );
0270:                if (pagePredicate != null)
0271:                    pagePredicate.connected(this );
0272:                if (action != null)
0273:                    action.connected(this );
0274:            }
0275:
0276:            private static String[] useStandard(String[] standard, String[] s) {
0277:                if (s == null || standard == null || standard == s)
0278:                    return s;
0279:                if (s.length != standard.length)
0280:                    return s;
0281:                for (int i = 0; i < s.length; ++i)
0282:                    if (!s[i].equals(standard[i]))
0283:                        return s;
0284:                return standard;
0285:            }
0286:
0287:            //#endif JDK1.1
0288:
0289:            /**
0290:             * Start crawling.  Returns either when the crawl is done, or 
0291:             * when pause() or stop() is called.  Because this method implements the
0292:             * java.lang.Runnable interface, a crawler can be run in the
0293:             * background thread.
0294:             */
0295:            public void run() {
0296:                crawledRoots = roots;
0297:
0298:                if (state == CrawlEvent.STOPPED)
0299:                    clear();
0300:
0301:                if (state == CrawlEvent.CLEARED && crawledRoots != null) {
0302:                    // give each root a default priority based on its position in the array
0303:                    float priority = 0;
0304:                    float increment = 1.0f / crawledRoots.length;
0305:                    for (int i = 0; i < crawledRoots.length; ++i) {
0306:                        crawledRoots[i].setPriority(priority);
0307:                        priority += increment;
0308:                    }
0309:                    submit(crawledRoots);
0310:                }
0311:
0312:                state = CrawlEvent.STARTED;
0313:                sendCrawlEvent(state);
0314:
0315:                synchronized (crawlQueue) {
0316:                    Timer timer = new CrawlTimer(this );
0317:                    int timeout = dp.getCrawlTimeout();
0318:                    if (timeout > 0)
0319:                        timer.set(timeout * 1000, false);
0320:
0321:                    int nWorms = Math.max(dp.getMaxThreads(), 1);
0322:                    worms = new Worm[nWorms];
0323:                    for (int i = 0; i < nWorms; ++i) {
0324:                        worms[i] = new Worm(this , i);
0325:                        worms[i].start();
0326:                    }
0327:
0328:                    try {
0329:                        while (state == CrawlEvent.STARTED) {
0330:                            if (numPagesLeft == 0) {
0331:                                // ran out of links to crawl
0332:                                state = CrawlEvent.STOPPED;
0333:                                sendCrawlEvent(state);
0334:                            } else if (synchronous) {
0335:                                // Synchronous mode.
0336:                                // Main thread calls process() on each link
0337:                                // in crawlQueue, in priority order.
0338:                                Link link = (Link) crawlQueue.getMin();
0339:                                if (link.getStatus() == LinkEvent.DOWNLOADED)
0340:                                    process(link);
0341:                                else
0342:                                    crawlQueue.wait();
0343:                            } else
0344:                                // Asynchronous crawling.
0345:                                // Main thread does nothing but wait, while
0346:                                // background threads call process().
0347:                                crawlQueue.wait();
0348:                        }
0349:                    } catch (InterruptedException e) {
0350:                    }
0351:
0352:                    timer.cancel();
0353:
0354:                    for (int i = 0; i < worms.length; ++i)
0355:                        worms[i].die();
0356:                    if (state == CrawlEvent.PAUSED) {
0357:                        // put partly-processed links back in fetchQueue
0358:                        synchronized (fetchQueue) {
0359:                            for (int i = 0; i < worms.length; ++i)
0360:                                if (worms[i].link != null)
0361:                                    fetchQueue.put(worms[i].link);
0362:                        }
0363:                    }
0364:                    worms = null;
0365:                }
0366:            }
0367:
0368:            /**
0369:             * Initialize the crawler for a fresh crawl.  Clears the crawling queue
0370:             * and sets all crawling statistics to 0.  Stops the crawler
0371:             * if it is currently running.
0372:             */
0373:            public void clear() {
0374:                stop();
0375:                numPagesVisited = 0;
0376:                numLinksTested = 0;
0377:                clearVisited();
0378:                if (crawledRoots != null)
0379:                    for (int i = 0; i < crawledRoots.length; ++i)
0380:                        crawledRoots[i].disconnect();
0381:                crawledRoots = null;
0382:                state = CrawlEvent.CLEARED;
0383:                sendCrawlEvent(state);
0384:            }
0385:
0386:            /**
0387:             * Pause the crawl in progress.  If the crawler is running, then
0388:             * it finishes processing the current page, then returns.  The queues remain as-is,
0389:             * so calling run() again will resume the crawl exactly where it left off.
0390:             * pause() can be called from any thread.
0391:             */
0392:            public void pause() {
0393:                if (state == CrawlEvent.STARTED) {
0394:                    synchronized (crawlQueue) {
0395:                        state = CrawlEvent.PAUSED;
0396:                        crawlQueue.notify();
0397:                    }
0398:                    sendCrawlEvent(state);
0399:                }
0400:            }
0401:
0402:            /**
0403:             * Stop the crawl in progress.  If the crawler is running, then
0404:             * it finishes processing the current page, then returns.
0405:             * Empties the crawling queue.
0406:             */
0407:            public void stop() {
0408:                if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) {
0409:                    synchronized (crawlQueue) {
0410:                        synchronized (fetchQueue) {
0411:                            state = CrawlEvent.STOPPED;
0412:                            fetchQueue.clear();
0413:                            crawlQueue.clear();
0414:                            numPagesLeft = 0;
0415:                            crawlQueue.notify();
0416:                        }
0417:                    }
0418:                    sendCrawlEvent(state);
0419:                }
0420:            }
0421:
0422:            /*
0423:             * Timeout the crawl in progress.  Used internally by
0424:             * the CrawlTimer.
0425:             */
0426:            void timedOut() {
0427:                if (state == CrawlEvent.STARTED) {
0428:                    synchronized (crawlQueue) {
0429:                        synchronized (fetchQueue) {
0430:                            state = CrawlEvent.TIMED_OUT;
0431:                            fetchQueue.clear();
0432:                            crawlQueue.clear();
0433:                            numPagesLeft = 0;
0434:                            crawlQueue.notify();
0435:                        }
0436:                    }
0437:                    sendCrawlEvent(state);
0438:                }
0439:            }
0440:
0441:            /**
0442:             * Get state of crawler.
0443:             * @return one of CrawlEvent.STARTED, CrawlEvent.PAUSED, STOPPED, CLEARED.
0444:             */
0445:            public int getState() {
0446:                return state;
0447:            }
0448:
0449:            /**
0450:             * Callback for visiting a page.  Default version does nothing.
0451:             *
0452:             * @param page Page retrieved by the crawler
0453:             */
0454:            public void visit(Page page) {
0455:            }
0456:
0457:            /**
0458:             * Callback for testing whether a link should be traversed.
0459:             * Default version returns true for all links. Override this method
0460:             * for more interesting behavior.
0461:             *
0462:             * @param l Link encountered by the crawler
0463:             * @return true if link should be followed, false if it should be ignored.
0464:             */
0465:            public boolean shouldVisit(Link l) {
0466:                return true;
0467:            }
0468:
0469:            /** 
0470:             * Expand the crawl from a page.  The default implementation of this
0471:             * method tests every link on the page using shouldVisit (), and 
0472:             * submit()s the links that are approved.  A subclass may want to override
0473:             * this method if it's inconvenient to consider the links individually 
0474:             * with shouldVisit().
0475:             * @param page Page to expand
0476:             */
0477:            public void expand(Page page) {
0478:                // examine each link on the page
0479:                Link[] links = page.getLinks();
0480:
0481:                if (links != null && links.length > 0) {
0482:                    // give each link a default priority based on its page
0483:                    // and position on page
0484:                    float priority = (depthFirst ? -numPagesVisited
0485:                            : numPagesVisited);
0486:                    float increment = 1.0f / links.length;
0487:
0488:                    for (int i = 0; i < links.length; ++i) {
0489:                        Link l = links[i];
0490:
0491:                        // set default download parameters
0492:                        l.setPriority(priority);
0493:                        priority += increment;
0494:                        l.setDownloadParameters(dp);
0495:
0496:                        ++numLinksTested;
0497:                        if (ignoreVisitedLinks && visited(l))
0498:                            // FIX: use atomic test-and-set
0499:                            // FIX: set l.page somehow?
0500:                            sendLinkEvent(l, LinkEvent.ALREADY_VISITED);
0501:                        else if (!((type == null || l.hasAnyLabels(type))
0502:                                && (domain == null || l.hasAnyLabels(domain))
0503:                                && (linkPredicate == null || linkPredicate
0504:                                        .shouldVisit(l)) && shouldVisit(l)))
0505:                            sendLinkEvent(l, LinkEvent.SKIPPED);
0506:                        else if (page.getDepth() >= maxDepth)
0507:                            sendLinkEvent(l, LinkEvent.TOO_DEEP);
0508:                        else
0509:                            submit(l);
0510:                    }
0511:                }
0512:            }
0513:
0514:            /*
0515:             * Crawl statistics
0516:             */
0517:
0518:            /**
0519:             * Get number of pages visited.
0520:             * @return number of pages passed to visit() so far in this crawl
0521:             */
0522:            public int getPagesVisited() {
0523:                return numPagesVisited;
0524:            }
0525:
0526:            /**
0527:             * Get number of links tested.
0528:             * @return number of links passed to shouldVisit() so far in this crawl
0529:             */
0530:            public int getLinksTested() {
0531:                return numLinksTested;
0532:            }
0533:
0534:            /**
0535:             * Get number of pages left to be visited.
0536:             * @return number of links approved by shouldVisit() but not yet visited
0537:             */
0538:            public int getPagesLeft() {
0539:                return numPagesLeft;
0540:            }
0541:
0542:            /**
0543:             * Get number of threads currently working.
0544:             * @return number of threads downloading pages
0545:             */
0546:            public int getActiveThreads() {
0547:                Worm[] w = worms;
0548:
0549:                if (w == null)
0550:                    return 0;
0551:
0552:                int n = 0;
0553:                for (int i = 0; i < w.length; ++i)
0554:                    if (w[i] != null && w[i].link != null)
0555:                        ++n;
0556:                return n;
0557:            }
0558:
0559:            /*
0560:             * Crawler parameters
0561:             */
0562:
0563:            /**
0564:             * Get human-readable name of crawler.  Default value is the
0565:             * class name, e.g., "Crawler".  Useful for identifying the crawler in a
0566:             * user interface; also used as the default User-agent for identifying
0567:             * the crawler to a remote Web server.  (The User-agent can be
0568:             * changed independently of the crawler name with setDownloadParameters().)
0569:             * @return human-readable name of crawler
0570:             */
0571:            public String getName() {
0572:                return name;
0573:            }
0574:
0575:            /**
0576:             * Set human-readable name of crawler.
0577:             * @param name new name for crawler
0578:             */
0579:            public void setName(String name) {
0580:                this .name = name;
0581:            }
0582:
0583:            /**
0584:             * Convert the crawler to a String.
0585:             * @return Human-readable name of crawler.
0586:             */
0587:            public String toString() {
0588:                return getName();
0589:            }
0590:
0591:            /**
0592:             * Get starting points of crawl as an array of Link objects.
0593:             * @return array of Links from which crawler will start its next crawl.
0594:             */
0595:            public Link[] getRoots() {
0596:                if (roots == null)
0597:                    return new Link[0];
0598:
0599:                Link[] result = new Link[roots.length];
0600:                System.arraycopy(roots, 0, result, 0, roots.length);
0601:                return result;
0602:            }
0603:
0604:            /**
0605:             * Get roots of last crawl.  May differ from getRoots() 
0606:             * if new roots have been set.
0607:             * @return array of Links from which crawler started its last crawl,
0608:             * or null if the crawler was cleared.
0609:             */
0610:            public Link[] getCrawledRoots() {
0611:                if (crawledRoots == null)
0612:                    return null;
0613:
0614:                Link[] result = new Link[crawledRoots.length];
0615:                System.arraycopy(crawledRoots, 0, result, 0,
0616:                        crawledRoots.length);
0617:                return result;
0618:            }
0619:
0620:            /**
0621:             * Get starting points of crawl as a String of newline-delimited URLs.
0622:             * @return URLs where crawler will start, separated by newlines.
0623:             */
0624:            public String getRootHrefs() {
0625:                StringBuffer buf = new StringBuffer();
0626:                if (roots != null) {
0627:                    for (int i = 0; i < roots.length; ++i) {
0628:                        if (buf.length() > 0)
0629:                            buf.append('\n');
0630:                        buf.append(roots[i].getURL().toExternalForm());
0631:                    }
0632:                }
0633:                return buf.toString();
0634:            }
0635:
0636:            /**
0637:             * Set starting points of crawl as a string of whitespace-delimited URLs.
0638:             * @param hrefs URLs of starting point, separated by space, \t, or \n
0639:             * @exception java.net.MalformedURLException if any of the URLs is invalid,
0640:             *    leaving starting points unchanged
0641:             */
0642:            public void setRootHrefs(String hrefs) throws MalformedURLException {
0643:                Vector v = new Vector();
0644:                StringTokenizer tok = new StringTokenizer(hrefs);
0645:                while (tok.hasMoreElements())
0646:                    v.addElement(new Link(tok.nextToken()));
0647:                roots = new Link[v.size()];
0648:                v.copyInto(roots);
0649:            }
0650:
0651:            /**
0652:             * Set starting point of crawl as a single Link.
0653:             * @param link starting point
0654:             */
0655:            public void setRoot(Link link) {
0656:                roots = new Link[1];
0657:                roots[0] = link;
0658:            }
0659:
0660:            /**
0661:             * Set starting points of crawl as an array of Links.
0662:             * @param links starting points
0663:             */
0664:            public void setRoots(Link[] links) {
0665:                roots = new Link[links.length];
0666:                System.arraycopy(links, 0, roots, 0, links.length);
0667:            }
0668:
0669:            /**
0670:             * Add a root to the existing set of roots.
0671:             * @param link starting point to add
0672:             */
0673:            public void addRoot(Link link) {
0674:                if (roots == null)
0675:                    setRoot(link);
0676:                else {
0677:                    Link newroots[] = new Link[roots.length + 1];
0678:                    System.arraycopy(roots, 0, newroots, 0, roots.length);
0679:                    newroots[newroots.length - 1] = link;
0680:                    roots = newroots;
0681:                }
0682:            }
0683:
0684:            /**
0685:             * Get crawl domain.  Default value is WEB.
0686:             * @return WEB, SERVER, or SUBTREE.
0687:             */
0688:            public String[] getDomain() {
0689:                return domain;
0690:            }
0691:
0692:            /**
0693:             * Set crawl domain.
0694:             * @param domain one of WEB, SERVER, or SUBTREE.
0695:             */
0696:            public void setDomain(String[] domain) {
0697:                this .domain = domain;
0698:            }
0699:
0700:            /**
0701:             * Get legal link types to crawl.  Default value is HYPERLINKS.
0702:             * @return HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
0703:             */
0704:            public String[] getLinkType() {
0705:                return type;
0706:            }
0707:
0708:            /**
0709:             * Set legal link types to crawl.
0710:             * @param domain one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
0711:             */
0712:            public void setLinkType(String[] type) {
0713:                this .type = type;
0714:            }
0715:
0716:            /**
0717:             * Get depth-first search flag.  Default value is true.
0718:             * @return true if search is depth-first, false if search is breadth-first.
0719:             */
0720:            public boolean getDepthFirst() {
0721:                return depthFirst;
0722:            }
0723:
0724:            /**
0725:             * Set depth-first search flag.  If neither depth-first nor breadth-first
0726:             * is desired, then override shouldVisit() to set a custom priority on
0727:             * each link.
0728:             * @param useDFS true if search should be depth-first, false if search should be breadth-first.
0729:             */
0730:            public void setDepthFirst(boolean useDFS) {
0731:                depthFirst = useDFS;
0732:            }
0733:
0734:            /**
0735:             * Get synchronous flag.  Default value is false.
0736:             * @return true if crawler must visit the pages in priority order; false if crawler can visit 
0737:             * pages in any order.
0738:             */
0739:            public boolean getSynchronous() {
0740:                return synchronous;
0741:            }
0742:
0743:            /**
0744:             * Set ssynchronous flag.
0745:             * @param f true if crawler must visit the pages in priority order; false if crawler can visit 
0746:             * pages in any order.
0747:             */
0748:            public void setSynchronous(boolean f) {
0749:                synchronous = f;
0750:            }
0751:
0752:            /**
0753:             * Get ignore-visited-links flag.  Default value is true.
0754:             * @return true if search skips links whose URLs have already been visited
0755:             * (or queued for visiting).
0756:             */
0757:            public boolean getIgnoreVisitedLinks() {
0758:                return ignoreVisitedLinks;
0759:            }
0760:
0761:            /**
0762:             * Set ignore-visited-links flag.
0763:             * @param f true if search skips links whose URLs have already been visited
0764:             * (or queued for visiting).
0765:             */
0766:            public void setIgnoreVisitedLinks(boolean f) {
0767:                ignoreVisitedLinks = f;
0768:            }
0769:
0770:            /**
0771:             * Get maximum depth.  Default value is 5.
0772:             * @return maximum depth of crawl, in hops from starting point.
0773:             */
0774:            public int getMaxDepth() {
0775:                return maxDepth;
0776:            }
0777:
0778:            /**
0779:             * Set maximum depth.
0780:             * @param maxDepth maximum depth of crawl, in hops from starting point
0781:             */
0782:            public void setMaxDepth(int maxDepth) {
0783:                this .maxDepth = maxDepth;
0784:            }
0785:
0786:            /**
0787:             * Get download parameters (such as number of threads, timeouts, maximum
0788:             * page size, etc.)
0789:             */
0790:            public DownloadParameters getDownloadParameters() {
0791:                return dp;
0792:            }
0793:
0794:            /**
0795:             * Set download parameters  (such as number of threads, timeouts, maximum
0796:             * page size, etc.)
0797:             * @param dp Download parameters
0798:             */
0799:            public void setDownloadParameters(DownloadParameters dp) {
0800:                this .dp = dp;
0801:            }
0802:
0803:            /**
0804:             * Set link predicate.  This is an alternative way to
0805:             * specify the links to walk.  If the link predicate is
0806:             * non-null, then only links that satisfy
0807:             * the link predicate AND shouldVisit() are crawled.
0808:             * @param pred Link predicate
0809:             */
0810:            public void setLinkPredicate(LinkPredicate pred) {
0811:                if (pred == linkPredicate
0812:                        || (pred != null && pred.equals(linkPredicate)))
0813:                    return;
0814:                if (linkPredicate != null)
0815:                    linkPredicate.disconnected(this );
0816:                linkPredicate = pred;
0817:                if (linkPredicate != null)
0818:                    linkPredicate.connected(this );
0819:            }
0820:
0821:            /**
0822:             * Get link predicate.
0823:             * @return current link predicate
0824:             */
0825:            public LinkPredicate getLinkPredicate() {
0826:                return linkPredicate;
0827:            }
0828:
0829:            /**
0830:             * Set page predicate.  This is a way to filter the pages
0831:             * passed to visit().  If the page predicate is
0832:             * non-null, then only pages that satisfy it are passed to visit().
0833:             * @param pred Page predicate
0834:             */
0835:            public void setPagePredicate(PagePredicate pred) {
0836:                if (pred == pagePredicate
0837:                        || (pred != null && pred.equals(pagePredicate)))
0838:                    return;
0839:                if (pagePredicate != null)
0840:                    pagePredicate.disconnected(this );
0841:                pagePredicate = pred;
0842:                if (pagePredicate != null)
0843:                    pagePredicate.connected(this );
0844:            }
0845:
0846:            /**
0847:             * Get page predicate.
0848:             * @return current page predicate
0849:             */
0850:            public PagePredicate getPagePredicate() {
0851:                return pagePredicate;
0852:            }
0853:
0854:            /**
0855:             * Set the action.  This is an alternative way to specify
0856:             * an action performed on every page.  If act is non-null,
0857:             * then every page passed to visit() is also passed to this
0858:             * action.
0859:             * @param act Action
0860:             */
0861:            public void setAction(Action act) {
0862:                if (act == action || (act != null && act.equals(action)))
0863:                    return;
0864:                if (action != null)
0865:                    action.disconnected(this );
0866:                action = act;
0867:                if (action != null)
0868:                    action.connected(this );
0869:            }
0870:
0871:            /**
0872:             * Get action.
0873:             * @return current action
0874:             */
0875:            public Action getAction() {
0876:                return action;
0877:            }
0878:
0879:            /*
0880:             * Link queue management
0881:             *
0882:             */
0883:
0884:            /**
0885:             * Puts a link into the crawling queue.  If the crawler is running, the
0886:             * link will eventually be retrieved and passed to visit().
0887:             * @param link Link to put in queue
0888:             */
0889:            public void submit(Link link) {
0890:                markVisited(link); // FIX: need atomic test-and-set of visited flag
0891:                sendLinkEvent(link, LinkEvent.QUEUED);
0892:                synchronized (crawlQueue) {
0893:                    synchronized (fetchQueue) {
0894:                        crawlQueue.put(link);
0895:                        ++numPagesLeft;
0896:                        fetchQueue.put(link);
0897:                        fetchQueue.notifyAll(); // wake up worms
0898:                    }
0899:                }
0900:            }
0901:
0902:            /**
0903:             * Submit an array of Links for crawling.  If the crawler is running,
0904:             * these links will eventually be retrieved and passed to visit().
0905:             * @param links Links to put in queue
0906:             */
0907:            public void submit(Link[] links) {
0908:                for (int i = 0; i < links.length; ++i)
0909:                    submit(links[i]);
0910:            }
0911:
0912:            /**
0913:             * Enumerate crawling queue.
0914:             * @return an enumeration of Link objects which are waiting to be visited.
0915:             */
0916:            // FIX: enumerate in priority order
0917:            public Enumeration enumerateQueue() {
0918:                return crawlQueue.elements();
0919:            }
0920:
0921:            /*
0922:             * Classifiers
0923:             *
0924:             */
0925:
0926:            /**
0927:             * Adds a classifier to this crawler.  If the
0928:             * classifier is already found in the set, does nothing.
0929:             * @param c a classifier
0930:             */
0931:            public void addClassifier(Classifier c) {
0932:                if (!classifiers.contains(c)) {
0933:                    float cpriority = c.getPriority();
0934:
0935:                    for (int i = 0; i < classifiers.size(); ++i) {
0936:                        Classifier d = (Classifier) classifiers.elementAt(i);
0937:                        if (cpriority < d.getPriority()) {
0938:                            classifiers.insertElementAt(c, i);
0939:                            return;
0940:                        }
0941:                    }
0942:                    classifiers.addElement(c);
0943:                }
0944:            }
0945:
0946:            /**
0947:             * Removes a classifier from the set of classifiers.  
0948:             * If c is not found in the set, does nothing.
0949:             *
0950:             * @param c a classifier
0951:             */
0952:            public void removeClassifier(Classifier c) {
0953:                classifiers.removeElement(c);
0954:            }
0955:
0956:            /**
0957:             * Clears the set of classifiers.
0958:             */
0959:            public void removeAllClassifiers() {
0960:                classifiers.removeAllElements();
0961:            }
0962:
0963:            /**
0964:             * Enumerates the set of classifiers.
0965:             *
0966:             * @return An enumeration of the classifiers.
0967:             */
0968:            public Enumeration enumerateClassifiers() {
0969:                return classifiers.elements();
0970:            }
0971:
0972:            /**
0973:             * Get the set of classifiers
0974:             *
0975:             * @return An array containing the registered classifiers.
0976:             */
0977:            public Classifier[] getClassifiers() {
0978:                Classifier[] c = new Classifier[classifiers.size()];
0979:                classifiers.copyInto(c);
0980:                return c;
0981:            }
0982:
0983:            /*
0984:             * Event listeners
0985:             *
0986:             */
0987:
0988:            /**
0989:             * Adds a listener to the set of CrawlListeners for this crawler.
0990:             * If the listener is already found in the set, does nothing.
0991:             *
0992:             * @param listen a listener
0993:             */
0994:            public void addCrawlListener(CrawlListener listen) {
0995:                if (!crawlListeners.contains(listen))
0996:                    crawlListeners.addElement(listen);
0997:            }
0998:
0999:            /**
1000:             * Removes a listener from the set of CrawlListeners.  If it is not found in the set,
1001:             * does nothing.
1002:             *
1003:             * @param listen a listener
1004:             */
1005:            public void removeCrawlListener(CrawlListener listen) {
1006:                crawlListeners.removeElement(listen);
1007:            }
1008:
1009:            /**
1010:             * Adds a listener to the set of LinkListeners for this crawler.
1011:             * If the listener is already found in the set, does nothing.
1012:             *
1013:             * @param listen a listener
1014:             */
1015:            public void addLinkListener(LinkListener listen) {
1016:                if (!linkListeners.contains(listen))
1017:                    linkListeners.addElement(listen);
1018:            }
1019:
1020:            /**
1021:             * Removes a listener from the set of LinkListeners.  If it is not found in the set,
1022:             * does nothing.
1023:             *
1024:             * @param listen a listener
1025:             */
1026:            public void removeLinkListener(LinkListener listen) {
1027:                linkListeners.removeElement(listen);
1028:            }
1029:
1030:            /**
1031:             * Send a CrawlEvent to all CrawlListeners registered with this crawler.
1032:             * @param id Event id
1033:             */
1034:            protected void sendCrawlEvent(int id) {
1035:                CrawlEvent evt = new CrawlEvent(this , id);
1036:                for (int j = 0, len = crawlListeners.size(); j < len; ++j) {
1037:                    CrawlListener listen = (CrawlListener) crawlListeners
1038:                            .elementAt(j);
1039:                    switch (id) {
1040:                    case CrawlEvent.STARTED:
1041:                        listen.started(evt);
1042:                        break;
1043:                    case CrawlEvent.STOPPED:
1044:                        listen.stopped(evt);
1045:                        break;
1046:                    case CrawlEvent.CLEARED:
1047:                        listen.cleared(evt);
1048:                        break;
1049:                    case CrawlEvent.TIMED_OUT:
1050:                        listen.timedOut(evt);
1051:                        break;
1052:                    case CrawlEvent.PAUSED:
1053:                        listen.paused(evt);
1054:                        break;
1055:                    }
1056:                }
1057:            }
1058:
1059:            /**
1060:             * Send a LinkEvent to all LinkListeners registered with this crawler.
1061:             * @param l Link related to event
1062:             * @param id Event id
1063:             */
1064:            protected void sendLinkEvent(Link l, int id) {
1065:                LinkEvent evt = new LinkEvent(this , id, l);
1066:                l.setStatus(id);
1067:                for (int j = 0, len = linkListeners.size(); j < len; ++j) {
1068:                    LinkListener listen = (LinkListener) linkListeners
1069:                            .elementAt(j);
1070:                    listen.crawled(evt);
1071:                }
1072:            }
1073:
1074:            /**
1075:             * Send an exceptional LinkEvent to all LinkListeners registered with this crawler.
1076:             * @param l Link related to event
1077:             * @param id Event id
1078:             * @param exception Exception associated with event
1079:             */
1080:            protected void sendLinkEvent(Link l, int id, Throwable exception) {
1081:                LinkEvent evt = new LinkEvent(this , id, l, exception);
1082:                l.setStatus(id);
1083:                l.setLabel("exception", exception.toString());
1084:                for (int j = 0, len = linkListeners.size(); j < len; ++j) {
1085:                    LinkListener listen = (LinkListener) linkListeners
1086:                            .elementAt(j);
1087:                    listen.crawled(evt);
1088:                }
1089:            }
1090:
1091:            /*
1092:             * Visited pages table
1093:             *
1094:             */
1095:
1096:            /**
1097:             * Test whether the page corresponding to a link has been visited
1098:             * (or queued for visiting).
1099:             * @param link  Link to test
1100:             * @return true if link has been passed to walk() during this crawl
1101:             */
1102:            public boolean visited(Link link) {
1103:                return visitedPages.containsKey(link.getPageURL().toString());
1104:            }
1105:
1106:            /**
1107:             * Register that a link has been visited.
1108:             * @param link  Link that has been visited
1109:             */
1110:            protected void markVisited(Link link) {
1111:                visitedPages.put(link.getPageURL().toString(), this );
1112:            }
1113:
1114:            /**
1115:             * Clear the set of visited links.
1116:             */
1117:            protected void clearVisited() {
1118:                visitedPages.clear();
1119:            }
1120:
1121:            /*
1122:             * Fetch loop
1123:             *
1124:             */
1125:
1126:            void fetch(Worm w) {
1127:                Timer timer = new WormTimer(w);
1128:
1129:                while (!w.dead) {
1130:                    //System.err.println (w + ": fetching a link");
1131:
1132:                    // pull the highest-priority link from the fetch queue
1133:                    synchronized (fetchQueue) {
1134:                        while (!w.dead
1135:                                && (w.link = (Link) fetchQueue.deleteMin()) == null) {
1136:                            try {
1137:                                fetchQueue.wait();
1138:                            } catch (InterruptedException e) {
1139:                            }
1140:                        }
1141:                    }
1142:
1143:                    if (w.dead)
1144:                        return;
1145:
1146:                    //System.err.println (w + ": processing " + w.link.toDescription());
1147:
1148:                    try {
1149:                        // download the link to get a page
1150:                        DownloadParameters dp;
1151:                        Page page;
1152:
1153:                        dp = w.link.getDownloadParameters();
1154:                        if (dp == null)
1155:                            dp = this .dp;
1156:                        int timeout = dp.getDownloadTimeout();
1157:
1158:                        sendLinkEvent(w.link, LinkEvent.RETRIEVING);
1159:                        try {
1160:
1161:                            if (timeout > 0)
1162:                                timer.set(timeout * 1000, false);
1163:
1164:                            if (dp.getObeyRobotExclusion()
1165:                                    && robotExclusion.disallowed(w.link
1166:                                            .getURL()))
1167:                                throw new IOException(
1168:                                        "disallowed by Robot Exclusion Standard (robots.txt)");
1169:
1170:                            page = new Page(w.link, dp);
1171:
1172:                        } finally {
1173:                            timer.cancel();
1174:                        }
1175:
1176:                        if (w.dead)
1177:                            return;
1178:
1179:                        sendLinkEvent(w.link, LinkEvent.DOWNLOADED);
1180:
1181:                        if (synchronous) {
1182:                            // Synchronous mode.
1183:                            // Main thread will call process() when
1184:                            // this link's turn arrives (in priority order).
1185:                            // Wake up the main thread.
1186:                            synchronized (crawlQueue) {
1187:                                crawlQueue.notify();
1188:                            }
1189:                        } else {
1190:                            // Asynchronous mode.
1191:                            // Each worm calls process() on its link. 
1192:                            process(w.link);
1193:                        }
1194:
1195:                        w.link = null;
1196:
1197:                        // loop around and fetch another link
1198:
1199:                    } catch (ThreadDeath e) {
1200:                        throw e; // have to continue dying 
1201:                    } catch (Throwable e) {
1202:                        // Some other exception occurred, either during the page fetch
1203:                        // or in some user code.  Mark up the link with the error.
1204:                        if (w.dead)
1205:                            return;
1206:
1207:                        sendLinkEvent(w.link, LinkEvent.ERROR, e);
1208:                        synchronized (crawlQueue) {
1209:                            crawlQueue.delete(w.link);
1210:                            --numPagesLeft;
1211:                            w.link = null;
1212:                            crawlQueue.notify();
1213:                        }
1214:                    }
1215:                }
1216:            }
1217:
1218:            void process(Link link) {
1219:                Page page = link.getPage();
1220:
1221:                // classify the page
1222:                for (int j = 0, len = classifiers.size(); j < len; ++j) {
1223:                    Classifier cl = (Classifier) classifiers.elementAt(j);
1224:                    cl.classify(page);
1225:                }
1226:
1227:                // invoke callbacks on the page
1228:                ++numPagesVisited;
1229:                if (pagePredicate == null || pagePredicate.shouldActOn(page)) {
1230:                    if (action != null)
1231:                        action.visit(page);
1232:                    visit(page);
1233:                }
1234:                expand(page);
1235:
1236:                // send out the event
1237:                sendLinkEvent(link, LinkEvent.VISITED);
1238:
1239:                // discard link
1240:                synchronized (crawlQueue) {
1241:                    crawlQueue.delete(link);
1242:                    --numPagesLeft;
1243:                    crawlQueue.notify();
1244:                }
1245:            }
1246:
1247:            void fetchTimedOut(Worm w, int interval) {
1248:                if (w.dead)
1249:                    return;
1250:
1251:                w.die();
1252:                sendLinkEvent(w.link, LinkEvent.ERROR, new IOException(
1253:                        "Timeout after " + interval + " seconds"));
1254:
1255:                synchronized (crawlQueue) {
1256:                    crawlQueue.delete(w.link);
1257:                    --numPagesLeft;
1258:
1259:                    worms[w.i] = new Worm(this , w.i);
1260:                    worms[w.i].start();
1261:
1262:                    crawlQueue.notify();
1263:                }
1264:            }
1265:
1266:            //#ifdef JDK1.1
1267:            // FIX: more error checking here
1268:            public static void main(String[] args) throws Exception {
1269:                java.io.ObjectInputStream in = new java.io.ObjectInputStream(
1270:                        new java.io.FileInputStream(args[0]));
1271:                Crawler loadedCrawler = (Crawler) in.readObject();
1272:                in.close();
1273:
1274:                EventLog.monitor(loadedCrawler).setOnlyNetworkEvents(false);
1275:                loadedCrawler.run();
1276:            }
1277:            //#endif JDK1.1
1278:
1279:        }
1280:
1281:        /* Simple Thread subclass that invokes a crawler's fetch loop. */
1282:        class Worm extends Thread {
1283:            Crawler crawler; // crawler in charge of this worm
1284:            int i; // index of this worm in crawler.worms[]
1285:            Link link; // link this worm is currently working on
1286:            boolean dead = false; // true if this worm has been killed
1287:
1288:            public Worm(Crawler crawler, int i) {
1289:                super (crawler.getName() + " worm " + i);
1290:                setDaemon(true);
1291:                this .crawler = crawler;
1292:                this .i = i;
1293:            }
1294:
1295:            public void run() {
1296:                crawler.fetch(this );
1297:            }
1298:
1299:            public void die() {
1300:                dead = true;
1301:                stop();
1302:            }
1303:
1304:        }
1305:
1306:        class WormTimer extends Timer {
1307:            Worm worm;
1308:
1309:            public WormTimer(Worm worm) {
1310:                this .worm = worm;
1311:            }
1312:
1313:            protected void alarm() {
1314:                worm.crawler.fetchTimedOut(worm, getInterval() / 1000);
1315:            }
1316:        }
1317:
1318:        class CrawlTimer extends Timer {
1319:            Crawler crawler;
1320:
1321:            public CrawlTimer(Crawler crawler) {
1322:                this .crawler = crawler;
1323:            }
1324:
1325:            protected void alarm() {
1326:                crawler.timedOut();
1327:            }
1328:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.