0001: /*
0002: * WebSphinx web-crawling toolkit
0003: *
0004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
0005: * reserved.
0006: *
0007: * Redistribution and use in source and binary forms, with or without
0008: * modification, are permitted provided that the following conditions
0009: * are met:
0010: *
0011: * 1. Redistributions of source code must retain the above copyright
0012: * notice, this list of conditions and the following disclaimer.
0013: *
0014: * 2. Redistributions in binary form must reproduce the above copyright
0015: * notice, this list of conditions and the following disclaimer in
0016: * the documentation and/or other materials provided with the
0017: * distribution.
0018: *
0019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
0020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
0021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
0023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
0024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
0025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
0029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0030: *
0031: */
0032:
0033: package websphinx;
0034:
0035: import rcm.util.PriorityQueue;
0036: import rcm.util.Timer;
0037: import java.util.Vector;
0038: import java.util.Enumeration;
0039: import java.util.Hashtable;
0040: import java.util.StringTokenizer;
0041: import java.net.URL;
0042: import java.net.MalformedURLException;
0043: import java.io.IOException; //#ifdef JDK1.1
0044: import java.io.Serializable;
0045: import java.io.ObjectInputStream;
0046: import java.io.ObjectOutputStream;
0047:
0048: //#endif JDK1.1
0049:
0050: /**
0051: * Web crawler.
0052: * <P>
0053: * To write a crawler, extend this class and override
0054: * shouldVisit () and visit() to create your own crawler.
0055: * <P>
0056: * To use a crawler:
0057: * <OL>
0058: * <LI>Initialize the crawler by calling
0059: * setRoot() (or one of its variants) and setting other
0060: * crawl parameters.
0061: * <LI>Register any classifiers you need with addClassifier().
0062: * <LI>Connect event listeners to monitor the crawler,
0063: * such as websphinx.EventLog, websphinx.workbench.WebGraph,
0064: * or websphinx.workbench.Statistics.
0065: * <LI>Call run() to start the crawler.
0066: * </OL>
0067: * A running crawler consists of a priority queue of
0068: * Links waiting to be visited and a set of threads
0069: * retrieving pages in parallel. When a page is downloaded,
0070: * it is processed as follows:
0071: * <OL>
0072: * <LI><B>classify()</B>: The page is passed to the classify() method of
0073: * every registered classifier, in increasing order of
0074: * their priority values. Classifiers typically attach
0075: * informative labels to the page and its links, such as "homepage"
0076: * or "root page".
0077: * <LI><B>visit()</B>: The page is passed to the crawler's
0078: * visit() method for user-defined processing.
0079: * <LI><B>expand()</B>: The page is passed to the crawler's
0080: * expand() method to be expanded. The default implementation
0081: * tests every unvisited hyperlink on the page with shouldVisit(),
0082: * and puts
0083: * each link approved by shouldVisit() into the crawling queue.
0084: * </OL>
0085: * By default, when expanding the links of a page, the crawler
0086: * only considers hyperlinks (not applets or inline images, for instance) that
0087: * point to Web pages (not mailto: links, for instance). If you want
0088: * shouldVisit() to test every link on the page, use setLinkType(Crawler.ALL_LINKS).
0089: *
0090: */
0091:
0092: public class Crawler implements Runnable
0093: //#ifdef JDK1.1
0094: , Serializable
0095: //#endif JDK1.1
0096: {
0097:
0098: //#ifdef JDK1.1
0099: private static final long serialVersionUID = -3757789861952010450L;
0100: //#endif JDK1.1
0101:
0102: /**
0103: * Specify WEB as the crawl domain to allow the crawler
0104: * to visit any page on the World Wide Web.
0105: */
0106: public static final String[] WEB = null;
0107:
0108: /**
0109: * Specify SERVER as the crawl domain to limit the crawler
0110: * to visit only pages on the same Web server (hostname
0111: * and port number) as the root link from which it started.
0112: */
0113: public static final String[] SERVER = { "local" };
0114:
0115: /**
0116: * Specify SUBTREE as the crawl domain to limit the crawler
0117: * to visit only pages which are descendants of the root link
0118: * from which it started.
0119: */
0120: public static final String[] SUBTREE = { "sibling", "descendent" };
0121:
0122: /**
0123: * Specify HYPERLINKS as the link type to allow the crawler
0124: * to visit only hyperlinks (A, AREA, and FRAME tags which
0125: * point to http:, ftp:, file:, or gopher: URLs).
0126: */
0127: public static final String[] HYPERLINKS = { "hyperlink" };
0128:
0129: /**
0130: * Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler
0131: * to visit only hyperlinks and inline images.
0132: */
0133: public static final String[] HYPERLINKS_AND_IMAGES = { "hyperlink",
0134: "image" };
0135:
0136: /**
0137: * Specify ALL_LINKS as the link type to allow the crawler
0138: * to visit any kind of link
0139: */
0140: public static final String[] ALL_LINKS = null;
0141:
0142: // Crawler parameters
0143: private String name = getClass().getName(); // crawler's name
0144: private transient Link[] roots = null;
0145: private String[] rootHrefs = null; // exists only when serializing crawler
0146: private String[] domain = WEB;
0147: private boolean synchronous = false;
0148: private boolean depthFirst = true;
0149: private String[] type = HYPERLINKS;
0150: private boolean ignoreVisitedLinks = true;
0151: private int maxDepth = 5;
0152: private DownloadParameters dp = new DownloadParameters()
0153: .changeUserAgent(name);
0154: private Vector classifiers = new Vector();
0155: private LinkPredicate linkPredicate;
0156: private PagePredicate pagePredicate;
0157: private Action action;
0158:
0159: // Transient state
0160:
0161: private transient Link[] crawledRoots = null;
0162:
0163: private transient int state = CrawlEvent.CLEARED;
0164:
0165: private transient Worm[] worms;
0166: // background threads
0167:
0168: private transient PriorityQueue fetchQueue;
0169: // links waiting to be downloaded
0170: private transient PriorityQueue crawlQueue;
0171: // all links that have been expanded but not
0172: // processed (used only if crawler is in synchronous mode)
0173:
0174: private transient int numLinksTested;
0175: // number of links tested by shouldVisit()
0176: private transient int numPagesVisited;
0177: // number of pages passed to visit()
0178: private transient int numPagesLeft;
0179: // all links that have been expanded but not processed
0180: // == crawlQueue.size ()
0181:
0182: // FIX: convert to immutable linked lists
0183: private transient Vector crawlListeners;
0184: // list of CrawlListeners
0185: private transient Vector linkListeners;
0186: // list of LinkListeners
0187:
0188: private transient Hashtable visitedPages;
0189: // visited pages (a set of URLs)
0190:
0191: private transient RobotExclusion robotExclusion;
0192:
0193: // robot exclusion cache
0194:
0195: /**
0196: * Make a new Crawler.
0197: */
0198: public Crawler() {
0199: addClassifier(new StandardClassifier());
0200: init();
0201: }
0202:
0203: /*
0204: * Initialize the transient fields of the crawler.
0205: */
0206: private void init() {
0207: state = CrawlEvent.CLEARED;
0208:
0209: numLinksTested = 0;
0210: numPagesVisited = 0;
0211: numPagesLeft = 0;
0212:
0213: worms = null;
0214: crawlQueue = new PriorityQueue();
0215: fetchQueue = new PriorityQueue();
0216:
0217: crawlListeners = new Vector();
0218: linkListeners = new Vector();
0219:
0220: visitedPages = new Hashtable();
0221: robotExclusion = new RobotExclusion(getName());
0222: }
0223:
0224: /*
0225: * Write a Crawler to an output stream.
0226: */
0227: //#ifdef JDK1.1
0228: private void writeObject(ObjectOutputStream out) throws IOException {
0229: if (roots != null) {
0230: rootHrefs = new String[roots.length];
0231: for (int i = 0; i < roots.length; ++i)
0232: rootHrefs[i] = roots[i].getURL().toString();
0233: } else
0234: rootHrefs = null;
0235:
0236: out.defaultWriteObject();
0237:
0238: rootHrefs = null;
0239: }
0240:
0241: //#endif JDK1.1
0242:
0243: /*
0244: * Read a Crawler from an input stream.
0245: */
0246: //#ifdef JDK1.1
0247: private void readObject(ObjectInputStream in) throws IOException,
0248: ClassNotFoundException {
0249: in.defaultReadObject();
0250:
0251: if (rootHrefs != null) {
0252: roots = new Link[rootHrefs.length];
0253: for (int i = 0; i < rootHrefs.length; ++i)
0254: roots[i] = new Link(rootHrefs[i]);
0255: } else
0256: roots = null;
0257:
0258: domain = useStandard(WEB, domain);
0259: domain = useStandard(SERVER, domain);
0260: domain = useStandard(SUBTREE, domain);
0261:
0262: type = useStandard(HYPERLINKS, type);
0263: type = useStandard(HYPERLINKS_AND_IMAGES, type);
0264: type = useStandard(ALL_LINKS, type);
0265:
0266: init();
0267:
0268: if (linkPredicate != null)
0269: linkPredicate.connected(this );
0270: if (pagePredicate != null)
0271: pagePredicate.connected(this );
0272: if (action != null)
0273: action.connected(this );
0274: }
0275:
0276: private static String[] useStandard(String[] standard, String[] s) {
0277: if (s == null || standard == null || standard == s)
0278: return s;
0279: if (s.length != standard.length)
0280: return s;
0281: for (int i = 0; i < s.length; ++i)
0282: if (!s[i].equals(standard[i]))
0283: return s;
0284: return standard;
0285: }
0286:
0287: //#endif JDK1.1
0288:
0289: /**
0290: * Start crawling. Returns either when the crawl is done, or
0291: * when pause() or stop() is called. Because this method implements the
0292: * java.lang.Runnable interface, a crawler can be run in the
0293: * background thread.
0294: */
0295: public void run() {
0296: crawledRoots = roots;
0297:
0298: if (state == CrawlEvent.STOPPED)
0299: clear();
0300:
0301: if (state == CrawlEvent.CLEARED && crawledRoots != null) {
0302: // give each root a default priority based on its position in the array
0303: float priority = 0;
0304: float increment = 1.0f / crawledRoots.length;
0305: for (int i = 0; i < crawledRoots.length; ++i) {
0306: crawledRoots[i].setPriority(priority);
0307: priority += increment;
0308: }
0309: submit(crawledRoots);
0310: }
0311:
0312: state = CrawlEvent.STARTED;
0313: sendCrawlEvent(state);
0314:
0315: synchronized (crawlQueue) {
0316: Timer timer = new CrawlTimer(this );
0317: int timeout = dp.getCrawlTimeout();
0318: if (timeout > 0)
0319: timer.set(timeout * 1000, false);
0320:
0321: int nWorms = Math.max(dp.getMaxThreads(), 1);
0322: worms = new Worm[nWorms];
0323: for (int i = 0; i < nWorms; ++i) {
0324: worms[i] = new Worm(this , i);
0325: worms[i].start();
0326: }
0327:
0328: try {
0329: while (state == CrawlEvent.STARTED) {
0330: if (numPagesLeft == 0) {
0331: // ran out of links to crawl
0332: state = CrawlEvent.STOPPED;
0333: sendCrawlEvent(state);
0334: } else if (synchronous) {
0335: // Synchronous mode.
0336: // Main thread calls process() on each link
0337: // in crawlQueue, in priority order.
0338: Link link = (Link) crawlQueue.getMin();
0339: if (link.getStatus() == LinkEvent.DOWNLOADED)
0340: process(link);
0341: else
0342: crawlQueue.wait();
0343: } else
0344: // Asynchronous crawling.
0345: // Main thread does nothing but wait, while
0346: // background threads call process().
0347: crawlQueue.wait();
0348: }
0349: } catch (InterruptedException e) {
0350: }
0351:
0352: timer.cancel();
0353:
0354: for (int i = 0; i < worms.length; ++i)
0355: worms[i].die();
0356: if (state == CrawlEvent.PAUSED) {
0357: // put partly-processed links back in fetchQueue
0358: synchronized (fetchQueue) {
0359: for (int i = 0; i < worms.length; ++i)
0360: if (worms[i].link != null)
0361: fetchQueue.put(worms[i].link);
0362: }
0363: }
0364: worms = null;
0365: }
0366: }
0367:
0368: /**
0369: * Initialize the crawler for a fresh crawl. Clears the crawling queue
0370: * and sets all crawling statistics to 0. Stops the crawler
0371: * if it is currently running.
0372: */
0373: public void clear() {
0374: stop();
0375: numPagesVisited = 0;
0376: numLinksTested = 0;
0377: clearVisited();
0378: if (crawledRoots != null)
0379: for (int i = 0; i < crawledRoots.length; ++i)
0380: crawledRoots[i].disconnect();
0381: crawledRoots = null;
0382: state = CrawlEvent.CLEARED;
0383: sendCrawlEvent(state);
0384: }
0385:
0386: /**
0387: * Pause the crawl in progress. If the crawler is running, then
0388: * it finishes processing the current page, then returns. The queues remain as-is,
0389: * so calling run() again will resume the crawl exactly where it left off.
0390: * pause() can be called from any thread.
0391: */
0392: public void pause() {
0393: if (state == CrawlEvent.STARTED) {
0394: synchronized (crawlQueue) {
0395: state = CrawlEvent.PAUSED;
0396: crawlQueue.notify();
0397: }
0398: sendCrawlEvent(state);
0399: }
0400: }
0401:
0402: /**
0403: * Stop the crawl in progress. If the crawler is running, then
0404: * it finishes processing the current page, then returns.
0405: * Empties the crawling queue.
0406: */
0407: public void stop() {
0408: if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) {
0409: synchronized (crawlQueue) {
0410: synchronized (fetchQueue) {
0411: state = CrawlEvent.STOPPED;
0412: fetchQueue.clear();
0413: crawlQueue.clear();
0414: numPagesLeft = 0;
0415: crawlQueue.notify();
0416: }
0417: }
0418: sendCrawlEvent(state);
0419: }
0420: }
0421:
0422: /*
0423: * Timeout the crawl in progress. Used internally by
0424: * the CrawlTimer.
0425: */
0426: void timedOut() {
0427: if (state == CrawlEvent.STARTED) {
0428: synchronized (crawlQueue) {
0429: synchronized (fetchQueue) {
0430: state = CrawlEvent.TIMED_OUT;
0431: fetchQueue.clear();
0432: crawlQueue.clear();
0433: numPagesLeft = 0;
0434: crawlQueue.notify();
0435: }
0436: }
0437: sendCrawlEvent(state);
0438: }
0439: }
0440:
0441: /**
0442: * Get state of crawler.
0443: * @return one of CrawlEvent.STARTED, CrawlEvent.PAUSED, STOPPED, CLEARED.
0444: */
0445: public int getState() {
0446: return state;
0447: }
0448:
0449: /**
0450: * Callback for visiting a page. Default version does nothing.
0451: *
0452: * @param page Page retrieved by the crawler
0453: */
0454: public void visit(Page page) {
0455: }
0456:
0457: /**
0458: * Callback for testing whether a link should be traversed.
0459: * Default version returns true for all links. Override this method
0460: * for more interesting behavior.
0461: *
0462: * @param l Link encountered by the crawler
0463: * @return true if link should be followed, false if it should be ignored.
0464: */
0465: public boolean shouldVisit(Link l) {
0466: return true;
0467: }
0468:
0469: /**
0470: * Expand the crawl from a page. The default implementation of this
0471: * method tests every link on the page using shouldVisit (), and
0472: * submit()s the links that are approved. A subclass may want to override
0473: * this method if it's inconvenient to consider the links individually
0474: * with shouldVisit().
0475: * @param page Page to expand
0476: */
0477: public void expand(Page page) {
0478: // examine each link on the page
0479: Link[] links = page.getLinks();
0480:
0481: if (links != null && links.length > 0) {
0482: // give each link a default priority based on its page
0483: // and position on page
0484: float priority = (depthFirst ? -numPagesVisited
0485: : numPagesVisited);
0486: float increment = 1.0f / links.length;
0487:
0488: for (int i = 0; i < links.length; ++i) {
0489: Link l = links[i];
0490:
0491: // set default download parameters
0492: l.setPriority(priority);
0493: priority += increment;
0494: l.setDownloadParameters(dp);
0495:
0496: ++numLinksTested;
0497: if (ignoreVisitedLinks && visited(l))
0498: // FIX: use atomic test-and-set
0499: // FIX: set l.page somehow?
0500: sendLinkEvent(l, LinkEvent.ALREADY_VISITED);
0501: else if (!((type == null || l.hasAnyLabels(type))
0502: && (domain == null || l.hasAnyLabels(domain))
0503: && (linkPredicate == null || linkPredicate
0504: .shouldVisit(l)) && shouldVisit(l)))
0505: sendLinkEvent(l, LinkEvent.SKIPPED);
0506: else if (page.getDepth() >= maxDepth)
0507: sendLinkEvent(l, LinkEvent.TOO_DEEP);
0508: else
0509: submit(l);
0510: }
0511: }
0512: }
0513:
0514: /*
0515: * Crawl statistics
0516: */
0517:
0518: /**
0519: * Get number of pages visited.
0520: * @return number of pages passed to visit() so far in this crawl
0521: */
0522: public int getPagesVisited() {
0523: return numPagesVisited;
0524: }
0525:
0526: /**
0527: * Get number of links tested.
0528: * @return number of links passed to shouldVisit() so far in this crawl
0529: */
0530: public int getLinksTested() {
0531: return numLinksTested;
0532: }
0533:
0534: /**
0535: * Get number of pages left to be visited.
0536: * @return number of links approved by shouldVisit() but not yet visited
0537: */
0538: public int getPagesLeft() {
0539: return numPagesLeft;
0540: }
0541:
0542: /**
0543: * Get number of threads currently working.
0544: * @return number of threads downloading pages
0545: */
0546: public int getActiveThreads() {
0547: Worm[] w = worms;
0548:
0549: if (w == null)
0550: return 0;
0551:
0552: int n = 0;
0553: for (int i = 0; i < w.length; ++i)
0554: if (w[i] != null && w[i].link != null)
0555: ++n;
0556: return n;
0557: }
0558:
0559: /*
0560: * Crawler parameters
0561: */
0562:
0563: /**
0564: * Get human-readable name of crawler. Default value is the
0565: * class name, e.g., "Crawler". Useful for identifying the crawler in a
0566: * user interface; also used as the default User-agent for identifying
0567: * the crawler to a remote Web server. (The User-agent can be
0568: * changed independently of the crawler name with setDownloadParameters().)
0569: * @return human-readable name of crawler
0570: */
0571: public String getName() {
0572: return name;
0573: }
0574:
0575: /**
0576: * Set human-readable name of crawler.
0577: * @param name new name for crawler
0578: */
0579: public void setName(String name) {
0580: this .name = name;
0581: }
0582:
0583: /**
0584: * Convert the crawler to a String.
0585: * @return Human-readable name of crawler.
0586: */
0587: public String toString() {
0588: return getName();
0589: }
0590:
0591: /**
0592: * Get starting points of crawl as an array of Link objects.
0593: * @return array of Links from which crawler will start its next crawl.
0594: */
0595: public Link[] getRoots() {
0596: if (roots == null)
0597: return new Link[0];
0598:
0599: Link[] result = new Link[roots.length];
0600: System.arraycopy(roots, 0, result, 0, roots.length);
0601: return result;
0602: }
0603:
0604: /**
0605: * Get roots of last crawl. May differ from getRoots()
0606: * if new roots have been set.
0607: * @return array of Links from which crawler started its last crawl,
0608: * or null if the crawler was cleared.
0609: */
0610: public Link[] getCrawledRoots() {
0611: if (crawledRoots == null)
0612: return null;
0613:
0614: Link[] result = new Link[crawledRoots.length];
0615: System.arraycopy(crawledRoots, 0, result, 0,
0616: crawledRoots.length);
0617: return result;
0618: }
0619:
0620: /**
0621: * Get starting points of crawl as a String of newline-delimited URLs.
0622: * @return URLs where crawler will start, separated by newlines.
0623: */
0624: public String getRootHrefs() {
0625: StringBuffer buf = new StringBuffer();
0626: if (roots != null) {
0627: for (int i = 0; i < roots.length; ++i) {
0628: if (buf.length() > 0)
0629: buf.append('\n');
0630: buf.append(roots[i].getURL().toExternalForm());
0631: }
0632: }
0633: return buf.toString();
0634: }
0635:
0636: /**
0637: * Set starting points of crawl as a string of whitespace-delimited URLs.
0638: * @param hrefs URLs of starting point, separated by space, \t, or \n
0639: * @exception java.net.MalformedURLException if any of the URLs is invalid,
0640: * leaving starting points unchanged
0641: */
0642: public void setRootHrefs(String hrefs) throws MalformedURLException {
0643: Vector v = new Vector();
0644: StringTokenizer tok = new StringTokenizer(hrefs);
0645: while (tok.hasMoreElements())
0646: v.addElement(new Link(tok.nextToken()));
0647: roots = new Link[v.size()];
0648: v.copyInto(roots);
0649: }
0650:
0651: /**
0652: * Set starting point of crawl as a single Link.
0653: * @param link starting point
0654: */
0655: public void setRoot(Link link) {
0656: roots = new Link[1];
0657: roots[0] = link;
0658: }
0659:
0660: /**
0661: * Set starting points of crawl as an array of Links.
0662: * @param links starting points
0663: */
0664: public void setRoots(Link[] links) {
0665: roots = new Link[links.length];
0666: System.arraycopy(links, 0, roots, 0, links.length);
0667: }
0668:
0669: /**
0670: * Add a root to the existing set of roots.
0671: * @param link starting point to add
0672: */
0673: public void addRoot(Link link) {
0674: if (roots == null)
0675: setRoot(link);
0676: else {
0677: Link newroots[] = new Link[roots.length + 1];
0678: System.arraycopy(roots, 0, newroots, 0, roots.length);
0679: newroots[newroots.length - 1] = link;
0680: roots = newroots;
0681: }
0682: }
0683:
0684: /**
0685: * Get crawl domain. Default value is WEB.
0686: * @return WEB, SERVER, or SUBTREE.
0687: */
0688: public String[] getDomain() {
0689: return domain;
0690: }
0691:
0692: /**
0693: * Set crawl domain.
0694: * @param domain one of WEB, SERVER, or SUBTREE.
0695: */
0696: public void setDomain(String[] domain) {
0697: this .domain = domain;
0698: }
0699:
0700: /**
0701: * Get legal link types to crawl. Default value is HYPERLINKS.
0702: * @return HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
0703: */
0704: public String[] getLinkType() {
0705: return type;
0706: }
0707:
0708: /**
0709: * Set legal link types to crawl.
0710: * @param domain one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.
0711: */
0712: public void setLinkType(String[] type) {
0713: this .type = type;
0714: }
0715:
0716: /**
0717: * Get depth-first search flag. Default value is true.
0718: * @return true if search is depth-first, false if search is breadth-first.
0719: */
0720: public boolean getDepthFirst() {
0721: return depthFirst;
0722: }
0723:
0724: /**
0725: * Set depth-first search flag. If neither depth-first nor breadth-first
0726: * is desired, then override shouldVisit() to set a custom priority on
0727: * each link.
0728: * @param useDFS true if search should be depth-first, false if search should be breadth-first.
0729: */
0730: public void setDepthFirst(boolean useDFS) {
0731: depthFirst = useDFS;
0732: }
0733:
0734: /**
0735: * Get synchronous flag. Default value is false.
0736: * @return true if crawler must visit the pages in priority order; false if crawler can visit
0737: * pages in any order.
0738: */
0739: public boolean getSynchronous() {
0740: return synchronous;
0741: }
0742:
0743: /**
0744: * Set ssynchronous flag.
0745: * @param f true if crawler must visit the pages in priority order; false if crawler can visit
0746: * pages in any order.
0747: */
0748: public void setSynchronous(boolean f) {
0749: synchronous = f;
0750: }
0751:
0752: /**
0753: * Get ignore-visited-links flag. Default value is true.
0754: * @return true if search skips links whose URLs have already been visited
0755: * (or queued for visiting).
0756: */
0757: public boolean getIgnoreVisitedLinks() {
0758: return ignoreVisitedLinks;
0759: }
0760:
0761: /**
0762: * Set ignore-visited-links flag.
0763: * @param f true if search skips links whose URLs have already been visited
0764: * (or queued for visiting).
0765: */
0766: public void setIgnoreVisitedLinks(boolean f) {
0767: ignoreVisitedLinks = f;
0768: }
0769:
0770: /**
0771: * Get maximum depth. Default value is 5.
0772: * @return maximum depth of crawl, in hops from starting point.
0773: */
0774: public int getMaxDepth() {
0775: return maxDepth;
0776: }
0777:
0778: /**
0779: * Set maximum depth.
0780: * @param maxDepth maximum depth of crawl, in hops from starting point
0781: */
0782: public void setMaxDepth(int maxDepth) {
0783: this .maxDepth = maxDepth;
0784: }
0785:
0786: /**
0787: * Get download parameters (such as number of threads, timeouts, maximum
0788: * page size, etc.)
0789: */
0790: public DownloadParameters getDownloadParameters() {
0791: return dp;
0792: }
0793:
0794: /**
0795: * Set download parameters (such as number of threads, timeouts, maximum
0796: * page size, etc.)
0797: * @param dp Download parameters
0798: */
0799: public void setDownloadParameters(DownloadParameters dp) {
0800: this .dp = dp;
0801: }
0802:
0803: /**
0804: * Set link predicate. This is an alternative way to
0805: * specify the links to walk. If the link predicate is
0806: * non-null, then only links that satisfy
0807: * the link predicate AND shouldVisit() are crawled.
0808: * @param pred Link predicate
0809: */
0810: public void setLinkPredicate(LinkPredicate pred) {
0811: if (pred == linkPredicate
0812: || (pred != null && pred.equals(linkPredicate)))
0813: return;
0814: if (linkPredicate != null)
0815: linkPredicate.disconnected(this );
0816: linkPredicate = pred;
0817: if (linkPredicate != null)
0818: linkPredicate.connected(this );
0819: }
0820:
0821: /**
0822: * Get link predicate.
0823: * @return current link predicate
0824: */
0825: public LinkPredicate getLinkPredicate() {
0826: return linkPredicate;
0827: }
0828:
0829: /**
0830: * Set page predicate. This is a way to filter the pages
0831: * passed to visit(). If the page predicate is
0832: * non-null, then only pages that satisfy it are passed to visit().
0833: * @param pred Page predicate
0834: */
0835: public void setPagePredicate(PagePredicate pred) {
0836: if (pred == pagePredicate
0837: || (pred != null && pred.equals(pagePredicate)))
0838: return;
0839: if (pagePredicate != null)
0840: pagePredicate.disconnected(this );
0841: pagePredicate = pred;
0842: if (pagePredicate != null)
0843: pagePredicate.connected(this );
0844: }
0845:
0846: /**
0847: * Get page predicate.
0848: * @return current page predicate
0849: */
0850: public PagePredicate getPagePredicate() {
0851: return pagePredicate;
0852: }
0853:
0854: /**
0855: * Set the action. This is an alternative way to specify
0856: * an action performed on every page. If act is non-null,
0857: * then every page passed to visit() is also passed to this
0858: * action.
0859: * @param act Action
0860: */
0861: public void setAction(Action act) {
0862: if (act == action || (act != null && act.equals(action)))
0863: return;
0864: if (action != null)
0865: action.disconnected(this );
0866: action = act;
0867: if (action != null)
0868: action.connected(this );
0869: }
0870:
0871: /**
0872: * Get action.
0873: * @return current action
0874: */
0875: public Action getAction() {
0876: return action;
0877: }
0878:
0879: /*
0880: * Link queue management
0881: *
0882: */
0883:
0884: /**
0885: * Puts a link into the crawling queue. If the crawler is running, the
0886: * link will eventually be retrieved and passed to visit().
0887: * @param link Link to put in queue
0888: */
0889: public void submit(Link link) {
0890: markVisited(link); // FIX: need atomic test-and-set of visited flag
0891: sendLinkEvent(link, LinkEvent.QUEUED);
0892: synchronized (crawlQueue) {
0893: synchronized (fetchQueue) {
0894: crawlQueue.put(link);
0895: ++numPagesLeft;
0896: fetchQueue.put(link);
0897: fetchQueue.notifyAll(); // wake up worms
0898: }
0899: }
0900: }
0901:
0902: /**
0903: * Submit an array of Links for crawling. If the crawler is running,
0904: * these links will eventually be retrieved and passed to visit().
0905: * @param links Links to put in queue
0906: */
0907: public void submit(Link[] links) {
0908: for (int i = 0; i < links.length; ++i)
0909: submit(links[i]);
0910: }
0911:
0912: /**
0913: * Enumerate crawling queue.
0914: * @return an enumeration of Link objects which are waiting to be visited.
0915: */
0916: // FIX: enumerate in priority order
0917: public Enumeration enumerateQueue() {
0918: return crawlQueue.elements();
0919: }
0920:
0921: /*
0922: * Classifiers
0923: *
0924: */
0925:
0926: /**
0927: * Adds a classifier to this crawler. If the
0928: * classifier is already found in the set, does nothing.
0929: * @param c a classifier
0930: */
0931: public void addClassifier(Classifier c) {
0932: if (!classifiers.contains(c)) {
0933: float cpriority = c.getPriority();
0934:
0935: for (int i = 0; i < classifiers.size(); ++i) {
0936: Classifier d = (Classifier) classifiers.elementAt(i);
0937: if (cpriority < d.getPriority()) {
0938: classifiers.insertElementAt(c, i);
0939: return;
0940: }
0941: }
0942: classifiers.addElement(c);
0943: }
0944: }
0945:
0946: /**
0947: * Removes a classifier from the set of classifiers.
0948: * If c is not found in the set, does nothing.
0949: *
0950: * @param c a classifier
0951: */
0952: public void removeClassifier(Classifier c) {
0953: classifiers.removeElement(c);
0954: }
0955:
0956: /**
0957: * Clears the set of classifiers.
0958: */
0959: public void removeAllClassifiers() {
0960: classifiers.removeAllElements();
0961: }
0962:
0963: /**
0964: * Enumerates the set of classifiers.
0965: *
0966: * @return An enumeration of the classifiers.
0967: */
0968: public Enumeration enumerateClassifiers() {
0969: return classifiers.elements();
0970: }
0971:
0972: /**
0973: * Get the set of classifiers
0974: *
0975: * @return An array containing the registered classifiers.
0976: */
0977: public Classifier[] getClassifiers() {
0978: Classifier[] c = new Classifier[classifiers.size()];
0979: classifiers.copyInto(c);
0980: return c;
0981: }
0982:
0983: /*
0984: * Event listeners
0985: *
0986: */
0987:
0988: /**
0989: * Adds a listener to the set of CrawlListeners for this crawler.
0990: * If the listener is already found in the set, does nothing.
0991: *
0992: * @param listen a listener
0993: */
0994: public void addCrawlListener(CrawlListener listen) {
0995: if (!crawlListeners.contains(listen))
0996: crawlListeners.addElement(listen);
0997: }
0998:
0999: /**
1000: * Removes a listener from the set of CrawlListeners. If it is not found in the set,
1001: * does nothing.
1002: *
1003: * @param listen a listener
1004: */
1005: public void removeCrawlListener(CrawlListener listen) {
1006: crawlListeners.removeElement(listen);
1007: }
1008:
1009: /**
1010: * Adds a listener to the set of LinkListeners for this crawler.
1011: * If the listener is already found in the set, does nothing.
1012: *
1013: * @param listen a listener
1014: */
1015: public void addLinkListener(LinkListener listen) {
1016: if (!linkListeners.contains(listen))
1017: linkListeners.addElement(listen);
1018: }
1019:
1020: /**
1021: * Removes a listener from the set of LinkListeners. If it is not found in the set,
1022: * does nothing.
1023: *
1024: * @param listen a listener
1025: */
1026: public void removeLinkListener(LinkListener listen) {
1027: linkListeners.removeElement(listen);
1028: }
1029:
1030: /**
1031: * Send a CrawlEvent to all CrawlListeners registered with this crawler.
1032: * @param id Event id
1033: */
1034: protected void sendCrawlEvent(int id) {
1035: CrawlEvent evt = new CrawlEvent(this , id);
1036: for (int j = 0, len = crawlListeners.size(); j < len; ++j) {
1037: CrawlListener listen = (CrawlListener) crawlListeners
1038: .elementAt(j);
1039: switch (id) {
1040: case CrawlEvent.STARTED:
1041: listen.started(evt);
1042: break;
1043: case CrawlEvent.STOPPED:
1044: listen.stopped(evt);
1045: break;
1046: case CrawlEvent.CLEARED:
1047: listen.cleared(evt);
1048: break;
1049: case CrawlEvent.TIMED_OUT:
1050: listen.timedOut(evt);
1051: break;
1052: case CrawlEvent.PAUSED:
1053: listen.paused(evt);
1054: break;
1055: }
1056: }
1057: }
1058:
1059: /**
1060: * Send a LinkEvent to all LinkListeners registered with this crawler.
1061: * @param l Link related to event
1062: * @param id Event id
1063: */
1064: protected void sendLinkEvent(Link l, int id) {
1065: LinkEvent evt = new LinkEvent(this , id, l);
1066: l.setStatus(id);
1067: for (int j = 0, len = linkListeners.size(); j < len; ++j) {
1068: LinkListener listen = (LinkListener) linkListeners
1069: .elementAt(j);
1070: listen.crawled(evt);
1071: }
1072: }
1073:
1074: /**
1075: * Send an exceptional LinkEvent to all LinkListeners registered with this crawler.
1076: * @param l Link related to event
1077: * @param id Event id
1078: * @param exception Exception associated with event
1079: */
1080: protected void sendLinkEvent(Link l, int id, Throwable exception) {
1081: LinkEvent evt = new LinkEvent(this , id, l, exception);
1082: l.setStatus(id);
1083: l.setLabel("exception", exception.toString());
1084: for (int j = 0, len = linkListeners.size(); j < len; ++j) {
1085: LinkListener listen = (LinkListener) linkListeners
1086: .elementAt(j);
1087: listen.crawled(evt);
1088: }
1089: }
1090:
1091: /*
1092: * Visited pages table
1093: *
1094: */
1095:
1096: /**
1097: * Test whether the page corresponding to a link has been visited
1098: * (or queued for visiting).
1099: * @param link Link to test
1100: * @return true if link has been passed to walk() during this crawl
1101: */
1102: public boolean visited(Link link) {
1103: return visitedPages.containsKey(link.getPageURL().toString());
1104: }
1105:
1106: /**
1107: * Register that a link has been visited.
1108: * @param link Link that has been visited
1109: */
1110: protected void markVisited(Link link) {
1111: visitedPages.put(link.getPageURL().toString(), this );
1112: }
1113:
1114: /**
1115: * Clear the set of visited links.
1116: */
1117: protected void clearVisited() {
1118: visitedPages.clear();
1119: }
1120:
1121: /*
1122: * Fetch loop
1123: *
1124: */
1125:
1126: void fetch(Worm w) {
1127: Timer timer = new WormTimer(w);
1128:
1129: while (!w.dead) {
1130: //System.err.println (w + ": fetching a link");
1131:
1132: // pull the highest-priority link from the fetch queue
1133: synchronized (fetchQueue) {
1134: while (!w.dead
1135: && (w.link = (Link) fetchQueue.deleteMin()) == null) {
1136: try {
1137: fetchQueue.wait();
1138: } catch (InterruptedException e) {
1139: }
1140: }
1141: }
1142:
1143: if (w.dead)
1144: return;
1145:
1146: //System.err.println (w + ": processing " + w.link.toDescription());
1147:
1148: try {
1149: // download the link to get a page
1150: DownloadParameters dp;
1151: Page page;
1152:
1153: dp = w.link.getDownloadParameters();
1154: if (dp == null)
1155: dp = this .dp;
1156: int timeout = dp.getDownloadTimeout();
1157:
1158: sendLinkEvent(w.link, LinkEvent.RETRIEVING);
1159: try {
1160:
1161: if (timeout > 0)
1162: timer.set(timeout * 1000, false);
1163:
1164: if (dp.getObeyRobotExclusion()
1165: && robotExclusion.disallowed(w.link
1166: .getURL()))
1167: throw new IOException(
1168: "disallowed by Robot Exclusion Standard (robots.txt)");
1169:
1170: page = new Page(w.link, dp);
1171:
1172: } finally {
1173: timer.cancel();
1174: }
1175:
1176: if (w.dead)
1177: return;
1178:
1179: sendLinkEvent(w.link, LinkEvent.DOWNLOADED);
1180:
1181: if (synchronous) {
1182: // Synchronous mode.
1183: // Main thread will call process() when
1184: // this link's turn arrives (in priority order).
1185: // Wake up the main thread.
1186: synchronized (crawlQueue) {
1187: crawlQueue.notify();
1188: }
1189: } else {
1190: // Asynchronous mode.
1191: // Each worm calls process() on its link.
1192: process(w.link);
1193: }
1194:
1195: w.link = null;
1196:
1197: // loop around and fetch another link
1198:
1199: } catch (ThreadDeath e) {
1200: throw e; // have to continue dying
1201: } catch (Throwable e) {
1202: // Some other exception occurred, either during the page fetch
1203: // or in some user code. Mark up the link with the error.
1204: if (w.dead)
1205: return;
1206:
1207: sendLinkEvent(w.link, LinkEvent.ERROR, e);
1208: synchronized (crawlQueue) {
1209: crawlQueue.delete(w.link);
1210: --numPagesLeft;
1211: w.link = null;
1212: crawlQueue.notify();
1213: }
1214: }
1215: }
1216: }
1217:
1218: void process(Link link) {
1219: Page page = link.getPage();
1220:
1221: // classify the page
1222: for (int j = 0, len = classifiers.size(); j < len; ++j) {
1223: Classifier cl = (Classifier) classifiers.elementAt(j);
1224: cl.classify(page);
1225: }
1226:
1227: // invoke callbacks on the page
1228: ++numPagesVisited;
1229: if (pagePredicate == null || pagePredicate.shouldActOn(page)) {
1230: if (action != null)
1231: action.visit(page);
1232: visit(page);
1233: }
1234: expand(page);
1235:
1236: // send out the event
1237: sendLinkEvent(link, LinkEvent.VISITED);
1238:
1239: // discard link
1240: synchronized (crawlQueue) {
1241: crawlQueue.delete(link);
1242: --numPagesLeft;
1243: crawlQueue.notify();
1244: }
1245: }
1246:
1247: void fetchTimedOut(Worm w, int interval) {
1248: if (w.dead)
1249: return;
1250:
1251: w.die();
1252: sendLinkEvent(w.link, LinkEvent.ERROR, new IOException(
1253: "Timeout after " + interval + " seconds"));
1254:
1255: synchronized (crawlQueue) {
1256: crawlQueue.delete(w.link);
1257: --numPagesLeft;
1258:
1259: worms[w.i] = new Worm(this , w.i);
1260: worms[w.i].start();
1261:
1262: crawlQueue.notify();
1263: }
1264: }
1265:
1266: //#ifdef JDK1.1
1267: // FIX: more error checking here
1268: public static void main(String[] args) throws Exception {
1269: java.io.ObjectInputStream in = new java.io.ObjectInputStream(
1270: new java.io.FileInputStream(args[0]));
1271: Crawler loadedCrawler = (Crawler) in.readObject();
1272: in.close();
1273:
1274: EventLog.monitor(loadedCrawler).setOnlyNetworkEvents(false);
1275: loadedCrawler.run();
1276: }
1277: //#endif JDK1.1
1278:
1279: }
1280:
1281: /* Simple Thread subclass that invokes a crawler's fetch loop. */
1282: class Worm extends Thread {
1283: Crawler crawler; // crawler in charge of this worm
1284: int i; // index of this worm in crawler.worms[]
1285: Link link; // link this worm is currently working on
1286: boolean dead = false; // true if this worm has been killed
1287:
1288: public Worm(Crawler crawler, int i) {
1289: super (crawler.getName() + " worm " + i);
1290: setDaemon(true);
1291: this .crawler = crawler;
1292: this .i = i;
1293: }
1294:
1295: public void run() {
1296: crawler.fetch(this );
1297: }
1298:
1299: public void die() {
1300: dead = true;
1301: stop();
1302: }
1303:
1304: }
1305:
1306: class WormTimer extends Timer {
1307: Worm worm;
1308:
1309: public WormTimer(Worm worm) {
1310: this .worm = worm;
1311: }
1312:
1313: protected void alarm() {
1314: worm.crawler.fetchTimedOut(worm, getInterval() / 1000);
1315: }
1316: }
1317:
1318: class CrawlTimer extends Timer {
1319: Crawler crawler;
1320:
1321: public CrawlTimer(Crawler crawler) {
1322: this .crawler = crawler;
1323: }
1324:
1325: protected void alarm() {
1326: crawler.timedOut();
1327: }
1328: }
|