0001: package net.matuschek.spider;
0002:
0003: /**
0004: * This class implements a web robot that does a search trough
0005: * the web starting from a given start document up to a given
0006: * search depth.
0007: *
0008: * @author Daniel Matuschek / Oliver Schmidt
0009: * @version $Revision: 1.35 $
0010: */
0011:
0012: import java.io.File;
0013: import java.io.FileInputStream;
0014: import java.io.IOException;
0015: import java.lang.reflect.Field;
0016: import java.lang.reflect.Modifier;
0017: import java.net.MalformedURLException;
0018: import java.net.URL;
0019: import java.util.Date;
0020: import java.util.HashMap;
0021: import java.util.HashSet;
0022: import java.util.StringTokenizer;
0023: import java.util.Vector;
0024:
0025: import net.matuschek.html.FormFiller;
0026: import net.matuschek.html.HtmlDocument;
0027: import net.matuschek.http.DocManagerException;
0028: import net.matuschek.http.DownloadRuleSet;
0029: import net.matuschek.http.ExtendedURL;
0030: import net.matuschek.http.HttpConstants;
0031: import net.matuschek.http.HttpDoc;
0032: import net.matuschek.http.HttpDocManager;
0033: import net.matuschek.http.HttpException;
0034: import net.matuschek.http.HttpHeader;
0035: import net.matuschek.http.HttpTool;
0036: import net.matuschek.http.HttpToolCallback;
0037: import net.matuschek.http.NTLMAuthorization;
0038: import net.matuschek.http.cookie.CookieManager;
0039: import net.matuschek.spider.docfilter.FilterChain;
0040: import net.matuschek.spider.docfilter.FilterException;
0041:
0042: import org.apache.log4j.Category;
0043: import org.w3c.dom.Element;
0044:
0045: public class WebRobot implements Runnable, Cloneable {
0046:
0047: /** the name of the robot */
0048: private final static String ROBOT_NAME = "JoBo";
0049:
0050: /** the default agent name */
0051: private final static String AGENT_NAME = ROBOT_NAME
0052: + "/1.4 (http://www.matuschek.net/jobo.html)";
0053:
0054: /** the robot exception handler*/
0055: protected RobotExceptionHandler exceptionHandler = new DefaultRobotExceptionHandler();
0056:
0057: /** default maximal search depth */
0058: private final static int DEFAULT_DEPTH = 10;
0059:
0060: /** the URL where the robot walk starts from */
0061: protected URL startURL = null;
0062:
0063: /** the host and directory where retrieval started from */
0064: protected String startDir = "";
0065:
0066: /** maximal search depth */
0067: protected int maxDepth = DEFAULT_DEPTH;
0068:
0069: /** is it allowed to walk to other hosts then the starting host ? */
0070: protected boolean walkToOtherHosts = false;
0071:
0072: /** DocManager will store or process retrieved documents */
0073: protected HttpDocManager docManager;
0074:
0075: /** HttpTool will be used to retrieve documents from a web server */
0076: protected HttpTool httpTool = new HttpTool();
0077:
0078: /** Log4J category for logging */
0079: protected Category log;
0080:
0081: /** Referer used to retrieve to first document */
0082: protected String startReferer = "-";
0083:
0084: /** test for robots.txt */
0085: protected NoRobots robCheck;
0086:
0087: /** current tasks */
0088: protected TaskList todo = null;
0089:
0090: /** a list of all URLs we got already */
0091: protected TaskList visited = null;
0092:
0093: /** ignore settings in /robots.txt ? */
0094: protected boolean ignoreRobotsTxt = false;
0095:
0096: /** sleep that number of seconds after every retrieved document */
0097: protected int sleepTime = 1;
0098:
0099: /** fill out forms */
0100: protected FormFiller formFiller = new FormFiller();
0101:
0102: /** this URLs can be visited more then once */
0103: protected Vector visitMany = new Vector();
0104:
0105: /** for callback to the user interface **/
0106: protected WebRobotCallback webRobotCallback = null;
0107:
0108: /** should we stop robot operation ? **/
0109: protected boolean stopIt = false;
0110:
0111: /** to check if it is allowed to travel to a given URL **/
0112: protected URLCheck urlCheck = null;
0113:
0114: /** should the robot suspend the current walk() **/
0115: protected boolean sleep;
0116:
0117: /** list of allowed URLs (even if walkToOtherHosts is false) **/
0118: protected Vector allowedURLs = new Vector();
0119:
0120: /** allow travelling the whole host ? */
0121: protected boolean allowWholeHost = true;
0122:
0123: /**
0124: * maximum document age in seconds, negative value means
0125: * no limit
0126: */
0127: protected long maxDocumentAge = -1; // no limit
0128:
0129: /**
0130: * allow travelling to all subdomains of the start host ?
0131: * @see #setAllowWholeDomain(boolean)
0132: */
0133: protected boolean allowWholeDomain = true;
0134:
0135: /**
0136: * do more flexible tests if the new URL is on the same host
0137: * @see #basicURLCheck(URL)
0138: */
0139: protected boolean flexibleHostCheck = false;
0140:
0141: /**
0142: * FilterChain to filter the document before storing it
0143: */
0144: protected FilterChain filters = null;
0145:
0146: /**
0147: * don't retrieve pages again that are already stored in the DocManager
0148: */
0149: protected boolean allowCaching = true;
0150:
0151: /**
0152: * Check for documents with the same content
0153: */
0154: protected boolean duplicateCheck = false;
0155:
0156: /**
0157: * initializes the robot with the default implementation
0158: * of the TaskList interface
0159: *
0160: * @param expected document count
0161: */
0162: public WebRobot(int expectedDocumentCount) {
0163: log = Category.getInstance(getClass().getName());
0164: content2UrlMap = new HashMap(expectedDocumentCount);
0165: registerVisitedList(new HashedMemoryTaskList(false,
0166: expectedDocumentCount));
0167: registerToDoList(new HashedMemoryTaskList(true,
0168: expectedDocumentCount));
0169: this .expectedDocumentCount = expectedDocumentCount;
0170: this .setAgentName(AGENT_NAME);
0171: }
0172:
0173: /**
0174: * initializes the robot with the default implementation of the TaskList
0175: * interface
0176: */
0177: public WebRobot() {
0178: this (DEFAULT_EXPECTED_DOCUMENT_COUNT);
0179: }
0180:
0181: /**
0182: * Sets the implementation class for the backend task list storage.
0183: * WebRobot uses the TaskList interface to store future tasks.
0184: *
0185: * If you want to use your own TaskList implementation, just call
0186: * this method.
0187: *
0188: * @param todo TaskList to be used for the "to do" list
0189: */
0190: public void registerToDoList(TaskList todo) {
0191: this .todo = todo;
0192: }
0193:
0194: /**
0195: * Sets the implementation class for the backend task list storage.
0196: * WebRobot uses the TaskList interface to store URLs that have
0197: * been retrieved before.
0198: *
0199: * If you want to use your own TaskList implementation, just call
0200: * this method.
0201: *
0202: * @param visited TaskList to be used for the list of visited URLs
0203: */
0204: public void registerVisitedList(TaskList visited) {
0205: this .visited = visited;
0206: }
0207:
0208: /**
0209: * @return the start URL for this robot
0210: */
0211: public URL getStartURL() {
0212: return startURL;
0213: }
0214:
0215: /**
0216: * Sets the start URL for this robot
0217: * @param startURL the start URL
0218: */
0219: public void setStartURL(URL startURL) {
0220: String path = startURL.getPath();
0221: this .startURL = startURL;
0222:
0223: // is it a directory ?
0224: if (path.endsWith("/")) {
0225: this .startDir = startURL.getHost() + path;
0226: } else {
0227: int pos = path.lastIndexOf("/");
0228: if (pos < 0) {
0229: // this happens for URLs without a path
0230: this .startDir = startURL.getHost() + "/";
0231: } else {
0232: this .startDir = startURL.getHost()
0233: + path.substring(0, pos + 1);
0234: }
0235: }
0236: }
0237:
0238: /**
0239: * @return the maximal allowed search depth
0240: */
0241: public int getMaxDepth() {
0242: return maxDepth;
0243: }
0244:
0245: /**
0246: * sets the maximal search depth
0247: * @param maxDepth
0248: */
0249: public void setMaxDepth(int maxDepth) {
0250: this .maxDepth = maxDepth;
0251: }
0252:
0253: /**
0254: * Get the value of bandwith of the used HttpTool
0255: * @return value of bandwith.
0256: */
0257: public int getBandwidth() {
0258: return httpTool.getBandwidth();
0259: }
0260:
0261: /**
0262: * Set the value of bandwith of the used HttpTool
0263: * @param bandwidth Value to assign to bandwith.
0264: */
0265: public void setBandwidth(int bandwidth) {
0266: httpTool.setBandwidth(bandwidth);
0267: }
0268:
0269: /**
0270: * gets the WalkToOtherHost status
0271: * @return true if the Robot is allowed to travel to other
0272: * host then the start host, false otherwise
0273: */
0274: public boolean getWalkToOtherHosts() {
0275: return walkToOtherHosts;
0276: }
0277:
0278: /**
0279: * sets the WalkToOtherHosts status
0280: * @param walkToOtherHosts true if the Robot is allowed to travel to other
0281: * host then the start host, false otherwise
0282: */
0283: public void setWalkToOtherHosts(boolean walkToOtherHosts) {
0284: this .walkToOtherHosts = walkToOtherHosts;
0285: }
0286:
0287: /**
0288: * gets the AllowWholeHost value
0289: * @return true if the Robot is allowed to travel to the whole
0290: * host where it started from, false otherwise. If false, it is only
0291: * allowed to travel to URLs below the start URL
0292: */
0293: public boolean getAllowWholeHost() {
0294: return allowWholeHost;
0295: }
0296:
0297: /**
0298: * sets the AllowWholeHost status
0299: * @param allowWholeHost if true, the Robot is allowed to
0300: * travel to the whole host where it started from. Otherwise it is only
0301: * allowed to travel to URLs below the start URL.
0302: */
0303: public void setAllowWholeHost(boolean allowWholeHost) {
0304: this .allowWholeHost = allowWholeHost;
0305: }
0306:
0307: /**
0308: * Gets the AllowWholeDomain value.
0309: * @return true if the Robot is allowed to travel to the whole
0310: * domain of the start host, false otherwise.
0311: * @see #setAllowWholeDomain(boolean)
0312: */
0313: public boolean getAllowWholeDomain() {
0314: return allowWholeDomain;
0315: }
0316:
0317: /**
0318: * Sets the AllowWholeDomain status
0319: * @param allowWholeDomain if true, the Robot is allows to travel
0320: * to all hosts in the same domain as the starting host. E.g. if you
0321: * start at www.apache.org, it is also allowed to travel to
0322: * jakarta.apache.org, xml.apache.org ...
0323: */
0324: public void setAllowWholeDomain(boolean allowWholeDomain) {
0325: this .allowWholeDomain = allowWholeDomain;
0326: }
0327:
0328: /**
0329: * Gets the state of flexible host checking (enabled or disabled).
0330: *
0331: * To find out if a new URL is on the same host, the robot usually
0332: * compares the host part of both. Some web servers have an inconsistent
0333: * addressing scheme and use the hostname www.domain.com and domain.com.
0334: * With flexible host check enabled, the robot will consider both
0335: * hosts as equal.
0336: *
0337: * @return true, if flexible host checking is enabled
0338: */
0339: public boolean getFlexibleHostCheck() {
0340: return flexibleHostCheck;
0341: }
0342:
0343: /**
0344: * Defines if the host test should be more flexible.
0345: *
0346: * To find out if a new URL is on the same host, the robot usually
0347: * compares the host part of both. Some web servers have an inconsistent
0348: * addressing scheme and use the hostname www.domain.com and domain.com.
0349: * With flexible host check enabled, the robot will consider both
0350: * hosts as equal.
0351: *
0352: * @param flexibleHostCheck set this true, to enable flexible host checking
0353: * (disabled by default)
0354: */
0355: public void setFlexibleHostCheck(boolean flexibleHostCheck) {
0356: this .flexibleHostCheck = flexibleHostCheck;
0357: }
0358:
0359: /**
0360: * Gets the AllowCaching value.
0361: * @return true if the Robot is allowed to cache documents in the
0362: * docManager
0363: * @see #setAllowCaching(boolean)
0364: */
0365: public boolean getAllowCaching() {
0366: return allowCaching;
0367: }
0368:
0369: /**
0370: * Sets the AllowCaching status
0371: *
0372: * @param allowCaching if true, the Robot is allows to use
0373: * cached documents. That means it will first try to get teh document
0374: * from the docManager cache and will only retrieve it if it is
0375: * not found in the cache. If the cache returns a document, the robot
0376: * will NEVER retrieve it again. Therefore, expiration mechanisms have
0377: * to be included in the HttpDocManager method retrieveFromCache.
0378: * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
0379: */
0380: public void setAllowCaching(boolean allowCaching) {
0381: this .allowCaching = allowCaching;
0382: }
0383:
0384: /**
0385: * @return the document manager of this robot
0386: * @see HttpDocManager
0387: */
0388: public HttpDocManager getDocManager() {
0389: return docManager;
0390: }
0391:
0392: /**
0393: * Sets the document manager for this robot <br />
0394: * Without a document manager, the robot will travel through the web but
0395: * don't do anything with the retrieved documents (simply forget
0396: * them).
0397: * A document manager can store them, extract information or
0398: * whatever you like.
0399: * There can be only one document manager, but you are free to combine
0400: * functionalities of available document managers in a new object (e.g.
0401: * to store the document and extract meta informations).
0402: * @param docManager
0403: */
0404: public void setDocManager(HttpDocManager docManager) {
0405: this .docManager = docManager;
0406: }
0407:
0408: /**
0409: * Sets the CookieManager used by the HttpTool
0410: * By default a MemoryCookieManager will be used, but you can
0411: * use this method to use your own CookieManager implementation.
0412: *
0413: * @param cm an object that implements the CookieManager interface
0414: */
0415: public void setCookieManager(CookieManager cm) {
0416: httpTool.setCookieManager(cm);
0417: }
0418:
0419: /**
0420: * Gets the CookieManager used by the HttpTool
0421: *
0422: * @return the CookieManager that will be used by the HttpTool
0423: */
0424: public CookieManager getCookieManager() {
0425: return httpTool.getCookieManager();
0426: }
0427:
0428: /**
0429: * Sets the DownloadRule
0430: * @param rule the download rule set to use
0431: */
0432: public void setDownloadRuleSet(DownloadRuleSet rules) {
0433: httpTool.setDownloadRuleSet(rules);
0434: }
0435:
0436: /**
0437: * Sets the URLCheck for this robot
0438: * @param check
0439: */
0440: public void setURLCheck(URLCheck check) {
0441: this .urlCheck = check;
0442: }
0443:
0444: /**
0445: * sets a proxy to use
0446: * @param proxyDescr the Proxy definition in the format host:port
0447: */
0448: public void setProxy(String proxyDescr) throws HttpException {
0449: httpTool.setProxy(proxyDescr);
0450: }
0451:
0452: /**
0453: * @return the current proxy setting in the format host:port
0454: */
0455: public String getProxy() {
0456: return httpTool.getProxy();
0457: }
0458:
0459: /**
0460: * @return the Referer setting for the first HTTP reuest
0461: */
0462: public String getStartReferer() {
0463: return startReferer;
0464: }
0465:
0466: /**
0467: * sets the Referer setting for the first HTTP reuest
0468: * @param startReferer an URL (e.g. http://www.matuschek.net)
0469: */
0470: public void setStartReferer(String startReferer) {
0471: this .startReferer = startReferer;
0472: }
0473:
0474: /**
0475: * should we ignore robots.txt Robot Exclusion protocol ?
0476: * @param ignoreRobotsTxt if set to true, the robot will ignore
0477: * the settings of the /robots.txt file on the webserver
0478: * <b>Know what you are doing if you change this setting</b>
0479: */
0480: public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
0481: robCheck.setIgnore(ignoreRobotsTxt);
0482: }
0483:
0484: /**
0485: * @return the sleeptime setting
0486: */
0487: public int getSleepTime() {
0488: return sleepTime;
0489: }
0490:
0491: /**
0492: * set the sleeptime<br />
0493: * after every retrieved document the robot will wait this time
0494: * before getting the next document. this allows it to limit the
0495: * load on the server
0496: * @param sleeptime wait time in seconds
0497: */
0498: public void setSleepTime(int sleepTime) {
0499: this .sleepTime = sleepTime;
0500: }
0501:
0502: /**
0503: * sets the From: HTTP header<br />
0504: * this should be a valid email address. it is not needed for the robot,
0505: * but you should use it, because the administrator of the web server
0506: * can contact you if the robot is doing things that he don't want
0507: * @param fromAdress an RFC 822 email adress
0508: */
0509: public void setFromAddress(String fromAddress) {
0510: httpTool.setFromAddress(fromAddress);
0511: }
0512:
0513: /**
0514: * sets the list of form handlers
0515: * @see net.matuschek.html.FormHandler for more
0516: * information about form handlers
0517: */
0518: public void setFormHandlers(Vector handlers) {
0519: formFiller.setFormHandlers(handlers);
0520: if (handlers != null && handlers.size() > 0) {
0521: hasFormHandlers = true;
0522: }
0523: }
0524:
0525: /**
0526: * @return the list of form handlers
0527: * @see net.matuschek.html.FormHandler for more information
0528: * about form handlers
0529: */
0530: public Vector getFormHandlers() {
0531: return formFiller.getFormHandlers();
0532: }
0533:
0534: /**
0535: * Gets the name of the "User-Agent" header that the robot will use
0536: * @return the user agent name
0537: */
0538: public String getAgentName() {
0539: if (httpTool != null) {
0540: return httpTool.getAgentName();
0541: } else {
0542: return null;
0543: }
0544: }
0545:
0546: /**
0547: * sets the Agent-Name authentication for this robot
0548: * @param name a name for this robot
0549: * (e.g. "Mozilla 4.0 (compatible; Robot)")
0550: */
0551: public void setAgentName(String name) {
0552: httpTool.setAgentName(name);
0553: // robCheck = new NoRobots(ROBOT_NAME, httpTool);
0554: robCheck = new NoRobots(name, httpTool);
0555: }
0556:
0557: /**
0558: * Gets the timeout for getting data in seconds of the used HttpTool
0559: * @return the value of sockerTimeout
0560: * @see #setTimeout(int)
0561: */
0562: public int getTimeout() {
0563: if (httpTool != null) {
0564: return httpTool.getTimeout();
0565: } else {
0566: return -1;
0567: }
0568: }
0569:
0570: /**
0571: * Sets the timeout for getting data. If HttpTool can't read data from a
0572: * remote web server after this number of seconds it will stop the download
0573: * of the current file
0574: * @param timeout Timeout in seconds
0575: */
0576: public void setTimeout(int timeout) {
0577: httpTool.setTimeout(timeout);
0578: }
0579:
0580: /**
0581: * Gets the ntlmAuthentication of the robot
0582: * @return the ntlmAuthentication
0583: */
0584: public NTLMAuthorization getNtlmAuthorization() {
0585: if (httpTool != null) {
0586: return httpTool.getNtlmAuthorization();
0587: } else {
0588: return null;
0589: }
0590: }
0591:
0592: /**
0593: * sets a ntlmAuthentication for this robot
0594: * @param ntlmAuthentication for this robot
0595: */
0596: public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
0597: httpTool.setNtlmAuthorization(ntlmAuthorization);
0598: }
0599:
0600: /**
0601: * Gets the setting of the IgnoreRobotsTxt property
0602: * @return true if robots.txt will be ignored, false otherwise
0603: */
0604: public boolean getIgnoreRobotsTxt() {
0605: return ignoreRobotsTxt;
0606: }
0607:
0608: /**
0609: * Gets a vector of URLs that can be visited more then once
0610: * @return a vector containing URLs formated as Strings
0611: */
0612: public Vector getVisitMany() {
0613: return visitMany;
0614: }
0615:
0616: public void setVisitMany(Vector visitMany) {
0617: this .visitMany = visitMany;
0618: }
0619:
0620: public void setHttpToolCallback(HttpToolCallback callback) {
0621: httpTool.setCallback(callback);
0622: }
0623:
0624: public WebRobotCallback getWebRobotCallback() {
0625: return webRobotCallback;
0626: }
0627:
0628: public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
0629: this .webRobotCallback = webRobotCallback;
0630: }
0631:
0632: /**
0633: * Sets the sleep status for this robot. If a WebRobot is set to sleep
0634: * after starting run(), is will wait after retrieving the current document
0635: * and wait for setSleep(false)
0636: */
0637: public void setSleep(boolean sleep) {
0638: this .sleep = sleep;
0639: }
0640:
0641: /**
0642: * Is the robot sleeping ?
0643: */
0644: public boolean isSleeping() {
0645: return this .sleep;
0646: }
0647:
0648: /**
0649: * Set the list of allowed URLs
0650: * @param allowed a Vector containing Strings. URLs will be checked
0651: * if they begin of a string in this vector
0652: */
0653: public void setAllowedURLs(Vector allowed) {
0654: this .allowedURLs = allowed;
0655: }
0656:
0657: /**
0658: * Gets the list of allowed URLs
0659: * @return a Vector containing Strings
0660: * @see #setAllowedURLs(Vector)
0661: */
0662: public Vector getAllowedURLs() {
0663: return this .allowedURLs;
0664: }
0665:
0666: /**
0667: * Enable/disable cookies
0668: * @param enable if true, HTTP cookies will be enabled, if false
0669: * the robot will not use cookies
0670: */
0671: public void setEnableCookies(boolean enable) {
0672: httpTool.setEnableCookies(enable);
0673: }
0674:
0675: /**
0676: * Get the status of the cookie engine
0677: * @return true, if HTTP cookies are enabled, false otherwise
0678: */
0679: public boolean getEnableCookies() {
0680: return httpTool.getEnableCookies();
0681: }
0682:
0683: /**
0684: * Set the maximum age of documents to retrieve to this number
0685: * of seconds
0686: * @param maxAge integer value of the maximum document age
0687: * (in seconds), negative value means no limit.
0688: */
0689: public void setMaxDocumentAge(long maxAge) {
0690: this .maxDocumentAge = maxAge;
0691: }
0692:
0693: /**
0694: * Gets the maximum age of documents to retrieve
0695: * @return maximum document age (in seconds), negative value means
0696: * no limit.
0697: */
0698: public long getMaxDocumentAge() {
0699: return this .maxDocumentAge;
0700: }
0701:
0702: /**
0703: * Sets a FilterChain. If teh WebRobot use a FilterChain it will
0704: * process any retrieved document by this FilterChain before
0705: * storing it
0706: *
0707: * @param filter a FilterChain to use for filtering HttpDocs
0708: */
0709: public void setFilters(FilterChain filters) {
0710: this .filters = filters;
0711: }
0712:
0713: /**
0714: * Delete all cookies
0715: */
0716: public void clearCookies() {
0717: httpTool.clearCookies();
0718: }
0719:
0720: /**
0721: * thread run() method, simply calls work()
0722: * @see #work()
0723: */
0724: public void run() {
0725: work();
0726: }
0727:
0728: /**
0729: * do your job travel through the web using the configured
0730: * parameters and retrieve documents
0731: */
0732: public void work() {
0733: RobotTask task = createRobotTask(startURL, maxDepth,
0734: startReferer);
0735: todo.add(task);
0736: walkTree();
0737: // ok, we did it, clean up dynamic data (the vistited vector)
0738: cleanUp();
0739: log.info("Documents retrieved by: Web=" + countWeb + " Cache="
0740: + countCache + " Refresh=" + countRefresh
0741: + " NoRefresh=" + countNoRefresh);
0742: }
0743:
0744: /**
0745: * stop the current robot run
0746: * note that this will not abourt the current download but stop after
0747: * the current download has finished
0748: */
0749: public void stopRobot() {
0750: stopIt = true;
0751: }
0752:
0753: /**
0754: * Holds information about memory status.
0755: * @see handleMemoryError(OutOfMemoryError)
0756: */
0757: private int memoryLevel = 0;
0758:
0759: /** Can new tasks be added? (may depend on memoryLevel) */
0760: protected boolean activatedNewTasks = true;
0761:
0762: /** Are visited URLs collected? (may depend on memoryLevel) */
0763: protected boolean activatedUrlHistory = true;
0764:
0765: /** Are visited contents collected? (may depend on memoryLevel) */
0766: protected boolean activatedContentHistory = true;
0767:
0768: /** memory buffer of 200 KB to be freed in case of urgent memory needs */
0769: private byte memoryBuffer[] = new byte[200 * 1024];
0770:
0771: /**
0772: * do your job !
0773: */
0774:
0775: public void walkTree() {
0776: while ((todo.size() > 0) && (!stopIt)) {
0777: RobotTask task;
0778: synchronized (visited) {
0779: task = todo.removeFirst();
0780: if (visited.contains(task)
0781: && (!visitMany.contains(task.getUrl()
0782: .toString()))) {
0783: log.debug("already visited: " + task.getUrl());
0784: continue;
0785: }
0786: if (activatedUrlHistory) {
0787: visited.add(task);
0788: }
0789: }
0790:
0791: boolean repeat = true;
0792: while (repeat) {
0793: try {
0794: retrieveURL(task);
0795: repeat = false;
0796: } catch (OutOfMemoryError memoryError) {
0797: handleMemoryError(memoryError);
0798: }
0799: }
0800:
0801: // sleep, if sleep is set to true
0802: while (sleep) {
0803: // callback
0804: if (webRobotCallback != null) {
0805: webRobotCallback.webRobotSleeping(true);
0806: }
0807:
0808: try {
0809: Thread.sleep(1000);
0810: } catch (InterruptedException e) {
0811: }
0812: ;
0813: }
0814:
0815: // callback
0816: if (webRobotCallback != null) {
0817: webRobotCallback.webRobotSleeping(false);
0818: }
0819:
0820: // callback
0821: if (webRobotCallback != null) {
0822: webRobotCallback.webRobotUpdateQueueStatus(todo.size());
0823: }
0824: spawnThread();
0825: }
0826:
0827: // callback
0828: if (webRobotCallback != null) {
0829: finishThreads();
0830: }
0831: }
0832:
0833: /**
0834: * Implements OutOfMemory handling strategies.
0835: * Action depends on memoryLevel
0836: * @param memoryError
0837: * @throws OutOfMemoryError
0838: */
0839: protected void handleMemoryError(OutOfMemoryError memoryError)
0840: throws OutOfMemoryError {
0841: memoryLevel++;
0842: log.error("OutOfMemoryError level=" + memoryLevel
0843: + "! (visited=" + visited.size() + ", todo="
0844: + todo.size() + ")");
0845: switch (memoryLevel) {
0846: case 1:
0847: // donīt remember visited URLs and contents any more
0848: // and try it again
0849: visited.clear();
0850: activatedUrlHistory = false;
0851: content2UrlMap.clear();
0852: activatedContentHistory = false;
0853: System.gc();
0854: break;
0855: case 2:
0856: // stop adding new Tasks, just process todo-list.
0857: // free memory buffer
0858: // and try it again
0859: activatedNewTasks = false;
0860: memoryBuffer = null;
0861: System.gc();
0862: break;
0863: case 3:
0864: // there is nothing we can do any more.
0865: // throw exception to stop robot
0866: throw memoryError;
0867: default:
0868: // Should never be reached.
0869: if (memoryBuffer != null) {
0870: // avoid removal of memoryBuffer by compiler
0871: System.err.println(memoryBuffer[0]);
0872: }
0873: throw memoryError;
0874: }
0875: }
0876:
0877: /**
0878: * calls webRobotDone and finishes docManager if
0879: * executed in mainThread
0880: */
0881: protected void finishThreads() {
0882: webRobotCallback.webRobotDone();
0883: if (docManager != null) {
0884: docManager.finish();
0885: }
0886: }
0887:
0888: /**
0889: * Start subThreads for spidering.
0890: * WARNING: Should only be implemented and used for local
0891: * spidering purposes!
0892: */
0893: protected synchronized void spawnThread() {
0894: }
0895:
0896: /** counter for calls of retrieveURL */
0897: protected int iteration = 0;
0898:
0899: /**
0900: * retrieve the next URL, save it, extract all included links and
0901: * add those links to the tasks list
0902: * @param task task to retrieve, function does nothing if this is null
0903: */
0904: public void retrieveURL(RobotTask task) {
0905: if (task == null) {
0906: log.debug("Empty task found, ignoring");
0907: return;
0908: }
0909:
0910: long now = System.currentTimeMillis();
0911:
0912: updateProgressInfo();
0913:
0914: URL u = task.getUrl();
0915: String urlString = u.toString();
0916: String referer = task.getReferer();
0917: int depth = task.getMaxDepth();
0918:
0919: if (depth < 0) {
0920: log.info("Max search depth reached");
0921: return;
0922: }
0923:
0924: // we may need this additional check even if we
0925: // tested it during adding to the tasks list
0926: if (!isAllowed(u)) {
0927: log.info("Url '" + u + "' filtered out.");
0928: return;
0929: }
0930:
0931: if (u.getFile().equals("")) {
0932: try {
0933: urlString = urlString + "/";
0934: u = new URL(urlString);
0935: // fix for double retrieved files
0936: task.setUrl(u);
0937: } catch (MalformedURLException e) {
0938: log.error("URL not well formed: " + e.toString());
0939: // use exception handler to handle exception
0940: exceptionHandler.handleException(this , u, e);
0941: return;
0942: }
0943: }
0944:
0945: log.info("retrieving " + urlString);
0946: httpTool.setReferer(referer);
0947:
0948: HttpDoc doc = null;
0949: Vector links = null;
0950: boolean cached = false;
0951:
0952: // look in the cache first, but only for static pages
0953: boolean reScan = true;
0954: if ((docManager != null && allowCaching)
0955: && (task.getMethod() == HttpConstants.GET)
0956: && (task.getParamString() == null)) {
0957: doc = docManager.retrieveFromCache(u);
0958: /* if (doc != null) {
0959: try {
0960: links = ((UrlCollector) docManager).retrieveLinks(doc);
0961: } catch (IOException e) {
0962: log.info("Could not get links for " + u + ": " + e.getMessage());
0963: links = null;
0964: }
0965: }*/
0966:
0967: if (doc != null) {
0968: countCache++;
0969: long lastRetrieved = doc.getDateAsMilliSeconds();
0970: double ageInSeconds = (now - lastRetrieved) / 1000;
0971: if (ageInSeconds < 0) {
0972: log.warn("DocumentAge < 0!");
0973: }
0974: reScan = maxDocumentAge >= 0
0975: && ageInSeconds > maxDocumentAge;
0976: if (reScan) {
0977: long lastModified = doc
0978: .getLastModifiedAsMilliSeconds();
0979: Date lastModifiedDate = new Date(lastModified);
0980: httpTool.setIfModifiedSince(lastModifiedDate);
0981: }
0982: } else {
0983: httpTool.setIfModifiedSince(null);
0984: }
0985: }
0986:
0987: // if not found in cache, retrieve from the web page
0988: if (reScan) {
0989: HttpDoc newDoc;
0990: boolean error = false;
0991: try {
0992: if (u.getProtocol().equalsIgnoreCase("file")) {
0993: // retrieve from file
0994: newDoc = retrieveFileURL(u, httpTool
0995: .getIfModifiedSince());
0996: } else {
0997: // retrieve from Web
0998: newDoc = httpTool.retrieveDocument(u, task
0999: .getMethod(), task.getParamString());
1000: if (newDoc != null) {
1001: newDoc.setDate(now);
1002: }
1003: sleepNow();
1004: }
1005:
1006: if (newDoc != null && !newDoc.isNotModified()) {
1007: if (!(newDoc.isOk() || newDoc.isRedirect())) {
1008: error = true;
1009: }
1010: } else {
1011: // (newDoc == null || newDoc.isNotModified()) && doc != null
1012: // -> Not modified
1013: // -> refresh time stamp
1014: if (doc != null) {
1015: doc.setDate(now);
1016: doc.setCached(false);
1017: newDoc = null;
1018: }
1019: }
1020: } catch (HttpException hex) {
1021: error = true;
1022: newDoc = null;
1023: }
1024: if (error) {
1025: int retry = task.retry();
1026: if (retry <= maxRetries) {
1027: synchronized (visited) {
1028: todo.add(task);
1029: visited.remove(task);
1030: }
1031: log.info("Adding " + u + " for retry no. " + retry);
1032: return;
1033: } else {
1034: doc = docManager.retrieveFromCache(u);
1035: if (doc == null) {
1036: log.warn("Unsuccessfull retries for " + u);
1037: return;
1038: } else {
1039: long docDate = doc.getDateAsMilliSeconds();
1040: long age = (now - docDate);
1041: age /= 1000;
1042: if (expirationAge < 0 || age < expirationAge) {
1043: newDoc = doc;
1044: cached = true;
1045: log.info("Cached document not expired: "
1046: + u);
1047: } else {
1048: log.warn("Cached document expired: " + u);
1049: docManager.removeDocument(u);
1050: return;
1051: }
1052: }
1053: }
1054: }
1055:
1056: if (newDoc != null) {
1057: countWeb++;
1058: doc = newDoc;
1059: links = null; // force recalculation of links
1060: countRefresh++;
1061: } else {
1062: cached = true;
1063: countNoRefresh++;
1064: }
1065: } else {
1066: cached = true;
1067: log.debug("Page " + u + " retrieved from cache");
1068: }
1069:
1070: // Add it to the visited vector
1071: // needs to be synchronized with todo-list
1072: // visited.add(task);
1073:
1074: // got a NULL document, that doc was not retrieved
1075: // usually, it was not downloaded because a rule didn't allow
1076: // to download it
1077: if (doc == null) {
1078: log.info("not downloaded " + u);
1079: return;
1080: }
1081:
1082: // Duplicate check
1083: String duplicate = null;
1084: if (duplicateCheck) {
1085: duplicate = getContentVisitedURL(doc);
1086: if (duplicate != null) {
1087: log.info("URLs with same content found: " + urlString
1088: + " = " + duplicate);
1089: } else {
1090: try {
1091: duplicate = docManager.findDuplicate(doc);
1092: if (duplicate != null) {
1093: log
1094: .info("URLs with same content found in cache: "
1095: + urlString + " = " + duplicate);
1096: }
1097: } catch (IOException e) {
1098: e.printStackTrace();
1099: }
1100: }
1101:
1102: if (duplicate != null) {
1103: String pureDuplicate = removeParameters(duplicate);
1104: String pureUrl = removeParameters(urlString);
1105: if (!pureUrl.equals(pureDuplicate) && !cached) {
1106: // different url not yet stored -> store it
1107: try {
1108: // retrieve links from original
1109: HttpDoc linksDoc = docManager
1110: .retrieveFromCache(new URL(duplicate));
1111: if (linksDoc != null) {
1112: doc.setLinks(linksDoc.getLinks());
1113: }
1114: docManager.storeDocument(doc);
1115: } catch (Exception e) {
1116: e.printStackTrace();
1117: }
1118: }
1119: RobotTask newTask;
1120: try {
1121: newTask = createRobotTask(new URL(duplicate),
1122: depth, referer);
1123: // check already here for visited tasks to save memory
1124: if (!visited.contains(newTask)) {
1125: addTask(newTask);
1126: }
1127: } catch (MalformedURLException e) {
1128: e.printStackTrace(); // Canīt happen
1129: }
1130: return;
1131: }
1132: }
1133:
1134: // was it an UnAuthorized document ?
1135: if (doc.isUnauthorized()) {
1136: log.info("got HTTP Unauthorized for URL " + u);
1137: }
1138:
1139: if (doc.isOk() || cached) {
1140: // callback
1141: if (webRobotCallback != null) {
1142: int contentLength = 0;
1143: if (doc.getContent() != null) {
1144: contentLength = doc.getContent().length;
1145: }
1146: webRobotCallback.webRobotRetrievedDoc(urlString,
1147: contentLength);
1148: }
1149:
1150: // extract links
1151: try {
1152: if (doc.isHTML() && (depth > 0)) {
1153: // solving encoding problem
1154: // HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
1155: HtmlDocument htmlDoc = null;
1156: HttpHeader contentTypeHeader = doc
1157: .getHeader("Content-type");
1158: if (contentTypeHeader != null) {
1159: String contentType = contentTypeHeader
1160: .getValue();
1161: int index = contentType.toLowerCase().indexOf(
1162: "charset=");
1163: if (index > 0) {
1164: htmlDoc = new HtmlDocument(u, doc
1165: .getContent(), contentType
1166: .substring(index + 8));
1167: } else {
1168: htmlDoc = new HtmlDocument(u, doc
1169: .getContent());
1170: }
1171: } else {
1172: htmlDoc = new HtmlDocument(u, doc.getContent());
1173: }
1174:
1175: // add links
1176:
1177: // this depth-check is critical!
1178: // otherwise far too many RobotTasks will be created
1179: // this will cause a premature OutOfMemoryException!
1180: if (depth > 0) {
1181: if (duplicate != null) {
1182: HttpDoc linksDoc = docManager
1183: .retrieveFromCache(new URL(
1184: duplicate));
1185: doc.setLinks(linksDoc.getLinks());
1186: } else if (cached) {
1187: }
1188: if (links == null) {
1189: links = htmlDoc.getLinks();
1190: doc.setLinks(links);
1191: }
1192: if (duplicate == null) {
1193: HashSet checkedLinks = new HashSet();
1194: for (int i = 0; i < links.size(); i++) {
1195: URL link = (URL) links.elementAt(i);
1196: log.info("Link: " + link);
1197: // check already here for duplicate links to avoid expensive
1198: // creation of RobotTasks
1199: if (!checkedLinks.contains(link)) {
1200: checkedLinks.add(link);
1201: String myReferer = u.toString();
1202: if (u.getUserInfo() != null) {
1203: // remove userinfo from referer
1204: int endindex = myReferer
1205: .indexOf("@") + 1;
1206: myReferer = "http://"
1207: + myReferer
1208: .substring(endindex);
1209: }
1210:
1211: RobotTask newTask = createRobotTask(
1212: (URL) links.elementAt(i),
1213: depth - 1, myReferer);
1214: // check already here for visited tasks to save memory
1215: if (!visited.contains(newTask)) {
1216: // bad workaround to retrieve images first
1217: if (newTask.urlString
1218: .endsWith(".jpg")) {
1219: addTaskAtStart(newTask);
1220: } else {
1221: addTask(newTask);
1222: }
1223: }
1224: }
1225: }
1226: }
1227: }
1228:
1229: if (hasFormHandlers) {
1230: // add forms
1231: Vector forms = htmlDoc.getElements("form");
1232: for (int i = 0; i < forms.size(); i++) {
1233: ExtendedURL eurl = formFiller.fillForm(u,
1234: (Element) forms.elementAt(i));
1235: if (eurl != null) {
1236: RobotTask newTask = createRobotTask(
1237: eurl.getURL(), depth - 1, u
1238: .toString());
1239: newTask
1240: .setParamString(eurl
1241: .getParams());
1242: newTask.setMethod(eurl
1243: .getRequestMethod());
1244: addTask(newTask);
1245: }
1246: }
1247: }
1248:
1249: }
1250: // catch any occuring error to keep on processing
1251: } catch (OutOfMemoryError e) {
1252: throw e;
1253: } catch (Throwable e) {
1254: log
1255: .error("Unexpected error while extraction links from url '"
1256: + u + "':" + e);
1257: e.printStackTrace();
1258: // continue processing
1259: }
1260:
1261: // filter and store the document
1262: if ((docManager != null)) {
1263: try {
1264: if (filters != null) {
1265: doc = filters.process(doc);
1266: } else {
1267: log.debug("No filters defined");
1268: }
1269:
1270: if (isProcessingAllowed(doc)) {
1271: docManager.processDocument(doc);
1272: } else {
1273: String md5 = doc
1274: .getHeaderValue(HttpHeader.CONTENT_MD5);
1275: doc.setContent("Not for indexing".getBytes());
1276: doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
1277: }
1278:
1279: try {
1280: docManager.storeDocument(doc);
1281: } catch (Exception e) {
1282: log.warn("could not store (not for indexing) "
1283: + urlString + ": " + e.getMessage());
1284: }
1285: if (activatedContentHistory && duplicate == null) {
1286: setContentVisitedURL(doc, urlString);
1287: }
1288: } catch (DocManagerException e1) {
1289: log.error("could not process document: "
1290: + e1.getMessage());
1291: exceptionHandler.handleException(this , u, e1);
1292: } catch (FilterException e2) {
1293: log.error(e2.getMessage());
1294: }
1295: }
1296:
1297: } else {
1298: // it was NOT a 200 return code !
1299:
1300: if (doc.isRedirect()) {
1301: String ref = doc.getLocation();
1302: log.info("Got redirect to " + ref);
1303:
1304: try {
1305: URL u2 = new URL(u, ref);
1306: // is it on another host ?
1307:
1308: // On a redirect, browsers use the old Referer instead of the
1309: // URL that got this redirect
1310: // Therefore we do not use u.toString as Referer but the old Referer
1311: RobotTask newTask = createRobotTask(u2, depth - 1,
1312: referer);
1313:
1314: // it will be inserted at the beginning of the vector !
1315: addTaskAtStart(newTask);
1316: } catch (MalformedURLException e) {
1317: // ignore this URL
1318: }
1319: // handle other values
1320: } else if (doc.isNotFound()) {
1321: // the document was not found
1322: exceptionHandler.handleException(this , u,
1323: new HttpException("Document not found"));
1324: } else if (doc.isUnauthorized()) {
1325: // the document was not found
1326: exceptionHandler.handleException(this , u,
1327: new HttpException(
1328: "No authorization for the document."));
1329: } else {
1330: // an other error occured.
1331: exceptionHandler.handleException(this , u,
1332: new HttpException(
1333: "Unknown document error (Http return code "
1334: + doc.getHttpCode() + ")."));
1335: }
1336: }
1337: }
1338:
1339: /**
1340: * Inform about spidering progress.
1341: * May use iteration, startTime,
1342: * countCache, countWeb, countRefresh, countNoRefresh
1343: */
1344: public void updateProgressInfo() {
1345: }
1346:
1347: /**
1348: * sleep for sleepTime seconds.
1349: */
1350: public void sleepNow() {
1351: if (sleepTime > 0) {
1352: synchronized (this ) {
1353: if (webRobotCallback != null) {
1354: webRobotCallback.webRobotSleeping(true);
1355: }
1356:
1357: try {
1358: Thread.sleep(sleepTime * 1000);
1359: } catch (InterruptedException e) {
1360: }
1361:
1362: if (webRobotCallback != null) {
1363: webRobotCallback.webRobotSleeping(false);
1364: }
1365: }
1366: }
1367: }
1368:
1369: /**
1370: * retrieves a file from the local file system.
1371: * @param url the url of the file to retrieve
1372: * @return HttpDoc containing the content and mime type
1373: */
1374: private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince)
1375: throws HttpException {
1376: HttpDoc doc = new HttpDoc();
1377:
1378: try {
1379: String host = url.getHost();
1380: String filename = url.getFile();
1381: if ((host == null) || (host.equals(""))) {
1382: // local file
1383: // remove leading / or \
1384: if ((filename.startsWith("\\"))
1385: || (filename.startsWith("/"))) {
1386: filename = filename.substring(1);
1387: }
1388: } else {
1389: filename = "//" + host + filename;
1390: }
1391: // get the mimetype and put in the http header
1392: String mimetypestr = getMimeTypeForFilename(filename);
1393: if (mimetypestr != null) {
1394: HttpHeader header = new HttpHeader("content-type",
1395: mimetypestr);
1396: doc.addHeader(header);
1397: }
1398:
1399: // get the content from the file
1400: File file = new File(filename);
1401: if (!file.exists()) {
1402: doc.setHttpCode("httpcode "
1403: + HttpConstants.HTTP_NOTFOUND);
1404: return doc;
1405: }
1406: long fileLastModified = file.lastModified();
1407: long ifModifiedSinceTime = ifModifiedSince == null ? 0
1408: : ifModifiedSince.getTime();
1409: if (fileLastModified > ifModifiedSinceTime) {
1410: byte[] content = readFileToByteArray(file);
1411: doc.setContent(content);
1412: doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
1413: } else {
1414: doc.setHttpCode("httpcode "
1415: + HttpConstants.HTTP_NOTMODIFIED);
1416: }
1417: doc.setLastModified(fileLastModified);
1418: doc.setDate(System.currentTimeMillis());
1419: doc.setURL(url);
1420:
1421: return doc;
1422: } catch (Exception e) {
1423: throw new HttpException(e.getMessage());
1424: }
1425: }
1426:
1427: /**
1428: * Get the Mime type for the given filename.
1429: * @param filename
1430: * @return Mime type
1431: */
1432: protected String getMimeTypeForFilename(String filename) {
1433: if (filename.endsWith(".html") || filename.endsWith(".htm")) {
1434: return "text/html";
1435: } else {
1436: return null;
1437: }
1438: }
1439:
1440: /**
1441: * Clean up temporary data
1442: */
1443: protected void cleanUp() {
1444: stopIt = false;
1445: visited.clear();
1446: todo.clear();
1447: }
1448:
1449: /**
1450: * adds a new task to the task vector but does some checks to
1451: */
1452: protected void addTask(RobotTask task) {
1453: if (taskAddAllowed(task) && activatedNewTasks) {
1454: todo.add(task);
1455: }
1456: }
1457:
1458: /**
1459: * adds a new tasks at the beginning of the tasks list
1460: * @see #addTask(RobotTask)
1461: */
1462: protected void addTaskAtStart(RobotTask task) {
1463: if (taskAddAllowed(task) && activatedNewTasks) {
1464: todo.addAtStart(task);
1465: }
1466: }
1467:
1468: /**
1469: * Checks if a tasks should be added to the task list
1470: * @param robotTask
1471: * @return true if this tasks can be added to the task list,
1472: * false otherwise
1473: */
1474: protected boolean taskAddAllowed(RobotTask task) {
1475: if (task == null) {
1476: log.info("Null task not allowed");
1477: return false;
1478: }
1479:
1480: if (!isAllowed(task.getUrl())) {
1481: return false;
1482: }
1483:
1484: if (todo.contains(task)) {
1485: return false;
1486: }
1487:
1488: return true;
1489: }
1490:
1491: /**
1492: * Is it allowed to travel to this new URL ?
1493: * @param u the URL to test
1494: * @return true if traveling to this URL is allowed, false otherwise
1495: */
1496: protected boolean isAllowed(URL u) {
1497:
1498: // do the basic checks
1499: if (basicURLCheck(u)) {
1500:
1501: // if we have an URLCheck then test this URL against it
1502: if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
1503: log.debug("not allowed by URLCheck:" + u);
1504: return false;
1505: }
1506:
1507: if (robCheck.ok(u)) {
1508: return true;
1509: } else {
1510: log.debug("not allowed by robots.txt:" + u);
1511: return false;
1512: }
1513: }
1514: return false;
1515: }
1516:
1517: /**
1518: * Is it allowed to process this document ?
1519: * @param document
1520: * @return true if processing of this URL is allowed
1521: */
1522: protected boolean isProcessingAllowed(HttpDoc doc) {
1523: URL u = doc.getURL();
1524: if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
1525: log.debug("processing not allowed by URLCheck:" + u);
1526: return false;
1527: }
1528:
1529: DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
1530: if (downloadRuleSet != null
1531: && !downloadRuleSet
1532: .processAllowed(doc.getHttpHeaders())) {
1533: log.debug("processing not allowed by DownloadRuleSet:" + u);
1534: return false;
1535: }
1536:
1537: return true;
1538: }
1539:
1540: /**
1541: * Basic URL allow check
1542: * it is allowed to walk to a new URL if <ul>
1543: * <li>WalkToOtherHost is true. In this case there will be no additional
1544: * tests.</li>
1545: * <li>The new URL is located below the start URL, e.g. is the start URL
1546: * is http://localhost/test, the URL http://localhost/test/index.html
1547: * is allowed, but http://localhost/ is not allowed.</li>
1548: * <li>AllowWholeHost is true and the new URL is located on the same host
1549: * as the start URL.</li>
1550: * <li>FlexibleHostCheck is true and the host part of the current URL
1551: * is equal to the host part of the start URL modulo the prefix "www."
1552: * </li>
1553: * <li>The URL starts with a string in the "AllowedURLs" list.</li>
1554: * </ul>
1555: */
1556: protected boolean basicURLCheck(URL currURL) {
1557: String currURLStr = currURL.getHost() + currURL.getPath();
1558: String currHost = currURL.getHost().toLowerCase();
1559: String startHost = startURL.getHost().toLowerCase();
1560:
1561: // no more checks, if walkToOtherHosts is true
1562: if (walkToOtherHosts) {
1563: return true;
1564: }
1565:
1566: // new URL below start URL ?
1567: if (currURLStr.startsWith(startDir)) {
1568: return true;
1569: }
1570:
1571: // on the same host ?
1572: if (allowWholeHost
1573: && (currURL.getHost().equalsIgnoreCase(startURL
1574: .getHost()))) {
1575: return true;
1576: }
1577:
1578: // on the same host with flexible test (host name with and without "www."
1579: if (flexibleHostCheck) {
1580: if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
1581: return true;
1582: }
1583: }
1584:
1585: // allow whole domain ?
1586: if (allowWholeDomain) {
1587: if (currHost.endsWith(getDomain(startHost))) {
1588: return true;
1589: }
1590: }
1591:
1592: // in the list of allowed URLs ?
1593: for (int i = 0; i < allowedURLs.size(); i++) {
1594: String s = (String) allowedURLs.elementAt(i);
1595: if (currURLStr.startsWith(s)) {
1596: return true;
1597: }
1598: }
1599: log.debug("URL " + currURLStr + " not allowed");
1600: return false;
1601: }
1602:
1603: /**
1604: * remove a leading www. from a given hostname
1605: *
1606: * @param hostname some hostname
1607: * @return the hostname if it doesn't start with "www." otherwise
1608: * the hostname without the leading www.
1609: */
1610: private String cutWWW(String hostname) {
1611: if (hostname.toLowerCase().startsWith("www.")) {
1612: return hostname.substring(4);
1613: } else {
1614: return hostname;
1615: }
1616: }
1617:
1618: /**
1619: * Gets the domain name of a given host (just delete everything
1620: * to the last "."
1621: *
1622: * @param hostname some hostname
1623: * @return the domain part of this hostname
1624: */
1625: private String getDomain(String hostname) {
1626: int pos = hostname.indexOf(".");
1627: if (pos < 0) {
1628: // this should not happen !
1629: return hostname;
1630: } else {
1631: return hostname.substring(pos + 1);
1632: }
1633: }
1634:
1635: /**
1636: * Method getExceptionHandler.
1637: * @return RobotExceptionHandler the exceptionhandler of the robot
1638: */
1639: public RobotExceptionHandler getExceptionHandler() {
1640: return exceptionHandler;
1641: }
1642:
1643: /**
1644: * Method setExceptionHandler.
1645: * sets the exceptionhandler of the robot
1646: * @param newExceptionHandler the new exception handler
1647: */
1648: public void setExceptionHandler(
1649: RobotExceptionHandler newExceptionHandler) {
1650: if (newExceptionHandler != null) {
1651: exceptionHandler = newExceptionHandler;
1652: }
1653: }
1654:
1655: /**
1656: * Method setStart.
1657: * sets the start URL
1658: * @param the startURL as String
1659: */
1660: public void setStart(String startURL) {
1661: try {
1662: setStartURL(new URL(startURL));
1663: } catch (MalformedURLException e) {
1664: e.printStackTrace();
1665: }
1666: }
1667:
1668: /**
1669: * Method getStart.
1670: * gets the start url as string
1671: * @return String
1672: */
1673: public String getStart() {
1674: URL url = getStartURL();
1675: if (url != null) {
1676: return url.toExternalForm();
1677: } else {
1678: return null;
1679: }
1680: }
1681:
1682: /**
1683: * This method finishes HttpTool, NoRobots, HttpDocManager.
1684: */
1685: public void finish() {
1686: if (httpTool != null) {
1687: httpTool.finish();
1688: }
1689: if (robCheck != null) {
1690: robCheck.finish();
1691: }
1692: if (docManager != null) {
1693: docManager.finish();
1694: }
1695: }
1696:
1697: public static void main(String[] args) {
1698: if (args.length > 0)
1699: System.err.println("Arguments will be ignored!");
1700: Field[] fields = WebRobot.class.getDeclaredFields();
1701: StringBuffer str = new StringBuffer(60);
1702: for (int i = 0; i < fields.length; i++) {
1703: if (!Modifier.isFinal(fields[i].getModifiers())
1704: && !Modifier.isStatic(fields[i].getModifiers())) {
1705: str.delete(0, str.length());
1706: str.append(" robot." + fields[i].getName() + " = "
1707: + fields[i].getName() + ";");
1708: while (str.length() < 50) {
1709: str.append(" ");
1710: }
1711: System.out.println(str.toString() + "// ("
1712: + fields[i].getType().getName() + ")");
1713: }
1714: }
1715: }
1716:
1717: /** default expected count of documents */
1718: private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
1719:
1720: /** expected count of documents */
1721: protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
1722:
1723: /** remember visited content here (md5, urlString) */
1724: protected HashMap content2UrlMap;
1725:
1726: /** counter for pages that were found in cache */
1727: long countCache = 0;
1728:
1729: /** counter for pages retrieved by web */
1730: long countWeb = 0;
1731:
1732: /** counter for pages that didnīt need a refresh */
1733: long countNoRefresh = 0;
1734:
1735: /** counter for refreshed pages (=cache+web) */
1736: long countRefresh = 0;
1737:
1738: /**
1739: * Method getContentVisitedURL.
1740: * Checks if the content was visited before and retrieves the corresponding URL.
1741: * @param content
1742: * @return found url or null if not found
1743: */
1744: public String getContentVisitedURL(HttpDoc doc) {
1745: Object key = doc.getContentMD5();
1746: synchronized (content2UrlMap) {
1747: String url = (String) content2UrlMap.get(key);
1748: return url;
1749: }
1750: }
1751:
1752: /**
1753: * Method setContentVisitedURL.
1754: * Makes an URL retrievable by its content by entering it in content2UrlMap.
1755: * @param content
1756: * @param url
1757: */
1758: public void setContentVisitedURL(HttpDoc doc, String url) {
1759: Object key = doc.getContentMD5();
1760: synchronized (content2UrlMap) {
1761: content2UrlMap.put(key, url);
1762: }
1763: }
1764:
1765: private final RobotTask createRobotTask(URL url, int maxDepth,
1766: String startReferer) {
1767: url = removeWasteParameters(url);
1768: return new RobotTask(url, maxDepth, startReferer);
1769: }
1770:
1771: /** only true if form-handlers are defined */
1772: boolean hasFormHandlers = false;
1773:
1774: /** list of wasteParameters (will be removed from URLs) **/
1775: protected Vector wasteParameters = new Vector();
1776:
1777: /**
1778: * Set the list of wasteParameters (will be removed from URLs)
1779: * @param wasteParameters
1780: * if they begin of a string in this vector
1781: */
1782: public void setWasteParameters(Vector wasteParameters) {
1783: this .wasteParameters = wasteParameters;
1784: }
1785:
1786: /**
1787: * Gets the list of wasteParameters (will be removed from URLs)
1788: * @return a Vector containing Strings
1789: */
1790: public Vector getWasteParameters() {
1791: return this .wasteParameters;
1792: }
1793:
1794: /** Removes wasteParameters from URL.
1795: * (eg. ID)
1796: * @param url
1797: * @return URL
1798: */
1799: public URL removeWasteParameters(URL url) {
1800: String urlString = url.toExternalForm();
1801: String newUrlString = removeParametersFromString(urlString,
1802: wasteParameters);
1803: if (urlString != newUrlString) {
1804: try {
1805: url = new URL(newUrlString);
1806: } catch (MalformedURLException ex) {
1807: ex.printStackTrace();
1808: }
1809: }
1810: ;
1811: return url;
1812: }
1813:
1814: /**
1815: * Remove passed Parameters from UrlString
1816: * @param urlString
1817: * @param wasteParameters
1818: * @return String
1819: */
1820: public static String removeParametersFromString(String urlString,
1821: Vector wasteParameters) {
1822: if (wasteParameters != null && wasteParameters.size() > 0) {
1823: int questionMark = urlString.indexOf("?");
1824: if (questionMark > 0 && questionMark < urlString.length()) {
1825: int restPosition = urlString.indexOf("#", questionMark);
1826: String parameters;
1827: String rest;
1828: if (restPosition < 0) {
1829: parameters = urlString.substring(questionMark + 1);
1830: rest = null;
1831: } else {
1832: parameters = urlString.substring(questionMark + 1,
1833: restPosition);
1834: rest = urlString.substring(restPosition);
1835: }
1836:
1837: StringBuffer filteredUrl = new StringBuffer(urlString
1838: .substring(0, questionMark));
1839: StringTokenizer tokenizer = new StringTokenizer(
1840: parameters, "&");
1841: String and = "?";
1842: boolean changed = false;
1843: while (tokenizer.hasMoreTokens()) {
1844: String token = tokenizer.nextToken();
1845: boolean keep = true;
1846: for (int w = 0; w < wasteParameters.size(); w++) {
1847: String wasteParameter = (String) wasteParameters
1848: .elementAt(w);
1849: if (token.startsWith(wasteParameter + "=")) {
1850: keep = false;
1851: changed = true;
1852: break;
1853: }
1854: }
1855: if (keep) {
1856: filteredUrl.append(and);
1857: filteredUrl.append(token);
1858: and = "&";
1859: }
1860: }
1861: if (rest != null)
1862: filteredUrl.append(rest);
1863: if (changed) {
1864: urlString = filteredUrl.toString();
1865: }
1866: }
1867: }
1868: return urlString;
1869: }
1870:
1871: /** time of WebRobot start in milliseconds */
1872: protected long startTime = System.currentTimeMillis();
1873:
1874: /** number of allowed retries for document retrieval */
1875: protected int maxRetries = 0;
1876:
1877: /**
1878: * Set allowed retries for document retrieval
1879: * @param maxRetries
1880: */
1881: public void setMaxRetries(int maxRetries) {
1882: this .maxRetries = maxRetries;
1883: }
1884:
1885: /**
1886: * Get allowed retries for document retrieval
1887: * @return maxRetries
1888: */
1889: public int getMaxRetries() {
1890: return maxRetries;
1891: }
1892:
1893: /**
1894: * expiration age of documents in cache.
1895: * Documents older than expirationAge will be removed,
1896: * negative value means no limit.
1897: */
1898: protected long expirationAge = -1;
1899:
1900: /**
1901: * set expiration age of documents in cache.
1902: * Documents older than expirationAge will be removed,
1903: * negative value means no limit.
1904: * @param age
1905: */
1906: public void setExpirationAge(long age) {
1907: expirationAge = age;
1908: }
1909:
1910: /**
1911: * get expiration age of documents in cache.
1912: * @return long
1913: */
1914: public long getExpirationAge() {
1915: return expirationAge;
1916: }
1917:
1918: /**
1919: * Remove Parameters from Url
1920: * @param url
1921: * @return url without parameters
1922: */
1923: private final static String removeParameters(String url) {
1924: int pos = url.indexOf("?");
1925: return pos >= 0 ? url.substring(0, pos) : url;
1926: }
1927:
1928: /**
1929: * Reads a File to a byte array.
1930: * @param file
1931: * @return byte[]
1932: * @throws IOException
1933: */
1934: protected byte[] readFileToByteArray(File file) throws IOException {
1935: FileInputStream in = null;
1936:
1937: try {
1938: byte[] buffer = new byte[(int) file.length()];
1939: in = new FileInputStream(file);
1940: in.read(buffer);
1941:
1942: return buffer;
1943: } finally {
1944: if (in != null) {
1945: try {
1946: in.close();
1947: } catch (IOException e) {
1948: }
1949: }
1950: }
1951: }
1952:
1953: }
|