Source Code Cross Referenced for WebRobot.java in  » Web-Crawler » JoBo » net » matuschek » spider » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » JoBo » net.matuschek.spider 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        package net.matuschek.spider;
0002:
0003:        /**
0004:         * This class implements a web robot that does a search trough
0005:         * the web starting from a given start document up to a given 
0006:         * search depth.
0007:         * 
0008:         * @author Daniel Matuschek / Oliver Schmidt 
0009:         * @version $Revision: 1.35 $
0010:         */
0011:
0012:        import java.io.File;
0013:        import java.io.FileInputStream;
0014:        import java.io.IOException;
0015:        import java.lang.reflect.Field;
0016:        import java.lang.reflect.Modifier;
0017:        import java.net.MalformedURLException;
0018:        import java.net.URL;
0019:        import java.util.Date;
0020:        import java.util.HashMap;
0021:        import java.util.HashSet;
0022:        import java.util.StringTokenizer;
0023:        import java.util.Vector;
0024:
0025:        import net.matuschek.html.FormFiller;
0026:        import net.matuschek.html.HtmlDocument;
0027:        import net.matuschek.http.DocManagerException;
0028:        import net.matuschek.http.DownloadRuleSet;
0029:        import net.matuschek.http.ExtendedURL;
0030:        import net.matuschek.http.HttpConstants;
0031:        import net.matuschek.http.HttpDoc;
0032:        import net.matuschek.http.HttpDocManager;
0033:        import net.matuschek.http.HttpException;
0034:        import net.matuschek.http.HttpHeader;
0035:        import net.matuschek.http.HttpTool;
0036:        import net.matuschek.http.HttpToolCallback;
0037:        import net.matuschek.http.NTLMAuthorization;
0038:        import net.matuschek.http.cookie.CookieManager;
0039:        import net.matuschek.spider.docfilter.FilterChain;
0040:        import net.matuschek.spider.docfilter.FilterException;
0041:
0042:        import org.apache.log4j.Category;
0043:        import org.w3c.dom.Element;
0044:
0045:        public class WebRobot implements  Runnable, Cloneable {
0046:
0047:            /** the name of the robot */
0048:            private final static String ROBOT_NAME = "JoBo";
0049:
0050:            /** the default agent name */
0051:            private final static String AGENT_NAME = ROBOT_NAME
0052:                    + "/1.4 (http://www.matuschek.net/jobo.html)";
0053:
0054:            /** the robot exception handler*/
0055:            protected RobotExceptionHandler exceptionHandler = new DefaultRobotExceptionHandler();
0056:
0057:            /** default maximal search depth */
0058:            private final static int DEFAULT_DEPTH = 10;
0059:
0060:            /** the URL where the robot walk starts from */
0061:            protected URL startURL = null;
0062:
0063:            /** the host and directory where retrieval started from */
0064:            protected String startDir = "";
0065:
0066:            /** maximal search depth */
0067:            protected int maxDepth = DEFAULT_DEPTH;
0068:
0069:            /** is it allowed to walk to other hosts then the starting host ? */
0070:            protected boolean walkToOtherHosts = false;
0071:
0072:            /** DocManager will store or process retrieved documents */
0073:            protected HttpDocManager docManager;
0074:
0075:            /** HttpTool will be used to retrieve documents from a web server */
0076:            protected HttpTool httpTool = new HttpTool();
0077:
0078:            /** Log4J category for logging */
0079:            protected Category log;
0080:
0081:            /** Referer used to retrieve to first document */
0082:            protected String startReferer = "-";
0083:
0084:            /** test for robots.txt */
0085:            protected NoRobots robCheck;
0086:
0087:            /** current tasks */
0088:            protected TaskList todo = null;
0089:
0090:            /** a list of all URLs we got already */
0091:            protected TaskList visited = null;
0092:
0093:            /** ignore settings in /robots.txt ? */
0094:            protected boolean ignoreRobotsTxt = false;
0095:
0096:            /** sleep that number of seconds after every retrieved document */
0097:            protected int sleepTime = 1;
0098:
0099:            /** fill out forms */
0100:            protected FormFiller formFiller = new FormFiller();
0101:
0102:            /** this URLs can be visited more then once */
0103:            protected Vector visitMany = new Vector();
0104:
0105:            /** for callback to the user interface **/
0106:            protected WebRobotCallback webRobotCallback = null;
0107:
0108:            /** should we stop robot operation ? **/
0109:            protected boolean stopIt = false;
0110:
0111:            /** to check if it is allowed to travel to a given URL **/
0112:            protected URLCheck urlCheck = null;
0113:
0114:            /** should the robot suspend the current walk() **/
0115:            protected boolean sleep;
0116:
0117:            /** list of allowed URLs (even if walkToOtherHosts is false) **/
0118:            protected Vector allowedURLs = new Vector();
0119:
0120:            /** allow travelling the whole host ? */
0121:            protected boolean allowWholeHost = true;
0122:
0123:            /** 
0124:             * maximum document age in seconds, negative value means
0125:             * no limit 
0126:             */
0127:            protected long maxDocumentAge = -1; // no limit
0128:
0129:            /** 
0130:             * allow travelling to all subdomains of the start host ? 
0131:             * @see #setAllowWholeDomain(boolean)
0132:             */
0133:            protected boolean allowWholeDomain = true;
0134:
0135:            /** 
0136:             * do more flexible tests if the new URL is on the same host
0137:             * @see #basicURLCheck(URL)
0138:             */
0139:            protected boolean flexibleHostCheck = false;
0140:
0141:            /**
0142:             * FilterChain to filter the document before storing it
0143:             */
0144:            protected FilterChain filters = null;
0145:
0146:            /**
0147:             * don't retrieve pages again that are already stored in the DocManager
0148:             */
0149:            protected boolean allowCaching = true;
0150:
0151:            /**
0152:             * Check for documents with the same content
0153:             */
0154:            protected boolean duplicateCheck = false;
0155:
0156:            /**
0157:             * initializes the robot with the default implementation 
0158:             * of the TaskList interface
0159:             * 
0160:             * @param expected document count
0161:             */
0162:            public WebRobot(int expectedDocumentCount) {
0163:                log = Category.getInstance(getClass().getName());
0164:                content2UrlMap = new HashMap(expectedDocumentCount);
0165:                registerVisitedList(new HashedMemoryTaskList(false,
0166:                        expectedDocumentCount));
0167:                registerToDoList(new HashedMemoryTaskList(true,
0168:                        expectedDocumentCount));
0169:                this .expectedDocumentCount = expectedDocumentCount;
0170:                this .setAgentName(AGENT_NAME);
0171:            }
0172:
0173:            /**
0174:             * initializes the robot with the default implementation of the TaskList
0175:             * interface
0176:             */
0177:            public WebRobot() {
0178:                this (DEFAULT_EXPECTED_DOCUMENT_COUNT);
0179:            }
0180:
0181:            /**
0182:             * Sets the implementation class for the backend task list storage.
0183:             * WebRobot uses the TaskList interface to store future tasks.
0184:             *
0185:             * If you want to use your own TaskList implementation, just call
0186:             * this method.
0187:             * 
0188:             * @param todo TaskList to be used for the "to do" list
0189:             */
0190:            public void registerToDoList(TaskList todo) {
0191:                this .todo = todo;
0192:            }
0193:
0194:            /**
0195:             * Sets the implementation class for the backend task list storage.
0196:             * WebRobot uses the TaskList interface to store URLs that have
0197:             * been retrieved before.
0198:             *
0199:             * If you want to use your own TaskList implementation, just call
0200:             * this method.
0201:             * 
0202:             * @param visited TaskList to be used for the list of visited URLs
0203:             */
0204:            public void registerVisitedList(TaskList visited) {
0205:                this .visited = visited;
0206:            }
0207:
0208:            /**
0209:             * @return the start URL for this robot
0210:             */
0211:            public URL getStartURL() {
0212:                return startURL;
0213:            }
0214:
0215:            /**
0216:             * Sets the start URL for this robot
0217:             * @param startURL the start URL
0218:             */
0219:            public void setStartURL(URL startURL) {
0220:                String path = startURL.getPath();
0221:                this .startURL = startURL;
0222:
0223:                // is it a directory ?
0224:                if (path.endsWith("/")) {
0225:                    this .startDir = startURL.getHost() + path;
0226:                } else {
0227:                    int pos = path.lastIndexOf("/");
0228:                    if (pos < 0) {
0229:                        // this happens for URLs without a path
0230:                        this .startDir = startURL.getHost() + "/";
0231:                    } else {
0232:                        this .startDir = startURL.getHost()
0233:                                + path.substring(0, pos + 1);
0234:                    }
0235:                }
0236:            }
0237:
0238:            /**
0239:             * @return the maximal allowed search depth
0240:             */
0241:            public int getMaxDepth() {
0242:                return maxDepth;
0243:            }
0244:
0245:            /**
0246:             * sets the maximal search depth
0247:             * @param maxDepth
0248:             */
0249:            public void setMaxDepth(int maxDepth) {
0250:                this .maxDepth = maxDepth;
0251:            }
0252:
0253:            /**
0254:             * Get the value of bandwith of the used HttpTool
0255:             * @return value of bandwith.
0256:             */
0257:            public int getBandwidth() {
0258:                return httpTool.getBandwidth();
0259:            }
0260:
0261:            /**
0262:             * Set the value of bandwith  of the used HttpTool
0263:             * @param bandwidth  Value to assign to bandwith.
0264:             */
0265:            public void setBandwidth(int bandwidth) {
0266:                httpTool.setBandwidth(bandwidth);
0267:            }
0268:
0269:            /**
0270:             * gets the WalkToOtherHost status
0271:             * @return true if the Robot is allowed to travel to other
0272:             * host then the start host, false otherwise
0273:             */
0274:            public boolean getWalkToOtherHosts() {
0275:                return walkToOtherHosts;
0276:            }
0277:
0278:            /**
0279:             * sets the WalkToOtherHosts status
0280:             * @param walkToOtherHosts true if the Robot is allowed to travel to other
0281:             * host then the start host, false otherwise
0282:             */
0283:            public void setWalkToOtherHosts(boolean walkToOtherHosts) {
0284:                this .walkToOtherHosts = walkToOtherHosts;
0285:            }
0286:
0287:            /**
0288:             * gets the AllowWholeHost value
0289:             * @return true if the Robot is allowed to travel to the whole 
0290:             * host where it started from, false otherwise. If false, it is only
0291:             * allowed to travel to URLs below the start URL
0292:             */
0293:            public boolean getAllowWholeHost() {
0294:                return allowWholeHost;
0295:            }
0296:
0297:            /**
0298:             * sets the AllowWholeHost status
0299:             * @param allowWholeHost if true, the Robot is allowed to
0300:             * travel to the whole host where it started from. Otherwise it is only
0301:             * allowed to travel to URLs below the start URL.
0302:             */
0303:            public void setAllowWholeHost(boolean allowWholeHost) {
0304:                this .allowWholeHost = allowWholeHost;
0305:            }
0306:
0307:            /**
0308:             * Gets the AllowWholeDomain value.
0309:             * @return true if the Robot is allowed to travel to the whole 
0310:             * domain of the start host, false otherwise. 
0311:             * @see #setAllowWholeDomain(boolean)
0312:             */
0313:            public boolean getAllowWholeDomain() {
0314:                return allowWholeDomain;
0315:            }
0316:
0317:            /**
0318:             * Sets the AllowWholeDomain status
0319:             * @param allowWholeDomain if true, the Robot is allows to travel
0320:             * to all hosts in the same domain as the starting host. E.g. if you
0321:             * start at www.apache.org, it is also allowed to travel to
0322:             * jakarta.apache.org, xml.apache.org ...
0323:             */
0324:            public void setAllowWholeDomain(boolean allowWholeDomain) {
0325:                this .allowWholeDomain = allowWholeDomain;
0326:            }
0327:
0328:            /**
0329:             * Gets the state of flexible host checking (enabled or disabled).
0330:             *
0331:             * To find out if a new URL is on the same host, the robot usually
0332:             * compares the host part of both. Some web servers have an inconsistent
0333:             * addressing scheme and use the hostname www.domain.com and domain.com.
0334:             * With flexible host check enabled, the robot will consider both
0335:             * hosts as equal.
0336:             *
0337:             * @return true, if flexible host checking is enabled
0338:             */
0339:            public boolean getFlexibleHostCheck() {
0340:                return flexibleHostCheck;
0341:            }
0342:
0343:            /**
0344:             * Defines if the host test should be more flexible.
0345:             *
0346:             * To find out if a new URL is on the same host, the robot usually
0347:             * compares the host part of both. Some web servers have an inconsistent
0348:             * addressing scheme and use the hostname www.domain.com and domain.com.
0349:             * With flexible host check enabled, the robot will consider both
0350:             * hosts as equal.
0351:             *
0352:             * @param flexibleHostCheck set this true, to enable flexible host checking
0353:             * (disabled by default)
0354:             */
0355:            public void setFlexibleHostCheck(boolean flexibleHostCheck) {
0356:                this .flexibleHostCheck = flexibleHostCheck;
0357:            }
0358:
0359:            /**
0360:             * Gets the AllowCaching value.
0361:             * @return true if the Robot is allowed to cache documents in the
0362:             * docManager
0363:             * @see #setAllowCaching(boolean)
0364:             */
0365:            public boolean getAllowCaching() {
0366:                return allowCaching;
0367:            }
0368:
0369:            /**
0370:             * Sets the AllowCaching status
0371:             *
0372:             * @param allowCaching if true, the Robot is allows to use
0373:             * cached documents. That means it will first try to get teh document
0374:             * from the docManager cache and will only retrieve it if it is
0375:             * not found in the cache. If the cache returns a document, the robot
0376:             * will NEVER retrieve it again. Therefore, expiration mechanisms have
0377:             * to be included in the HttpDocManager method retrieveFromCache.
0378:             * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
0379:             */
0380:            public void setAllowCaching(boolean allowCaching) {
0381:                this .allowCaching = allowCaching;
0382:            }
0383:
0384:            /**
0385:             * @return the document manager of this robot
0386:             * @see HttpDocManager
0387:             */
0388:            public HttpDocManager getDocManager() {
0389:                return docManager;
0390:            }
0391:
0392:            /**
0393:             * Sets the document manager for this robot <br />
0394:             * Without a document manager, the robot will travel through the web but
0395:             * don't do anything with the retrieved documents (simply forget
0396:             * them). 
0397:             * A document manager can store them, extract information or 
0398:             * whatever you like. 
0399:             * There can be only one document manager, but you are free to combine
0400:             * functionalities of available document managers in a new object (e.g.
0401:             * to store the document and extract meta informations).
0402:             * @param docManager
0403:             */
0404:            public void setDocManager(HttpDocManager docManager) {
0405:                this .docManager = docManager;
0406:            }
0407:
0408:            /**
0409:             * Sets the CookieManager used by the HttpTool
0410:             * By default a MemoryCookieManager will be used, but you can
0411:             * use this method to use your own CookieManager implementation.
0412:             *
0413:             * @param cm an object that implements the CookieManager interface
0414:             */
0415:            public void setCookieManager(CookieManager cm) {
0416:                httpTool.setCookieManager(cm);
0417:            }
0418:
0419:            /**
0420:             * Gets the CookieManager used by the HttpTool
0421:             *
0422:             * @return the CookieManager that will be used by the HttpTool
0423:             */
0424:            public CookieManager getCookieManager() {
0425:                return httpTool.getCookieManager();
0426:            }
0427:
0428:            /**
0429:             * Sets the DownloadRule
0430:             * @param rule the download rule set to use
0431:             */
0432:            public void setDownloadRuleSet(DownloadRuleSet rules) {
0433:                httpTool.setDownloadRuleSet(rules);
0434:            }
0435:
0436:            /**
0437:             * Sets the URLCheck for this robot
0438:             * @param check
0439:             */
0440:            public void setURLCheck(URLCheck check) {
0441:                this .urlCheck = check;
0442:            }
0443:
0444:            /** 
0445:             *  sets a proxy to use 
0446:             *  @param proxyDescr the Proxy definition in the format host:port
0447:             */
0448:            public void setProxy(String proxyDescr) throws HttpException {
0449:                httpTool.setProxy(proxyDescr);
0450:            }
0451:
0452:            /**
0453:             * @return the current proxy setting in the format host:port
0454:             */
0455:            public String getProxy() {
0456:                return httpTool.getProxy();
0457:            }
0458:
0459:            /**
0460:             * @return the Referer setting for the first HTTP reuest
0461:             */
0462:            public String getStartReferer() {
0463:                return startReferer;
0464:            }
0465:
0466:            /**
0467:             * sets the Referer setting for the first HTTP reuest
0468:             * @param startReferer an URL (e.g. http://www.matuschek.net)
0469:             */
0470:            public void setStartReferer(String startReferer) {
0471:                this .startReferer = startReferer;
0472:            }
0473:
0474:            /**
0475:             * should we ignore robots.txt Robot Exclusion protocol ?
0476:             * @param ignoreRobotsTxt if set to true, the robot will ignore
0477:             * the settings of the /robots.txt file on the webserver
0478:             * <b>Know what you are doing if you change this setting</b>
0479:             */
0480:            public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
0481:                robCheck.setIgnore(ignoreRobotsTxt);
0482:            }
0483:
0484:            /** 
0485:             * @return the sleeptime setting
0486:             */
0487:            public int getSleepTime() {
0488:                return sleepTime;
0489:            }
0490:
0491:            /**
0492:             * set the sleeptime<br />
0493:             * after every retrieved document the robot will wait this time
0494:             * before getting the next document. this allows it to limit the
0495:             * load on the server
0496:             * @param sleeptime wait time in seconds
0497:             */
0498:            public void setSleepTime(int sleepTime) {
0499:                this .sleepTime = sleepTime;
0500:            }
0501:
0502:            /**
0503:             * sets the From: HTTP header<br />
0504:             * this should be a valid email address. it is not needed for the robot,
0505:             * but you should use it, because the administrator of the web server
0506:             * can contact you if the robot is doing things that he don't want
0507:             * @param fromAdress an RFC 822 email adress
0508:             */
0509:            public void setFromAddress(String fromAddress) {
0510:                httpTool.setFromAddress(fromAddress);
0511:            }
0512:
0513:            /**
0514:             * sets the list of form handlers
0515:             * @see net.matuschek.html.FormHandler for more 
0516:             * information about form handlers
0517:             */
0518:            public void setFormHandlers(Vector handlers) {
0519:                formFiller.setFormHandlers(handlers);
0520:                if (handlers != null && handlers.size() > 0) {
0521:                    hasFormHandlers = true;
0522:                }
0523:            }
0524:
0525:            /**
0526:             * @return the list of form handlers
0527:             * @see net.matuschek.html.FormHandler for more information 
0528:             * about form handlers
0529:             */
0530:            public Vector getFormHandlers() {
0531:                return formFiller.getFormHandlers();
0532:            }
0533:
0534:            /**
0535:             * Gets the name of the "User-Agent" header that the robot will use
0536:             * @return the user agent name 
0537:             */
0538:            public String getAgentName() {
0539:                if (httpTool != null) {
0540:                    return httpTool.getAgentName();
0541:                } else {
0542:                    return null;
0543:                }
0544:            }
0545:
0546:            /**
0547:             * sets the Agent-Name authentication for this robot
0548:             * @param name a name for this robot 
0549:             * (e.g. "Mozilla 4.0 (compatible; Robot)")
0550:             */
0551:            public void setAgentName(String name) {
0552:                httpTool.setAgentName(name);
0553:                // robCheck = new NoRobots(ROBOT_NAME, httpTool);
0554:                robCheck = new NoRobots(name, httpTool);
0555:            }
0556:
0557:            /**
0558:             * Gets the timeout for getting data in seconds of the used HttpTool
0559:             * @return the value of sockerTimeout
0560:             * @see #setTimeout(int)
0561:             */
0562:            public int getTimeout() {
0563:                if (httpTool != null) {
0564:                    return httpTool.getTimeout();
0565:                } else {
0566:                    return -1;
0567:                }
0568:            }
0569:
0570:            /**
0571:             * Sets the timeout for getting data. If HttpTool can't read data from a
0572:             * remote web server after this number of seconds it will stop the download
0573:             * of the current file
0574:             * @param timeout Timeout in seconds
0575:             */
0576:            public void setTimeout(int timeout) {
0577:                httpTool.setTimeout(timeout);
0578:            }
0579:
0580:            /**
0581:             * Gets the ntlmAuthentication of the robot
0582:             * @return the ntlmAuthentication
0583:             */
0584:            public NTLMAuthorization getNtlmAuthorization() {
0585:                if (httpTool != null) {
0586:                    return httpTool.getNtlmAuthorization();
0587:                } else {
0588:                    return null;
0589:                }
0590:            }
0591:
0592:            /**
0593:             * sets a ntlmAuthentication for this robot
0594:             * @param ntlmAuthentication for this robot 
0595:             */
0596:            public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
0597:                httpTool.setNtlmAuthorization(ntlmAuthorization);
0598:            }
0599:
0600:            /**
0601:             * Gets the setting of the IgnoreRobotsTxt property
0602:             * @return true if robots.txt will be ignored, false otherwise
0603:             */
0604:            public boolean getIgnoreRobotsTxt() {
0605:                return ignoreRobotsTxt;
0606:            }
0607:
0608:            /**
0609:             * Gets a vector of URLs that can be visited more then once
0610:             * @return a vector containing URLs formated as Strings
0611:             */
0612:            public Vector getVisitMany() {
0613:                return visitMany;
0614:            }
0615:
0616:            public void setVisitMany(Vector visitMany) {
0617:                this .visitMany = visitMany;
0618:            }
0619:
0620:            public void setHttpToolCallback(HttpToolCallback callback) {
0621:                httpTool.setCallback(callback);
0622:            }
0623:
0624:            public WebRobotCallback getWebRobotCallback() {
0625:                return webRobotCallback;
0626:            }
0627:
0628:            public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
0629:                this .webRobotCallback = webRobotCallback;
0630:            }
0631:
0632:            /**
0633:             * Sets the sleep status for this robot. If a WebRobot is set to sleep
0634:             * after starting run(), is will wait after retrieving the current document
0635:             * and wait for setSleep(false)
0636:             */
0637:            public void setSleep(boolean sleep) {
0638:                this .sleep = sleep;
0639:            }
0640:
0641:            /**
0642:             * Is the robot sleeping ?
0643:             */
0644:            public boolean isSleeping() {
0645:                return this .sleep;
0646:            }
0647:
0648:            /** 
0649:             * Set the list of allowed URLs
0650:             * @param allowed a Vector containing Strings. URLs will be checked
0651:             * if they begin of a string in this vector
0652:             */
0653:            public void setAllowedURLs(Vector allowed) {
0654:                this .allowedURLs = allowed;
0655:            }
0656:
0657:            /**
0658:             * Gets the list of allowed URLs
0659:             * @return a Vector containing Strings
0660:             * @see #setAllowedURLs(Vector)
0661:             */
0662:            public Vector getAllowedURLs() {
0663:                return this .allowedURLs;
0664:            }
0665:
0666:            /**
0667:             * Enable/disable cookies
0668:             * @param enable if true, HTTP cookies will be enabled, if false
0669:             * the robot will not use cookies
0670:             */
0671:            public void setEnableCookies(boolean enable) {
0672:                httpTool.setEnableCookies(enable);
0673:            }
0674:
0675:            /**
0676:             * Get the status of the cookie engine
0677:             * @return true, if HTTP cookies are enabled, false otherwise
0678:             */
0679:            public boolean getEnableCookies() {
0680:                return httpTool.getEnableCookies();
0681:            }
0682:
0683:            /** 
0684:             * Set the maximum age of documents to retrieve to this number
0685:             * of seconds
0686:             * @param maxAge integer value of the maximum document age 
0687:             * (in seconds), negative value means no limit.
0688:             */
0689:            public void setMaxDocumentAge(long maxAge) {
0690:                this .maxDocumentAge = maxAge;
0691:            }
0692:
0693:            /**
0694:             * Gets the maximum age of documents to retrieve
0695:             * @return maximum document age (in seconds), negative value means 
0696:             * no limit.
0697:             */
0698:            public long getMaxDocumentAge() {
0699:                return this .maxDocumentAge;
0700:            }
0701:
0702:            /**
0703:             * Sets a FilterChain. If teh WebRobot use a FilterChain it will
0704:             * process any retrieved document by this FilterChain before
0705:             * storing it
0706:             *
0707:             * @param filter a FilterChain to use for filtering HttpDocs
0708:             */
0709:            public void setFilters(FilterChain filters) {
0710:                this .filters = filters;
0711:            }
0712:
0713:            /**
0714:             * Delete all cookies
0715:             */
0716:            public void clearCookies() {
0717:                httpTool.clearCookies();
0718:            }
0719:
0720:            /**
0721:             * thread run() method, simply calls work()
0722:             * @see #work()
0723:             */
0724:            public void run() {
0725:                work();
0726:            }
0727:
0728:            /**
0729:             * do your job travel through the web using the configured 
0730:             * parameters and retrieve documents
0731:             */
0732:            public void work() {
0733:                RobotTask task = createRobotTask(startURL, maxDepth,
0734:                        startReferer);
0735:                todo.add(task);
0736:                walkTree();
0737:                // ok, we did it, clean up dynamic data (the vistited vector)
0738:                cleanUp();
0739:                log.info("Documents retrieved by: Web=" + countWeb + " Cache="
0740:                        + countCache + " Refresh=" + countRefresh
0741:                        + " NoRefresh=" + countNoRefresh);
0742:            }
0743:
0744:            /**
0745:             * stop the current robot run 
0746:             * note that this will not abourt the current download but stop after
0747:             * the current download has finished
0748:             */
0749:            public void stopRobot() {
0750:                stopIt = true;
0751:            }
0752:
0753:            /**
0754:             * Holds information about memory status.
0755:             * @see handleMemoryError(OutOfMemoryError)
0756:             */
0757:            private int memoryLevel = 0;
0758:
0759:            /** Can new tasks be added? (may depend on memoryLevel) */
0760:            protected boolean activatedNewTasks = true;
0761:
0762:            /** Are visited URLs collected? (may depend on memoryLevel) */
0763:            protected boolean activatedUrlHistory = true;
0764:
0765:            /** Are visited contents collected? (may depend on memoryLevel) */
0766:            protected boolean activatedContentHistory = true;
0767:
0768:            /** memory buffer of 200 KB to be freed in case of urgent memory needs */
0769:            private byte memoryBuffer[] = new byte[200 * 1024];
0770:
0771:            /**
0772:             * do your job !
0773:             */
0774:
0775:            public void walkTree() {
0776:                while ((todo.size() > 0) && (!stopIt)) {
0777:                    RobotTask task;
0778:                    synchronized (visited) {
0779:                        task = todo.removeFirst();
0780:                        if (visited.contains(task)
0781:                                && (!visitMany.contains(task.getUrl()
0782:                                        .toString()))) {
0783:                            log.debug("already visited: " + task.getUrl());
0784:                            continue;
0785:                        }
0786:                        if (activatedUrlHistory) {
0787:                            visited.add(task);
0788:                        }
0789:                    }
0790:
0791:                    boolean repeat = true;
0792:                    while (repeat) {
0793:                        try {
0794:                            retrieveURL(task);
0795:                            repeat = false;
0796:                        } catch (OutOfMemoryError memoryError) {
0797:                            handleMemoryError(memoryError);
0798:                        }
0799:                    }
0800:
0801:                    // sleep, if sleep is set to true
0802:                    while (sleep) {
0803:                        // callback
0804:                        if (webRobotCallback != null) {
0805:                            webRobotCallback.webRobotSleeping(true);
0806:                        }
0807:
0808:                        try {
0809:                            Thread.sleep(1000);
0810:                        } catch (InterruptedException e) {
0811:                        }
0812:                        ;
0813:                    }
0814:
0815:                    // callback
0816:                    if (webRobotCallback != null) {
0817:                        webRobotCallback.webRobotSleeping(false);
0818:                    }
0819:
0820:                    // callback
0821:                    if (webRobotCallback != null) {
0822:                        webRobotCallback.webRobotUpdateQueueStatus(todo.size());
0823:                    }
0824:                    spawnThread();
0825:                }
0826:
0827:                // callback
0828:                if (webRobotCallback != null) {
0829:                    finishThreads();
0830:                }
0831:            }
0832:
0833:            /**
0834:             * Implements OutOfMemory handling strategies.
0835:             * Action depends on memoryLevel
0836:             * @param memoryError
0837:             * @throws OutOfMemoryError
0838:             */
0839:            protected void handleMemoryError(OutOfMemoryError memoryError)
0840:                    throws OutOfMemoryError {
0841:                memoryLevel++;
0842:                log.error("OutOfMemoryError level=" + memoryLevel
0843:                        + "! (visited=" + visited.size() + ", todo="
0844:                        + todo.size() + ")");
0845:                switch (memoryLevel) {
0846:                case 1:
0847:                    // donīt remember visited URLs and contents any more
0848:                    // and try it again
0849:                    visited.clear();
0850:                    activatedUrlHistory = false;
0851:                    content2UrlMap.clear();
0852:                    activatedContentHistory = false;
0853:                    System.gc();
0854:                    break;
0855:                case 2:
0856:                    // stop adding new Tasks, just process todo-list.
0857:                    // free memory buffer 
0858:                    // and try it again 
0859:                    activatedNewTasks = false;
0860:                    memoryBuffer = null;
0861:                    System.gc();
0862:                    break;
0863:                case 3:
0864:                    // there is nothing we can do any more.
0865:                    // throw exception to stop robot
0866:                    throw memoryError;
0867:                default:
0868:                    // Should never be reached.
0869:                    if (memoryBuffer != null) {
0870:                        // avoid removal of memoryBuffer by compiler
0871:                        System.err.println(memoryBuffer[0]);
0872:                    }
0873:                    throw memoryError;
0874:                }
0875:            }
0876:
0877:            /**
0878:             * calls webRobotDone and finishes docManager if 
0879:             * executed in mainThread
0880:             */
0881:            protected void finishThreads() {
0882:                webRobotCallback.webRobotDone();
0883:                if (docManager != null) {
0884:                    docManager.finish();
0885:                }
0886:            }
0887:
0888:            /**
0889:             * Start subThreads for spidering.
0890:             * WARNING: Should only be implemented and used for local
0891:             * spidering purposes!
0892:             */
0893:            protected synchronized void spawnThread() {
0894:            }
0895:
0896:            /** counter for calls of retrieveURL */
0897:            protected int iteration = 0;
0898:
0899:            /**
0900:             * retrieve the next URL, save it, extract all included links and
0901:             * add those links to the tasks list
0902:             * @param task task to retrieve, function does nothing if this is null
0903:             */
0904:            public void retrieveURL(RobotTask task) {
0905:                if (task == null) {
0906:                    log.debug("Empty task found, ignoring");
0907:                    return;
0908:                }
0909:
0910:                long now = System.currentTimeMillis();
0911:
0912:                updateProgressInfo();
0913:
0914:                URL u = task.getUrl();
0915:                String urlString = u.toString();
0916:                String referer = task.getReferer();
0917:                int depth = task.getMaxDepth();
0918:
0919:                if (depth < 0) {
0920:                    log.info("Max search depth reached");
0921:                    return;
0922:                }
0923:
0924:                // we may need this additional check even if we
0925:                // tested it during adding to the tasks list 
0926:                if (!isAllowed(u)) {
0927:                    log.info("Url '" + u + "' filtered out.");
0928:                    return;
0929:                }
0930:
0931:                if (u.getFile().equals("")) {
0932:                    try {
0933:                        urlString = urlString + "/";
0934:                        u = new URL(urlString);
0935:                        // fix for double retrieved files
0936:                        task.setUrl(u);
0937:                    } catch (MalformedURLException e) {
0938:                        log.error("URL not well formed: " + e.toString());
0939:                        // use exception handler to handle exception
0940:                        exceptionHandler.handleException(this , u, e);
0941:                        return;
0942:                    }
0943:                }
0944:
0945:                log.info("retrieving " + urlString);
0946:                httpTool.setReferer(referer);
0947:
0948:                HttpDoc doc = null;
0949:                Vector links = null;
0950:                boolean cached = false;
0951:
0952:                // look in the cache first, but only for static pages
0953:                boolean reScan = true;
0954:                if ((docManager != null && allowCaching)
0955:                        && (task.getMethod() == HttpConstants.GET)
0956:                        && (task.getParamString() == null)) {
0957:                    doc = docManager.retrieveFromCache(u);
0958:                    /*			if (doc != null) {
0959:                     try {
0960:                     links = ((UrlCollector) docManager).retrieveLinks(doc);
0961:                     } catch (IOException e) {
0962:                     log.info("Could not get links for " + u + ": " + e.getMessage());
0963:                     links = null;
0964:                     } 
0965:                     }*/
0966:
0967:                    if (doc != null) {
0968:                        countCache++;
0969:                        long lastRetrieved = doc.getDateAsMilliSeconds();
0970:                        double ageInSeconds = (now - lastRetrieved) / 1000;
0971:                        if (ageInSeconds < 0) {
0972:                            log.warn("DocumentAge < 0!");
0973:                        }
0974:                        reScan = maxDocumentAge >= 0
0975:                                && ageInSeconds > maxDocumentAge;
0976:                        if (reScan) {
0977:                            long lastModified = doc
0978:                                    .getLastModifiedAsMilliSeconds();
0979:                            Date lastModifiedDate = new Date(lastModified);
0980:                            httpTool.setIfModifiedSince(lastModifiedDate);
0981:                        }
0982:                    } else {
0983:                        httpTool.setIfModifiedSince(null);
0984:                    }
0985:                }
0986:
0987:                // if not found in cache, retrieve from the web page
0988:                if (reScan) {
0989:                    HttpDoc newDoc;
0990:                    boolean error = false;
0991:                    try {
0992:                        if (u.getProtocol().equalsIgnoreCase("file")) {
0993:                            // retrieve from file
0994:                            newDoc = retrieveFileURL(u, httpTool
0995:                                    .getIfModifiedSince());
0996:                        } else {
0997:                            // retrieve from Web
0998:                            newDoc = httpTool.retrieveDocument(u, task
0999:                                    .getMethod(), task.getParamString());
1000:                            if (newDoc != null) {
1001:                                newDoc.setDate(now);
1002:                            }
1003:                            sleepNow();
1004:                        }
1005:
1006:                        if (newDoc != null && !newDoc.isNotModified()) {
1007:                            if (!(newDoc.isOk() || newDoc.isRedirect())) {
1008:                                error = true;
1009:                            }
1010:                        } else {
1011:                            // (newDoc == null || newDoc.isNotModified()) && doc != null 
1012:                            // -> Not modified
1013:                            // -> refresh time stamp
1014:                            if (doc != null) {
1015:                                doc.setDate(now);
1016:                                doc.setCached(false);
1017:                                newDoc = null;
1018:                            }
1019:                        }
1020:                    } catch (HttpException hex) {
1021:                        error = true;
1022:                        newDoc = null;
1023:                    }
1024:                    if (error) {
1025:                        int retry = task.retry();
1026:                        if (retry <= maxRetries) {
1027:                            synchronized (visited) {
1028:                                todo.add(task);
1029:                                visited.remove(task);
1030:                            }
1031:                            log.info("Adding " + u + " for retry no. " + retry);
1032:                            return;
1033:                        } else {
1034:                            doc = docManager.retrieveFromCache(u);
1035:                            if (doc == null) {
1036:                                log.warn("Unsuccessfull retries for " + u);
1037:                                return;
1038:                            } else {
1039:                                long docDate = doc.getDateAsMilliSeconds();
1040:                                long age = (now - docDate);
1041:                                age /= 1000;
1042:                                if (expirationAge < 0 || age < expirationAge) {
1043:                                    newDoc = doc;
1044:                                    cached = true;
1045:                                    log.info("Cached document not expired: "
1046:                                            + u);
1047:                                } else {
1048:                                    log.warn("Cached document expired: " + u);
1049:                                    docManager.removeDocument(u);
1050:                                    return;
1051:                                }
1052:                            }
1053:                        }
1054:                    }
1055:
1056:                    if (newDoc != null) {
1057:                        countWeb++;
1058:                        doc = newDoc;
1059:                        links = null; // force recalculation of links
1060:                        countRefresh++;
1061:                    } else {
1062:                        cached = true;
1063:                        countNoRefresh++;
1064:                    }
1065:                } else {
1066:                    cached = true;
1067:                    log.debug("Page " + u + " retrieved from cache");
1068:                }
1069:
1070:                // Add it to the visited vector
1071:                // needs to be synchronized with todo-list
1072:                //		visited.add(task); 
1073:
1074:                // got a NULL document, that doc was not retrieved
1075:                // usually, it was not downloaded because a rule didn't allow
1076:                // to download it
1077:                if (doc == null) {
1078:                    log.info("not downloaded " + u);
1079:                    return;
1080:                }
1081:
1082:                // Duplicate check
1083:                String duplicate = null;
1084:                if (duplicateCheck) {
1085:                    duplicate = getContentVisitedURL(doc);
1086:                    if (duplicate != null) {
1087:                        log.info("URLs with same content found: " + urlString
1088:                                + " = " + duplicate);
1089:                    } else {
1090:                        try {
1091:                            duplicate = docManager.findDuplicate(doc);
1092:                            if (duplicate != null) {
1093:                                log
1094:                                        .info("URLs with same content found in cache: "
1095:                                                + urlString + " = " + duplicate);
1096:                            }
1097:                        } catch (IOException e) {
1098:                            e.printStackTrace();
1099:                        }
1100:                    }
1101:
1102:                    if (duplicate != null) {
1103:                        String pureDuplicate = removeParameters(duplicate);
1104:                        String pureUrl = removeParameters(urlString);
1105:                        if (!pureUrl.equals(pureDuplicate) && !cached) {
1106:                            // different url not yet stored -> store it
1107:                            try {
1108:                                // retrieve links from original
1109:                                HttpDoc linksDoc = docManager
1110:                                        .retrieveFromCache(new URL(duplicate));
1111:                                if (linksDoc != null) {
1112:                                    doc.setLinks(linksDoc.getLinks());
1113:                                }
1114:                                docManager.storeDocument(doc);
1115:                            } catch (Exception e) {
1116:                                e.printStackTrace();
1117:                            }
1118:                        }
1119:                        RobotTask newTask;
1120:                        try {
1121:                            newTask = createRobotTask(new URL(duplicate),
1122:                                    depth, referer);
1123:                            // check already here for visited tasks to save memory
1124:                            if (!visited.contains(newTask)) {
1125:                                addTask(newTask);
1126:                            }
1127:                        } catch (MalformedURLException e) {
1128:                            e.printStackTrace(); // Canīt happen
1129:                        }
1130:                        return;
1131:                    }
1132:                }
1133:
1134:                // was it an UnAuthorized document ?
1135:                if (doc.isUnauthorized()) {
1136:                    log.info("got HTTP Unauthorized for URL " + u);
1137:                }
1138:
1139:                if (doc.isOk() || cached) {
1140:                    // callback
1141:                    if (webRobotCallback != null) {
1142:                        int contentLength = 0;
1143:                        if (doc.getContent() != null) {
1144:                            contentLength = doc.getContent().length;
1145:                        }
1146:                        webRobotCallback.webRobotRetrievedDoc(urlString,
1147:                                contentLength);
1148:                    }
1149:
1150:                    // extract links
1151:                    try {
1152:                        if (doc.isHTML() && (depth > 0)) {
1153:                            // solving encoding problem
1154:                            // HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
1155:                            HtmlDocument htmlDoc = null;
1156:                            HttpHeader contentTypeHeader = doc
1157:                                    .getHeader("Content-type");
1158:                            if (contentTypeHeader != null) {
1159:                                String contentType = contentTypeHeader
1160:                                        .getValue();
1161:                                int index = contentType.toLowerCase().indexOf(
1162:                                        "charset=");
1163:                                if (index > 0) {
1164:                                    htmlDoc = new HtmlDocument(u, doc
1165:                                            .getContent(), contentType
1166:                                            .substring(index + 8));
1167:                                } else {
1168:                                    htmlDoc = new HtmlDocument(u, doc
1169:                                            .getContent());
1170:                                }
1171:                            } else {
1172:                                htmlDoc = new HtmlDocument(u, doc.getContent());
1173:                            }
1174:
1175:                            // add links
1176:
1177:                            // this depth-check is critical!
1178:                            // otherwise far too many RobotTasks will be created
1179:                            // this will cause a premature OutOfMemoryException!
1180:                            if (depth > 0) {
1181:                                if (duplicate != null) {
1182:                                    HttpDoc linksDoc = docManager
1183:                                            .retrieveFromCache(new URL(
1184:                                                    duplicate));
1185:                                    doc.setLinks(linksDoc.getLinks());
1186:                                } else if (cached) {
1187:                                }
1188:                                if (links == null) {
1189:                                    links = htmlDoc.getLinks();
1190:                                    doc.setLinks(links);
1191:                                }
1192:                                if (duplicate == null) {
1193:                                    HashSet checkedLinks = new HashSet();
1194:                                    for (int i = 0; i < links.size(); i++) {
1195:                                        URL link = (URL) links.elementAt(i);
1196:                                        log.info("Link: " + link);
1197:                                        // check already here for duplicate links to avoid expensive
1198:                                        // creation of RobotTasks
1199:                                        if (!checkedLinks.contains(link)) {
1200:                                            checkedLinks.add(link);
1201:                                            String myReferer = u.toString();
1202:                                            if (u.getUserInfo() != null) {
1203:                                                // remove userinfo from referer
1204:                                                int endindex = myReferer
1205:                                                        .indexOf("@") + 1;
1206:                                                myReferer = "http://"
1207:                                                        + myReferer
1208:                                                                .substring(endindex);
1209:                                            }
1210:
1211:                                            RobotTask newTask = createRobotTask(
1212:                                                    (URL) links.elementAt(i),
1213:                                                    depth - 1, myReferer);
1214:                                            // check already here for visited tasks to save memory
1215:                                            if (!visited.contains(newTask)) {
1216:                                                // bad workaround to retrieve images first
1217:                                                if (newTask.urlString
1218:                                                        .endsWith(".jpg")) {
1219:                                                    addTaskAtStart(newTask);
1220:                                                } else {
1221:                                                    addTask(newTask);
1222:                                                }
1223:                                            }
1224:                                        }
1225:                                    }
1226:                                }
1227:                            }
1228:
1229:                            if (hasFormHandlers) {
1230:                                // add forms
1231:                                Vector forms = htmlDoc.getElements("form");
1232:                                for (int i = 0; i < forms.size(); i++) {
1233:                                    ExtendedURL eurl = formFiller.fillForm(u,
1234:                                            (Element) forms.elementAt(i));
1235:                                    if (eurl != null) {
1236:                                        RobotTask newTask = createRobotTask(
1237:                                                eurl.getURL(), depth - 1, u
1238:                                                        .toString());
1239:                                        newTask
1240:                                                .setParamString(eurl
1241:                                                        .getParams());
1242:                                        newTask.setMethod(eurl
1243:                                                .getRequestMethod());
1244:                                        addTask(newTask);
1245:                                    }
1246:                                }
1247:                            }
1248:
1249:                        }
1250:                        // catch any occuring error to keep on processing
1251:                    } catch (OutOfMemoryError e) {
1252:                        throw e;
1253:                    } catch (Throwable e) {
1254:                        log
1255:                                .error("Unexpected error while extraction links from url '"
1256:                                        + u + "':" + e);
1257:                        e.printStackTrace();
1258:                        // continue processing
1259:                    }
1260:
1261:                    // filter and store the document
1262:                    if ((docManager != null)) {
1263:                        try {
1264:                            if (filters != null) {
1265:                                doc = filters.process(doc);
1266:                            } else {
1267:                                log.debug("No filters defined");
1268:                            }
1269:
1270:                            if (isProcessingAllowed(doc)) {
1271:                                docManager.processDocument(doc);
1272:                            } else {
1273:                                String md5 = doc
1274:                                        .getHeaderValue(HttpHeader.CONTENT_MD5);
1275:                                doc.setContent("Not for indexing".getBytes());
1276:                                doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
1277:                            }
1278:
1279:                            try {
1280:                                docManager.storeDocument(doc);
1281:                            } catch (Exception e) {
1282:                                log.warn("could not store (not for indexing) "
1283:                                        + urlString + ": " + e.getMessage());
1284:                            }
1285:                            if (activatedContentHistory && duplicate == null) {
1286:                                setContentVisitedURL(doc, urlString);
1287:                            }
1288:                        } catch (DocManagerException e1) {
1289:                            log.error("could not process document: "
1290:                                    + e1.getMessage());
1291:                            exceptionHandler.handleException(this , u, e1);
1292:                        } catch (FilterException e2) {
1293:                            log.error(e2.getMessage());
1294:                        }
1295:                    }
1296:
1297:                } else {
1298:                    // it was NOT a 200 return code !
1299:
1300:                    if (doc.isRedirect()) {
1301:                        String ref = doc.getLocation();
1302:                        log.info("Got redirect to " + ref);
1303:
1304:                        try {
1305:                            URL u2 = new URL(u, ref);
1306:                            // is it on another host ?
1307:
1308:                            // On a redirect, browsers use the old Referer instead of the
1309:                            // URL that got this redirect
1310:                            // Therefore we do not use u.toString as Referer but the old Referer
1311:                            RobotTask newTask = createRobotTask(u2, depth - 1,
1312:                                    referer);
1313:
1314:                            // it will be inserted at the beginning of the vector !
1315:                            addTaskAtStart(newTask);
1316:                        } catch (MalformedURLException e) {
1317:                            // ignore this URL
1318:                        }
1319:                        // handle other values
1320:                    } else if (doc.isNotFound()) {
1321:                        // the document was not found
1322:                        exceptionHandler.handleException(this , u,
1323:                                new HttpException("Document not found"));
1324:                    } else if (doc.isUnauthorized()) {
1325:                        // the document was not found
1326:                        exceptionHandler.handleException(this , u,
1327:                                new HttpException(
1328:                                        "No authorization for the document."));
1329:                    } else {
1330:                        // an other error occured.
1331:                        exceptionHandler.handleException(this , u,
1332:                                new HttpException(
1333:                                        "Unknown document error (Http return code "
1334:                                                + doc.getHttpCode() + ")."));
1335:                    }
1336:                }
1337:            }
1338:
1339:            /**
1340:             * Inform about spidering progress.
1341:             * May use iteration, startTime,
1342:             * countCache, countWeb, countRefresh, countNoRefresh
1343:             */
1344:            public void updateProgressInfo() {
1345:            }
1346:
1347:            /**
1348:             * sleep for sleepTime seconds.
1349:             */
1350:            public void sleepNow() {
1351:                if (sleepTime > 0) {
1352:                    synchronized (this ) {
1353:                        if (webRobotCallback != null) {
1354:                            webRobotCallback.webRobotSleeping(true);
1355:                        }
1356:
1357:                        try {
1358:                            Thread.sleep(sleepTime * 1000);
1359:                        } catch (InterruptedException e) {
1360:                        }
1361:
1362:                        if (webRobotCallback != null) {
1363:                            webRobotCallback.webRobotSleeping(false);
1364:                        }
1365:                    }
1366:                }
1367:            }
1368:
1369:            /**
1370:             * retrieves a file from the local file system.
1371:             * @param url the url of the file to retrieve
1372:             * @return HttpDoc containing the content and mime type
1373:             */
1374:            private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince)
1375:                    throws HttpException {
1376:                HttpDoc doc = new HttpDoc();
1377:
1378:                try {
1379:                    String host = url.getHost();
1380:                    String filename = url.getFile();
1381:                    if ((host == null) || (host.equals(""))) {
1382:                        // local file
1383:                        // remove leading / or \
1384:                        if ((filename.startsWith("\\"))
1385:                                || (filename.startsWith("/"))) {
1386:                            filename = filename.substring(1);
1387:                        }
1388:                    } else {
1389:                        filename = "//" + host + filename;
1390:                    }
1391:                    // get the mimetype and put in the http header
1392:                    String mimetypestr = getMimeTypeForFilename(filename);
1393:                    if (mimetypestr != null) {
1394:                        HttpHeader header = new HttpHeader("content-type",
1395:                                mimetypestr);
1396:                        doc.addHeader(header);
1397:                    }
1398:
1399:                    // get the content from the file
1400:                    File file = new File(filename);
1401:                    if (!file.exists()) {
1402:                        doc.setHttpCode("httpcode "
1403:                                + HttpConstants.HTTP_NOTFOUND);
1404:                        return doc;
1405:                    }
1406:                    long fileLastModified = file.lastModified();
1407:                    long ifModifiedSinceTime = ifModifiedSince == null ? 0
1408:                            : ifModifiedSince.getTime();
1409:                    if (fileLastModified > ifModifiedSinceTime) {
1410:                        byte[] content = readFileToByteArray(file);
1411:                        doc.setContent(content);
1412:                        doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
1413:                    } else {
1414:                        doc.setHttpCode("httpcode "
1415:                                + HttpConstants.HTTP_NOTMODIFIED);
1416:                    }
1417:                    doc.setLastModified(fileLastModified);
1418:                    doc.setDate(System.currentTimeMillis());
1419:                    doc.setURL(url);
1420:
1421:                    return doc;
1422:                } catch (Exception e) {
1423:                    throw new HttpException(e.getMessage());
1424:                }
1425:            }
1426:
1427:            /**
1428:             * Get the Mime type for the given filename.
1429:             * @param filename
1430:             * @return Mime type
1431:             */
1432:            protected String getMimeTypeForFilename(String filename) {
1433:                if (filename.endsWith(".html") || filename.endsWith(".htm")) {
1434:                    return "text/html";
1435:                } else {
1436:                    return null;
1437:                }
1438:            }
1439:
1440:            /** 
1441:             * Clean up temporary data
1442:             */
1443:            protected void cleanUp() {
1444:                stopIt = false;
1445:                visited.clear();
1446:                todo.clear();
1447:            }
1448:
1449:            /** 
1450:             * adds a new task to the task vector but does some checks to 
1451:             */
1452:            protected void addTask(RobotTask task) {
1453:                if (taskAddAllowed(task) && activatedNewTasks) {
1454:                    todo.add(task);
1455:                }
1456:            }
1457:
1458:            /** 
1459:             * adds a new tasks at the beginning of the tasks list 
1460:             * @see #addTask(RobotTask)
1461:             */
1462:            protected void addTaskAtStart(RobotTask task) {
1463:                if (taskAddAllowed(task) && activatedNewTasks) {
1464:                    todo.addAtStart(task);
1465:                }
1466:            }
1467:
1468:            /**
1469:             * Checks if a tasks should be added to the task list
1470:             * @param robotTask 
1471:             * @return true if this tasks can be added to the task list,
1472:             * false otherwise
1473:             */
1474:            protected boolean taskAddAllowed(RobotTask task) {
1475:                if (task == null) {
1476:                    log.info("Null task not allowed");
1477:                    return false;
1478:                }
1479:
1480:                if (!isAllowed(task.getUrl())) {
1481:                    return false;
1482:                }
1483:
1484:                if (todo.contains(task)) {
1485:                    return false;
1486:                }
1487:
1488:                return true;
1489:            }
1490:
1491:            /**
1492:             * Is it allowed to travel to this new URL ?
1493:             * @param u the URL to test
1494:             * @return true if traveling to this URL is allowed, false otherwise
1495:             */
1496:            protected boolean isAllowed(URL u) {
1497:
1498:                // do the basic checks
1499:                if (basicURLCheck(u)) {
1500:
1501:                    // if we have an URLCheck then test this URL against it 
1502:                    if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
1503:                        log.debug("not allowed by URLCheck:" + u);
1504:                        return false;
1505:                    }
1506:
1507:                    if (robCheck.ok(u)) {
1508:                        return true;
1509:                    } else {
1510:                        log.debug("not allowed by robots.txt:" + u);
1511:                        return false;
1512:                    }
1513:                }
1514:                return false;
1515:            }
1516:
1517:            /**
1518:             * Is it allowed to process this document ?
1519:             * @param document
1520:             * @return true if processing of this URL is allowed
1521:             */
1522:            protected boolean isProcessingAllowed(HttpDoc doc) {
1523:                URL u = doc.getURL();
1524:                if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
1525:                    log.debug("processing not allowed by URLCheck:" + u);
1526:                    return false;
1527:                }
1528:
1529:                DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
1530:                if (downloadRuleSet != null
1531:                        && !downloadRuleSet
1532:                                .processAllowed(doc.getHttpHeaders())) {
1533:                    log.debug("processing not allowed by DownloadRuleSet:" + u);
1534:                    return false;
1535:                }
1536:
1537:                return true;
1538:            }
1539:
1540:            /**
1541:             * Basic URL allow check
1542:             * it is allowed to walk to a new URL if <ul>
1543:             *  <li>WalkToOtherHost is true. In this case there will be no additional
1544:             *      tests.</li>
1545:             *  <li>The new URL is located below the start URL, e.g. is the start URL
1546:             *      is http://localhost/test, the URL http://localhost/test/index.html
1547:             *      is allowed, but http://localhost/ is not allowed.</li>
1548:             *  <li>AllowWholeHost is true and the new URL is located on the same host
1549:             *      as the start URL.</li>
1550:             *  <li>FlexibleHostCheck is true and the host part of the current URL
1551:             *      is equal to the host part of the start URL modulo the prefix "www."
1552:             *      </li>
1553:             *  <li>The URL starts with a string in the "AllowedURLs" list.</li>
1554:             * </ul>
1555:             */
1556:            protected boolean basicURLCheck(URL currURL) {
1557:                String currURLStr = currURL.getHost() + currURL.getPath();
1558:                String currHost = currURL.getHost().toLowerCase();
1559:                String startHost = startURL.getHost().toLowerCase();
1560:
1561:                // no more checks, if walkToOtherHosts is true
1562:                if (walkToOtherHosts) {
1563:                    return true;
1564:                }
1565:
1566:                // new URL below start URL ?
1567:                if (currURLStr.startsWith(startDir)) {
1568:                    return true;
1569:                }
1570:
1571:                // on the same host ?
1572:                if (allowWholeHost
1573:                        && (currURL.getHost().equalsIgnoreCase(startURL
1574:                                .getHost()))) {
1575:                    return true;
1576:                }
1577:
1578:                // on the same host with flexible test (host name with and without "www."
1579:                if (flexibleHostCheck) {
1580:                    if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
1581:                        return true;
1582:                    }
1583:                }
1584:
1585:                // allow whole domain ?
1586:                if (allowWholeDomain) {
1587:                    if (currHost.endsWith(getDomain(startHost))) {
1588:                        return true;
1589:                    }
1590:                }
1591:
1592:                // in the list of allowed URLs ?
1593:                for (int i = 0; i < allowedURLs.size(); i++) {
1594:                    String s = (String) allowedURLs.elementAt(i);
1595:                    if (currURLStr.startsWith(s)) {
1596:                        return true;
1597:                    }
1598:                }
1599:                log.debug("URL " + currURLStr + " not allowed");
1600:                return false;
1601:            }
1602:
1603:            /**
1604:             * remove a leading www. from a given hostname
1605:             * 
1606:             * @param hostname some hostname
1607:             * @return the hostname if it doesn't start with "www." otherwise
1608:             *  the hostname without the leading www.
1609:             */
1610:            private String cutWWW(String hostname) {
1611:                if (hostname.toLowerCase().startsWith("www.")) {
1612:                    return hostname.substring(4);
1613:                } else {
1614:                    return hostname;
1615:                }
1616:            }
1617:
1618:            /** 
1619:             * Gets the domain name of a given host (just delete everything
1620:             * to the last "."
1621:             *
1622:             * @param hostname some hostname
1623:             * @return the domain part of this hostname
1624:             */
1625:            private String getDomain(String hostname) {
1626:                int pos = hostname.indexOf(".");
1627:                if (pos < 0) {
1628:                    // this should not happen !
1629:                    return hostname;
1630:                } else {
1631:                    return hostname.substring(pos + 1);
1632:                }
1633:            }
1634:
1635:            /**
1636:             * Method getExceptionHandler.
1637:             * @return RobotExceptionHandler the exceptionhandler of the robot
1638:             */
1639:            public RobotExceptionHandler getExceptionHandler() {
1640:                return exceptionHandler;
1641:            }
1642:
1643:            /**
1644:             * Method setExceptionHandler.
1645:             * sets the exceptionhandler of the robot
1646:             * @param newExceptionHandler the new exception handler
1647:             */
1648:            public void setExceptionHandler(
1649:                    RobotExceptionHandler newExceptionHandler) {
1650:                if (newExceptionHandler != null) {
1651:                    exceptionHandler = newExceptionHandler;
1652:                }
1653:            }
1654:
1655:            /**
1656:             * Method setStart.
1657:             * sets the start URL 
1658:             * @param the startURL as String
1659:             */
1660:            public void setStart(String startURL) {
1661:                try {
1662:                    setStartURL(new URL(startURL));
1663:                } catch (MalformedURLException e) {
1664:                    e.printStackTrace();
1665:                }
1666:            }
1667:
1668:            /**
1669:             * Method getStart.
1670:             * gets the start url as string
1671:             * @return String
1672:             */
1673:            public String getStart() {
1674:                URL url = getStartURL();
1675:                if (url != null) {
1676:                    return url.toExternalForm();
1677:                } else {
1678:                    return null;
1679:                }
1680:            }
1681:
1682:            /**
1683:             * This method finishes HttpTool, NoRobots, HttpDocManager.
1684:             */
1685:            public void finish() {
1686:                if (httpTool != null) {
1687:                    httpTool.finish();
1688:                }
1689:                if (robCheck != null) {
1690:                    robCheck.finish();
1691:                }
1692:                if (docManager != null) {
1693:                    docManager.finish();
1694:                }
1695:            }
1696:
1697:            public static void main(String[] args) {
1698:                if (args.length > 0)
1699:                    System.err.println("Arguments will be ignored!");
1700:                Field[] fields = WebRobot.class.getDeclaredFields();
1701:                StringBuffer str = new StringBuffer(60);
1702:                for (int i = 0; i < fields.length; i++) {
1703:                    if (!Modifier.isFinal(fields[i].getModifiers())
1704:                            && !Modifier.isStatic(fields[i].getModifiers())) {
1705:                        str.delete(0, str.length());
1706:                        str.append("		robot." + fields[i].getName() + " = "
1707:                                + fields[i].getName() + ";");
1708:                        while (str.length() < 50) {
1709:                            str.append(" ");
1710:                        }
1711:                        System.out.println(str.toString() + "// ("
1712:                                + fields[i].getType().getName() + ")");
1713:                    }
1714:                }
1715:            }
1716:
1717:            /** default expected count of documents */
1718:            private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
1719:
1720:            /** expected count of documents */
1721:            protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
1722:
1723:            /** remember visited content here (md5, urlString) */
1724:            protected HashMap content2UrlMap;
1725:
1726:            /**  counter for pages that were found in cache */
1727:            long countCache = 0;
1728:
1729:            /** counter for pages retrieved by web */
1730:            long countWeb = 0;
1731:
1732:            /** counter for pages that didnīt need a refresh */
1733:            long countNoRefresh = 0;
1734:
1735:            /** counter for refreshed pages (=cache+web) */
1736:            long countRefresh = 0;
1737:
1738:            /**
1739:             * Method getContentVisitedURL.
1740:             * Checks if the content was visited before and retrieves the corresponding URL.
1741:             * @param content
1742:             * @return found url or null if not found
1743:             */
1744:            public String getContentVisitedURL(HttpDoc doc) {
1745:                Object key = doc.getContentMD5();
1746:                synchronized (content2UrlMap) {
1747:                    String url = (String) content2UrlMap.get(key);
1748:                    return url;
1749:                }
1750:            }
1751:
1752:            /**
1753:             * Method setContentVisitedURL.
1754:             * Makes an URL retrievable by its content by entering it in content2UrlMap.
1755:             * @param content
1756:             * @param url
1757:             */
1758:            public void setContentVisitedURL(HttpDoc doc, String url) {
1759:                Object key = doc.getContentMD5();
1760:                synchronized (content2UrlMap) {
1761:                    content2UrlMap.put(key, url);
1762:                }
1763:            }
1764:
1765:            private final RobotTask createRobotTask(URL url, int maxDepth,
1766:                    String startReferer) {
1767:                url = removeWasteParameters(url);
1768:                return new RobotTask(url, maxDepth, startReferer);
1769:            }
1770:
1771:            /** only true if form-handlers are defined */
1772:            boolean hasFormHandlers = false;
1773:
1774:            /** list of wasteParameters (will be removed from URLs) **/
1775:            protected Vector wasteParameters = new Vector();
1776:
1777:            /** 
1778:             * Set the list of wasteParameters (will be removed from URLs)
1779:             * @param wasteParameters 
1780:             * if they begin of a string in this vector
1781:             */
1782:            public void setWasteParameters(Vector wasteParameters) {
1783:                this .wasteParameters = wasteParameters;
1784:            }
1785:
1786:            /**
1787:             * Gets the list of wasteParameters (will be removed from URLs)
1788:             * @return a Vector containing Strings
1789:             */
1790:            public Vector getWasteParameters() {
1791:                return this .wasteParameters;
1792:            }
1793:
1794:            /** Removes wasteParameters from URL.
1795:             * (eg. ID)
1796:             * @param url
1797:             * @return URL
1798:             */
1799:            public URL removeWasteParameters(URL url) {
1800:                String urlString = url.toExternalForm();
1801:                String newUrlString = removeParametersFromString(urlString,
1802:                        wasteParameters);
1803:                if (urlString != newUrlString) {
1804:                    try {
1805:                        url = new URL(newUrlString);
1806:                    } catch (MalformedURLException ex) {
1807:                        ex.printStackTrace();
1808:                    }
1809:                }
1810:                ;
1811:                return url;
1812:            }
1813:
1814:            /**
1815:             * Remove passed Parameters from UrlString
1816:             * @param urlString
1817:             * @param wasteParameters
1818:             * @return String
1819:             */
1820:            public static String removeParametersFromString(String urlString,
1821:                    Vector wasteParameters) {
1822:                if (wasteParameters != null && wasteParameters.size() > 0) {
1823:                    int questionMark = urlString.indexOf("?");
1824:                    if (questionMark > 0 && questionMark < urlString.length()) {
1825:                        int restPosition = urlString.indexOf("#", questionMark);
1826:                        String parameters;
1827:                        String rest;
1828:                        if (restPosition < 0) {
1829:                            parameters = urlString.substring(questionMark + 1);
1830:                            rest = null;
1831:                        } else {
1832:                            parameters = urlString.substring(questionMark + 1,
1833:                                    restPosition);
1834:                            rest = urlString.substring(restPosition);
1835:                        }
1836:
1837:                        StringBuffer filteredUrl = new StringBuffer(urlString
1838:                                .substring(0, questionMark));
1839:                        StringTokenizer tokenizer = new StringTokenizer(
1840:                                parameters, "&");
1841:                        String and = "?";
1842:                        boolean changed = false;
1843:                        while (tokenizer.hasMoreTokens()) {
1844:                            String token = tokenizer.nextToken();
1845:                            boolean keep = true;
1846:                            for (int w = 0; w < wasteParameters.size(); w++) {
1847:                                String wasteParameter = (String) wasteParameters
1848:                                        .elementAt(w);
1849:                                if (token.startsWith(wasteParameter + "=")) {
1850:                                    keep = false;
1851:                                    changed = true;
1852:                                    break;
1853:                                }
1854:                            }
1855:                            if (keep) {
1856:                                filteredUrl.append(and);
1857:                                filteredUrl.append(token);
1858:                                and = "&";
1859:                            }
1860:                        }
1861:                        if (rest != null)
1862:                            filteredUrl.append(rest);
1863:                        if (changed) {
1864:                            urlString = filteredUrl.toString();
1865:                        }
1866:                    }
1867:                }
1868:                return urlString;
1869:            }
1870:
1871:            /** time of WebRobot start in milliseconds */
1872:            protected long startTime = System.currentTimeMillis();
1873:
1874:            /** number of allowed retries for document retrieval */
1875:            protected int maxRetries = 0;
1876:
1877:            /**
1878:             * Set allowed retries for document retrieval
1879:             * @param maxRetries
1880:             */
1881:            public void setMaxRetries(int maxRetries) {
1882:                this .maxRetries = maxRetries;
1883:            }
1884:
1885:            /**
1886:             * Get allowed retries for document retrieval
1887:             * @return maxRetries
1888:             */
1889:            public int getMaxRetries() {
1890:                return maxRetries;
1891:            }
1892:
1893:            /** 
1894:             * expiration age of documents in cache.
1895:             * Documents older than expirationAge will be removed,
1896:             * negative value means no limit. 
1897:             */
1898:            protected long expirationAge = -1;
1899:
1900:            /**
1901:             * set expiration age of documents in cache.
1902:             * Documents older than expirationAge will be removed,
1903:             * negative value means no limit. 
1904:             * @param age
1905:             */
1906:            public void setExpirationAge(long age) {
1907:                expirationAge = age;
1908:            }
1909:
1910:            /**
1911:             * get expiration age of documents in cache.
1912:             * @return long
1913:             */
1914:            public long getExpirationAge() {
1915:                return expirationAge;
1916:            }
1917:
1918:            /**
1919:             * Remove Parameters from Url
1920:             * @param url
1921:             * @return url without parameters
1922:             */
1923:            private final static String removeParameters(String url) {
1924:                int pos = url.indexOf("?");
1925:                return pos >= 0 ? url.substring(0, pos) : url;
1926:            }
1927:
1928:            /**
1929:             * Reads a File to a byte array.
1930:             * @param file
1931:             * @return byte[]
1932:             * @throws IOException
1933:             */
1934:            protected byte[] readFileToByteArray(File file) throws IOException {
1935:                FileInputStream in = null;
1936:
1937:                try {
1938:                    byte[] buffer = new byte[(int) file.length()];
1939:                    in = new FileInputStream(file);
1940:                    in.read(buffer);
1941:
1942:                    return buffer;
1943:                } finally {
1944:                    if (in != null) {
1945:                        try {
1946:                            in.close();
1947:                        } catch (IOException e) {
1948:                        }
1949:                    }
1950:                }
1951:            }
1952:
1953:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.