Source Code Cross Referenced for NoRobots.java in » Web-Crawler » JoBo » net » matuschek » spider » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » JoBo » net.matuschek.spider

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        // NoRobots - implements the Robot Exclusion Standard
002:        //
003:        // Copyright (C)1996,1998 by Jef Poskanzer <jef@acme.com>.  
004:        // All rights reserved.
005:        //
006:        // Redistribution and use in source and binary forms, with or without
007:        // modification, are permitted provided that the following conditions
008:        // are met:
009:        // 1. Redistributions of source code must retain the above copyright
010:        //    notice, this list of conditions and the following disclaimer.
011:        // 2. Redistributions in binary form must reproduce the above copyright
012:        //    notice, this list of conditions and the following disclaimer in the
013:        //    documentation and/or other materials provided with the distribution.
014:        //
015:        // THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
016:        // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
017:        // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
018:        // ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
019:        // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
020:        // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
021:        // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
022:        // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
023:        // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
024:        // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
025:        // SUCH DAMAGE.
026:
027:        /**
028:         * moved to the net.matuschek.spider package bei Daniel Matuschek
029:         * did some minimal modifications to use HttpTool for retrieval of robots.txt
030:         */package net.matuschek.spider;
031:
032:        import java.io.*;
033:        import java.net.*;
034:        import java.util.*;
035:
036:        import org.apache.log4j.Category;
037:
038:        import net.matuschek.http.*;
039:
040:        /** 
041:         * Implements the Robot Exclusion Standard.
042:         * <P>
043:         * The basic idea of the Robot Exclusion Standard is that each web server
044:         * can set up a single file called "/robots.txt" which contains pathnames
045:         * that robots should not look at. 
046:         * See <A HREF="http://www.robotstxt.org/wc/norobots.html">the full spec</A>
047:         * for details.
048:         * Using this class is very simple - you create the object using your robot's 
049:         * name and the httptool to retrieve the date, and then you call check() on 
050:         * each URL.  For efficiency, the class caches entries for servers you've 
051:         * visited recently.
052:         * <p>
053:         * @author cn
054:         * @version 0.1
055:         */
056:        public class NoRobots {
057:
058:            Category log = Category.getInstance(getClass().getName());
059:
060:            // The file with the robot rules in it.
061:            private static final String robotFile = "/robots.txt";
062:
063:            // The name of this robot.
064:            private String robotName;
065:
066:            // A table of all the servers we have visited recently.
067:            private Hashtable servers = new net.matuschek.util.LruHashtable(500);
068:
069:            // tool to get /robots.txt
070:            private HttpTool httpTool;
071:            private boolean ignore = false;
072:
073:            /**
074:             * Constructor.
075:             * @param robotName the name of the robot
076:             * @param httpTool the HttpTool instance for downloading the robotFile
077:             */
078:            public NoRobots(String robotName, HttpTool inhttpTool) {
079:                this .robotName = robotName;
080:                this .httpTool = inhttpTool;
081:                /*
082:                this.httpTool = new HttpTool();
083:                httpTool.setAgentName(inhttpTool.getAgentName());
084:                try{
085:                	httpTool.setProxy(inhttpTool.getProxy());
086:                } catch (HttpException e){
087:                	// ignore
088:                }
089:                 */
090:            }
091:
092:            /**
093:             * Check whether it's ok for this robot to fetch this URL. reads the
094:             * information in the robots.txt file on this host. If a robots.txt file is
095:             * there and this file disallows the robot to retrieve the requested url
096:             * then the method returns false
097:             * @param url the url we want to retrieve
098:             * @return boolean true if allowed to retireve the url, false otherwise
099:             */
100:            public boolean ok(URL url) {
101:                // if ignore is set to true, then this check returs true
102:                if (ignore) {
103:                    return true;
104:                }
105:
106:                String protocol = url.getProtocol();
107:                String host = url.getHost();
108:                int port = url.getPort();
109:                if (port == -1) {
110:                    port = 80;
111:                }
112:
113:                String file = url.getFile();
114:
115:                Vector disallows = getDisallows(protocol, host, port);
116:                Enumeration en = disallows.elements();
117:                while (en.hasMoreElements()) {
118:                    String pattern = (String) en.nextElement();
119:                    if (file.startsWith(pattern))
120:                        return false;
121:                }
122:                return true;
123:            }
124:
125:            /**
126:             * Method getDisallows.
127:             * Get the disallows list for the given server.  If it's not already in 
128:             * the servers hash table, we fetch it, parse it, and save it.
129:             * @param protocol
130:             * @param host
131:             * @param port
132:             * @return Vector
133:             */
134:            private Vector getDisallows(String protocol, String host, int port) {
135:                String key = protocol + "://" + host + ":" + port;
136:                Vector disallows = (Vector) servers.get(key);
137:                if (disallows != null)
138:                    return disallows;
139:
140:                disallows = new Vector();
141:                try {
142:                    URL robotUrl = new URL(protocol, host, port, robotFile);
143:                    try {
144:
145:                        // get document
146:                        log.debug("Retrieving robot file '" + robotUrl + "'.");
147:                        httpTool.setReferer("-");
148:                        String robotsFile = "";
149:                        try {
150:                            HttpDoc doc = httpTool.retrieveDocument(robotUrl,
151:                                    HttpConstants.GET, "");
152:                            //old source if (doc.isOk()) {
153:                            if (doc != null && doc.isOk()) {
154:                                robotsFile = new String(doc.getContent());
155:                            }
156:                        } catch (HttpException e) {
157:                            // ignore HTTP errors
158:                            log.info("Cannot read robots.txt: "
159:                                    + e.getMessage());
160:                        }
161:
162:                        BufferedReader robotReader = new BufferedReader(
163:                                new StringReader(robotsFile));
164:                        boolean userAgentIsMe = false;
165:                        while (true) {
166:                            String line = robotReader.readLine();
167:                            if (line == null)
168:                                break;
169:                            line = line.trim();
170:
171:                            // Completely ignore lines that are just a comment - they
172:                            // don't even separate records.
173:                            if (line.startsWith("#"))
174:                                continue;
175:
176:                            // Trim off any other comments.
177:                            int cmt = line.indexOf('#');
178:                            if (cmt != -1)
179:                                line = line.substring(0, cmt).trim();
180:
181:                            if (line.length() == 0)
182:                                userAgentIsMe = false;
183:                            else if (line.toLowerCase().startsWith(
184:                                    "user-agent:")) {
185:                                if (!userAgentIsMe) {
186:                                    String value = line.substring(11).trim();
187:                                    if (match(value, robotName))
188:                                        userAgentIsMe = true;
189:                                }
190:                            } else if (line.toLowerCase().startsWith(
191:                                    "disallow:")) {
192:                                if (userAgentIsMe) {
193:                                    String value = line.substring(9).trim();
194:                                    disallows.addElement(value);
195:                                }
196:                            }
197:                        }
198:                    } catch (IOException ignore) {
199:                    }
200:                } catch (MalformedURLException ignore) {
201:                }
202:
203:                servers.put(key, disallows);
204:                return disallows;
205:            }
206:
207:            /**
208:             * Method match.
209:             * Checks whether a string matches a given wildcard pattern.
210:             * Only does ? and *, and multiple patterns separated by |.
211:             * @param pattern
212:             * @param string
213:             * @return boolean
214:             */
215:            protected static boolean match(String pattern, String string) {
216:                for (int p = 0;; ++p) {
217:                    for (int s = 0;; ++p, ++s) {
218:                        boolean sEnd = (s >= string.length());
219:                        boolean pEnd = (p >= pattern.length() || pattern
220:                                .charAt(p) == '|');
221:                        if (sEnd && pEnd)
222:                            return true;
223:                        if (sEnd || pEnd)
224:                            break;
225:                        if (pattern.charAt(p) == '?')
226:                            continue;
227:                        if (pattern.charAt(p) == '*') {
228:                            int i;
229:                            ++p;
230:                            for (i = string.length(); i >= s; --i)
231:                                if (match(pattern.substring(p), string
232:                                        .substring(i))) /* not quite right */
233:                                    return true;
234:                            break;
235:                        }
236:                        if (pattern.charAt(p) != string.charAt(s))
237:                            break;
238:                    }
239:                    p = pattern.indexOf('|', p);
240:                    if (p == -1)
241:                        return false;
242:                }
243:            }
244:
245:            /**
246:             * Method getIgnore.
247:             * tells if the robot exclusion standard is ignored
248:             * @return boolean true if the check on robots.txt is not done
249:             */
250:            public boolean getIgnore() {
251:                return ignore;
252:            }
253:
254:            /**
255:             * Method setIgnore.
256:             * set the robot exclusion standard.
257:             * @param ignore if ignore is true then the robot exclusion standard is 
258:             * ignored
259:             */
260:            public void setIgnore(boolean ignore) {
261:                this .ignore = ignore;
262:            }
263:
264:            /**
265:             * This method finishes the HttpTool.
266:             */
267:            public void finish() {
268:                if (httpTool != null) {
269:                    httpTool.finish();
270:                }
271:            }
272:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.