01: package net.javacoding.jspider.core.rule.impl;
02:
03: import net.javacoding.jspider.api.model.Decision;
04: import net.javacoding.jspider.api.model.Site;
05: import net.javacoding.jspider.core.SpiderContext;
06: import net.javacoding.jspider.core.model.DecisionInternal;
07: import net.javacoding.jspider.core.model.SiteInternal;
08: import net.javacoding.jspider.core.util.URLUtil;
09: import net.javacoding.jspider.core.util.html.RobotsTXTLine;
10: import net.javacoding.jspider.core.util.html.RobotsTXTLineSet;
11:
12: import java.io.IOException;
13: import java.io.InputStream;
14: import java.net.URL;
15:
16: /**
17: * Rule implementation that applies the rules expressed by a site's robots.txt
18: * file to the resources we want to fetch on that site.
19: * This file allows webmasters to exclude certain resources and folders not to
20: * be spidered by web robots, to disallow inclusion in search engines, etc ...
21: *
22: * $Id: RobotsTXTRule.java,v 1.13 2003/03/28 17:26:28 vanrogu Exp $
23: *
24: * @author Günther Van Roey
25: */
26: public class RobotsTXTRule extends BaseRuleImpl {
27:
28: /** user agent under which we're operating. */
29: protected String effectiveUserAgent;
30:
31: /** user agent in the robots.txt file we're obeying */
32: protected String obeyedUserAgent;
33:
34: /** all lines in the robots.txt file that apply to us and forbid access to a part of the site. */
35: protected RobotsTXTLine[] forbiddenPaths;
36:
37: /**
38: * Public constructor.
39: * @param userAgent the userAgent under which we're operating
40: * @param is the inputstream to read the robots.txt file from
41: * @throws IOException in case something goes wrong reading the robots.txt
42: */
43: public RobotsTXTRule(String userAgent, InputStream is)
44: throws IOException {
45: RobotsTXTLineSet lineSet = RobotsTXTLineSet.findLineSet(is,
46: userAgent);
47: this .effectiveUserAgent = userAgent;
48: if (lineSet == null) {
49: this .obeyedUserAgent = null;
50: forbiddenPaths = new RobotsTXTLine[0];
51: } else {
52: this .obeyedUserAgent = lineSet.getUserAgent();
53: forbiddenPaths = lineSet.getLines();
54: }
55: }
56:
57: /**
58: * Returns the user agent from robots.txt we're obeying. (can be '*').
59: * This user agent identification is the first match we encountered in the file,
60: * a match is given if our effective user agent contains the user agent
61: * identification as a substring in a case-insensitive way.
62: * @return the useragent selector we're obeying.
63: */
64: public String getObeyedUserAgent() {
65: return obeyedUserAgent;
66: }
67:
68: /**
69: * Applies the rule to a given URL
70: * @param context the spider context we're working in
71: * @param currentSite the site we're spidering
72: * @param url the url of the resource to be tested for spider permission
73: * @return Decision object expressing this rule's decision on the resource
74: */
75: public Decision apply(SpiderContext context, Site currentSite,
76: URL url) {
77: String path = url.getPath();
78: Decision decision = new DecisionInternal();
79:
80: if ((context.getStorage().getSiteDAO().find(URLUtil
81: .getSiteURL(url))).getObeyRobotsTXT()) {
82:
83: for (int i = 0; i < forbiddenPaths.length; i++) {
84: RobotsTXTLine forbiddenPath = forbiddenPaths[i];
85: if (forbiddenPath.matches(url)) {
86: decision = new DecisionInternal(
87: Decision.RULE_FORBIDDEN, "access to '"
88: + path + "' forbidden");
89: break;
90: }
91: }
92: }
93: return decision;
94: }
95:
96: }
|