001: /* Copyright (C) 2003 Internet Archive.
002: *
003: * This file is part of the Heritrix web crawler (crawler.archive.org).
004: *
005: * Heritrix is free software; you can redistribute it and/or modify
006: * it under the terms of the GNU Lesser Public License as published by
007: * the Free Software Foundation; either version 2.1 of the License, or
008: * any later version.
009: *
010: * Heritrix is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser Public License
016: * along with Heritrix; if not, write to the Free Software
017: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
018: *
019: * RobotsHonoringPolicy.java
020: * Created on Oct 30, 2003
021: *
022: * $Header$
023: */
024: package org.archive.crawler.datamodel;
025:
026: import java.util.logging.Logger;
027:
028: import javax.management.AttributeNotFoundException;
029:
030: import org.archive.crawler.settings.CrawlerSettings;
031: import org.archive.crawler.settings.ModuleType;
032: import org.archive.crawler.settings.SimpleType;
033: import org.archive.crawler.settings.StringList;
034: import org.archive.crawler.settings.TextField;
035:
036: /**
037: * RobotsHonoringPolicy represent the strategy used by the crawler
038: * for determining how robots.txt files will be honored.
039: *
040: * Five kinds of policies exist:
041: * <dl>
042: * <dt>classic:</dt>
043: * <dd>obey the first set of robots.txt directives that apply to your
044: * current user-agent</dd>
045: * <dt>ignore:</dt>
046: * <dd>ignore robots.txt directives entirely</dd>
047: * <dt>custom:</dt>
048: * <dd>obey a specific operator-entered set of robots.txt directives
049: * for a given host</dd>
050: * <dt>most-favored:</dt>
051: * <dd>obey the most liberal restrictions offered (if *any* crawler is
052: * allowed to get a page, get it)</dd>
053: * <dt>most-favored-set:</dt>
054: * <dd>given some set of user-agent patterns, obey the most liberal
055: * restriction offered to any</dd>
056: * </dl>
057: *
058: * The two last ones has the opportunity of adopting a different user-agent
059: * to reflect the restrictions we've opted to use.
060: *
061: * @author John Erik Halse
062: *
063: */
064: public class RobotsHonoringPolicy extends ModuleType {
065:
066: private static final long serialVersionUID = 8850011643923116605L;
067:
068: private static Logger logger = Logger
069: .getLogger("org.archive.crawler.datamodel.RobotsHonoringPolicy");
070:
071: public final static int CLASSIC = 0;
072: public final static int IGNORE = 1;
073: public final static int CUSTOM = 2;
074: public final static int MOST_FAVORED = 3;
075: public final static int MOST_FAVORED_SET = 4;
076:
077: public final static String ATTR_NAME = "robots-honoring-policy";
078: public final static String ATTR_TYPE = "type";
079: public final static String ATTR_MASQUERADE = "masquerade";
080: public final static String ATTR_CUSTOM_ROBOTS = "custom-robots";
081: public final static String ATTR_USER_AGENTS = "user-agents";
082:
083: /**
084: * Creates a new instance of RobotsHonoringPolicy.
085: *
086: * @param name the name of the RobotsHonoringPolicy attirubte.
087: */
088: public RobotsHonoringPolicy(String name) {
089: super (name, "Robots honoring policy");
090:
091: String[] allowedTypes = new String[] { "classic", "ignore",
092: "custom", "most-favored", "most-favored-set" };
093:
094: addElementToDefinition(new SimpleType(
095: ATTR_TYPE,
096: "Policy type. The 'classic' policy simply obeys all "
097: + "robots.txt rules for the configured user-agent. The "
098: + "'ignore' policy ignores all robots rules. The 'custom' "
099: + "policy allows you to specify a policy, in robots.txt "
100: + "format, as a setting. The 'most-favored' policy will "
101: + "crawl an URL if the robots.txt allows any user-agent to "
102: + "crawl it. The 'most-favored-set' policy requires you "
103: + "to supply an list of alternate user-agents, and for"
104: + "every page, if any agent of the set is allowed, the"
105: + "page will be crawled.", "classic",
106: allowedTypes));
107: addElementToDefinition(new SimpleType(
108: ATTR_MASQUERADE,
109: "Should we masquerade as another user agent when obeying "
110: + "the rules declared for it. Only relevant if the "
111: + "policy type is 'most-favored' or 'most-favored-set'.",
112: new Boolean(false)));
113: addElementToDefinition(new SimpleType(ATTR_CUSTOM_ROBOTS,
114: "Custom robots to use if policy type is 'custom'. "
115: + "Compose as if an actual robots.txt file.",
116: new TextField("")));
117: addElementToDefinition(new StringList(ATTR_USER_AGENTS,
118: "Alternate user-agent values to consider using for "
119: + "the 'most-favored-set' policy."));
120: }
121:
122: public RobotsHonoringPolicy() {
123: this (ATTR_NAME);
124: }
125:
126: /**
127: * If policy-type is most favored crawler of set, then this method
128: * gets a list of all useragents in that set.
129: *
130: * @return List of Strings with user agents
131: */
132: public StringList getUserAgents(CrawlerSettings settings) {
133: if (isType(settings, RobotsHonoringPolicy.MOST_FAVORED_SET)) {
134: try {
135: return (StringList) getAttribute(settings,
136: ATTR_USER_AGENTS);
137: } catch (AttributeNotFoundException e) {
138: logger.severe(e.getMessage());
139: }
140: }
141: return null;
142: }
143:
144: /**
145: * This method returns true if the crawler should masquerade as the user agent
146: * which restrictions it opted to use.
147: *
148: * (Only relevant for policy-types: most-favored and most-favored-set).
149: *
150: * @return true if we should masquerade
151: */
152: public boolean shouldMasquerade(CrawlURI curi) {
153: try {
154: return ((Boolean) getAttribute(curi, ATTR_MASQUERADE))
155: .booleanValue();
156: } catch (AttributeNotFoundException e) {
157: logger.severe(e.getMessage());
158: return false;
159: }
160: }
161:
162: /**
163: * Get the supplied custom robots.txt
164: *
165: * @return String with content of alternate robots.txt
166: */
167: public String getCustomRobots(CrawlerSettings settings) {
168: if (isType(settings, RobotsHonoringPolicy.CUSTOM)) {
169: try {
170: return getAttribute(settings, ATTR_CUSTOM_ROBOTS)
171: .toString();
172: } catch (AttributeNotFoundException e) {
173: logger.severe(e.getMessage());
174: }
175: }
176: return null;
177: }
178:
179: /**
180: * Get the policy-type.
181: *
182: * @see #CLASSIC
183: * @see #IGNORE
184: * @see #CUSTOM
185: * @see #MOST_FAVORED
186: * @see #MOST_FAVORED_SET
187: *
188: * @return policy type
189: */
190: public int getType(Object context) {
191: int type = CLASSIC;
192: try {
193: String typeName = (String) getAttribute(context, "type");
194: if (typeName.equals("classic")) {
195: type = RobotsHonoringPolicy.CLASSIC;
196: } else if (typeName.equals("ignore")) {
197: type = RobotsHonoringPolicy.IGNORE;
198: } else if (typeName.equals("custom")) {
199: type = RobotsHonoringPolicy.CUSTOM;
200: } else if (typeName.equals("most-favored")) {
201: type = RobotsHonoringPolicy.MOST_FAVORED;
202: } else if (typeName.equals("most-favored-set")) {
203: type = RobotsHonoringPolicy.MOST_FAVORED_SET;
204: } else {
205: throw new IllegalArgumentException();
206: }
207: } catch (AttributeNotFoundException e) {
208: logger.severe(e.getMessage());
209: }
210: return type;
211: }
212:
213: /**
214: * Check if policy is of a certain type.
215: *
216: * @param o An object that can be resolved into a settings object.
217: * @param type the type to check against.
218: * @return true if the policy is of the submitted type
219: */
220: public boolean isType(Object o, int type) {
221: return type == getType(o);
222: }
223:
224: }
|