001: /* ExternalGeoLocationDecideRule
002: *
003: * Created on May 25, 2005
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.deciderules;
024:
025: import java.net.InetAddress;
026: import java.net.UnknownHostException;
027: import java.util.logging.Level;
028: import java.util.logging.Logger;
029:
030: import org.apache.commons.httpclient.URIException;
031: import org.archive.crawler.datamodel.CandidateURI;
032: import org.archive.crawler.datamodel.CrawlHost;
033: import org.archive.crawler.settings.SimpleType;
034: import org.xbill.DNS.Address;
035:
036: /**
037: * A rule that can be configured to take alternate implementations
038: * of the ExternalGeoLocationInterface.
039: * If no implementation specified, or none found, returns configured decision.
040: * If host in URI has been resolved checks CrawlHost for the country code
041: * determination.
042: * If country code is not present, does country lookup, and saves the country
043: * code to <code>CrawlHost</code> for future consultation.
044: * If country code is present in <code>CrawlHost</code>, compares it against
045: * the configured code.
046: * Note that if a host's IP address changes during the crawl, we still consider
047: * the associated hostname to be in the country of its original IP address.
048: *
049: * @author Igor Ranitovic
050: */
051: public class ExternalGeoLocationDecideRule extends PredicatedDecideRule {
052:
053: private static final long serialVersionUID = -32974116429860725L;
054:
055: private static final Logger LOGGER = Logger
056: .getLogger(ExternalGeoLocationDecideRule.class.getName());
057: static final String ATTR_IMPLEMENTATION = "implementation-class";
058: static final String ATTR_COUNTRY_CODE = "country-code";
059: static final String DEFAULT_COUNTRY_CODE = "--";
060: private String countryCode;
061: private ExternalGeoLookupInterface implementation = null;
062:
063: /**
064: * @param name Name of this rule.
065: */
066: public ExternalGeoLocationDecideRule(String name) {
067: super (name);
068: setDescription("ExternalGeoLocationDecideRule. Rule that "
069: + "instantiates implementations of the ExternalGeoLookupInterface. "
070: + "The implementation needs to be present on the classpath. "
071: + "On initialization, the implementation is instantiated ("
072: + "assumption is that there is public constructor that takes +"
073: + "country code).");
074: addElementToDefinition(new SimpleType(ATTR_IMPLEMENTATION,
075: "Name of implementation of ExternalGeoLookupInterface class to "
076: + "instantiate.", ""));
077: addElementToDefinition(new SimpleType(ATTR_COUNTRY_CODE,
078: "Country code name.", ""));
079:
080: }
081:
082: protected boolean evaluate(Object obj) {
083: ExternalGeoLookupInterface impl = getConfiguredImplementation(obj);
084: if (impl == null) {
085: return false;
086: }
087: CrawlHost crawlHost = null;
088: String host;
089: InetAddress address;
090: try {
091: if (obj instanceof CandidateURI) {
092: host = ((CandidateURI) obj).getUURI().getHost();
093: crawlHost = getSettingsHandler().getOrder()
094: .getController().getServerCache().getHostFor(
095: host);
096: if (crawlHost.getCountryCode() != null) {
097: return (crawlHost.getCountryCode()
098: .equals(countryCode)) ? true : false;
099: }
100: address = crawlHost.getIP();
101: if (address == null) {
102: address = Address.getByName(host);
103: }
104: crawlHost.setCountryCode((String) impl.lookup(address));
105: if (crawlHost.getCountryCode().equals(countryCode)) {
106: LOGGER.fine("Country Code Lookup: " + " " + host
107: + crawlHost.getCountryCode());
108: return true;
109: }
110: }
111: } catch (UnknownHostException e) {
112: LOGGER.log(Level.FINE, "Failed dns lookup " + obj, e);
113: if (crawlHost != null) {
114: crawlHost.setCountryCode(DEFAULT_COUNTRY_CODE);
115: }
116: } catch (URIException e) {
117: LOGGER
118: .log(Level.FINE, "Failed to parse hostname " + obj,
119: e);
120: }
121:
122: return false;
123: }
124:
125: /**
126: * Get implementation, if one specified. If none specified, will keep trying
127: * to find one. Will be messy if the provided class is not-instantiable
128: *
129: * @param o A context object.
130: * @return Instance of <code>ExternalGeoLookupInterface</code> or null.
131: */
132: protected synchronized ExternalGeoLookupInterface getConfiguredImplementation(
133: Object o) {
134: if (this .implementation != null) {
135: return this .implementation;
136: }
137: ExternalGeoLookupInterface result = null;
138: try {
139: String className = (String) getAttribute(o,
140: ATTR_IMPLEMENTATION);
141: countryCode = (String) getAttribute(o, ATTR_COUNTRY_CODE);
142: if (className != null && className.length() != 0) {
143: Object obj = Class.forName(className).getConstructor(
144: new Class[] { String.class }).newInstance(
145: new Object[] { countryCode });
146: if (!(obj instanceof ExternalGeoLookupInterface)) {
147: LOGGER
148: .severe("Implementation "
149: + className
150: + " does not implement ExternalGeoLookupInterface");
151: }
152: result = (ExternalGeoLookupInterface) obj;
153: this .implementation = result;
154: }
155: } catch (Exception e) {
156: LOGGER.severe(e.getMessage());
157: }
158: return result;
159: }
160: }
|