001: /*
002: * Title: RobotDecoratorMapper
003: * Description:
004: *
005: * This software is published under the terms of the OpenSymphony Software
006: * License version 1.1, of which a copy has been included with this
007: * distribution in the LICENSE.txt file.
008: */
009:
010: package com.opensymphony.module.sitemesh.mapper;
011:
012: import com.opensymphony.module.sitemesh.Config;
013: import com.opensymphony.module.sitemesh.Decorator;
014: import com.opensymphony.module.sitemesh.DecoratorMapper;
015: import com.opensymphony.module.sitemesh.Page;
016: import com.opensymphony.module.sitemesh.RequestConstants;
017:
018: import javax.servlet.http.HttpServletRequest;
019: import javax.servlet.http.HttpSession;
020: import java.util.Properties;
021:
022: /**
023: * The RobotDecoratorMapper will use the specified decorator when the requester
024: * is identified as a robot (also known as spider, crawler, ferret) of a search engine.
025: *
026: * <p>The name of this decorator should be supplied in the <code>decorator</code>
027: * property.</p>
028: *
029: * @author <a href="mailto:pathos@pandora.be">Mathias Bogaert</a>
030: * @version $Revision: 1.3 $
031: *
032: * @see com.opensymphony.module.sitemesh.DecoratorMapper
033: */
034: public class RobotDecoratorMapper extends AbstractDecoratorMapper {
035: private String decoratorName = null;
036:
037: /** All known robot hosts (list can be found <a href="http://www.spiderhunter.com">here</a>). */
038: private static final String[] botHosts = { "alltheweb.com",
039: "alta-vista.net", "altavista.com", "atext.com",
040: "euroseek.net", "excite.com", "fast-search.net",
041: "google.com", "googlebot.com", "infoseek.co.jp",
042: "infoseek.com", "inktomi.com", "inktomisearch.com",
043: "linuxtoday.com.au", "lycos.com", "lycos.com",
044: "northernlight.com", "pa-x.dec.com" };
045:
046: /**
047: * All known robot user-agent headers (list can be found
048: * <a href="http://www.robotstxt.org/wc/active.html">here</a>).
049: *
050: * <p>NOTE: To avoid bad detection:</p>
051: *
052: * <ul>
053: * <li>Robots with ID of 2 letters only were removed</li>
054: * <li>Robot called "webs" were removed</li>
055: * <li>directhit was changed in direct_hit (its real id)</li>
056: * </ul>
057: */
058: private static final String[] botAgents = { "acme.spider",
059: "ahoythehomepagefinder", "alkaline", "appie",
060: "arachnophilia", "architext", "aretha", "ariadne",
061: "aspider", "atn.txt", "atomz", "auresys", "backrub",
062: "bigbrother", "bjaaland", "blackwidow", "blindekuh",
063: "bloodhound", "brightnet", "bspider",
064: "cactvschemistryspider", "calif", "cassandra", "cgireader",
065: "checkbot", "churl", "cmc", "collective", "combine",
066: "conceptbot", "core", "cshkust", "cusco", "cyberspyder",
067: "deweb", "dienstspider", "diibot", "direct_hit", "dnabot",
068: "download_express", "dragonbot", "dwcp", "ebiness", "eit",
069: "emacs", "emcspider", "esther", "evliyacelebi", "fdse",
070: "felix", "ferret", "fetchrover", "fido", "finnish",
071: "fireball", "fish", "fouineur", "francoroute", "freecrawl",
072: "funnelweb", "gazz", "gcreep", "getbot", "geturl", "golem",
073: "googlebot", "grapnel", "griffon", "gromit", "gulliver",
074: "hambot", "harvest", "havindex", "hometown",
075: "wired-digital", "htdig", "htmlgobble",
076: "hyperdecontextualizer", "ibm", "iconoclast", "ilse",
077: "imagelock", "incywincy", "informant", "infoseek",
078: "infoseeksidewinder", "infospider", "inspectorwww",
079: "intelliagent", "iron33", "israelisearch", "javabee",
080: "jcrawler", "jeeves", "jobot", "joebot", "jubii",
081: "jumpstation", "katipo", "kdd", "kilroy", "ko_yappo_robot",
082: "labelgrabber.txt", "larbin", "legs", "linkscan",
083: "linkwalker", "lockon", "logo_gif", "lycos", "macworm",
084: "magpie", "mediafox", "merzscope", "meshexplorer",
085: "mindcrawler", "moget", "momspider", "monster", "motor",
086: "muscatferret", "mwdsearch", "myweb", "netcarta",
087: "netmechanic", "netscoop", "newscan-online", "nhse",
088: "nomad", "northstar", "nzexplorer", "occam", "octopus",
089: "orb_search", "packrat", "pageboy", "parasite", "patric",
090: "perignator", "perlcrawler", "phantom", "piltdownman",
091: "pioneer", "pitkow", "pjspider", "pka",
092: "plumtreewebaccessor", "poppi", "portalb", "puu", "python",
093: "raven", "rbse", "resumerobot", "rhcs", "roadrunner",
094: "robbie", "robi", "roverbot", "safetynetrobot", "scooter",
095: "search_au", "searchprocess", "senrigan", "sgscout",
096: "shaggy", "shaihulud", "sift", "simbot", "site-valet",
097: "sitegrabber", "sitetech", "slurp", "smartspider",
098: "snooper", "solbot", "spanner", "speedy", "spider_monkey",
099: "spiderbot", "spiderman", "spry", "ssearcher", "suke",
100: "sven", "tach_bw", "tarantula", "tarspider", "tcl",
101: "techbot", "templeton", "titin", "titan", "tkwww",
102: "tlspider", "ucsd", "udmsearch", "urlck", "valkyrie",
103: "victoria", "visionsearch", "voyager", "vwbot", "w3index",
104: "w3m2", "wanderer", "webbandit", "webcatcher", "webcopy",
105: "webfetcher", "webfoot", "weblayers", "weblinker",
106: "webmirror", "webmoose", "webquest", "webreader",
107: "webreaper", "websnarf", "webspider", "webvac", "webwalk",
108: "webwalker", "webwatch", "wget", "whowhere", "wmir",
109: "wolp", "wombat", "worm", "wwwc", "wz101", "xget",
110: "nederland.zoek" };
111:
112: public void init(Config config, Properties properties,
113: DecoratorMapper parent) throws InstantiationException {
114: super .init(config, properties, parent);
115: decoratorName = properties.getProperty("decorator");
116: }
117:
118: public Decorator getDecorator(HttpServletRequest request, Page page) {
119: Decorator result = null;
120:
121: if (decoratorName != null && isBot(request)) {
122: result = getNamedDecorator(request, decoratorName);
123: }
124:
125: return result == null ? super .getDecorator(request, page)
126: : result;
127: }
128:
129: /** Check if the current request came from a robot (also known as spider, crawler, ferret) */
130: private static boolean isBot(HttpServletRequest request) {
131: if (request == null)
132: return false;
133:
134: // force creation of a session
135: HttpSession session = request.getSession(true);
136:
137: if (Boolean.FALSE.equals(session
138: .getAttribute(RequestConstants.ROBOT))) {
139: return false;
140: } else if (Boolean.TRUE.equals(session
141: .getAttribute(RequestConstants.ROBOT))) {
142: // a key was found in the session indicating it is a robot
143: return true;
144: } else {
145: if ("robots.txt".indexOf(request.getRequestURI()) != -1) {
146: // there is a specific request for the robots.txt file, so we assume
147: // it must be a robot (only robots request robots.txt)
148:
149: // set a key in the session, so the next time we don't have to manually
150: // detect the robot again
151: session.setAttribute(RequestConstants.ROBOT,
152: Boolean.TRUE);
153: return true;
154: } else {
155: String userAgent = request.getHeader("User-Agent");
156:
157: if (userAgent != null && userAgent.trim().length() > 2) {
158: // first check for common user-agent headers, so that we can speed
159: // this thing up, hopefully clever spiders will not send a fake header
160: if (userAgent.indexOf("MSIE") != -1
161: || userAgent.indexOf("Gecko") != -1 // MSIE and Mozilla
162: || userAgent.indexOf("Opera") != -1
163: || userAgent.indexOf("iCab") != -1 // Opera and iCab (mac browser)
164: || userAgent.indexOf("Konqueror") != -1
165: || userAgent.indexOf("KMeleon") != -1 // Konqueror and KMeleon
166: || userAgent.indexOf("4.7") != -1
167: || userAgent.indexOf("Lynx") != -1) { // NS 4.78 and Lynx
168: // indicate this session is not a robot
169: session.setAttribute(RequestConstants.ROBOT,
170: Boolean.FALSE);
171: return false;
172: }
173:
174: for (int i = 0; i < botAgents.length; i++) {
175: if (userAgent.indexOf(botAgents[i]) != -1) {
176: // set a key in the session, so the next time we don't have to manually
177: // detect the robot again
178: session.setAttribute(
179: RequestConstants.ROBOT,
180: Boolean.TRUE);
181: return true;
182: }
183: }
184: }
185:
186: // detect the robot from the host or user-agent
187: String remoteHost = request.getRemoteHost(); // requires one DNS lookup
188:
189: // if the DNS server didn't return a hostname, getRemoteHost returns the
190: // IP address, which is ignored here (the last char is checked, because some
191: // remote hosts begin with the IP)
192: if (remoteHost != null
193: && remoteHost.length() > 0
194: && remoteHost.charAt(remoteHost.length() - 1) > 64) {
195: for (int i = 0; i < botHosts.length; i++) {
196: if (remoteHost.indexOf(botHosts[i]) != -1) {
197: // set a key in the session, so the next time we don't have to manually
198: // detect the robot again
199: session.setAttribute(
200: RequestConstants.ROBOT,
201: Boolean.TRUE);
202: return true;
203: }
204: }
205: }
206:
207: // remote host and user agent are not in the predefined list,
208: // so it must be an unknown robot or not a robot
209:
210: // indicate this session is not a robot
211: session.setAttribute(RequestConstants.ROBOT,
212: Boolean.FALSE);
213: return false;
214: }
215: }
216: }
217: }
|