Source Code Cross Referenced for PreconditionEnforcer.java in » Web-Crawler » heritrix » org » archive » crawler » prefetch » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.prefetch
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * SimplePolitenessEnforcer.java
020:         * Created on May 22, 2003
021:         *
022:         * $Header$
023:         */
024:        package org.archive.crawler.prefetch;
025:
026:        import java.util.Iterator;
027:        import java.util.Set;
028:        import java.util.logging.Level;
029:        import java.util.logging.Logger;
030:
031:        import javax.management.AttributeNotFoundException;
032:
033:        import org.apache.commons.httpclient.URIException;
034:        import org.archive.crawler.datamodel.CoreAttributeConstants;
035:        import org.archive.crawler.datamodel.CrawlHost;
036:        import org.archive.crawler.datamodel.CrawlServer;
037:        import org.archive.crawler.datamodel.CrawlURI;
038:        import org.archive.crawler.datamodel.CredentialStore;
039:        import org.archive.crawler.datamodel.FetchStatusCodes;
040:        import org.archive.crawler.datamodel.credential.Credential;
041:        import org.archive.crawler.datamodel.credential.CredentialAvatar;
042:        import org.archive.crawler.framework.Processor;
043:        import org.archive.crawler.settings.SimpleType;
044:        import org.archive.crawler.settings.Type;
045:        import org.archive.net.UURI;
046:
047:        /**
048:         * Ensures the preconditions for a fetch -- such as DNS lookup 
049:         * or acquiring and respecting a robots.txt policy -- are
050:         * satisfied before a URI is passed to subsequent stages.
051:         *
052:         * @author gojomo
053:         */
054:        public class PreconditionEnforcer extends Processor implements 
055:                CoreAttributeConstants, FetchStatusCodes {
056:
057:            private static final long serialVersionUID = 4636474153589079615L;
058:
059:            private static final Logger logger = Logger
060:                    .getLogger(PreconditionEnforcer.class.getName());
061:
062:            private final static Integer DEFAULT_IP_VALIDITY_DURATION = new Integer(
063:                    60 * 60 * 6); // six hours 
064:            private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION = new Integer(
065:                    60 * 60 * 24); // one day
066:
067:            /** seconds to keep IP information for */
068:            public final static String ATTR_IP_VALIDITY_DURATION = "ip-validity-duration-seconds";
069:            /** seconds to cache robots info */
070:            public final static String ATTR_ROBOTS_VALIDITY_DURATION = "robot-validity-duration-seconds";
071:
072:            /** whether to calculate robots exclusion without applying */
073:            public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE;
074:            public final static String ATTR_CALCULATE_ROBOTS_ONLY = "calculate-robots-only";
075:
076:            public PreconditionEnforcer(String name) {
077:                super (name, "Precondition enforcer");
078:
079:                Type e;
080:
081:                e = addElementToDefinition(new SimpleType(
082:                        ATTR_IP_VALIDITY_DURATION,
083:                        "The minimum interval for which a dns-record will be considered "
084:                                + "valid (in seconds). "
085:                                + "If the record's DNS TTL is larger, that will be used instead.",
086:                        DEFAULT_IP_VALIDITY_DURATION));
087:                e.setExpertSetting(true);
088:
089:                e = addElementToDefinition(new SimpleType(
090:                        ATTR_ROBOTS_VALIDITY_DURATION,
091:                        "The time in seconds that fetched robots.txt information is "
092:                                + "considered to be valid. "
093:                                + "If the value is set to '0', then the robots.txt information"
094:                                + " will never expire.",
095:                        DEFAULT_ROBOTS_VALIDITY_DURATION));
096:                e.setExpertSetting(true);
097:
098:                e = addElementToDefinition(new SimpleType(
099:                        ATTR_CALCULATE_ROBOTS_ONLY,
100:                        "Whether to only calculate the robots status of an URI, "
101:                                + "without actually applying any exclusions found. If true, "
102:                                + "exlcuded URIs will only be annotated in the crawl.log, but "
103:                                + "still fetched. Default is false. ",
104:                        DEFAULT_CALCULATE_ROBOTS_ONLY));
105:                e.setExpertSetting(true);
106:            }
107:
108:            protected void innerProcess(CrawlURI curi) {
109:
110:                if (considerDnsPreconditions(curi)) {
111:                    return;
112:                }
113:
114:                // make sure we only process schemes we understand (i.e. not dns)
115:                String scheme = curi.getUURI().getScheme().toLowerCase();
116:                if (!(scheme.equals("http") || scheme.equals("https"))) {
117:                    logger
118:                            .fine("PolitenessEnforcer doesn't understand uri's of type "
119:                                    + scheme + " (ignoring)");
120:                    return;
121:                }
122:
123:                if (considerRobotsPreconditions(curi)) {
124:                    return;
125:                }
126:
127:                if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
128:                    return;
129:                }
130:
131:                // OK, it's allowed
132:
133:                // For all curis that will in fact be fetched, set appropriate delays.
134:                // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors
135:                // curi.setDelayFactor(getDelayFactorFor(curi));
136:                // curi.setMinimumDelay(getMinimumDelayFor(curi));
137:
138:                return;
139:            }
140:
141:            /**
142:             * Consider the robots precondition.
143:             *
144:             * @param curi CrawlURI we're checking for any required preconditions.
145:             * @return True, if this <code>curi</code> has a precondition or processing
146:             *         should be terminated for some other reason.  False if
147:             *         we can precede to process this url.
148:             */
149:            private boolean considerRobotsPreconditions(CrawlURI curi) {
150:                // treat /robots.txt fetches specially
151:                UURI uuri = curi.getUURI();
152:                try {
153:                    if (uuri != null && uuri.getPath() != null
154:                            && curi.getUURI().getPath().equals("/robots.txt")) {
155:                        // allow processing to continue
156:                        curi.setPrerequisite(true);
157:                        return false;
158:                    }
159:                } catch (URIException e) {
160:                    logger.severe("Failed get of path for " + curi);
161:                }
162:                // require /robots.txt if not present
163:                if (isRobotsExpired(curi)) {
164:                    // Need to get robots
165:                    if (logger.isLoggable(Level.FINE)) {
166:                        logger.fine("No valid robots for "
167:                                + getController().getServerCache()
168:                                        .getServerFor(curi) + "; deferring "
169:                                + curi);
170:                    }
171:
172:                    // Robots expired - should be refetched even though its already
173:                    // crawled.
174:                    try {
175:                        String prereq = curi.getUURI().resolve("/robots.txt")
176:                                .toString();
177:                        curi.markPrerequisite(prereq, getController()
178:                                .getPostprocessorChain());
179:                    } catch (URIException e1) {
180:                        logger.severe("Failed resolve using " + curi);
181:                        throw new RuntimeException(e1); // shouldn't ever happen
182:                    }
183:                    return true;
184:                }
185:                // test against robots.txt if available
186:                CrawlServer cs = getController().getServerCache().getServerFor(
187:                        curi);
188:                if (cs.isValidRobots()) {
189:                    String ua = getController().getOrder().getUserAgent(curi);
190:                    if (cs.getRobots().disallows(curi, ua)) {
191:                        if (((Boolean) getUncheckedAttribute(curi,
192:                                ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {
193:                            // annotate URI as excluded, but continue to process normally
194:                            curi.addAnnotation("robotExcluded");
195:                            return false;
196:                        }
197:                        // mark as precluded; in FetchHTTP, this will
198:                        // prevent fetching and cause a skip to the end
199:                        // of processing (unless an intervening processor
200:                        // overrules)
201:                        curi.setFetchStatus(S_ROBOTS_PRECLUDED);
202:                        curi.putString("error", "robots.txt exclusion");
203:                        logger.fine("robots.txt precluded " + curi);
204:                        return true;
205:                    }
206:                    return false;
207:                }
208:                // No valid robots found => Attempt to get robots.txt failed
209:                curi.skipToProcessorChain(getController()
210:                        .getPostprocessorChain());
211:                curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
212:                curi.putString("error", "robots.txt prerequisite failed");
213:                if (logger.isLoggable(Level.FINE)) {
214:                    logger.fine("robots.txt prerequisite failed " + curi);
215:                }
216:                return true;
217:            }
218:
219:            /**
220:             * @param curi CrawlURI whose dns prerequisite we're to check.
221:             * @return true if no further processing in this module should occur
222:             */
223:            private boolean considerDnsPreconditions(CrawlURI curi) {
224:                if (curi.getUURI().getScheme().equals("dns")) {
225:                    // DNS URIs never have a DNS precondition
226:                    curi.setPrerequisite(true);
227:                    return false;
228:                }
229:
230:                CrawlServer cs = getController().getServerCache().getServerFor(
231:                        curi);
232:                if (cs == null) {
233:                    curi.setFetchStatus(S_UNFETCHABLE_URI);
234:                    curi.skipToProcessorChain(getController()
235:                            .getPostprocessorChain());
236:                    return true;
237:                }
238:
239:                // If we've done a dns lookup and it didn't resolve a host
240:                // cancel further fetch-processing of this URI, because
241:                // the domain is unresolvable
242:                CrawlHost ch = getController().getServerCache()
243:                        .getHostFor(curi);
244:                if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {
245:                    if (logger.isLoggable(Level.FINE)) {
246:                        logger.fine("no dns for " + ch
247:                                + " cancelling processing for CrawlURI "
248:                                + curi.toString());
249:                    }
250:                    curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
251:                    curi.skipToProcessorChain(getController()
252:                            .getPostprocessorChain());
253:                    return true;
254:                }
255:
256:                // If we haven't done a dns lookup  and this isn't a dns uri
257:                // shoot that off and defer further processing
258:                if (isIpExpired(curi)
259:                        && !curi.getUURI().getScheme().equals("dns")) {
260:                    logger.fine("Deferring processing of CrawlURI "
261:                            + curi.toString() + " for dns lookup.");
262:                    String preq = "dns:" + ch.getHostName();
263:                    try {
264:                        curi.markPrerequisite(preq, getController()
265:                                .getPostprocessorChain());
266:                    } catch (URIException e) {
267:                        throw new RuntimeException(e); // shouldn't ever happen
268:                    }
269:                    return true;
270:                }
271:
272:                // DNS preconditions OK
273:                return false;
274:            }
275:
276:            /**
277:             * Get the maximum time a dns-record is valid.
278:             *
279:             * @param curi the uri this time is valid for.
280:             * @return the maximum time a dns-record is valid -- in seconds -- or
281:             * negative if record's ttl should be used.
282:             */
283:            public long getIPValidityDuration(CrawlURI curi) {
284:                Integer d;
285:                try {
286:                    d = (Integer) getAttribute(ATTR_IP_VALIDITY_DURATION, curi);
287:                } catch (AttributeNotFoundException e) {
288:                    d = DEFAULT_IP_VALIDITY_DURATION;
289:                }
290:
291:                return d.longValue();
292:            }
293:
294:            /** Return true if ip should be looked up.
295:             *
296:             * @param curi the URI to check.
297:             * @return true if ip should be looked up.
298:             */
299:            public boolean isIpExpired(CrawlURI curi) {
300:                CrawlHost host = getController().getServerCache().getHostFor(
301:                        curi);
302:                if (!host.hasBeenLookedUp()) {
303:                    // IP has not been looked up yet.
304:                    return true;
305:                }
306:
307:                if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
308:                    // IP never expires (numeric IP)
309:                    return false;
310:                }
311:
312:                long duration = getIPValidityDuration(curi);
313:                if (duration == 0) {
314:                    // Never expire ip if duration is null (set by user or more likely,
315:                    // set to zero in case where we tried in FetchDNS but failed).
316:                    return false;
317:                }
318:
319:                // catch old "default" -1 settings that are now problematic,
320:                // convert to new minimum
321:                if (duration <= 0) {
322:                    duration = DEFAULT_IP_VALIDITY_DURATION.intValue();
323:                }
324:
325:                long ttl = host.getIpTTL();
326:                if (ttl > duration) {
327:                    // Use the larger of the operator-set minimum duration 
328:                    // or the DNS record TTL
329:                    duration = ttl;
330:                }
331:
332:                // Duration and ttl are in seconds.  Convert to millis.
333:                if (duration > 0) {
334:                    duration *= 1000;
335:                }
336:
337:                return (duration + host.getIpFetched()) < System
338:                        .currentTimeMillis();
339:            }
340:
341:            /** Get the maximum time a robots.txt is valid.
342:             *
343:             * @param curi
344:             * @return the time a robots.txt is valid in milliseconds.
345:             */
346:            public long getRobotsValidityDuration(CrawlURI curi) {
347:                Integer d;
348:                try {
349:                    d = (Integer) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION,
350:                            curi);
351:                } catch (AttributeNotFoundException e) {
352:                    // This should never happen, but if it does, return default
353:                    logger.severe(e.getLocalizedMessage());
354:                    d = DEFAULT_ROBOTS_VALIDITY_DURATION;
355:                }
356:                // convert from seconds to milliseconds
357:                return d.longValue() * 1000;
358:            }
359:
360:            /**
361:             * Is the robots policy expired.
362:             *
363:             * This method will also return true if we haven't tried to get the
364:             * robots.txt for this server.
365:             *
366:             * @param curi
367:             * @return true if the robots policy is expired.
368:             */
369:            public boolean isRobotsExpired(CrawlURI curi) {
370:                CrawlServer server = getController().getServerCache()
371:                        .getServerFor(curi);
372:                long robotsFetched = server.getRobotsFetchedTime();
373:                if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) {
374:                    // Have not attempted to fetch robots
375:                    return true;
376:                }
377:                long duration = getRobotsValidityDuration(curi);
378:                if (duration == 0) {
379:                    // When zero, robots should be valid forever
380:                    return false;
381:                }
382:                if (robotsFetched + duration < System.currentTimeMillis()) {
383:                    // Robots is still valid
384:                    return true;
385:                }
386:                return false;
387:            }
388:
389:            /**
390:             * Consider credential preconditions.
391:             *
392:             * Looks to see if any credential preconditions (e.g. html form login
393:             * credentials) for this <code>CrawlServer</code>. If there are, have they
394:             * been run already? If not, make the running of these logins a precondition
395:             * of accessing any other url on this <code>CrawlServer</code>.
396:             *
397:             * <p>
398:             * One day, do optimization and avoid running the bulk of the code below.
399:             * Argument for running the code everytime is that overrides and refinements
400:             * may change what comes back from credential store.
401:             *
402:             * @param curi CrawlURI we're checking for any required preconditions.
403:             * @return True, if this <code>curi</code> has a precondition that needs to
404:             *         be met before we can proceed. False if we can precede to process
405:             *         this url.
406:             */
407:            private boolean credentialPrecondition(final CrawlURI curi) {
408:
409:                boolean result = false;
410:
411:                CredentialStore cs = CredentialStore
412:                        .getCredentialStore(getSettingsHandler());
413:                if (cs == null) {
414:                    logger.severe("No credential store for " + curi);
415:                    return result;
416:                }
417:
418:                Iterator i = cs.iterator(curi);
419:                if (i == null) {
420:                    return result;
421:                }
422:
423:                while (i.hasNext()) {
424:                    Credential c = (Credential) i.next();
425:
426:                    if (c.isPrerequisite(curi)) {
427:                        // This credential has a prereq. and this curi is it.  Let it
428:                        // through.  Add its avatar to the curi as a mark.  Also, does
429:                        // this curi need to be posted?  Note, we do this test for
430:                        // is it a prereq BEFORE we do the check that curi is of the
431:                        // credential domain because such as yahoo have you go to
432:                        // another domain altogether to login.
433:                        c.attach(curi);
434:                        curi.setPost(c.isPost(curi));
435:                        break;
436:                    }
437:
438:                    if (!c.rootUriMatch(getController(), curi)) {
439:                        continue;
440:                    }
441:
442:                    if (!c.hasPrerequisite(curi)) {
443:                        continue;
444:                    }
445:
446:                    if (!authenticated(c, curi)) {
447:                        // Han't been authenticated.  Queue it and move on (Assumption
448:                        // is that we can do one authentication at a time -- usually one
449:                        // html form).
450:                        String prereq = c.getPrerequisite(curi);
451:                        if (prereq == null || prereq.length() <= 0) {
452:                            CrawlServer server = getController()
453:                                    .getServerCache().getServerFor(curi);
454:                            logger.severe(server.getName() + " has "
455:                                    + " credential(s) of type " + c
456:                                    + " but prereq" + " is null.");
457:                        } else {
458:                            try {
459:                                curi.markPrerequisite(prereq, getController()
460:                                        .getPostprocessorChain());
461:                            } catch (URIException e) {
462:                                logger
463:                                        .severe("unable to set credentials prerequisite "
464:                                                + prereq);
465:                                getController().logUriError(e, curi.getUURI(),
466:                                        prereq);
467:                                return false;
468:                            }
469:                            result = true;
470:                            if (logger.isLoggable(Level.FINE)) {
471:                                logger.fine("Queueing prereq " + prereq
472:                                        + " of type " + c + " for " + curi);
473:                            }
474:                            break;
475:                        }
476:                    }
477:                }
478:                return result;
479:            }
480:
481:            /**
482:             * Has passed credential already been authenticated.
483:             *
484:             * @param credential Credential to test.
485:             * @param curi CrawlURI.
486:             * @return True if already run.
487:             */
488:            private boolean authenticated(final Credential credential,
489:                    final CrawlURI curi) {
490:                boolean result = false;
491:                CrawlServer server = getController().getServerCache()
492:                        .getServerFor(curi);
493:                if (!server.hasCredentialAvatars()) {
494:                    return result;
495:                }
496:                Set avatars = server.getCredentialAvatars();
497:                for (Iterator i = avatars.iterator(); i.hasNext();) {
498:                    CredentialAvatar ca = (CredentialAvatar) i.next();
499:                    String key = null;
500:                    try {
501:                        key = credential.getKey(curi);
502:                    } catch (AttributeNotFoundException e) {
503:                        logger.severe("Failed getting key for " + credential
504:                                + " for " + curi);
505:                        continue;
506:                    }
507:                    if (ca.match(credential.getClass(), key)) {
508:                        result = true;
509:                    }
510:                }
511:                return result;
512:            }
513:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.