Source Code Cross Referenced for WaitEvaluator.java in  » Web-Crawler » heritrix » org » archive » crawler » postprocessor » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.postprocessor 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* WaitEvaluator
002:         * 
003:         * $Id: WaitEvaluator.java 4654 2006-09-25 20:19:54Z paul_jack $
004:         * 
005:         * Created on 26.11.2004
006:         *
007:         * Copyright (C) 2004 Internet Archive.
008:         * 
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         * 
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         * 
016:         * Heritrix is distributed in the hope that it will be useful, 
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         * 
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.crawler.postprocessor;
026:
027:        import java.util.logging.Level;
028:        import java.util.logging.Logger;
029:
030:        import javax.management.AttributeNotFoundException;
031:
032:        import org.archive.crawler.datamodel.CrawlURI;
033:        import org.archive.crawler.framework.Processor;
034:        import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
035:        import org.archive.crawler.settings.SimpleType;
036:
037:        /**
038:         * A processor that determines when a URI should be revisited next. Does
039:         * <b>not</b> account for DNS and robots.txt expiration. That should be 
040:         * handled seperately by the Frontiers.
041:         *
042:         * @author Kristinn Sigurdsson
043:         */
044:        public class WaitEvaluator extends Processor implements 
045:                AdaptiveRevisitAttributeConstants {
046:
047:            private static final long serialVersionUID = 7452762726125458413L;
048:
049:            Logger logger = Logger.getLogger(WaitEvaluator.class.getName());
050:
051:            /** Default wait time after initial visit. */
052:            public final static String ATTR_INITIAL_WAIT_INTERVAL = "initial-wait-interval-seconds";
053:            protected final static Long DEFAULT_INITIAL_WAIT_INTERVAL = new Long(
054:                    86400); // 1 day
055:            /** Maximum wait between visits */
056:            public final static String ATTR_MAX_WAIT_INTERVAL = "max-wait-interval-seconds";
057:            protected final static Long DEFAULT_MAX_WAIT_INTERVAL = new Long(
058:                    2419200); // 4 weeks
059:            /** Minimum wait between visits */
060:            public final static String ATTR_MIN_WAIT_INTERVAL = "min-wait-interval-seconds";
061:            protected final static Long DEFAULT_MIN_WAIT_INTERVAL = new Long(
062:                    3600); // 1 hour
063:            /** Factor increase on wait when unchanged */
064:            public final static String ATTR_UNCHANGED_FACTOR = "unchanged-factor";
065:            protected final static Double DEFAULT_UNCHANGED_FACTOR = new Double(
066:                    1.5);
067:            /** Factor decrease on wait when changed */
068:            public final static String ATTR_CHANGED_FACTOR = "changed-factor";
069:            protected final static Double DEFAULT_CHANGED_FACTOR = new Double(
070:                    1.5);
071:            /** Fixed wait time for 'unknown' change status. I.e. wait time for URIs 
072:             *  whose content change detection is not available. */
073:            public final static String ATTR_DEFAULT_WAIT_INTERVAL = "default-wait-interval-seconds";
074:            protected final static Long DEFAULT_DEFAULT_WAIT_INTERVAL = new Long(
075:                    259200); // 3 days
076:            /** Indicates if the amount of time the URI was overdue should be added
077:             *  to the wait time before the new wait time is calculated.  */
078:            public final static String ATTR_USE_OVERDUE_TIME = "use-overdue-time";
079:            protected final static Boolean DEFAULT_USE_OVERDUE_TIME = new Boolean(
080:                    false);
081:
082:            /**
083:             * Constructor
084:             * 
085:             * @param name The name of the module
086:             */
087:            public WaitEvaluator(String name) {
088:                this (
089:                        name,
090:                        "Evaluates how long to wait before fetching a URI again. "
091:                                + "Typically, this processor should be in the post processing "
092:                                + "chain. It will pass if another wait evaluator has already "
093:                                + "processed the CrawlURI.",
094:                        DEFAULT_INITIAL_WAIT_INTERVAL,
095:                        DEFAULT_MAX_WAIT_INTERVAL, DEFAULT_MIN_WAIT_INTERVAL,
096:                        DEFAULT_UNCHANGED_FACTOR, DEFAULT_CHANGED_FACTOR);
097:            }
098:
099:            /**
100:             * Constructor
101:             * 
102:             * @param name The name of the module
103:             * @param description Description of the module
104:             * @param default_inital_wait_interval The default value for initial wait
105:             *           time
106:             * @param default_max_wait_interval The maximum value for wait time
107:             * @param default_min_wait_interval The minimum value for wait time
108:             * @param default_unchanged_factor The factor for changing wait times of
109:             *           unchanged documents (will be multiplied by this value)
110:             * @param default_changed_factor The factor for changing wait times of
111:             *           changed documents (will be divided by this value)
112:             */
113:            public WaitEvaluator(String name, String description,
114:                    Long default_inital_wait_interval,
115:                    Long default_max_wait_interval,
116:                    Long default_min_wait_interval,
117:                    Double default_unchanged_factor,
118:                    Double default_changed_factor) {
119:                super (name, description);
120:
121:                addElementToDefinition(new SimpleType(
122:                        ATTR_INITIAL_WAIT_INTERVAL,
123:                        "The initial wait time between revisits. Will then be "
124:                                + "updated according to crawler experiance. I.e. shorter "
125:                                + "wait, visit more often, if document has changed between "
126:                                + "visits, and vica versa.",
127:                        default_inital_wait_interval));
128:                addElementToDefinition(new SimpleType(
129:                        ATTR_MAX_WAIT_INTERVAL,
130:                        "The maximum settable wait time between revisits. Once a "
131:                                + "URIs wait time reaches this value, it will not grow "
132:                                + "further, regardless of subsequent visits that discover "
133:                                + "no changes. Note that this does not ensure that the URI "
134:                                + "does not wait any longer, since the crawler might be "
135:                                + "'behind,' forcing a URI to wait until other URIs, "
136:                                + "scheduled for earlier are completed..",
137:                        default_max_wait_interval));
138:                addElementToDefinition(new SimpleType(
139:                        ATTR_MIN_WAIT_INTERVAL,
140:                        "The minum settable wait time between revisits. Once a "
141:                                + "URIs wait time reaches this value, it will not be shortened "
142:                                + "further, regardlesss of subsequent visits that discover "
143:                                + "changes.", default_min_wait_interval));
144:                addElementToDefinition(new SimpleType(
145:                        ATTR_DEFAULT_WAIT_INTERVAL,
146:                        "Fixed wait time for 'unknown' change status. I.e. wait time "
147:                                + "for URIs whose content change detection is not available.",
148:                        DEFAULT_DEFAULT_WAIT_INTERVAL));
149:                addElementToDefinition(new SimpleType(
150:                        ATTR_UNCHANGED_FACTOR,
151:                        "The factor by which a URIs wait time is increased when a "
152:                                + "revisit reveals an unchanged document. A value of 1 will "
153:                                + "leave it unchanged, a value of 2 will double it etc.",
154:                        default_unchanged_factor));
155:                addElementToDefinition(new SimpleType(
156:                        ATTR_CHANGED_FACTOR,
157:                        "The factor by which a URIs wait time is decreased when a "
158:                                + "revisit reveals a changed document. A value of 1 will leave "
159:                                + "it unchanged, a value of two will half it etc.",
160:                        default_changed_factor));
161:                addElementToDefinition(new SimpleType(
162:                        ATTR_USE_OVERDUE_TIME,
163:                        "Indicates if the amount of time the URI was overdue should "
164:                                + "be added to the wait time before the new wait time is "
165:                                + "calculated.", DEFAULT_USE_OVERDUE_TIME));
166:
167:                // Register persistent CrawlURI items 
168:                CrawlURI.addAlistPersistentMember(A_WAIT_INTERVAL);
169:            }
170:
171:            protected void innerProcess(CrawlURI curi)
172:                    throws InterruptedException {
173:
174:                if (curi.isSuccess() == false) {
175:                    // If the URI was not crawled successfully, we can not reevaluate
176:                    // the wait interval.
177:                    return;
178:                }
179:
180:                if (curi.containsKey(A_WAIT_REEVALUATED)
181:                        && ((Boolean) curi.getObject(A_WAIT_REEVALUATED))
182:                                .booleanValue()) {
183:                    // This CrawlURIs wait interval has already been reevaluted during
184:                    // this processing round.
185:                    return;
186:                }
187:
188:                long min;
189:                try {
190:                    min = ((Long) getAttribute(curi, ATTR_MIN_WAIT_INTERVAL))
191:                            .longValue() * 1000;
192:                } catch (AttributeNotFoundException e1) {
193:                    min = DEFAULT_MIN_WAIT_INTERVAL.longValue();
194:                    logger.fine("Unable to load minimum wait interval for "
195:                            + curi.toString());
196:                }
197:
198:                long max;
199:                try {
200:                    max = ((Long) getAttribute(curi, ATTR_MAX_WAIT_INTERVAL))
201:                            .longValue() * 1000;
202:                } catch (AttributeNotFoundException e1) {
203:                    max = DEFAULT_MAX_WAIT_INTERVAL.longValue();
204:                    logger.fine("Unable to load maximum wait interval for "
205:                            + curi.toString());
206:                }
207:
208:                long waitInterval;
209:                if (!curi.containsKey(A_CONTENT_STATE_KEY)
210:                        || curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNKNOWN) {
211:                    try {
212:                        waitInterval = ((Long) getAttribute(curi,
213:                                ATTR_DEFAULT_WAIT_INTERVAL)).longValue() * 1000;
214:                    } catch (AttributeNotFoundException e1) {
215:                        waitInterval = DEFAULT_DEFAULT_WAIT_INTERVAL
216:                                .longValue();
217:                        logger.fine("Unable to load default wait interval for "
218:                                + curi.toString());
219:                    }
220:                } else {
221:                    /* Calculate curi's time of next processing */
222:                    waitInterval = DEFAULT_INITIAL_WAIT_INTERVAL.longValue() * 1000;
223:
224:                    // Retrieve wait interval
225:                    if (curi.containsKey(A_WAIT_INTERVAL)) {
226:                        waitInterval = curi.getLong(A_WAIT_INTERVAL);
227:
228:                        // Should override time be taken into account?
229:                        boolean useOverrideTime = DEFAULT_USE_OVERDUE_TIME
230:                                .booleanValue();
231:                        try {
232:                            useOverrideTime = ((Boolean) getAttribute(curi,
233:                                    ATTR_USE_OVERDUE_TIME)).booleanValue();
234:                        } catch (AttributeNotFoundException e1) {
235:                            useOverrideTime = DEFAULT_USE_OVERDUE_TIME
236:                                    .booleanValue();
237:                            logger.fine("Unable to load use-overdue-time for "
238:                                    + curi.toString());
239:                        }
240:
241:                        if (useOverrideTime) {
242:                            waitInterval += curi.getLong(A_FETCH_OVERDUE);
243:                        }
244:
245:                        // Revise the wait interval
246:                        if (curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) {
247:                            // Had changed. Decrease wait interval time.
248:                            double factor;
249:                            try {
250:                                factor = ((Double) getAttribute(curi,
251:                                        ATTR_CHANGED_FACTOR)).doubleValue();
252:                            } catch (AttributeNotFoundException e2) {
253:                                factor = DEFAULT_CHANGED_FACTOR.doubleValue();
254:                                logger
255:                                        .fine("Unable to load changed factor for "
256:                                                + curi.toString());
257:                            }
258:                            waitInterval = (long) (waitInterval / factor);
259:                        } else if (curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_UNCHANGED) {
260:                            // Had not changed. Increase wait interval time
261:                            double factor;
262:                            try {
263:                                factor = ((Double) getAttribute(curi,
264:                                        ATTR_UNCHANGED_FACTOR)).doubleValue();
265:                            } catch (AttributeNotFoundException e2) {
266:                                factor = DEFAULT_UNCHANGED_FACTOR.doubleValue();
267:                                logger
268:                                        .fine("Unable to load unchanged factor for "
269:                                                + curi.toString());
270:                            }
271:                            waitInterval = (long) (waitInterval * factor);
272:                        }
273:                    } else {
274:                        // If wait element not found, use initial wait interval 
275:                        try {
276:                            waitInterval = ((Long) getAttribute(curi,
277:                                    ATTR_INITIAL_WAIT_INTERVAL)).longValue() * 1000;
278:                        } catch (AttributeNotFoundException e1) {
279:                            // If this fails use default (already set) and log error.
280:                            logger
281:                                    .fine("Unable to load initial wait interval for "
282:                                            + curi.toString());
283:                        }
284:                    }
285:                }
286:
287:                if (waitInterval < min) {
288:                    waitInterval = min;
289:                } else if (waitInterval > max) {
290:                    waitInterval = max;
291:                }
292:
293:                if (logger.isLoggable(Level.FINE)) {
294:                    logger.fine("URI " + curi.toString() + ", change: "
295:                            + curi.getInt(A_CONTENT_STATE_KEY)
296:                            + " new wait interval: " + waitInterval);
297:                }
298:                // Update wait interval
299:                curi.putLong(A_WAIT_INTERVAL, waitInterval);
300:                curi.putObject(A_WAIT_REEVALUATED, new Boolean(true));
301:            }
302:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.