Source Code Cross Referenced for QuotaEnforcer.java in  » Web-Crawler » heritrix » org » archive » crawler » prefetch » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.prefetch 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /* QuotaEnforcer
002:         * 
003:         * Created on Nov 4, 2005
004:         *
005:         * Copyright (C) 2005 Internet Archive.
006:         * 
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         * 
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         * 
014:         * Heritrix is distributed in the hope that it will be useful, 
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         * 
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.crawler.prefetch;
024:
025:        import java.util.logging.Level;
026:        import java.util.logging.Logger;
027:
028:        import org.archive.crawler.datamodel.CoreAttributeConstants;
029:        import org.archive.crawler.datamodel.CrawlSubstats;
030:        import org.archive.crawler.datamodel.CrawlURI;
031:        import org.archive.crawler.datamodel.FetchStatusCodes;
032:        import org.archive.crawler.framework.Processor;
033:        import org.archive.crawler.settings.SimpleType;
034:
035:        /**
036:         * A simple quota enforcer. If the host, server, or frontier group
037:         * associated with the current CrawlURI is already over its quotas, 
038:         * blocks the current URI's processing with S_BLOCKED_BY_QUOTA.
039:         * 
040:         * @author gojomo
041:         * @version $Date: 2007-04-06 00:40:50 +0000 (Fri, 06 Apr 2007) $, $Revision: 5040 $
042:         */
043:        public class QuotaEnforcer extends Processor implements 
044:                FetchStatusCodes {
045:
046:            private static final long serialVersionUID = 6091720623469404595L;
047:
048:            private final Logger LOGGER = Logger.getLogger(this .getClass()
049:                    .getName());
050:
051:            // indexed table of reused string categorical names/keys
052:            protected static final int SERVER = 0;
053:            protected static final int HOST = 1;
054:            protected static final int GROUP = 2;
055:            protected static final int NAME = 0;
056:            protected static final int SUCCESSES = 1;
057:            protected static final int SUCCESS_KB = 2;
058:            protected static final int RESPONSES = 3;
059:            protected static final int RESPONSE_KB = 4;
060:            protected static final String[][] keys = new String[][] {
061:                    { "server", "server-max-fetch-successes",
062:                            "server-max-success-kb",
063:                            "server-max-fetch-responses", "server-max-all-kb" },
064:                    { "host", "host-max-fetch-successes",
065:                            "host-max-success-kb", "host-max-fetch-responses",
066:                            "host-max-all-kb" },
067:                    { "group", "group-max-fetch-successes",
068:                            "group-max-success-kb",
069:                            "group-max-fetch-responses", "group-max-all-kb" } };
070:
071:            // server quotas
072:            // successes
073:            /** server max successful fetches */
074:            protected static final String ATTR_SERVER_MAX_FETCH_SUCCESSES = keys[SERVER][SUCCESSES];
075:            protected static final Long DEFAULT_SERVER_MAX_FETCH_SUCCESSES = new Long(
076:                    -1);
077:            /** server max successful fetch bytes */
078:            protected static final String ATTR_SERVER_MAX_SUCCESS_KB = keys[SERVER][SUCCESS_KB];;
079:            protected static final Long DEFAULT_SERVER_MAX_SUCCESS_KB = new Long(
080:                    -1);
081:            // all-responses
082:            /** server max fetch responses (including error codes) */
083:            protected static final String ATTR_SERVER_MAX_FETCH_RESPONSES = keys[SERVER][RESPONSES];
084:            protected static final Long DEFAULT_SERVER_MAX_FETCH_RESPONSES = new Long(
085:                    -1);
086:            /** server max all fetch bytes (including error responses) */
087:            protected static final String ATTR_SERVER_MAX_ALL_KB = keys[SERVER][RESPONSE_KB];
088:            protected static final Long DEFAULT_SERVER_MAX_ALL_KB = new Long(-1);
089:
090:            // host quotas
091:            // successes
092:            /** host max successful fetches */
093:            protected static final String ATTR_HOST_MAX_FETCH_SUCCESSES = keys[HOST][SUCCESSES];;
094:            protected static final Long DEFAULT_HOST_MAX_FETCH_SUCCESSES = new Long(
095:                    -1);
096:            /** host max successful fetch bytes */
097:            protected static final String ATTR_HOST_MAX_SUCCESS_KB = keys[HOST][SUCCESS_KB];;
098:            protected static final Long DEFAULT_HOST_MAX_SUCCESS_KB = new Long(
099:                    -1);
100:            // all-responses
101:            /** host max fetch responses (including error codes) */
102:            protected static final String ATTR_HOST_MAX_FETCH_RESPONSES = keys[HOST][RESPONSES];
103:            protected static final Long DEFAULT_HOST_MAX_FETCH_RESPONSES = new Long(
104:                    -1);
105:            /** host max all fetch bytes (including error responses) */
106:            protected static final String ATTR_HOST_MAX_ALL_KB = keys[HOST][RESPONSE_KB];
107:            protected static final Long DEFAULT_HOST_MAX_ALL_KB = new Long(-1);
108:
109:            // group quotas
110:            // successes
111:            /** group max successful fetches */
112:            protected static final String ATTR_GROUP_MAX_FETCH_SUCCESSES = keys[GROUP][SUCCESSES];
113:            protected static final Long DEFAULT_GROUP_MAX_FETCH_SUCCESSES = new Long(
114:                    -1);
115:            /** group max successful fetch bytes */
116:            protected static final String ATTR_GROUP_MAX_SUCCESS_KB = keys[GROUP][SUCCESS_KB];
117:            protected static final Long DEFAULT_GROUP_MAX_SUCCESS_KB = new Long(
118:                    -1);
119:            // all-responses
120:            /** group max fetch responses (including error codes) */
121:            protected static final String ATTR_GROUP_MAX_FETCH_RESPONSES = keys[GROUP][RESPONSES];
122:            protected static final Long DEFAULT_GROUP_MAX_FETCH_RESPONSES = new Long(
123:                    -1);
124:            /** group max all fetch bytes (including error responses) */
125:            protected static final String ATTR_GROUP_MAX_ALL_KB = keys[GROUP][RESPONSE_KB];
126:            protected static final Long DEFAULT_GROUP_MAX_ALL_KB = new Long(-1);
127:
128:            /** whether to force-retire when over-quote detected */
129:            protected static final String ATTR_FORCE_RETIRE = "force-retire";
130:            protected static final Boolean DEFAULT_FORCE_RETIRE = true;
131:
132:            /**
133:             * Constructor.
134:             * @param name Name of this processor.
135:             */
136:            public QuotaEnforcer(String name) {
137:                super (name, "QuotaEnforcer.");
138:
139:                addElementToDefinition(new SimpleType(
140:                        ATTR_FORCE_RETIRE,
141:                        "Whether an over-quota situation should result in the "
142:                                + "containing queue being force-retired (if the Frontier "
143:                                + "supports this). Note that if your queues combine URIs "
144:                                + "that are different with regard to the quota category, "
145:                                + "the retirement may hold back URIs not in the same "
146:                                + "quota category. " + "Default is false.",
147:                        DEFAULT_FORCE_RETIRE));
148:
149:                String maxFetchSuccessesDesc = "Maximum number of fetch successes "
150:                        + "(e.g. 200 responses) to collect from one CATEGORY. "
151:                        + "Default is -1, meaning no limit.";
152:                String maxSuccessKbDesc = "Maximum amount of fetch success content "
153:                        + "(e.g. 200 responses) in KB to collect from one CATEGORY. "
154:                        + "Default is -1, meaning no limit.";
155:                String maxFetchResponsesDesc = "Maximum number of fetch responses "
156:                        + "(incl. error responses) to collect from one CATEGORY. "
157:                        + "Default is -1, meaning no limit.";
158:                String maxAllKbDesc = "Maximum amount of response content "
159:                        + "(incl. error responses) in KB to collect from one CATEGORY. "
160:                        + "Default is -1, meaning no limit.";
161:                // server successes
162:                addElementToDefinition(new SimpleType(
163:                        ATTR_SERVER_MAX_FETCH_SUCCESSES, maxFetchSuccessesDesc
164:                                .replaceAll("CATEGORY", "server"),
165:                        DEFAULT_SERVER_MAX_FETCH_SUCCESSES));
166:                addElementToDefinition(new SimpleType(
167:                        ATTR_SERVER_MAX_SUCCESS_KB, maxSuccessKbDesc
168:                                .replaceAll("CATEGORY", "server"),
169:                        DEFAULT_SERVER_MAX_SUCCESS_KB));
170:                // server all-responses
171:                addElementToDefinition(new SimpleType(
172:                        ATTR_SERVER_MAX_FETCH_RESPONSES, maxFetchResponsesDesc
173:                                .replaceAll("CATEGORY", "server"),
174:                        DEFAULT_SERVER_MAX_FETCH_RESPONSES));
175:                addElementToDefinition(new SimpleType(ATTR_SERVER_MAX_ALL_KB,
176:                        maxAllKbDesc.replaceAll("CATEGORY", "server"),
177:                        DEFAULT_SERVER_MAX_ALL_KB));
178:                // host successes
179:                addElementToDefinition(new SimpleType(
180:                        ATTR_HOST_MAX_FETCH_SUCCESSES, maxFetchSuccessesDesc
181:                                .replaceAll("CATEGORY", "host"),
182:                        DEFAULT_HOST_MAX_FETCH_SUCCESSES));
183:                addElementToDefinition(new SimpleType(ATTR_HOST_MAX_SUCCESS_KB,
184:                        maxSuccessKbDesc.replaceAll("CATEGORY", "host"),
185:                        DEFAULT_HOST_MAX_SUCCESS_KB));
186:                // host all-responses
187:                addElementToDefinition(new SimpleType(
188:                        ATTR_HOST_MAX_FETCH_RESPONSES, maxFetchResponsesDesc
189:                                .replaceAll("CATEGORY", "host"),
190:                        DEFAULT_HOST_MAX_FETCH_RESPONSES));
191:                addElementToDefinition(new SimpleType(ATTR_HOST_MAX_ALL_KB,
192:                        maxAllKbDesc.replaceAll("CATEGORY", "host"),
193:                        DEFAULT_HOST_MAX_ALL_KB));
194:                // group successes
195:                addElementToDefinition(new SimpleType(
196:                        ATTR_GROUP_MAX_FETCH_SUCCESSES, maxFetchSuccessesDesc
197:                                .replaceAll("CATEGORY", "group (queue)"),
198:                        DEFAULT_GROUP_MAX_FETCH_SUCCESSES));
199:                addElementToDefinition(new SimpleType(
200:                        ATTR_GROUP_MAX_SUCCESS_KB, maxSuccessKbDesc.replaceAll(
201:                                "CATEGORY", "group (queue)"),
202:                        DEFAULT_GROUP_MAX_SUCCESS_KB));
203:                // group all-responses
204:                addElementToDefinition(new SimpleType(
205:                        ATTR_GROUP_MAX_FETCH_RESPONSES, maxFetchResponsesDesc
206:                                .replaceAll("CATEGORY", "group (queue)"),
207:                        DEFAULT_GROUP_MAX_FETCH_RESPONSES));
208:                addElementToDefinition(new SimpleType(ATTR_GROUP_MAX_ALL_KB,
209:                        maxAllKbDesc.replaceAll("CATEGORY", "group (queue)"),
210:                        DEFAULT_GROUP_MAX_ALL_KB));
211:
212:            }
213:
214:            protected void innerProcess(CrawlURI curi) {
215:                CrawlSubstats.HasCrawlSubstats[] haveStats = new CrawlSubstats.HasCrawlSubstats[] {
216:                        getController().getServerCache().getServerFor(curi), // server
217:                        getController().getServerCache().getHostFor(curi), // host
218:                        getController().getFrontier().getGroup(curi) // group
219:                };
220:
221:                for (int cat = SERVER; cat <= GROUP; cat++) {
222:                    if (checkQuotas(curi, haveStats[cat], cat)) {
223:                        return;
224:                    }
225:                }
226:            }
227:
228:            /**
229:             * Check all quotas for the given substats and category (server, host, or
230:             * group). 
231:             * 
232:             * @param curi CrawlURI to mark up with results
233:             * @param hasStats  holds CrawlSubstats with actual values to test
234:             * @param CAT category index (SERVER, HOST, GROUP) to quota settings keys
235:             * @return true if quota precludes fetching of CrawlURI
236:             */
237:            protected boolean checkQuotas(final CrawlURI curi,
238:                    final CrawlSubstats.HasCrawlSubstats hasStats, final int CAT) {
239:                if (hasStats == null) {
240:                    if (LOGGER.isLoggable(Level.FINE)) {
241:                        LOGGER.fine(curi.toString() + " null stats category: "
242:                                + CAT);
243:                    }
244:                    return false;
245:                }
246:                CrawlSubstats substats = hasStats.getSubstats();
247:                long[] actuals = new long[] {
248:                        -1, // dummy
249:                        substats.getFetchSuccesses(),
250:                        substats.getSuccessBytes() / 1024,
251:                        substats.getFetchResponses(),
252:                        substats.getTotalBytes() / 1024, };
253:                for (int q = SUCCESSES; q <= RESPONSE_KB; q++) {
254:                    if (applyQuota(curi, keys[CAT][q], actuals[q])) {
255:                        return true;
256:                    }
257:                }
258:                return false;
259:            }
260:
261:            /**
262:             * Apply the quota specified by the given key against the actual 
263:             * value provided. If the quota and actual values rule out processing the 
264:             * given CrawlURI,  mark up the CrawlURI appropriately. 
265:             * 
266:             * @param curi CrawlURI whose processing is subject to a potential quota
267:             * limitation
268:             * @param quotaKey settings key to get applicable quota
269:             * @param actual current value to compare to quota 
270:             * @return true is CrawlURI is blocked by a quota, false otherwise
271:             */
272:            protected boolean applyQuota(CrawlURI curi, String quotaKey,
273:                    long actual) {
274:                long quota = ((Long) getUncheckedAttribute(curi, quotaKey))
275:                        .longValue();
276:                if (quota >= 0 && actual >= quota) {
277:                    curi.setFetchStatus(S_BLOCKED_BY_QUOTA);
278:                    curi.addAnnotation("Q:" + quotaKey);
279:                    curi.skipToProcessorChain(getController()
280:                            .getPostprocessorChain());
281:                    if ((Boolean) getUncheckedAttribute(curi, ATTR_FORCE_RETIRE)) {
282:                        curi.putObject(CoreAttributeConstants.A_FORCE_RETIRE,
283:                                (Boolean) true);
284:                    }
285:                    return true;
286:                }
287:                return false;
288:            }
289:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.