001: /* QuotaEnforcer
002: *
003: * Created on Nov 4, 2005
004: *
005: * Copyright (C) 2005 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.prefetch;
024:
025: import java.util.logging.Level;
026: import java.util.logging.Logger;
027:
028: import org.archive.crawler.datamodel.CoreAttributeConstants;
029: import org.archive.crawler.datamodel.CrawlSubstats;
030: import org.archive.crawler.datamodel.CrawlURI;
031: import org.archive.crawler.datamodel.FetchStatusCodes;
032: import org.archive.crawler.framework.Processor;
033: import org.archive.crawler.settings.SimpleType;
034:
035: /**
036: * A simple quota enforcer. If the host, server, or frontier group
037: * associated with the current CrawlURI is already over its quotas,
038: * blocks the current URI's processing with S_BLOCKED_BY_QUOTA.
039: *
040: * @author gojomo
041: * @version $Date: 2007-04-06 00:40:50 +0000 (Fri, 06 Apr 2007) $, $Revision: 5040 $
042: */
043: public class QuotaEnforcer extends Processor implements
044: FetchStatusCodes {
045:
046: private static final long serialVersionUID = 6091720623469404595L;
047:
048: private final Logger LOGGER = Logger.getLogger(this .getClass()
049: .getName());
050:
051: // indexed table of reused string categorical names/keys
052: protected static final int SERVER = 0;
053: protected static final int HOST = 1;
054: protected static final int GROUP = 2;
055: protected static final int NAME = 0;
056: protected static final int SUCCESSES = 1;
057: protected static final int SUCCESS_KB = 2;
058: protected static final int RESPONSES = 3;
059: protected static final int RESPONSE_KB = 4;
060: protected static final String[][] keys = new String[][] {
061: { "server", "server-max-fetch-successes",
062: "server-max-success-kb",
063: "server-max-fetch-responses", "server-max-all-kb" },
064: { "host", "host-max-fetch-successes",
065: "host-max-success-kb", "host-max-fetch-responses",
066: "host-max-all-kb" },
067: { "group", "group-max-fetch-successes",
068: "group-max-success-kb",
069: "group-max-fetch-responses", "group-max-all-kb" } };
070:
071: // server quotas
072: // successes
073: /** server max successful fetches */
074: protected static final String ATTR_SERVER_MAX_FETCH_SUCCESSES = keys[SERVER][SUCCESSES];
075: protected static final Long DEFAULT_SERVER_MAX_FETCH_SUCCESSES = new Long(
076: -1);
077: /** server max successful fetch bytes */
078: protected static final String ATTR_SERVER_MAX_SUCCESS_KB = keys[SERVER][SUCCESS_KB];;
079: protected static final Long DEFAULT_SERVER_MAX_SUCCESS_KB = new Long(
080: -1);
081: // all-responses
082: /** server max fetch responses (including error codes) */
083: protected static final String ATTR_SERVER_MAX_FETCH_RESPONSES = keys[SERVER][RESPONSES];
084: protected static final Long DEFAULT_SERVER_MAX_FETCH_RESPONSES = new Long(
085: -1);
086: /** server max all fetch bytes (including error responses) */
087: protected static final String ATTR_SERVER_MAX_ALL_KB = keys[SERVER][RESPONSE_KB];
088: protected static final Long DEFAULT_SERVER_MAX_ALL_KB = new Long(-1);
089:
090: // host quotas
091: // successes
092: /** host max successful fetches */
093: protected static final String ATTR_HOST_MAX_FETCH_SUCCESSES = keys[HOST][SUCCESSES];;
094: protected static final Long DEFAULT_HOST_MAX_FETCH_SUCCESSES = new Long(
095: -1);
096: /** host max successful fetch bytes */
097: protected static final String ATTR_HOST_MAX_SUCCESS_KB = keys[HOST][SUCCESS_KB];;
098: protected static final Long DEFAULT_HOST_MAX_SUCCESS_KB = new Long(
099: -1);
100: // all-responses
101: /** host max fetch responses (including error codes) */
102: protected static final String ATTR_HOST_MAX_FETCH_RESPONSES = keys[HOST][RESPONSES];
103: protected static final Long DEFAULT_HOST_MAX_FETCH_RESPONSES = new Long(
104: -1);
105: /** host max all fetch bytes (including error responses) */
106: protected static final String ATTR_HOST_MAX_ALL_KB = keys[HOST][RESPONSE_KB];
107: protected static final Long DEFAULT_HOST_MAX_ALL_KB = new Long(-1);
108:
109: // group quotas
110: // successes
111: /** group max successful fetches */
112: protected static final String ATTR_GROUP_MAX_FETCH_SUCCESSES = keys[GROUP][SUCCESSES];
113: protected static final Long DEFAULT_GROUP_MAX_FETCH_SUCCESSES = new Long(
114: -1);
115: /** group max successful fetch bytes */
116: protected static final String ATTR_GROUP_MAX_SUCCESS_KB = keys[GROUP][SUCCESS_KB];
117: protected static final Long DEFAULT_GROUP_MAX_SUCCESS_KB = new Long(
118: -1);
119: // all-responses
120: /** group max fetch responses (including error codes) */
121: protected static final String ATTR_GROUP_MAX_FETCH_RESPONSES = keys[GROUP][RESPONSES];
122: protected static final Long DEFAULT_GROUP_MAX_FETCH_RESPONSES = new Long(
123: -1);
124: /** group max all fetch bytes (including error responses) */
125: protected static final String ATTR_GROUP_MAX_ALL_KB = keys[GROUP][RESPONSE_KB];
126: protected static final Long DEFAULT_GROUP_MAX_ALL_KB = new Long(-1);
127:
128: /** whether to force-retire when over-quote detected */
129: protected static final String ATTR_FORCE_RETIRE = "force-retire";
130: protected static final Boolean DEFAULT_FORCE_RETIRE = true;
131:
132: /**
133: * Constructor.
134: * @param name Name of this processor.
135: */
136: public QuotaEnforcer(String name) {
137: super (name, "QuotaEnforcer.");
138:
139: addElementToDefinition(new SimpleType(
140: ATTR_FORCE_RETIRE,
141: "Whether an over-quota situation should result in the "
142: + "containing queue being force-retired (if the Frontier "
143: + "supports this). Note that if your queues combine URIs "
144: + "that are different with regard to the quota category, "
145: + "the retirement may hold back URIs not in the same "
146: + "quota category. " + "Default is false.",
147: DEFAULT_FORCE_RETIRE));
148:
149: String maxFetchSuccessesDesc = "Maximum number of fetch successes "
150: + "(e.g. 200 responses) to collect from one CATEGORY. "
151: + "Default is -1, meaning no limit.";
152: String maxSuccessKbDesc = "Maximum amount of fetch success content "
153: + "(e.g. 200 responses) in KB to collect from one CATEGORY. "
154: + "Default is -1, meaning no limit.";
155: String maxFetchResponsesDesc = "Maximum number of fetch responses "
156: + "(incl. error responses) to collect from one CATEGORY. "
157: + "Default is -1, meaning no limit.";
158: String maxAllKbDesc = "Maximum amount of response content "
159: + "(incl. error responses) in KB to collect from one CATEGORY. "
160: + "Default is -1, meaning no limit.";
161: // server successes
162: addElementToDefinition(new SimpleType(
163: ATTR_SERVER_MAX_FETCH_SUCCESSES, maxFetchSuccessesDesc
164: .replaceAll("CATEGORY", "server"),
165: DEFAULT_SERVER_MAX_FETCH_SUCCESSES));
166: addElementToDefinition(new SimpleType(
167: ATTR_SERVER_MAX_SUCCESS_KB, maxSuccessKbDesc
168: .replaceAll("CATEGORY", "server"),
169: DEFAULT_SERVER_MAX_SUCCESS_KB));
170: // server all-responses
171: addElementToDefinition(new SimpleType(
172: ATTR_SERVER_MAX_FETCH_RESPONSES, maxFetchResponsesDesc
173: .replaceAll("CATEGORY", "server"),
174: DEFAULT_SERVER_MAX_FETCH_RESPONSES));
175: addElementToDefinition(new SimpleType(ATTR_SERVER_MAX_ALL_KB,
176: maxAllKbDesc.replaceAll("CATEGORY", "server"),
177: DEFAULT_SERVER_MAX_ALL_KB));
178: // host successes
179: addElementToDefinition(new SimpleType(
180: ATTR_HOST_MAX_FETCH_SUCCESSES, maxFetchSuccessesDesc
181: .replaceAll("CATEGORY", "host"),
182: DEFAULT_HOST_MAX_FETCH_SUCCESSES));
183: addElementToDefinition(new SimpleType(ATTR_HOST_MAX_SUCCESS_KB,
184: maxSuccessKbDesc.replaceAll("CATEGORY", "host"),
185: DEFAULT_HOST_MAX_SUCCESS_KB));
186: // host all-responses
187: addElementToDefinition(new SimpleType(
188: ATTR_HOST_MAX_FETCH_RESPONSES, maxFetchResponsesDesc
189: .replaceAll("CATEGORY", "host"),
190: DEFAULT_HOST_MAX_FETCH_RESPONSES));
191: addElementToDefinition(new SimpleType(ATTR_HOST_MAX_ALL_KB,
192: maxAllKbDesc.replaceAll("CATEGORY", "host"),
193: DEFAULT_HOST_MAX_ALL_KB));
194: // group successes
195: addElementToDefinition(new SimpleType(
196: ATTR_GROUP_MAX_FETCH_SUCCESSES, maxFetchSuccessesDesc
197: .replaceAll("CATEGORY", "group (queue)"),
198: DEFAULT_GROUP_MAX_FETCH_SUCCESSES));
199: addElementToDefinition(new SimpleType(
200: ATTR_GROUP_MAX_SUCCESS_KB, maxSuccessKbDesc.replaceAll(
201: "CATEGORY", "group (queue)"),
202: DEFAULT_GROUP_MAX_SUCCESS_KB));
203: // group all-responses
204: addElementToDefinition(new SimpleType(
205: ATTR_GROUP_MAX_FETCH_RESPONSES, maxFetchResponsesDesc
206: .replaceAll("CATEGORY", "group (queue)"),
207: DEFAULT_GROUP_MAX_FETCH_RESPONSES));
208: addElementToDefinition(new SimpleType(ATTR_GROUP_MAX_ALL_KB,
209: maxAllKbDesc.replaceAll("CATEGORY", "group (queue)"),
210: DEFAULT_GROUP_MAX_ALL_KB));
211:
212: }
213:
214: protected void innerProcess(CrawlURI curi) {
215: CrawlSubstats.HasCrawlSubstats[] haveStats = new CrawlSubstats.HasCrawlSubstats[] {
216: getController().getServerCache().getServerFor(curi), // server
217: getController().getServerCache().getHostFor(curi), // host
218: getController().getFrontier().getGroup(curi) // group
219: };
220:
221: for (int cat = SERVER; cat <= GROUP; cat++) {
222: if (checkQuotas(curi, haveStats[cat], cat)) {
223: return;
224: }
225: }
226: }
227:
228: /**
229: * Check all quotas for the given substats and category (server, host, or
230: * group).
231: *
232: * @param curi CrawlURI to mark up with results
233: * @param hasStats holds CrawlSubstats with actual values to test
234: * @param CAT category index (SERVER, HOST, GROUP) to quota settings keys
235: * @return true if quota precludes fetching of CrawlURI
236: */
237: protected boolean checkQuotas(final CrawlURI curi,
238: final CrawlSubstats.HasCrawlSubstats hasStats, final int CAT) {
239: if (hasStats == null) {
240: if (LOGGER.isLoggable(Level.FINE)) {
241: LOGGER.fine(curi.toString() + " null stats category: "
242: + CAT);
243: }
244: return false;
245: }
246: CrawlSubstats substats = hasStats.getSubstats();
247: long[] actuals = new long[] {
248: -1, // dummy
249: substats.getFetchSuccesses(),
250: substats.getSuccessBytes() / 1024,
251: substats.getFetchResponses(),
252: substats.getTotalBytes() / 1024, };
253: for (int q = SUCCESSES; q <= RESPONSE_KB; q++) {
254: if (applyQuota(curi, keys[CAT][q], actuals[q])) {
255: return true;
256: }
257: }
258: return false;
259: }
260:
261: /**
262: * Apply the quota specified by the given key against the actual
263: * value provided. If the quota and actual values rule out processing the
264: * given CrawlURI, mark up the CrawlURI appropriately.
265: *
266: * @param curi CrawlURI whose processing is subject to a potential quota
267: * limitation
268: * @param quotaKey settings key to get applicable quota
269: * @param actual current value to compare to quota
270: * @return true is CrawlURI is blocked by a quota, false otherwise
271: */
272: protected boolean applyQuota(CrawlURI curi, String quotaKey,
273: long actual) {
274: long quota = ((Long) getUncheckedAttribute(curi, quotaKey))
275: .longValue();
276: if (quota >= 0 && actual >= quota) {
277: curi.setFetchStatus(S_BLOCKED_BY_QUOTA);
278: curi.addAnnotation("Q:" + quotaKey);
279: curi.skipToProcessorChain(getController()
280: .getPostprocessorChain());
281: if ((Boolean) getUncheckedAttribute(curi, ATTR_FORCE_RETIRE)) {
282: curi.putObject(CoreAttributeConstants.A_FORCE_RETIRE,
283: (Boolean) true);
284: }
285: return true;
286: }
287: return false;
288: }
289: }
|