001: /* DomainSensitiveFrontier
002: *
003: * $Id: DomainSensitiveFrontier.java 4656 2006-09-25 21:34:50Z paul_jack $
004: *
005: * Created on 2004-may-06
006: *
007: * Copyright (C) 2004 Royal Library of Sweden.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.frontier;
026:
027: import java.io.IOException;
028: import java.util.Hashtable;
029: import java.util.logging.Logger;
030:
031: import javax.management.AttributeNotFoundException;
032: import javax.management.MBeanException;
033: import javax.management.ReflectionException;
034:
035: import org.archive.crawler.datamodel.CrawlURI;
036: import org.archive.crawler.event.CrawlURIDispositionListener;
037: import org.archive.crawler.filter.OrFilter;
038: import org.archive.crawler.filter.URIRegExpFilter;
039: import org.archive.crawler.framework.CrawlController;
040: import org.archive.crawler.framework.exceptions.FatalConfigurationException;
041: import org.archive.crawler.prefetch.QuotaEnforcer;
042: import org.archive.crawler.scope.ClassicScope;
043: import org.archive.crawler.settings.CrawlerSettings;
044: import org.archive.crawler.settings.SimpleType;
045: import org.archive.crawler.settings.Type;
046:
047: /**
048: * Behaves like {@link BdbFrontier} (i.e., a basic mostly breadth-first
049: * frontier), but with the addition that you can set the number of documents
050: * to download on a per site basis.
051: *
052: * Useful for case of frequent revisits of a site of frequent changes.
053: *
054: * <p>Choose the number of docs you want to download and specify
055: * the count in <code>max-docs</code>. If <code>count-per-host</code> is
056: * true, the default, then the crawler will download <code>max-docs</code>
057: * per host. If you create an override, the overridden <code>max-docs</code>
058: * count will be downloaded instead, whether it is higher or lower.
059: * <p>If <code>count-per-host</code> is false, then <code>max-docs</code>
060: * acts like the the crawl order <code>max-docs</code> and the crawler will
061: * download this total amount of docs only. Overrides will
062: * download <code>max-docs</code> total in the overridden domain.
063: *
064: * @author Oskar Grenholm <oskar dot grenholm at kb dot se>
065: * @deprecated As of release 1.10.0. Replaced by {@link BdbFrontier} and
066: * {@link QuotaEnforcer}.
067: */
068: public class DomainSensitiveFrontier extends BdbFrontier implements
069: CrawlURIDispositionListener {
070:
071: private static final long serialVersionUID = -3330190056282726202L;
072:
073: private static final Logger logger = Logger
074: .getLogger(DomainSensitiveFrontier.class.getName());
075:
076: public static final String ATTR_MAX_DOCS = "max-docs";
077: public static final String ATTR_COUNTER_MODE = "counter-mode";
078: public static final String COUNT_OVERRIDE = "count-per-override";
079: public static final String COUNT_HOST = "count-per-host";
080: public static final String COUNT_DOMAIN = "count-per-domain";
081: public static final String[] ATTR_AVAILABLE_MODES = new String[] {
082: COUNT_OVERRIDE, COUNT_HOST, COUNT_DOMAIN };
083: public static final String DEFAULT_MODE = COUNT_OVERRIDE;
084:
085: // TODO: Make this a BigMap.
086: private Hashtable<String, Long> hostCounters = new Hashtable<String, Long>();
087: private boolean countPerOverride = true;
088: private String counterMode;
089:
090: public DomainSensitiveFrontier(String name) {
091: super (
092: ATTR_NAME,
093: "DomainSensitiveFrontier. *Deprecated* Use "
094: + "BdbFrontier+QuotaEnforcer instead. "
095: + "Overrides BdbFrontier to add specification of number of "
096: + "documents to download (Expects 'exclude-filter' "
097: + "to be part of CrawlScope).");
098: Type e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCS,
099: "Maximum number of documents to download for host or domain"
100: + " (Zero means no limit).", new Long(0)));
101: e.setOverrideable(true);
102: e = addElementToDefinition(new SimpleType(
103: ATTR_COUNTER_MODE,
104: "If "
105: + COUNT_OVERRIDE
106: + ", acts like the crawl "
107: + "order maximum download count and the crawler will download "
108: + "this total amount of docs only. Override to change the max "
109: + "count for the overridden domain or host. "
110: + "Else if "
111: + COUNT_HOST
112: + " the crawler will download "
113: + ATTR_MAX_DOCS
114: + " per host. Add an override to change "
115: + "max count on a per-domain or a per-host basis.For "
116: + "example, if you set "
117: + ATTR_MAX_DOCS
118: + " to 30 in "
119: + "this mode, the crawler will download 30 docs from "
120: + "each host in scope. If you override for kb.se setting "
121: + ATTR_MAX_DOCS
122: + " to 20, it will instead download only 20 docs from each "
123: + "host of kb.se. (It can be a larger as well as a smaller "
124: + "value here.). "
125: + "Finally "
126: + COUNT_DOMAIN
127: + " behaves similar to "
128: + COUNT_HOST
129: + ", but instead sets max on a per-domain basis."
130: + "Here you can do overrides on the domain-level, but "
131: + "not on the host-level. So if you here set "
132: + ATTR_MAX_DOCS
133: + " to 30 the crawler will download 30 docs from each "
134: + "domain in scope. If you override for kb.se setting "
135: + ATTR_MAX_DOCS
136: + " to 20, it will instead download only "
137: + "20 docs in total from the whole kb.se domain. (It can be "
138: + "a larger as well as a smaller value here.)",
139: DEFAULT_MODE, ATTR_AVAILABLE_MODES));
140: e.setOverrideable(false);
141: }
142:
143: public void initialize(CrawlController c)
144: throws FatalConfigurationException, IOException {
145: super .initialize(c);
146: this .controller.addCrawlURIDispositionListener(this );
147: try {
148: counterMode = ((String) getAttribute(ATTR_COUNTER_MODE));
149: if (counterMode.equalsIgnoreCase(COUNT_DOMAIN)
150: || counterMode.equalsIgnoreCase(COUNT_HOST))
151: countPerOverride = false;
152: else
153: countPerOverride = true;
154: } catch (AttributeNotFoundException e) {
155: e.printStackTrace();
156: } catch (MBeanException e) {
157: e.printStackTrace();
158: } catch (ReflectionException e) {
159: e.printStackTrace();
160: }
161: }
162:
163: /**
164: * Check if the max document download limit for this host or domain has
165: * been reached.
166: *
167: * If so, delete the rest of the URIs for this host or domain waiting in
168: * the queue. Then add an URIRegExpFilter for this host or domain, so
169: * we won't get any more URIs from this one later on.
170: * @param curi CrawlURI.
171: * @return True if discarded queue.
172: */
173: private synchronized boolean checkDownloadLimits(CrawlURI curi) {
174: long this MaxDocs = 0;
175: long this Counter = 0;
176: boolean discarded = false;
177: boolean retVal = false;
178: if (curi.getUURI().getScheme().equals("dns")) {
179: return false;
180: }
181: try {
182: String host = curi.getUURI().getHost();
183: CrawlerSettings cs = controller.getSettingsHandler()
184: .getSettings(host);
185: do {
186: String scope;
187: if (counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
188: scope = cs.getScope() != null ? cs.getScope()
189: : "root";
190: else if (counterMode.equalsIgnoreCase(COUNT_HOST))
191: scope = host;
192: else { //Get domain part of host
193: int i = host.lastIndexOf(".");
194: i = host.lastIndexOf(".", i - 1);
195: scope = host.substring(i + 1, host.length());
196: }
197: this MaxDocs = ((Long) getAttribute(cs, ATTR_MAX_DOCS))
198: .longValue();
199: this Counter = this .hostCounters.get(scope) != null ? ((Long) this .hostCounters
200: .get(scope)).longValue()
201: : 0;
202: // Have we hit the max document download limit for this host
203: // or domain?
204: if ((this MaxDocs > 0 && this Counter >= this MaxDocs)) {
205: logger.fine("Discarding Queue: " + host + " ");
206: curi.addAnnotation("dsfLimit");
207: if (!discarded) {
208: long count = 0;
209: WorkQueue wq = getQueueFor(curi);
210: wq.unpeek();
211: count += wq.deleteMatching(this , ".*");
212: decrementQueuedCount(count);
213: discarded = true;
214: // I tried adding annotation but we're past log time
215: // for Curi so it doesn't work.
216: // curi.addAnnotation("maxDocsForHost");
217: }
218: // Adding an exclude filter for this host or domain
219: OrFilter or = (OrFilter) this .controller.getScope()
220: .getAttribute(
221: ClassicScope.ATTR_EXCLUDE_FILTER);
222: // If we have hit max for root, block everything. Else
223: // just the scope.
224: String filter = scope.equalsIgnoreCase("root") ? ".*"
225: : "^((https?://)?[a-zA-Z0-9\\.]*)" + scope
226: + "($|/.*)";
227: logger.fine("Adding filter: [" + filter + "].");
228: URIRegExpFilter urf = new URIRegExpFilter(curi
229: .toString(), filter);
230: or.addFilter(this .controller.getSettingsHandler()
231: .getSettings(null), urf);
232: this MaxDocs = 0;
233: this Counter = 0;
234: retVal = true;
235: }
236: } while ((cs = cs.getParent()) != null && countPerOverride);
237: } catch (Exception e) {
238: logger.severe("ERROR: checkDownloadLimits(), "
239: + "while processing {" + curi.toString() + "}"
240: + e.getClass() + "message: " + e.getMessage()
241: + ". Stack trace:");
242: e.printStackTrace();
243: }
244: return retVal;
245: }
246:
247: protected synchronized void incrementHostCounters(CrawlURI curi) {
248: if (!curi.getUURI().toString().startsWith("dns:")) {
249: try {
250: String host = curi.getUURI().getHost();
251: CrawlerSettings cs = controller.getSettingsHandler()
252: .getSettings(host);
253: do {
254: String scope;
255: if (counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
256: scope = cs.getScope() != null ? cs.getScope()
257: : "root";
258: else if (counterMode.equalsIgnoreCase(COUNT_HOST))
259: scope = host;
260: else { //Get only domain part of host
261: int i = host.lastIndexOf(".");
262: i = host.lastIndexOf(".", i - 1);
263: scope = host.substring(i + 1, host.length());
264: }
265: long counter = this .hostCounters.get(scope) != null ? ((Long) this .hostCounters
266: .get(scope)).longValue()
267: : 0;
268: this .hostCounters.put(scope, new Long(++counter));
269: } while ((cs = cs.getParent()) != null
270: && countPerOverride);
271: } catch (Exception e) {
272: logger.severe("ERROR: incrementHostCounters() "
273: + e.getMessage());
274: }
275: }
276: }
277:
278: public void crawledURISuccessful(CrawlURI curi) {
279: incrementHostCounters(curi);
280: checkDownloadLimits(curi);
281: }
282:
283: public void crawledURINeedRetry(CrawlURI curi) {
284: }
285:
286: public void crawledURIDisregard(CrawlURI curi) {
287: }
288:
289: public void crawledURIFailure(CrawlURI curi) {
290: }
291: }
|