01: /* WagCostAssignmentPolicy
02: *
03: * $Id: WagCostAssignmentPolicy.java 3704 2005-07-18 17:30:21Z stack-sf $
04: *
05: * Created on Dec 10, 2004
06: *
07: * Copyright (C) 2004 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.frontier;
26:
27: import org.archive.crawler.datamodel.CrawlURI;
28: import org.archive.net.UURI;
29:
30: /**
31: * A CostAssignmentPolicy based on some wild guesses of kinds of URIs
32: * that should be deferred into the (potentially never-crawled) future.
33: *
34: * @author gojomo
35: */
36: public class WagCostAssignmentPolicy extends CostAssignmentPolicy {
37:
38: /**
39: * Add constant penalties for certain features of URI (and
40: * its 'via') that make it more delayable/skippable.
41: *
42: * @param curi CrawlURI to be assigned a cost
43: *
44: * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
45: */
46: public int costOf(CrawlURI curi) {
47: int cost = 1;
48: UURI uuri = curi.getUURI();
49: if (uuri.hasQuery()) {
50: // has query string
51: cost++;
52: int qIndex = uuri.toString().indexOf('?');
53: if (curi.flattenVia().startsWith(
54: uuri.toString().substring(0, qIndex))) {
55: // non-query-string portion of URI is same as previous
56: cost++;
57: }
58: // TODO: other potential query-related cost penalties:
59: // - more than X query-string attributes
60: // - calendarish terms
61: // - query-string over certain size
62: }
63: // TODO: other potential path-based penalties
64: // - new path is simply extension of via path
65: // - many path segments
66: // TODO: other potential hops-based penalties
67: // - more than X hops
68: // - each speculative hop
69: return cost;
70: }
71: }
|