01: /* AntiCalendarCostAssignmentPolicy
02: *
03: * $Id: AntiCalendarCostAssignmentPolicy.java 4953 2007-03-03 01:32:53Z gojomo $
04: *
05: * Created on Dec 15, 2004
06: *
07: * Copyright (C) 2004 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.frontier;
26:
27: import java.util.regex.Matcher;
28:
29: import org.archive.crawler.datamodel.CrawlURI;
30: import org.archive.util.TextUtils;
31:
32: /**
33: * CostAssignmentPolicy that further penalizes URIs with
34: * calendar-suggestive strings in them, with an extra unit
35: * of cost.
36: *
37: * Will catch some 'innocent' URIs, but only when uncaught
38: * large-volume chaff is ranked higher than caught 'wheat'
39: * will this cause notable problems.
40: *
41: * @author gojomo
42: */
43: public class AntiCalendarCostAssignmentPolicy extends
44: UnitCostAssignmentPolicy {
45: public static String CALENDARISH = "(?i)(calendar)|(year)|(month)|(day)|(date)|(viewcal)"
46: + "|(\\D19\\d\\d\\D)|(\\D20\\d\\d\\D)|(event)|(yr=)"
47: + "|(calendrier)|(jour)";
48:
49: /* (non-Javadoc)
50: * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
51: */
52: public int costOf(CrawlURI curi) {
53: int cost = super .costOf(curi);
54: Matcher m = TextUtils.getMatcher(CALENDARISH, curi.toString());
55: if (m.find()) {
56: cost++;
57: // TODO: consider if multiple occurences should cost more
58: }
59: TextUtils.recycleMatcher(m);
60: return cost;
61: }
62: }
|