01: /* StripWWWRule
02: *
03: * Created on Oct 5, 2004
04: *
05: * Copyright (C) 2004 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.url.canonicalize;
24:
25: import java.util.regex.Pattern;
26:
27: /**
28: * Strip any 'www[0-9]*' found on http/https URLs IF they have some
29: * path/query component (content after third slash). Top 'slash page'
30: * URIs are left unstripped: we prefer crawling redundant
31: * top pages to missing an entire site only available from either
32: * the www-full or www-less hostname, but not both.
33: * @author stack
34: * @version $Date: 2006-09-18 20:32:47 +0000 (Mon, 18 Sep 2006) $, $Revision: 4634 $
35: */
36: public class StripWWWNRule extends BaseRule {
37: private static final long serialVersionUID = 3619916990307308590L;
38:
39: private static final String DESCRIPTION = "Strip any 'www[0-9]*' found. "
40: + "Use this rule to equate 'http://www.archive.org/index.html' and "
41: + "'http://www0001.archive.org/index.html' with "
42: + "'http://archive.org/index.html'. The resulting canonicalization "
43: + "returns 'http://archive.org/index.html'. It removes any www's "
44: + "or wwwNNN's found, where 'N' is one or more numerics, EXCEPT "
45: + "on URIs that have no path/query component "
46: + ". Top-level 'slash page' URIs are left unstripped: we prefer "
47: + "crawling redundant top pages to missing an entire site only "
48: + "available from either the www-full or www-less hostname, but not "
49: + "both. Operates on http and https schemes only. "
50: + "Use StripWWWRule to strip a lone 'www' only (This rule is a "
51: + "more general version of StripWWWRule).";
52:
53: private static final Pattern REGEX = Pattern
54: .compile("(?i)^(https?://)(?:www[0-9]*\\.)([^/]*/.+)$");
55:
56: public StripWWWNRule(String name) {
57: super (name, DESCRIPTION);
58: }
59:
60: public String canonicalize(String url, Object context) {
61: return doStripRegexMatch(url, REGEX.matcher(url));
62: }
63: }
|