01: /*
02: * Created on 2006-aug-25
03: *
04: * Copyright (C) 2006 Royal Library of Sweden.
05: *
06: * This program is free software; you can redistribute it and/or
07: * modify it under the terms of the GNU Lesser General Public License
08: * as published by the Free Software Foundation; either version 2
09: * of the License, or (at your option) any later version.
10: *
11: * This program is distributed in the hope that it will be useful,
12: * but WITHOUT ANY WARRANTY; without even the implied warranty of
13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: * GNU Lesser General Public License for more details.
15: *
16: * You should have received a copy of the GNU Lesser General Public License
17: * along with this program; if not, write to the Free Software
18: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19: */
20: package org.archive.crawler.url.canonicalize;
21:
22: import java.util.regex.Matcher;
23: import java.util.regex.Pattern;
24:
25: public class StripExtraSlashes extends BaseRule {
26:
27: private static final String DESCRIPTION = "Strip any extra slashes, '/', found in the path. "
28: + "Use this rule to equate 'http://www.archive.org//A//B/index.html' and "
29: + "'http://www.archive.org/A/B/index.html'.";
30:
31: private static final Pattern REGEX = Pattern
32: .compile("(^https?://.*?)//+(.*)");
33:
34: public StripExtraSlashes(String name) {
35: super (name, DESCRIPTION);
36: }
37:
38: public String canonicalize(String url, Object context) {
39: Matcher matcher = REGEX.matcher(url);
40: while (matcher.matches()) {
41: url = matcher.group(1) + "/" + matcher.group(2);
42: matcher = REGEX.matcher(url);
43: }
44: return url;
45: }
46: }
|