01: /* FixupQueryStr
02: *
03: * Created on Oct 5, 2004
04: *
05: * Copyright (C) 2004 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.url.canonicalize;
24:
25: /**
26: * Strip any trailing question mark.
27: * @author stack
28: * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
29: */
30: public class FixupQueryStr extends BaseRule {
31:
32: private static final long serialVersionUID = 3169526832544474794L;
33:
34: private static final String DESCRIPTION = "Fixup the question mark that leads off the query string. "
35: + "This rule returns 'http://www.archive.org/index.html' if passed"
36: + " 'http://www.archive.org/index.html?'. It will also strip '?&'"
37: + " if '?&' is all that comprises the query string. Also strips"
38: + " extraneous leading '&': Returns 'http://archive.org/index.html?x=y"
39: + " if passed 'http://archive.org/index.html?&x=y."
40: + " Will also strip '&' if last thing in query string."
41: + " Operates on all schemes. This is a good rule to run toward the"
42: + " end of canonicalization processing.";
43:
44: public FixupQueryStr(String name) {
45: super (name, DESCRIPTION);
46: }
47:
48: public String canonicalize(String url, Object context) {
49: if (url == null || url.length() <= 0) {
50: return url;
51: }
52:
53: int index = url.lastIndexOf('?');
54: if (index > 0) {
55: if (index == (url.length() - 1)) {
56: // '?' is last char in url. Strip it.
57: url = url.substring(0, url.length() - 1);
58: } else if (url.charAt(index + 1) == '&') {
59: // Next char is '&'. Strip it.
60: if (url.length() == (index + 2)) {
61: // Then url ends with '?&'. Strip them.
62: url = url.substring(0, url.length() - 2);
63: } else {
64: // The '&' is redundant. Strip it.
65: url = url.substring(0, index + 1)
66: + url.substring(index + 2);
67: }
68: } else if (url.charAt(url.length() - 1) == '&') {
69: // If we have a lone '&' on end of query str,
70: // strip it.
71: url = url.substring(0, url.length() - 1);
72: }
73: }
74: return url;
75: }
76: }
|