01: /* CanonicalizerTest
02: *
03: * Created on Oct 7, 2004
04: *
05: * Copyright (C) 2004 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.url;
24:
25: import java.io.File;
26:
27: import org.apache.commons.httpclient.URIException;
28: import org.archive.crawler.datamodel.CrawlOrder;
29: import org.archive.crawler.settings.MapType;
30: import org.archive.crawler.settings.XMLSettingsHandler;
31: import org.archive.crawler.url.canonicalize.FixupQueryStr;
32: import org.archive.crawler.url.canonicalize.LowercaseRule;
33: import org.archive.crawler.url.canonicalize.StripSessionIDs;
34: import org.archive.crawler.url.canonicalize.StripUserinfoRule;
35: import org.archive.crawler.url.canonicalize.StripWWWRule;
36: import org.archive.net.UURIFactory;
37: import org.archive.util.TmpDirTestCase;
38:
39: /**
40: * Test canonicalization.
41: * @author stack
42: * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
43: */
44: public class CanonicalizerTest extends TmpDirTestCase {
45: private File orderFile;
46: protected XMLSettingsHandler settingsHandler;
47:
48: private MapType rules = null;
49:
50: protected void setUp() throws Exception {
51: super .setUp();
52: this .orderFile = new File(getTmpDir(), this .getClass()
53: .getName()
54: + ".order.xml");
55: this .settingsHandler = new XMLSettingsHandler(orderFile);
56: this .settingsHandler.initialize();
57:
58: this .rules = (MapType) (settingsHandler.getSettingsObject(null))
59: .getModule(CrawlOrder.ATTR_NAME).getAttribute(
60: CrawlOrder.ATTR_RULES);
61: this .rules.addElement(null, new LowercaseRule("lowercase"));
62: this .rules.addElement(null, new StripUserinfoRule("userinfo"));
63: this .rules.addElement(null, new StripWWWRule("www"));
64: this .rules.addElement(null, new StripSessionIDs("ids"));
65: this .rules.addElement(null, new FixupQueryStr("querystr"));
66: }
67:
68: public void testCanonicalize() throws URIException {
69: final String scheme = "http://";
70: final String nonQueryStr = "archive.org/index.html";
71: final String result = scheme + nonQueryStr;
72: assertTrue("Mangled original", result.equals(Canonicalizer
73: .canonicalize(UURIFactory.getInstance(result),
74: this .rules.iterator(UURIFactory
75: .getInstance(result)))));
76: String tmp = scheme + "www." + nonQueryStr;
77: assertTrue("Mangled www", result.equals(Canonicalizer
78: .canonicalize(UURIFactory.getInstance(tmp), this .rules
79: .iterator(UURIFactory.getInstance(result)))));
80: tmp = scheme + "www." + nonQueryStr
81: + "?jsessionid=01234567890123456789012345678901";
82: assertTrue("Mangled sessionid", result.equals(Canonicalizer
83: .canonicalize(UURIFactory.getInstance(tmp), this .rules
84: .iterator(UURIFactory.getInstance(result)))));
85: tmp = scheme + "www." + nonQueryStr
86: + "?jsessionid=01234567890123456789012345678901";
87: assertTrue("Mangled sessionid", result.equals(Canonicalizer
88: .canonicalize(UURIFactory.getInstance(tmp), this.rules
89: .iterator(UURIFactory.getInstance(result)))));
90: }
91: }
|