01: /* RegexRuleTest
02: *
03: * Created on Oct 6, 2004
04: *
05: * Copyright (C) 2004 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.url.canonicalize;
24:
25: import java.io.File;
26:
27: import javax.management.InvalidAttributeValueException;
28:
29: import org.apache.commons.httpclient.URIException;
30: import org.archive.crawler.datamodel.CrawlOrder;
31: import org.archive.crawler.settings.MapType;
32: import org.archive.crawler.settings.XMLSettingsHandler;
33: import org.archive.net.UURIFactory;
34: import org.archive.util.TmpDirTestCase;
35:
36: /**
37: * Test the regex rule.
38: * @author stack
39: * @version $Date: 2005-07-18 17:30:21 +0000 (Mon, 18 Jul 2005) $, $Revision: 3704 $
40: */
41: public class RegexRuleTest extends TmpDirTestCase {
42: private File orderFile;
43: protected XMLSettingsHandler settingsHandler;
44: private MapType rules = null;
45:
46: protected void setUp() throws Exception {
47: super .setUp();
48: this .orderFile = new File(getTmpDir(), this .getClass()
49: .getName()
50: + ".order.xml");
51: this .settingsHandler = new XMLSettingsHandler(orderFile);
52: this .settingsHandler.initialize();
53: this .rules = (MapType) (settingsHandler.getSettingsObject(null))
54: .getModule(CrawlOrder.ATTR_NAME).getAttribute(
55: CrawlOrder.ATTR_RULES);
56: }
57:
58: public void testCanonicalize() throws URIException,
59: InvalidAttributeValueException {
60: final String url = "http://www.aRchive.Org/index.html";
61: RegexRule rr = new RegexRule("Test "
62: + this .getClass().getName());
63: this .rules.addElement(null, rr);
64: rr.canonicalize(url, UURIFactory.getInstance(url));
65: String product = rr.canonicalize(url, null);
66: assertTrue("Default doesn't work.", url.equals(product));
67: }
68:
69: public void testSessionid() throws InvalidAttributeValueException {
70: final String urlBase = "http://joann.com/catalog.jhtml";
71: final String urlMinusSessionid = urlBase + "?CATID=96029";
72: final String url = urlBase
73: + ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO?CATID=96029";
74: RegexRule rr = new RegexRule("Test",
75: "^(.+)(?:;\\$sessionid\\$[A-Z0-9]{32})(\\?.*)+$",
76: "$1$2");
77: this .rules.addElement(null, rr);
78: String product = rr.canonicalize(url, null);
79: assertTrue("Failed " + url, urlMinusSessionid.equals(product));
80: }
81:
82: public void testNullFormat() throws InvalidAttributeValueException {
83: final String urlBase = "http://joann.com/catalog.jhtml";
84: final String url = urlBase
85: + ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO";
86: RegexRule rr = new RegexRule("Test",
87: "^(.+)(?:;\\$sessionid\\$[A-Z0-9]{32})$", "$1$2");
88: this .rules.addElement(null, rr);
89: String product = rr.canonicalize(url, null);
90: assertTrue("Failed " + url, urlBase.equals(product));
91: }
92: }
|