01: /* RobotstxtTest
02: *
03: * $Id: RobotstxtTest.java 4668 2006-09-26 21:49:01Z paul_jack $
04: *
05: * Created Sep 1, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.datamodel;
26:
27: import java.io.BufferedReader;
28: import java.io.IOException;
29: import java.io.StringReader;
30: import java.util.HashMap;
31: import java.util.LinkedList;
32: import java.util.List;
33:
34: import junit.framework.TestCase;
35:
36: public class RobotstxtTest extends TestCase {
37: public void testParseRobots() throws IOException {
38: LinkedList<String> userAgents = new LinkedList<String>();
39: HashMap<String, List<String>> disallows = new HashMap<String, List<String>>();
40: BufferedReader reader = new BufferedReader(new StringReader(
41: "BLAH"));
42: assertFalse(Robotstxt.parse(reader, userAgents, disallows));
43: assertTrue(disallows.size() == 0);
44: // Parse archive robots.txt with heritrix agent.
45: String agent = "archive.org_bot";
46: reader = new BufferedReader(new StringReader("User-agent: "
47: + agent + "\n" + "Disallow: /cgi-bin/\n"
48: + "Disallow: /details/software\n"));
49: assertFalse(Robotstxt.parse(reader, userAgents, disallows));
50: assertTrue(disallows.size() == 1);
51: assertTrue(userAgents.size() == 1);
52: assertEquals(userAgents.get(0), agent);
53: // Parse archive robots.txt with star agent.
54: agent = "*";
55: reader = new BufferedReader(new StringReader("User-agent: "
56: + agent + "\n" + "Disallow: /cgi-bin/\n"
57: + "Disallow: /details/software\n"));
58: disallows = new HashMap<String, List<String>>();
59: userAgents = new LinkedList<String>();
60: assertFalse(Robotstxt.parse(reader, userAgents, disallows));
61: assertTrue(disallows.size() == 1);
62: assertTrue(userAgents.size() == 1);
63: assertEquals(userAgents.get(0), "");
64: }
65: }
|