01: /* SeedFileIteratorTest
02: *
03: * $Id: SeedFileIteratorTest.java 4651 2006-09-25 18:31:13Z paul_jack $
04: *
05: * Created on May 31, 2005
06: *
07: * Copyright (C) 2005 Internet Archive.
08: *
09: * This file is part of the Heritrix web crawler (crawler.archive.org).
10: *
11: * Heritrix is free software; you can redistribute it and/or modify
12: * it under the terms of the GNU Lesser Public License as published by
13: * the Free Software Foundation; either version 2.1 of the License, or
14: * any later version.
15: *
16: * Heritrix is distributed in the hope that it will be useful,
17: * but WITHOUT ANY WARRANTY; without even the implied warranty of
18: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19: * GNU Lesser Public License for more details.
20: *
21: * You should have received a copy of the GNU Lesser Public License
22: * along with Heritrix; if not, write to the Free Software
23: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24: */
25: package org.archive.crawler.scope;
26:
27: import java.io.BufferedReader;
28: import java.io.BufferedWriter;
29: import java.io.IOException;
30: import java.io.StringReader;
31: import java.io.StringWriter;
32: import java.util.LinkedList;
33:
34: import junit.framework.TestCase;
35:
36: import org.archive.net.UURI;
37:
38: /**
39: * Test {@link SeedFileIterator}.
40: * @author gojomo
41: * @version $Revision: 4651 $, $Date: 2006-09-25 18:31:13 +0000 (Mon, 25 Sep 2006) $
42: */
43: public class SeedFileIteratorTest extends TestCase {
44: public void testHyphenInHost() {
45: final String seedFileContent = "http://www.examp-le.com/";
46: StringWriter sw = new StringWriter();
47: StringReader sr = new StringReader(seedFileContent);
48: UURI seed = (UURI) (new SeedFileIterator(
49: new BufferedReader(sr), sw)).next();
50: assertEquals("Hyphen is problem", seed.toString(),
51: seedFileContent);
52: }
53:
54: public void testGeneral() throws IOException {
55: String seedFile = "# comment\n" + // comment
56: "\n" + // blank line
57: "www.example.com\n" + // naked host, implied scheme
58: "www.example.org/foo\n" + // naked host+path, implied scheme
59: "http://www.example.net\n" + // full HTTP URL
60: "+http://www.example.us"; // 'directive' (should be ignored)
61: StringWriter ignored = new StringWriter();
62: SeedFileIterator iter = new SeedFileIterator(
63: new BufferedReader(new StringReader(seedFile)),
64: new BufferedWriter(ignored));
65: LinkedList<String> seeds = new LinkedList<String>();
66: while (iter.hasNext()) {
67: UURI n = iter.next();
68: if (n instanceof UURI) {
69: seeds.add(n.getURI());
70: }
71: }
72: assertTrue("didn't get naked host", seeds
73: .contains("http://www.example.com/"));
74: assertTrue("didn't get naked host+path", seeds
75: .contains("http://www.example.org/foo"));
76: assertTrue("didn't get full http URL", seeds
77: .contains("http://www.example.net/"));
78: assertTrue("got wrong number of URLs", seeds.size() == 3);
79: assertTrue("ignored entry not reported", ignored.toString()
80: .indexOf("+http://www.example.us") >= 0);
81: }
82: }
|