001: package org.archive.crawler.scope;
002:
003: /* SeedCachingScopeTest
004: *
005: * $Id: SeedCachingScopeTest.java 4651 2006-09-25 18:31:13Z paul_jack $
006: *
007: * Created on Mar 30, 2005
008: *
009: * Copyright (C) 2005 Internet Archive.
010: *
011: * This file is part of the Heritrix web crawler (crawler.archive.org).
012: *
013: * Heritrix is free software; you can redistribute it and/or modify
014: * it under the terms of the GNU Lesser Public License as published by
015: * the Free Software Foundation; either version 2.1 of the License, or
016: * any later version.
017: *
018: * Heritrix is distributed in the hope that it will be useful,
019: * but WITHOUT ANY WARRANTY; without even the implied warranty of
020: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
021: * GNU Lesser Public License for more details.
022: *
023: * You should have received a copy of the GNU Lesser Public License
024: * along with Heritrix; if not, write to the Free Software
025: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
026: */
027:
028: import java.io.File;
029: import java.io.FileWriter;
030: import java.io.IOException;
031: import java.io.PrintWriter;
032: import java.util.Comparator;
033: import java.util.Iterator;
034: import java.util.Set;
035: import java.util.TreeSet;
036:
037: import org.apache.commons.httpclient.URIException;
038: import org.archive.crawler.datamodel.CrawlURI;
039: import org.archive.net.UURI;
040: import org.archive.net.UURIFactory;
041: import org.archive.util.TmpDirTestCase;
042:
043: /**
044: * Test {@link SeedCachingScope}.
045: * @author stack gojomo
046: * @version $Revision: 4651 $, $Date: 2006-09-25 18:31:13 +0000 (Mon, 25 Sep 2006) $
047: */
048: public class SeedCachingScopeTest extends TmpDirTestCase {
049: /**
050: * Constrained SeedCachingScope subclass for testing
051: *
052: * @author gojomo
053: */
054: private class UnitTestSeedCachingScope extends SeedCachingScope {
055:
056: private static final long serialVersionUID = -1651873833038665447L;
057:
058: private File seedsfile;
059:
060: public UnitTestSeedCachingScope(File seedsfile) {
061: super ("test");
062: this .seedsfile = seedsfile;
063: }
064:
065: public File getSeedfile() {
066: return seedsfile;
067: }
068: }
069:
070: private static Set<UURI> seeds = null;
071:
072: /**
073: * Comparator for treeset of uuris.
074: */
075: private static final Comparator<UURI> CMP = new Comparator<UURI>() {
076: public int compare(UURI o1, UURI o2) {
077: int result = -1;
078: if (o1 == null && o1 == null) {
079: result = 0;
080: } else if (o1 == null) {
081: result = -1;
082: } else if (o2 == null) {
083: result = 1;
084: } else {
085: String s1 = o1.toString();
086: String s2 = o2.toString();
087: result = s1.compareTo(s2);
088: result = (result < 0) ? result = -1
089: : (result > 0) ? result = 1 : 0;
090: }
091: return result;
092: }
093: };
094:
095: /**
096: * Seed file reference.
097: */
098: private File seedsfile;
099:
100: /* (non-Javadoc)
101: * @see org.archive.util.TmpDirTestCase#setUp()
102: */
103: protected void setUp() throws Exception {
104: super .setUp();
105:
106: // First create array of seeds and add to treeset.
107: SeedCachingScopeTest.seeds = new TreeSet<UURI>(
108: SeedCachingScopeTest.CMP);
109: String[] uris = { "mailto:www.google.com",
110: "http://www.port.com:80/etc/motd2",
111: "http://a:b@userinfo.com/etc/motd2",
112: "news:www.google.com", "http://www.google.com",
113: "https://www.google.com", "gopher://www.google.com",
114: "news://www.google.com", "rss://www.google.com",
115: "telnet://www.google.com",
116: "ftp://myname@example.com/etc/motd",
117: "ftp://example.com/etc/motd2" };
118: for (int i = 0; i < uris.length; i++) {
119: SeedCachingScopeTest.seeds.add(UURIFactory
120: .getInstance(uris[i]));
121: }
122:
123: // Write a seeds file w/ our list of seeds.
124: this .seedsfile = new File(getTmpDir(),
125: SeedCachingScopeTest.class.getName() + ".seedfile");
126: PrintWriter writer = new PrintWriter(new FileWriter(
127: this .seedsfile));
128: for (int i = 0; i < uris.length; i++) {
129: writer.println(uris[i]);
130: }
131: writer.close();
132: }
133:
134: /* (non-Javadoc)
135: * @see org.archive.util.TmpDirTestCase#tearDown()
136: */
137: protected void tearDown() throws Exception {
138: super .tearDown();
139: if (this .seedsfile.exists()) {
140: this .seedsfile.delete();
141: }
142: }
143:
144: public void testGeneral() throws URIException {
145: // First make sure that I can get the seed set from seed file.
146: SeedCachingScope sl = checkContent(SeedCachingScopeTest.seeds);
147: // Now do add and see if get set matches seed file content.
148: final CrawlURI curi = new CrawlURI(UURIFactory
149: .getInstance("http://one.two.three"));
150: sl.addSeed(curi);
151: Set<UURI> set = new TreeSet<UURI>(SeedCachingScopeTest.CMP);
152: set.addAll(SeedCachingScopeTest.seeds);
153: set.add(curi.getUURI());
154: checkContent(sl, set);
155: }
156:
157: public void testNoScheme() throws IOException {
158: final String NOSCHEME = "x.y.z";
159: FileWriter fw = new FileWriter(this .seedsfile, true);
160: // Write to new (last) line the URL.
161: fw.write("\n");
162: fw.write(NOSCHEME);
163: fw.flush();
164: fw.close();
165: boolean found = false;
166: SeedCachingScope sl = new UnitTestSeedCachingScope(seedsfile);
167: for (Iterator i = sl.seedsIterator(); i.hasNext();) {
168: UURI uuri = (UURI) i.next();
169: if (uuri.getHost() == null) {
170: continue;
171: }
172: if (uuri.getHost().equals(NOSCHEME)) {
173: found = true;
174: break;
175: }
176: }
177: assertTrue("Did not find " + NOSCHEME, found);
178: }
179:
180: private SeedCachingScope checkContent(Set seedSet) {
181: return checkContent(null, seedSet);
182: }
183:
184: private SeedCachingScope checkContent(SeedCachingScope sl,
185: Set seedSet) {
186: if (sl == null) {
187: sl = new UnitTestSeedCachingScope(this .seedsfile);
188: }
189: int count = 0;
190: for (Iterator i = sl.seedsIterator(); i.hasNext();) {
191: count++;
192: UURI uuri = (UURI) i.next();
193: assertTrue("Does not contain: " + uuri.toString(), seedSet
194: .contains(uuri));
195: }
196: assertTrue("Different sizes: " + count + ", " + seedSet.size(),
197: count == seedSet.size());
198: return sl;
199: }
200: }
|