001: /* CrawlURITest
002: *
003: * Created on Jul 26, 2004
004: *
005: * Copyright (C) 2004 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.datamodel;
024:
025: import java.io.File;
026: import java.io.FileInputStream;
027: import java.io.FileOutputStream;
028: import java.io.IOException;
029: import java.io.ObjectInputStream;
030: import java.io.ObjectOutputStream;
031:
032: import org.apache.commons.httpclient.URIException;
033: import org.archive.net.UURIFactory;
034: import org.archive.util.TmpDirTestCase;
035:
036: /**
037: * @author stack
038: * @version $Revision: 3771 $, $Date: 2005-08-29 21:52:36 +0000 (Mon, 29 Aug 2005) $
039: */
040: public class CrawlURITest extends TmpDirTestCase {
041:
042: CrawlURI seed = null;
043:
044: protected void setUp() throws Exception {
045: super .setUp();
046: final String url = "http://www.dh.gov.uk/Home/fs/en";
047: this .seed = new CrawlURI(UURIFactory.getInstance(url));
048: this .seed.setSchedulingDirective(CandidateURI.MEDIUM);
049: this .seed.setIsSeed(true);
050: // Force caching of string.
051: this .seed.toString();
052: // TODO: should this via really be itself?
053: this .seed.setVia(UURIFactory.getInstance(url));
054: }
055:
056: /**
057: * Test serialization/deserialization works.
058: *
059: * @throws IOException
060: * @throws ClassNotFoundException
061: */
062: final public void testSerialization() throws IOException,
063: ClassNotFoundException {
064: File serialize = new File(getTmpDir(), this .getClass()
065: .getName()
066: + ".serialize");
067: try {
068: FileOutputStream fos = new FileOutputStream(serialize);
069: ObjectOutputStream oos = new ObjectOutputStream(fos);
070: oos.writeObject(this .seed);
071: oos.reset();
072: oos.writeObject(this .seed);
073: oos.reset();
074: oos.writeObject(this .seed);
075: oos.close();
076: // Read in the object.
077: FileInputStream fis = new FileInputStream(serialize);
078: ObjectInputStream ois = new ObjectInputStream(fis);
079: CrawlURI deserializedCuri = (CrawlURI) ois.readObject();
080: deserializedCuri = (CrawlURI) ois.readObject();
081: deserializedCuri = (CrawlURI) ois.readObject();
082: assertTrue("Deserialized not equal to original", this .seed
083: .toString().equals(deserializedCuri.toString()));
084: String host = this .seed.getUURI().getHost();
085: assertTrue("Deserialized host not null", host != null
086: && host.length() >= 0);
087: } finally {
088: serialize.delete();
089: }
090: }
091:
092: public void testCandidateURIWithLoadedAList() throws URIException {
093: CandidateURI c = CandidateURI
094: .createSeedCandidateURI(UURIFactory
095: .getInstance("http://www.archive.org"));
096: c.putString("key", "value");
097: CrawlURI curi = new CrawlURI(c, 0);
098: assertTrue("Didn't find AList item", curi.getString("key")
099: .equals("value"));
100: }
101:
102: // TODO: move to QueueAssignmentPolicies
103: // public void testCalculateClassKey() throws URIException {
104: // final String uri = "http://mprsrv.agri.gov.cn";
105: // CrawlURI curi = new CrawlURI(UURIFactory.getInstance(uri));
106: // String key = curi.getClassKey();
107: // assertTrue("Key1 is bad " + key,
108: // key.equals(curi.getUURI().getAuthorityMinusUserinfo()));
109: // final String baduri = "ftp://pfbuser:pfbuser@mprsrv.agri.gov.cn/clzreceive/";
110: // curi = new CrawlURI(UURIFactory.getInstance(baduri));
111: // key = curi.getClassKey();
112: // assertTrue("Key2 is bad " + key,
113: // key.equals(curi.getUURI().getAuthorityMinusUserinfo()));
114: // }
115: }
|