01: /* BadURIsStopPageParsingSelfTest
02: *
03: * Created on Mar 10, 2004
04: *
05: * Copyright (C) 2004 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.selftest;
24:
25: import java.io.File;
26: import java.util.ArrayList;
27: import java.util.Arrays;
28: import java.util.Iterator;
29: import java.util.List;
30:
31: /**
32: * Selftest for figuring problems parsing URIs in a page.
33: *
34: * @author stack
35: * @see <a
36: * href="https://sourceforge.net/tracker/?func=detail&aid=788219&group_id=73833&atid=539099">[ 788219 ]
37: * URI Syntax Errors stop page parsing.</a>
38: * @version $Revision: 4931 $, $Date: 2007-02-21 18:48:17 +0000 (Wed, 21 Feb 2007) $
39: */
40: public class BadURIsStopPageParsingSelfTest extends SelfTestCase {
41: /**
42: * Files to find as a list.
43: *
44: * We don't find goodtwo.html because it has a BASE that is out
45: * of scope.
46: */
47: private static final List<File> FILES_TO_FIND = Arrays
48: .asList(new File[] { new File("goodone.html"),
49: new File("goodthree.html"), new File("one.html"),
50: new File("two.html"), new File("three.html") });
51:
52: public void stestFilesFound() {
53: assertInitialized();
54: List<File> foundFiles = filesFoundInArc();
55: ArrayList<File> editedFoundFiles = new ArrayList<File>(
56: foundFiles.size());
57: for (Iterator i = foundFiles.iterator(); i.hasNext();) {
58: File f = (File) i.next();
59: if (f.getAbsolutePath().endsWith("polishex.html")) {
60: // There is a URI in our list with the above as suffix. Its in
61: // the arc as a 404. Remove it. It doesn't exist on disk so it
62: // will cause the below testFilesInArc to fail.
63: continue;
64: }
65: editedFoundFiles.add(f);
66: }
67: testFilesInArc(FILES_TO_FIND, editedFoundFiles);
68: }
69: }
|