01: /* CharsetSelfTest
02: *
03: * Created on Mar 10, 2004
04: *
05: * Copyright (C) 2004 Internet Archive.
06: *
07: * This file is part of the Heritrix web crawler (crawler.archive.org).
08: *
09: * Heritrix is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU Lesser Public License as published by
11: * the Free Software Foundation; either version 2.1 of the License, or
12: * any later version.
13: *
14: * Heritrix is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
17: * GNU Lesser Public License for more details.
18: *
19: * You should have received a copy of the GNU Lesser Public License
20: * along with Heritrix; if not, write to the Free Software
21: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22: */
23: package org.archive.crawler.selftest;
25: import java.io.File;
26: import java.util.Arrays;
27: import java.util.List;
29: /**
30: * Simple test to ensure we can extract links from multibyte pages.
31: *
32: * That is, can we regex over a multibyte stream.
33: *
34: * @author stack
35: * @version $Revision: 4931 $, $Date: 2007-02-21 18:48:17 +0000 (Wed, 21 Feb 2007) $
36: */
37: public class CharsetSelfTest extends SelfTestCase {
38: /**
39: * Files to find as a list.
40: */
41: private static final List<File> FILES_TO_FIND = Arrays
42: .asList(new File[] { new File("utf8.jsp"),
43: new File("shiftjis.jsp"),
44: new File("charsetselftest_end.html") });
46: /**
47: * Look for last file in link chain.
48: *
49: * The way the pages are setup under the CharsetSelfTest directory under
50: * the webapp is that we have one multibyte page w/ a single link buried in
51: * it that points off to another multibyte page. On the end of the link
52: * chain is a page named END_OF_CHAIN_PAGE. This test looks to see that
53: * arc has all pages in the chain.
54: */
55: public void stestCharset() {
56: assertInitialized();
57: testFilesInArc(FILES_TO_FIND);
58: }
59: }