001: /* DocTest
002: *
003: * Created on September 12, 2006
004: *
005: * Copyright (C) 2006 Internet Archive.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.util.ms;
024:
025: import java.io.Closeable;
026: import java.io.File;
027: import java.io.FileInputStream;
028: import java.io.FileOutputStream;
029: import java.io.IOException;
030: import java.io.InputStreamReader;
031: import java.io.OutputStreamWriter;
032: import java.io.Reader;
033: import java.io.Writer;
034:
035: import org.apache.poi.hdf.extractor.WordDocument;
036:
037: import junit.framework.TestCase;
038:
039: public class DocTest extends TestCase {
040:
041: final private static File TEST_DIR = new File("testdata/ms");
042:
043: // Rename to testAgainstPOI to actually run the test.
044: public void testAgainstPOI() throws IOException {
045: int errors = 0;
046: long start = System.currentTimeMillis();
047: for (File f : TEST_DIR.listFiles())
048: try {
049: start = System.currentTimeMillis();
050: if (f.getName().endsWith(".doc")) {
051: errors += runDoc(f);
052: }
053: } finally {
054: long duration = System.currentTimeMillis() - start;
055: System.out.println("Duration in milliseconds: "
056: + duration);
057: }
058: if (errors > 0) {
059: throw new IOException(errors + " errors, see stdout.");
060: }
061: }
062:
063: private int runDoc(File doc) throws IOException {
064: System.out.println("===== Now processing " + doc.getName());
065: String name = doc.getName();
066: int p = name.lastIndexOf('.');
067: String expectedName = name.substring(0, p) + ".txt";
068: File expectedFile = new File(TEST_DIR, expectedName);
069: if (!expectedFile.exists()) {
070: createExpectedOutput(doc, expectedFile);
071: }
072: return runFiles(doc, expectedFile);
073: }
074:
075: private void createExpectedOutput(File doc, File output)
076: throws IOException {
077: FileInputStream finp = new FileInputStream(doc);
078: FileOutputStream fout = new FileOutputStream(output);
079:
080: try {
081: WordDocument wd = new WordDocument(finp);
082: Writer writer = new OutputStreamWriter(fout, "UTF-16BE");
083: wd.writeAllText(writer);
084: } finally {
085: close(finp);
086: close(fout);
087: }
088: }
089:
090: private static void close(Closeable c) {
091: try {
092: c.close();
093: } catch (IOException e) {
094: e.printStackTrace();
095: }
096: }
097:
098: private int runFiles(File doc, File expected) throws IOException {
099: FileInputStream expectedIn = new FileInputStream(expected);
100: Reader expectedReader = new InputStreamReader(expectedIn,
101: "UTF-16BE");
102: Reader docReader = Doc.getText(doc);
103: try {
104: return runReaders(docReader, expectedReader);
105: } finally {
106: close(docReader);
107: close(expectedReader);
108: }
109: }
110:
111: private int runReaders(Reader doc, Reader expected)
112: throws IOException {
113: int count = 0;
114: int errors = 0;
115: boolean go = true;
116: while (go) {
117: int ch = doc.read();
118: int expectedCh = correctPOI(expected.read());
119: if ((ch < 0) || (expectedCh < 0)) {
120: go = false;
121: if ((ch >= 0) || (expectedCh >= 0)) {
122: errors++;
123: System.out.println("File lengths differ.");
124: }
125: }
126: if (ch != expectedCh) {
127: errors += 1;
128: report(count, expectedCh, ch);
129: }
130: count++;
131: }
132: return errors;
133: }
134:
135: private void report(int count, int expected, int actual) {
136: StringBuilder msg = new StringBuilder("#").append(count);
137: msg.append(": Expected ");
138: msg.append(expected).append(" (").append(toChar(expected));
139: msg.append(") but got ").append(actual).append(" (");
140: msg.append(toChar(actual)).append(").");
141: System.out.println(msg);
142: }
143:
144: private static String toChar(int ch) {
145: if (ch < 0) {
146: return "EOF";
147: } else {
148: return Character.toString((char) ch);
149: }
150: }
151:
152: /**
153: * Corrects POI's Cp1252 output. There's a bug somewhere in POI that
154: * makes it produce incorrect characters. Not sure where and don't have
155: * time to track it down. But I have visually checked the input
156: * documents to verify that Doc is producing the right character, and
157: * that POI is not.
158: *
159: * @param ch the POI-produced character to check
160: * @return the corrected character
161: */
162: private static int correctPOI(int ch) {
163: switch (ch) {
164: case 8734:
165: // POI produced the infinity sign when it should have
166: // produced the degrees sign.
167: return 176;
168: case 214:
169: // POI produced an umat O instead of an ellipses mark.
170: return 8230;
171: case 237:
172: // POI produced an acute i instead of a fancy single quote
173: return 8217;
174: case 236:
175: // POI produced a reverse acute i instead of fancy double quote
176: return 8220;
177: case 238:
178: // POI produced a caret i instead of fancy double quote
179: return 8221;
180: default:
181: return ch;
182: }
183: }
184:
185: }
|