01: /*
02: * Licensed to the Apache Software Foundation (ASF) under one or more
03: * contributor license agreements. See the NOTICE file distributed with
04: * this work for additional information regarding copyright ownership.
05: * The ASF licenses this file to You under the Apache License, Version 2.0
06: * (the "License"); you may not use this file except in compliance with
07: * the License. You may obtain a copy of the License at
08: *
09: * http://www.apache.org/licenses/LICENSE-2.0
10: *
11: * Unless required by applicable law or agreed to in writing, software
12: * distributed under the License is distributed on an "AS IS" BASIS,
13: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14: * See the License for the specific language governing permissions and
15: * limitations under the License.
16: */
17: package org.apache.poi.hwpf.extractor;
18:
19: import java.io.FileInputStream;
20: import java.util.Iterator;
21:
22: import org.apache.poi.hwpf.HWPFDocument;
23: import org.apache.poi.hwpf.model.TextPiece;
24: import org.apache.poi.hwpf.usermodel.Paragraph;
25: import org.apache.poi.hwpf.usermodel.Range;
26:
27: import junit.framework.TestCase;
28:
29: /**
30: * Test the different routes to extracting text
31: *
32: * @author Nick Burch (nick at torchbox dot com)
33: */
34: public class TestDifferentRoutes extends TestCase {
35: private String[] p_text = new String[] {
36: "This is a simple word document\r",
37: "\r",
38: "It has a number of paragraphs in it\r",
39: "\r",
40: "Some of them even feature bold, italic and underlined text\r",
41: "\r", "\r", "This bit is in a different font and size\r",
42: "\r", "\r", "This bit features some red text.\r", "\r",
43: "\r", "It is otherwise very very boring.\r" };
44:
45: private HWPFDocument doc;
46:
47: protected void setUp() throws Exception {
48: String dirname = System.getProperty("HWPF.testdata.path");
49:
50: String filename = dirname + "/test2.doc";
51: doc = new HWPFDocument(new FileInputStream(filename));
52: }
53:
54: /**
55: * Test model based extraction
56: */
57: public void testExtractFromModel() {
58: Range r = doc.getRange();
59:
60: String[] text = new String[r.numParagraphs()];
61: for (int i = 0; i < r.numParagraphs(); i++) {
62: Paragraph p = r.getParagraph(i);
63: text[i] = p.text();
64: }
65:
66: assertEquals(p_text.length, text.length);
67: for (int i = 0; i < p_text.length; i++) {
68: assertEquals(p_text[i], text[i]);
69: }
70: }
71:
72: /**
73: * Test textPieces based extraction
74: */
75: public void testExtractFromTextPieces() throws Exception {
76: StringBuffer textBuf = new StringBuffer();
77:
78: Iterator textPieces = doc.getTextTable().getTextPieces()
79: .iterator();
80: while (textPieces.hasNext()) {
81: TextPiece piece = (TextPiece) textPieces.next();
82:
83: String encoding = "Cp1252";
84: if (piece.usesUnicode()) {
85: encoding = "UTF-16LE";
86: }
87: String text = new String(piece.getRawBytes(), encoding);
88: textBuf.append(text);
89: }
90:
91: StringBuffer exp = new StringBuffer();
92: for (int i = 0; i < p_text.length; i++) {
93: exp.append(p_text[i]);
94: }
95: assertEquals(exp.toString(), textBuf.toString());
96: }
97: }
|