01: /* Copyright 2004 Ryan Ackley
02: *
03: * Licensed under the Apache License, Version 2.0 (the "License");
04: * you may not use this file except in compliance with the License.
05: * You may obtain a copy of the License at
06: *
07: * http://www.apache.org/licenses/LICENSE-2.0
08: *
09: * Unless required by applicable law or agreed to in writing, software
10: * distributed under the License is distributed on an "AS IS" BASIS,
11: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12: * See the License for the specific language governing permissions and
13: * limitations under the License.
14: */
15:
16: package org.textmining.text.extraction.chp;
17:
18: import java.util.List;
19: import java.util.ArrayList;
20: import java.io.OutputStream;
21: import java.io.IOException;
22:
23: import org.apache.poi.poifs.common.POIFSConstants;
24: import org.apache.poi.util.LittleEndian;
25: import org.apache.poi.hwpf.model.io.*;
26: import org.apache.poi.hwpf.model.*;
27:
28: /**
29: * This class holds all of the character formatting properties from a Word
30: * 6.0/95 document.
31: *
32: * @author Ryan Ackley
33: */
34: public class Word6CHPBinTable {
35: /** List of character properties.*/
36: ArrayList _textRuns = new ArrayList();
37:
38: /**
39: * Constructor used to read a binTable in from a Word document.
40: *
41: * @param documentStream The POIFS "WordDocument" stream from a Word document
42: * @param offset The offset of the Chp bin table in the main stream.
43: * @param size The size of the Chp bin table in the main stream.
44: * @param fcMin The start of text in the main stream.
45: */
46: public Word6CHPBinTable(byte[] documentStream, int offset,
47: int size, int fcMin) {
48: PlexOfCps binTable = new PlexOfCps(documentStream, offset,
49: size, 2);
50:
51: int length = binTable.length();
52: for (int x = 0; x < length; x++) {
53: GenericPropertyNode node = binTable.getProperty(x);
54:
55: int pageNum = LittleEndian.getShort((byte[]) node
56: .getBytes());
57: int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum;
58:
59: CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(
60: documentStream, pageOffset, fcMin);
61:
62: int fkpSize = cfkp.size();
63:
64: for (int y = 0; y < fkpSize; y++) {
65: _textRuns.add(cfkp.getCHPX(y));
66: }
67: }
68: }
69:
70: public List getTextRuns() {
71: return _textRuns;
72: }
73:
74: }
|