001: /* Copyright 2004 Ryan Ackley
002: *
003: * Licensed under the Apache License, Version 2.0 (the "License");
004: * you may not use this file except in compliance with the License.
005: * You may obtain a copy of the License at
006: *
007: * http://www.apache.org/licenses/LICENSE-2.0
008: *
009: * Unless required by applicable law or agreed to in writing, software
010: * distributed under the License is distributed on an "AS IS" BASIS,
011: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012: * See the License for the specific language governing permissions and
013: * limitations under the License.
014: */
015:
016: package org.textmining.text.extraction;
017:
018: import org.apache.poi.util.LittleEndian;
019: import org.apache.poi.hwpf.model.*;
020: import org.textmining.text.extraction.sprm.*;
021: import org.textmining.text.extraction.chp.*;
022:
023: import java.util.*;
024: import java.io.*;
025:
026: /**
027: * This class is used to extract text from Word 6 documents only. It should
028: * only be called from the org.textmining.text.extraction.WordExtractor because
029: * it will automatically determine the version.
030: *
031: * @author Ryan Ackley
032: */
033: class Word6Extractor {
034:
035: public Word6Extractor() {
036: }
037:
038: /**
039: * Extracts the text
040: *
041: * @param mainStream The POIFS document stream entitled "WordDocument".
042: *
043: * @return The text from the document
044: * @throws Exception If there are any unexpected exceptions.
045: */
046: public String extractText(byte[] mainStream) throws Exception {
047: int fcMin = LittleEndian.getInt(mainStream, 0x18);
048: int fcMax = LittleEndian.getInt(mainStream, 0x1C);
049:
050: int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
051: int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
052:
053: // get a list of character properties
054: Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream,
055: chpTableOffset, chpTableSize, fcMin);
056: List textRuns = chpTable.getTextRuns();
057:
058: // iterate through the
059: WordTextBuffer finalTextBuf = new WordTextBuffer();
060: Iterator runsIt = textRuns.iterator();
061: while (runsIt.hasNext()) {
062: CHPX chpx = (CHPX) runsIt.next();
063: int runStart = chpx.getStart() + fcMin;
064: int runEnd = chpx.getEnd() + fcMin;
065:
066: if (!isDeleted(chpx.getGrpprl())) {
067: String s = new String(mainStream, runStart, Math.min(
068: runEnd, fcMax)
069: - runStart, "Cp1252");
070: finalTextBuf.append(s);
071: if (runEnd >= fcMax) {
072: break;
073: }
074: }
075: }
076:
077: return finalTextBuf.toString();
078: }
079:
080: /**
081: * Used to determine if a run of text has been deleted.
082: * @param grpprl The list of sprms for this run of text.
083: * @return
084: */
085: private boolean isDeleted(byte[] grpprl) {
086: int offset = 0;
087: boolean deleted = false;
088: while (offset < grpprl.length) {
089: switch (LittleEndian.getUnsignedByte(grpprl, offset++)) {
090: case 65:
091: deleted = grpprl[offset++] != 0;
092: break;
093: case 66:
094: offset++;
095: break;
096: case 67:
097: offset++;
098: break;
099: case 68:
100: offset += grpprl[offset];
101: break;
102: case 69:
103: offset += 2;
104: break;
105: case 70:
106: offset += 4;
107: break;
108: case 71:
109: offset++;
110: break;
111: case 72:
112: offset += 2;
113: break;
114: case 73:
115: offset += 3;
116: break;
117: case 74:
118: offset += grpprl[offset];
119: break;
120: case 75:
121: offset++;
122: break;
123: case 80:
124: offset += 2;
125: break;
126: case 81:
127: offset += grpprl[offset];
128: break;
129: case 82:
130: offset += grpprl[offset];
131: break;
132: case 83:
133: break;
134: case 85:
135: offset++;
136: break;
137: case 86:
138: offset++;
139: break;
140: case 87:
141: offset++;
142: break;
143: case 88:
144: offset++;
145: break;
146: case 89:
147: offset++;
148: break;
149: case 90:
150: offset++;
151: break;
152: case 91:
153: offset++;
154: break;
155: case 92:
156: offset++;
157: break;
158: case 93:
159: offset += 2;
160: break;
161: case 94:
162: offset++;
163: break;
164: case 95:
165: offset += 3;
166: break;
167: case 96:
168: offset += 2;
169: break;
170: case 97:
171: offset += 2;
172: break;
173: case 98:
174: offset++;
175: break;
176: case 99:
177: offset++;
178: break;
179: case 100:
180: offset++;
181: break;
182: case 101:
183: offset++;
184: break;
185: case 102:
186: offset++;
187: break;
188: case 103:
189: offset += grpprl[offset];
190: break;
191: case 104:
192: offset++;
193: break;
194: case 105:
195: offset += grpprl[offset];
196: break;
197: case 106:
198: offset += grpprl[offset];
199: break;
200: case 107:
201: offset += 2;
202: break;
203: case 108:
204: offset += grpprl[offset];
205: break;
206: case 109:
207: offset += 2;
208: break;
209: case 110:
210: offset += 2;
211: break;
212: case 117:
213: offset++;
214: break;
215: case 118:
216: offset++;
217: break;
218:
219: }
220: }
221: return deleted;
222: }
223: }
|