001: /* Copyright 2004 Ryan Ackley
002: *
003: * Licensed under the Apache License, Version 2.0 (the "License");
004: * you may not use this file except in compliance with the License.
005: * You may obtain a copy of the License at
006: *
007: * http://www.apache.org/licenses/LICENSE-2.0
008: *
009: * Unless required by applicable law or agreed to in writing, software
010: * distributed under the License is distributed on an "AS IS" BASIS,
011: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012: * See the License for the specific language governing permissions and
013: * limitations under the License.
014: */
015: package org.textmining.text.extraction;
016:
017: import org.apache.poi.poifs.filesystem.*;
018: import org.apache.poi.poifs.storage.RTFSignatureException;
019: import org.apache.poi.util.LittleEndian;
020: import org.apache.poi.hwpf.model.*;
021: import org.textmining.text.extraction.sprm.*;
022:
023: import java.util.*;
024: import java.io.*;
025:
026: /**
027: * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
028: *
029: * @author Ryan Ackley
030: */
031: public class WordExtractor {
032:
033: /**
034: * Constructor
035: */
036: public WordExtractor() {
037: }
038:
039: /**
040: * Gets the text from a Word document.
041: *
042: * @param in The InputStream representing the Word file.
043: */
044: public String extractText(InputStream in) throws Exception {
045: WordTextBuffer finalTextBuf = null;
046: try {
047: ArrayList text = new ArrayList();
048: POIFSFileSystem fsys = new POIFSFileSystem(in);
049:
050: // load our POIFS document streams.
051: DocumentEntry headerProps = (DocumentEntry) fsys.getRoot()
052: .getEntry("WordDocument");
053: DocumentInputStream din = fsys
054: .createDocumentInputStream("WordDocument");
055: byte[] header = new byte[headerProps.getSize()];
056:
057: din.read(header);
058: din.close();
059:
060: int info = LittleEndian.getShort(header, 0xa);
061: if ((info & 0x4) != 0) {
062: //throw new FastSavedException("Fast-saved files are unsupported at this time");
063: }
064: if ((info & 0x100) != 0) {
065: throw new PasswordProtectedException(
066: "This document is password protected");
067: }
068:
069: // determine the version of Word this document came from.
070: int nFib = LittleEndian.getShort(header, 0x2);
071: switch (nFib) {
072: case 101:
073: case 102:
074: case 103:
075: case 104:
076: // this is a Word 6.0 doc send it to the extractor for that version.
077: Word6Extractor oldExtractor = new Word6Extractor();
078:
079: return oldExtractor.extractText(header);
080: }
081:
082: //Get the information we need from the header
083: boolean useTable1 = (info & 0x200) != 0;
084:
085: //get the location of the piece table
086: int complexOffset = LittleEndian.getInt(header, 0x1a2);
087:
088: // determine which table stream we must use.
089: String tableName = null;
090: if (useTable1) {
091: tableName = "1Table";
092: } else {
093: tableName = "0Table";
094: }
095:
096: DocumentEntry table = (DocumentEntry) fsys.getRoot()
097: .getEntry(tableName);
098: byte[] tableStream = new byte[table.getSize()];
099:
100: din = fsys.createDocumentInputStream(tableName);
101:
102: din.read(tableStream);
103: din.close();
104:
105: int chpOffset = LittleEndian.getInt(header, 0xfa);
106: int chpSize = LittleEndian.getInt(header, 0xfe);
107: int fcMin = LittleEndian.getInt(header, 0x18);
108: CHPBinTable cbt = new CHPBinTable(header, tableStream,
109: chpOffset, chpSize, fcMin);
110:
111: // load our text pieces and our character runs
112: ComplexFileTable cft = new ComplexFileTable(header,
113: tableStream, complexOffset, fcMin);
114: TextPieceTable tpt = cft.getTextPieceTable();
115: List textPieces = tpt.getTextPieces();
116:
117: // make the POIFS objects available for garbage collection
118: din = null;
119: fsys = null;
120: table = null;
121: headerProps = null;
122:
123: List textRuns = cbt.getTextRuns();
124: Iterator runIt = textRuns.iterator();
125: Iterator textIt = textPieces.iterator();
126:
127: TextPiece currentPiece = (TextPiece) textIt.next();
128: int currentTextStart = currentPiece.getStart();
129: int currentTextEnd = currentPiece.getEnd();
130:
131: finalTextBuf = new WordTextBuffer();
132:
133: // iterate through all text runs extract the text only if they haven't been
134: // deleted
135: while (runIt.hasNext()) {
136: CHPX chpx = (CHPX) runIt.next();
137: boolean deleted = isDeleted(chpx.getGrpprl());
138: if (deleted) {
139: continue;
140: }
141:
142: int runStart = chpx.getStart();
143: int runEnd = chpx.getEnd();
144:
145: while (runStart >= currentTextEnd) {
146: currentPiece = (TextPiece) textIt.next();
147: currentTextStart = currentPiece.getStart();
148: currentTextEnd = currentPiece.getEnd();
149: }
150:
151: if (runEnd < currentTextEnd) {
152: String str = currentPiece.substring(runStart
153: - currentTextStart, runEnd
154: - currentTextStart);
155: finalTextBuf.append(str);
156: } else if (runEnd > currentTextEnd) {
157: while (runEnd > currentTextEnd) {
158: String str = currentPiece.substring(runStart
159: - currentTextStart, currentTextEnd
160: - currentTextStart);
161: finalTextBuf.append(str);
162: if (textIt.hasNext()) {
163: currentPiece = (TextPiece) textIt.next();
164: currentTextStart = currentPiece.getStart();
165: runStart = currentTextStart;
166: currentTextEnd = currentPiece.getEnd();
167: } else {
168:
169: return finalTextBuf.toString();
170: }
171: }
172: String str = currentPiece.substring(0, runEnd
173: - currentTextStart);
174: finalTextBuf.append(str);
175: } else {
176: String str = currentPiece.substring(runStart
177: - currentTextStart, runEnd
178: - currentTextStart);
179: if (textIt.hasNext()) {
180: currentPiece = (TextPiece) textIt.next();
181: currentTextStart = currentPiece.getStart();
182: currentTextEnd = currentPiece.getEnd();
183: }
184: finalTextBuf.append(str);
185: }
186: }
187: // if(null!=finalTextBuf)
188: return finalTextBuf.toString();
189: // else
190: // return "";
191:
192: } catch (RTFSignatureException e) {
193: throw new RTFSignatureException();
194:
195: } catch (Throwable e) {
196:
197: if (finalTextBuf != null)
198: return finalTextBuf.toString();
199: else
200: return null;
201: }
202: // }
203: }
204:
205: /**
206: * Used to determine if a run of text has been deleted.
207: *
208: * @param grpprl The list of sprms for a particular run of text.
209: * @return true if this run of text has been deleted.
210: */
211: private boolean isDeleted(byte[] grpprl) {
212: SprmIterator iterator = new SprmIterator(grpprl);
213: while (iterator.hasNext()) {
214: SprmOperation op = iterator.next();
215: // 0 is the operation that signals a FDelRMark operation
216: if (op.getOperation() == 0 && op.getOperand() != 0) {
217: return true;
218: }
219: }
220: return false;
221: }
222: }
|